@article {pmid40555587, year = {2025}, author = {Chappie, K and Kell, S and Qi, D and Selig, J and Christakis, Y and Moreno, X and Severson, J and Best, A and Wacnik, P and Santamaria, M and Zhang, Y and Fry, BA and Mather, RJ}, title = {Comparing Phoneme Speech Recordings and Acoustic App Data Capture Experience for Android and iOS Mobile Device Users in the Large Decentralized AcRIS Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.05.016}, pmid = {40555587}, issn = {1873-4588}, abstract = {OBJECTIVE: Growth in telehealth and interest in decentralized clinical trials have motivated the need to understand Android and iPhone Operating System (iOS) impacts on remote, mobile app speech capture. This research investigated Android and iOS device differences in 16 acoustic, physiologically based speech features extracted from phonemes /i/ and /m/, how in-app instructions impacted maximum phonation time (MPT) for /a/, and a mobile app design consideration to capture quality signals for acoustic analyses.

METHODS: Acoustic features were auto-extracted from vocalizations recorded on the personal cell phones of 6505 subjects as part of a 6-8 weeks longitudinal, at-home, clinical trial. Feature averages were computed for 693 self-reported healthy participants (no chronic or acute conditions). Wilcoxon two-sample tests comparing mean feature values from Android and iOS speech recordings were computed for these self-reported healthy participants.

RESULTS: Periodic measures such as harmonicity differed between Android and iOS, with iOS registering more periodic content. Energy-related features demonstrated lower levels of high-frequency energy in the iOS results. Signal-to-noise ratio and coefficient of variation in fundamental frequency measured similarly across Android and iOS, as did the first three formants for /i/. All features showed more variability in Android devices than in iOS devices. Average background noise intensity levels were lower for Android. MPT averages were longer on the sustained /a/ task after a study pause period.

CONCLUSIONS: Measurement differences were found between Android and iOS devices for several features that have historically been used to describe disease change. Device differences impacted participant experience in recording their speech, with iOS users having more difficulty passing in-app intensity-based background noise checks. An app instruction update to the sustained /a/ task during the study pause period resulted in longer MPT averages, demonstrating the importance that app instructions play in remote clinical trials.

TRIAL REGISTRATION NUMBER: NCT04748445.}, } @article {pmid40554903, year = {2025}, author = {Yılmaz, G and Saraç, AB and Konrot, A and Baki, SN and Alpan, E and Bilgiç, HC and Demiryürek, P and Doğan, NN}, title = {Speech motor control and laryngeal diadochokinesis in typically developing normophonic children.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {195}, number = {}, pages = {112435}, doi = {10.1016/j.ijporl.2025.112435}, pmid = {40554903}, issn = {1872-8464}, abstract = {OBJECTIVE: This study aimed to evaluate speech motor control and laryngeal diadochokinesis in terms of age and sex in a typically developing, normophonic pediatric population using a computer-assisted analysis method, and to establish normative data for the assessed parameters.

METHODS: The sample of the study included 427 typically developing, normophonic children between the ages of 7 and 16 years. While 48.01 % (n = 205) of the participants were female, 51.99 % (n = 222) were male. The participants were divided into 3 age groups: 7-9 (male n = 87; female n = 82), 10-12 (male n = 50; female n = 47), and 13-16 (male n = 85; female n = 77). The acoustic analyses were carried out using the Motor Speech Profile (MSP) software (KayPENTAX, Lincoln Park, NJ, USA). The analysis protocols consisted of oral diadochokinetic (DDK) rate, laryngeal DDK rate, second formant (F2) transition rate, and general syllabic rate. The alternate motion rate (AMR) and sequential motion rate (SMR) tasks were used for oral DDK assessments. Laryngeal DDK was assessed using 2 tasks, abductor (/hʌ/ and /hi/) and adductor (/ʔʌ/ and /ʔi/).

RESULTS: Normative data were obtained for speech motor control and laryngeal DDK for children aged 7-16 years who were native speakers of Turkish. As age increased, oral and laryngeal DDK rates, F2 transition rates, and syllabic rates increased in both sexes. Additionally, only in the oral AMR-DDK analyses, DDK stabilization was observed to increase (DDK-jitter decreased). No statistically significant differences associated with age were observed in the DDK-jitter values in the other DDK analyses (p > 0.05). Sex-based differences were only observed in the syllabic rate analyses, and the syllabic rate values of the female participants were lower than those of the male participants in all age groups.

CONCLUSION: The pediatric normative database presented in this study can offer reference ranges for further studies involving the analyses of changes in oral and laryngeal motor control that may arise due to various developmental or neurological problems.}, } @article {pmid40551876, year = {2025}, author = {Li, J and Wang, Y and Wang, F and Zhang, R and Wang, N and Zhu, Y and Zhao, T}, title = {Using Speech Features and Machine Learning Models to Predict Emotional and Behavioral Problems in Chinese Adolescents.}, journal = {Depression and anxiety}, volume = {2025}, number = {}, pages = {5734107}, pmid = {40551876}, issn = {1520-6394}, mesh = {Humans ; Adolescent ; Male ; Female ; Child ; *Machine Learning ; *Problem Behavior/psychology ; *Speech/physiology ; *Affective Symptoms/diagnosis ; China ; *Adolescent Behavior ; East Asian People ; }, abstract = {Background: Current assessments of adolescent emotional and behavioral problems rely heavily on subjective reports, which are prone to biases. Aim: This study is the first to explore the potential of speech signals as objective markers for predicting emotional and behavioral problems (hyperactivity, emotional symptoms, conduct problems, and peer problems) in adolescents using machine learning techniques. Materials and Methods: We analyzed speech data from 8215 adolescents aged 12-18 years, extracting four categories of speech features: mel-frequency cepstral coefficients (MFCC), mel energy spectrum (MELS), prosodic features (PROS), and formant features (FORM). Machine learning models-logistic regression (LR), support vector machine (SVM), and gradient boosting decision trees (GBDT)-were employed to classify hyperactivity, emotional symptoms, conduct problems, and peer problems as defined by the Strengths and Difficulties Questionnaire (SDQ). Model performance was assessed using area under the curve (AUC), F1-score, and Shapley additive explanations (SHAP) values. Results: The GBDT model achieved the highest accuracy for predicting hyperactivity (AUC = 0.78) and emotional symptoms (AUC = 0.74 for males and 0.66 for females), while performance was weaker for conduct and peer problems. SHAP analysis revealed gender-specific feature importance patterns, with certain speech features being more critical for males than females. Conclusion: These findings demonstrate the feasibility of using speech features to objectively predict emotional and behavioral problems in adolescents and identify gender-specific markers. This study lays the foundation for developing speech-based assessment tools for early identification and intervention, offering an objective alternative to traditional subjective evaluation methods.}, } @article {pmid40492699, year = {2025}, author = {Beaudry, L and Gerber, S and Perrier, P and Ménard, L}, title = {Effects of a simultaneous lip tube and auditory feedback perturbation on the production of the French vowel /u/.}, journal = {The Journal of the Acoustical Society of America}, volume = {157}, number = {6}, pages = {4285-4299}, doi = {10.1121/10.0036827}, pmid = {40492699}, issn = {1520-8524}, mesh = {Humans ; Male ; Female ; Adult ; *Feedback, Sensory ; Young Adult ; *Speech Acoustics ; *Lip/physiology/innervation ; Acoustic Stimulation ; *Phonetics ; *Speech Perception ; *Voice Quality ; Speech Production Measurement ; }, abstract = {This study investigates the relative weight of somatosensory and auditory feedback in the production of the French vowel /u/ in a simultaneous lip tube and formant shift perturbation. To do so, 20 native Quebec French speakers were recruited. Three experimental conditions involving a lip tube, with each displaying a different auditory condition, were devised. In the first condition, auditory feedback was corrected by canceling the auditory effects of the lip tube using a formant shift. In the second condition, the corrected auditory feedback was replaced with white noise. Finally, access to natural auditory feedback was restored. The results reveal a diversity of compensation strategies depending on the participant. Although some participants rely on auditory feedback to compensate for the lip tube, others compensate before access to natural auditory feedback is restored. It is argued that this could be performed with internal predictions of the auditory feedback using somatosensory feedback, in line, among others, with the dual stream prediction model by Tian and Poppel [J. Cognit. Neurosci. 25(7), 1020--1036 (2013)].}, } @article {pmid40420656, year = {2025}, author = {Singer, N and Zaltz, Y}, title = {Auditory Learning and Generalization in Older Adults: Evidence from Voice Discrimination Training.}, journal = {Trends in hearing}, volume = {29}, number = {}, pages = {23312165251342436}, doi = {10.1177/23312165251342436}, pmid = {40420656}, issn = {2331-2165}, mesh = {Humans ; Aged ; Male ; Female ; Young Adult ; Adult ; Aged, 80 and over ; Adolescent ; *Aging/psychology ; *Generalization, Psychological ; Cues ; Age Factors ; Acoustic Stimulation ; *Auditory Perception ; Transfer, Psychology ; *Learning ; *Speech Perception ; *Discrimination Learning ; }, abstract = {Auditory learning is essential for adapting to continuously changing acoustic environments. This adaptive capability, however, may be impacted by age-related declines in sensory and cognitive functions, potentially limiting learning efficiency and generalization in older adults. This study investigated auditory learning and generalization in 24 older (65-82 years) and 24 younger (18-34 years) adults through voice discrimination (VD) training. Participants were divided into training (12 older, 12 younger adults) and control groups (12 older, 12 younger adults). Trained participants completed five sessions: Two testing sessions assessing VD performance using a 2-down 1-up adaptive procedure with F0-only, formant-only, and combined F0 + formant cues, and three training sessions focusing exclusively on VD with F0 cues. Control groups participated only in the two testing sessions, with no intermediate training. Results revealed significant training-induced improvements in VD with F0 cues for both younger and older adults, with comparable learning efficiency and gains across groups. However, generalization to the formant-only cue was observed only in younger adults, suggesting limited learning transfer in older adults. Additionally, VD training did not improve performance in the combined F0 + formant condition beyond control group improvements, underscoring the specificity of perceptual learning. These findings provide novel insights into auditory learning in older adults, showing that while they retain the ability for significant auditory skill acquisition, age-related declines in perceptual flexibility may limit broader generalization. This study highlights the importance of designing targeted auditory interventions for older adults, considering their specific limitations in generalizing learning gains across different acoustic cues.}, } @article {pmid40404868, year = {2025}, author = {Cao, Y and Cheng, Y and Liu, S and Mou, Z}, title = {Left hemisphere lateralization in unilateral upper motor neuron dysarthria via quantitative acoustic analysis.}, journal = {Scientific reports}, volume = {15}, number = {1}, pages = {17776}, pmid = {40404868}, issn = {2045-2322}, support = {2021A1515220049 and 2022A0505040007//Basic and Applied Basic Research Project of Guangzhou Basic Research Program/ ; 2019SKJ003 and KTP20190222//Special Project of Chinese government for Science and Technology of Guangdong Province/ ; 202201020046//Science and Technology Projects in Guangzhou City/ ; }, mesh = {Humans ; *Dysarthria/physiopathology ; Male ; Female ; Middle Aged ; Adult ; *Functional Laterality/physiology ; Aged ; *Speech Acoustics ; *Motor Neuron Disease/physiopathology ; *Motor Neurons/physiology/pathology ; Speech/physiology ; Phonetics ; }, abstract = {This paper aimed to identify specific acoustic parameters F1, F2, and Vowel Space Area (VSA), Vowel Articulation Index (VAI), Formant Centralization Ratio (FCR) for evaluating speech in Mandarin-speaking individuals with Unilateral Upper Motor Neuron (UUMN) dysarthria. Additionally, it explored the correlation between dysarthria severity and lesion side based on these parameters and scale results. This study conducted comparative study using acoustic spectral analysis to analyze phonetic features among UUMN dysarthria (UUMND) patients and neurologically normal adults, and the Left-sided and Right-sided upper motor neuron dysarthria (UMND) groups. The Mandibular-Oral Motor Function Assessment Scale (MOMFAS) was used in the study. The acoustic parameters F1, F2 and VSA, VAI, FCR showed significant differences between individuals with UUMN dysarthria and neurologically normal adults. Comparing left-sided upper motor neuron (UMN) dysarthria patients with right-sided UMN dysarthria patients, a considerable increase in FCR was observed in the left-sided group, while VSA and VAI showed significant decreases. The mean scale score of left-sided UMN dysarthria patients was also significantly lower than that of individuals with right-sided UMN dysarthria. The severity of UUMND was more pronounced in individuals with left-sided lesions, providing supportive evidence of lateralization on the left hemisphere. The acoustic indices F1, F2 and VSA, VAI, FCR can sensitively reflect the vowel changes of UUMND patients. They could be utilized not only to describe the acoustic properties of UUMND patients but also to assess the effectiveness of rehabilitation therapy on impaired vowel articulation in such patients.}, } @article {pmid40401379, year = {2025}, author = {Nakamura, G and Yamada, H and Hirose, A and Maeda, H and Reidenberg, JS and Kato, H and Park, S and Fujise, Y}, title = {Discovery of sexual dimorphism of the laryngeal sac in the common minke whale Balaenoptera acutorostrata.}, journal = {Anatomical record (Hoboken, N.J. : 2007)}, volume = {}, number = {}, pages = {}, doi = {10.1002/ar.25681}, pmid = {40401379}, issn = {1932-8494}, support = {//Fisheries Agency of Japan/ ; }, abstract = {Mysticetes, or baleen whales, have an air sac on the ventral surface of the larynx known as the "laryngeal sac." The primary hypothesis regarding this structure's function is that it is involved in sound production. However, several other functions have been proposed, including air recycling, air storage, and even buoyancy control. In this study, we analyzed ontogenetic development and sexual dimorphism of the laryngeal sac with an aim of elucidating the function of this organ. The larynx of 61 (male: n = 40, female: n = 21) common minke whales Balaenoptera acutorostrata, collected from off the Japanese coast were used for present study. We isolated the larynx, situated between the hyoid bone and the trachea, during the flensing process. Seven linear measurements were taken using calipers, and the weight was obtained using a digital weight scale. Allometric equation and proportions to total body length or weight were used to compare laryngeal morphological differences between sexes and maturity. Measurements of laryngeal sac size were significantly larger in sexually mature males. Furthermore, examination of two male individuals of approximately the same body length but different maturities showed the sexually mature male had a larger laryngeal sac, compared to sexually immature male. The thickness of the laryngeal sac's muscle wall and the volume of the sac's lumen may be related to testes development (sexually mature whales have heavier testes). Only the width of the hyoid bone (basihyal and paired thyrohyals) was proportionally constant within all measurement sites, regardless of sex or maturity. We propose that baleen whales utilize their well muscularly developed laryngeal sac in a manner analogous to the human tongue, actively modifying its shape and volume to influence vocal production. Specifically, this structure may function as a resonance filter that creates a formant structure and contributes to the modification of phonemes generated by the U-folds of the larynx. Furthermore, the ability to produce complex vocalizations through this mechanism may have led to the enlargement of the laryngeal sac in males via sexual selection, where it also serves as a signal of their reproductive status.}, } @article {pmid40370017, year = {2025}, author = {Wright, D and Westander, J and Jensen, P}, title = {Domestication effects on crowing in chickens: variation between wild and captive red junglefowl and domestic white Leghorn and the genetic architecture of crowing vocalizations.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {380}, number = {1926}, pages = {20240199}, doi = {10.1098/rstb.2024.0199}, pmid = {40370017}, issn = {1471-2970}, support = {//Svenska Forskningsrådet Formas/ ; //Vetenskapsrådet/ ; }, mesh = {Animals ; *Chickens/genetics/physiology ; *Domestication ; *Vocalization, Animal ; Quantitative Trait Loci ; Male ; Sweden ; India ; }, abstract = {The crowing of the male chicken is a charismatic example of vocal display in a bird. It is regarded as the main territorial announcement of the ancestral red junglefowl. The call has been preserved throughout domestication, although several of its elements have been altered. To assess these alterations, we assayed crowing spectrograms from wild and captive-held red junglefowl populations from India, along with two red junglefowl populations held in long-term captivity in Sweden, and a domestic white Leghorn breed. We find consistent differences between the different Indian red junglefowl and the domestic white Leghorn for a range of characteristics, including the duration of the last syllable and the number of formants and their frequency in the last and second-to-last syllable. To analyse the genetic architecture of crowing vocalization, we performed a quantitative trait loci (QTL) experiment using a wild × domestic advanced intercross to identify QTL that explained a large percentage of the variation present for the duration of the last syllable and the number of formants in the second to last syllable. With this study we thus demonstrate consistent differences in red junglefowl and white Leghorn chickens and identify a relatively simple genetic architecture for some of these traits.This article is part of the theme issue 'Unravelling domestication: multi-disciplinary perspectives on human and non-human relationships in the past, present and future'.}, } @article {pmid40328556, year = {2025}, author = {Irineu, RA and Dassie-Leite, AP and Pereira, EC and Ferreira, T and Martins, PDN}, title = {Vocal Markers in the Gender Perception of Trans Women and Trans Men.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.03.002}, pmid = {40328556}, issn = {1873-4588}, abstract = {OBJECTIVE: To identify vocal gender markers in trans people, considering the relationship between gender perception and the acoustic and auditory-perceptual parameters of the voice.

METHODS: Observational, cross-sectional study, approved by the Research Ethics Committee (n. 5.353.501). The judges completed auditory-perceptual judgment (APJ) and acoustic analysis of 30 transgender women and 23 transgender men, aged between 18 and 43 years, based on the production of the sustained vowel /a/ and connected speech (number counting and days of the week). The APJ was made in consensus by two judges; vocal deviation was analyzed using the GRBASI scale; the parameters pitch (high, medium, and low) loudness (strong, adequate, and weak), resonance (laryngopharyngeal, balanced, and nasal), articulation (locked, adequate, and exaggerated), intonation (descending, level, and ascending), and gender perception (feminine, masculine, and neutral). For the acoustic evaluation, the software PRAAT was used to extract the parameters oscillatory frequency (fo), fo deviation, minimum and maximum frequency (fomin/fomax), first (F1), second (F2), third (F3), and fourth (F4) formant frequencies. The Kruskal-Wallis test, chi-square test, and Fisher's exact test were used for the statistical analysis of the data. For the regression analysis, the data were analyzed descriptively and inferentially using SPSS 29.0 software. A binary logistic regression model was applied to predict the binary nominal qualitative dependent variable of gender congruence through voice. In all statistical tests, a significance level of 5% (P < 0.05) was used.

RESULTS: The average fo was 146.289 Hz for trans women and 157.409 Hz for trans men. For trans women, gender perception was related to the parameters pitch (P = 0.013), articulation (P = 0.017), and intonation (P = 0.000). In trans men, gender perception was related to hormone use (P = 0.016), GRBASI tension parameter (P = 0.028), pitch (P = 0.001), loudness (P = 0.033), intonation (P = 0.001), fo (P = 0.034), fomin (P = 0.029), fomax (P = 0.018), and F1 (P = 0.038). In the results obtained from binary logistic regression for predicting gender congruence based on voice, ascending intonation was an auditory-perceptual predictor (P = 0.001) in the group of transgender women, and F1 was an acoustic predictor (P = 0.050) in the group of transgender men, both in connected speech.

CONCLUSION: In trans women, high pitch, adequate articulation, and ascending intonations were observed as markers of female gender. Most of the trans women's voices were perceived as feminine, even when they had a low pitch. In trans men, more tense vocal quality, descending intonations, and average fo in the range considered masculine were observed as markers of male gender. The parameters high pitch and ascending intonations were markers of female gender for both trans women and trans men. Ascending intonation was an auditory-perceptual predictor of vocal femininity in the transgender women group, and F1 frequency was an acoustic predictor of vocal masculinity in the transgender men group, both in connected speech.}, } @article {pmid40275664, year = {2025}, author = {Yuan, F and Lu-Lu, L and Liang, YY and Cao, MM and Xu, ZH and Qian, CR and Wang, D and Zhang, K}, title = {[Abnormal characteristics of tongue consonants and their correlation with articulatory movement parameters in patients with tongue cancer after surgery].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {34}, number = {1}, pages = {74-78}, pmid = {40275664}, issn = {1006-7248}, mesh = {Humans ; *Tongue Neoplasms/surgery/physiopathology ; *Tongue/physiopathology ; Male ; Female ; Middle Aged ; Aged ; Adult ; *Speech ; *Phonetics ; Movement ; Postoperative Period ; }, abstract = {PURPOSE: To study the abnormal characteristics of tongue consonants and their correlation with articulatory movement parameters in patients with tongue cancer after operation.

METHODS: A total of 119 patients with tongue cancer who received surgical treatment at First Affiliated Hospital of Bengbu Medical University from March 2019 to May 2023 were selected. The patients were divided into tongue margin group(n=38), tongue body group (n=40) and tongue base group(n=41). Twenty-five monosyllabic words in Huang Zhaoming-Han Zhijuan Vocabulary List for evaluating tongue consonants were used as speech assessment tools to evaluate the errors of each tongue consonant. The articulation speech measurement and training instrument were used to extract the second formants (F2) of the /i/ and /u/ vowels of the patients by linear predictive spectrum, and the articulation movement parameters such as tongue distance and F2i/F2u were calculated according to the formula. SPSS 26.0 software package was used for data analysis.

RESULTS: The rate of tongue consonant error in each group was as follows: in tongue margin group, preapical sound (49.5%)＞ apical middle sound (27.8%)＞ apical postapical sound (17.5%)＞lingual facial sound (9.4%)＞ lingual base sound (6.1%). In tongue body group, preapical sound (55.0%)＞ apical middle sound (47.1%) ＞ apical postapical sound (25.4%)＞lingual facial sound (12.1%)＞lingual base sound (3.3%). In tongue base group, preapical sound (60.0%)＞postapical sound (52.0%) ＞apical medium sound (51.9%)＞lingual base sound (44.3%)＞lingual facial sound (34.8%). The error frequency of tongue apex medium sound in tongue body group and tongue base group was significantly higher than that in tongue margin group, and the error frequency of tongue apex posterior sound, tongue surface sound and tongue base sound in tongue base group was significantly higher than that in tongue body group and tongue margin group(P＜0.05). Tongue distance and F2i/F2u in tongue base group were significantly lower than those in tongue margin group and tongue body group, and tongue distance and F2i/F2u in tongue body group were significantly lower than those in tongue margin group(P＜0.05). Tongue distance, F2i/F2u were significantly negatively correlated with the error frequency of apical midpoint, apical postpoint and base sound in all groups(r＜0, P＜0.05).

CONCLUSIONS: Most patients with tongue cancer after operation have abnormal tongue tip, and the most serious problem is the pretip. In clinical practice, objective parameters such as tongue distance and F2i/F2u can be used to quantitatively and indirectly evaluate the articulation status and dynamic rehabilitation effect of tongue cancer patients after surgery.}, } @article {pmid40258124, year = {2025}, author = {Hong, Y and Chen, S and Jiang, H}, title = {Does Musical Experience Facilitate Phonetic Accommodation During Human-Robot Interaction?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-16}, doi = {10.1044/2025_JSLHR-24-00495}, pmid = {40258124}, issn = {1558-9102}, abstract = {PURPOSE: This study investigated the effect of musical training on phonetic accommodation in a second language (L2) after interacting with a social robot, exploring the motivations and reasons behind their accommodation strategies.

METHOD: Fifteen L2 English speakers with long-term musical training experience (musician group) and 15 speakers without musical training experience (nonmusician group) were recruited to complete four conversational tasks with the social robot Furhat. Their production of a list of key words and carrier sentences was collected before and after conversations and used to quantify their phonetic accommodations. The spectral cues and prosodic cues of the production were extracted and analyzed.

RESULTS: Both groups showed similar convergence patterns but different divergence patterns. Specifically, the musician group showed divergence from the robot's production on more prosodic cues (mean fundamental frequency and duration) than the nonmusician group. Both groups converged their vowel formants toward the robot without group differences.

CONCLUSIONS: The findings reflect individuals' assessment of the robot's speech characteristics and their efforts to enhance communication efficiency, which might indicate a special speech register used for addressing the robot. The finding is more noticeable in the musician group compared to the nonmusician group. We proposed two possible explanations of the effect of musical training on phonetic accommodations: one involves the training of auditory attention and working memory and the other relates to the refinement of phonetic talent in L2 acquisition, contributing to theories on the relationship between music and language. This study also has implications for applying musical training to speech communication training in clinical populations and for designing social robots to better serve as speech therapy partners.}, } @article {pmid40253572, year = {2025}, author = {Filippa, M and Tissot, H and Mancinelli, T and Favez, N and Grandjean, D}, title = {Maternal and paternal infant directed speech is modulated by the child's age in in two and three person interactions.}, journal = {Scientific reports}, volume = {15}, number = {1}, pages = {13624}, pmid = {40253572}, issn = {2045-2322}, mesh = {Humans ; Infant ; Female ; Male ; *Speech/physiology ; *Mother-Child Relations ; Adult ; Mothers ; Age Factors ; Fathers ; Father-Child Relations ; Speech Acoustics ; }, abstract = {Prosody in infant-directed speech (IDS) serves important functions for the infant's attention, regulation, and emotional expression. However, how the structural characteristics of this vocal signal are influenced by the presence or absence of one or two parents at different infant ages remains under-investigated. This study aimed to identify the acoustic characteristics of parental vocalizations in 69 families during specific phases of the Lausanne Trilogue Play (LTP) setting. Vocalizations were analyzed in both two-person contexts (mother-baby or father-baby interacting with the infant individually) and three-person contexts (mother-baby or father-baby interactions in the presence of the other parent) at three time points: when the infant was 3, 9, and 18 months old. Videos of interactions were coded, and the parental vocalizations were extracted. Five components of acoustic features related to the prosodic aspects of speech were extracted for subsequent analysis: intensity and its variability, pitch and pitch variability, formant amplitude, the intensity of specific speech frequency bands affecting sound timbre, and the rate of voiced and unvoiced segments per second. The study demonstrated a main effect of infant age on parental acoustic prosodic characteristics, along with significant interactions between infant age and interaction context (two- versus three-person) and between infant age and parental role (mother versus father). Across contexts and parental roles, intensity, pitch, and their variability consistently increased from 3 to 9 months. By 9 months, distinct prosodic patterns emerged, including a reduced syllable rate and formant amplitude, along with an increase in pauses. The mother's voice exhibited a steady increase in intensity, as well as in pitch and intensity variability. Interestingly, when comparing parents across the two contexts, IDS in the three-person context is characterized by a higher rate of syllables and fewer pauses, with the most pronounced changes observed at 9 months of age. The development of prosodic characteristics in IDS is not constant across age and it is influenced by the complex interactions between age phases, parental gender, and contextual factors, with a dynamic adaptation of the communication strategies in three-person contexts. The current study underscores the importance of taking a comprehensive perspective in analyzing infant-directed speech within an interactive context involving both fathers and mothers in two- and three-person settings.}, } @article {pmid40253259, year = {2025}, author = {M, A and I, M and A M, J and I, H and J C, C}, title = {A Pitch-Synchronous Study of Formants.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.03.033}, pmid = {40253259}, issn = {1873-4588}, abstract = {INTRODUCTION: Formants are of fundamental importance in voice science. To date, formants have typically been studied using pitch-asynchronous methods, such as linear-prediction analysis. The results are often incomplete (without level), not objective (with frequencies depending on the preset order p), and require many pitch periods of stationary signals. A method that is accurate, complete, reproducible, and widely applicable is needed.

METHOD: This study presents a pitch-synchronous method for measuring formants. From the waveform of each pitch period, formants are obtained with high reproducibility, including all formant parameters such as central frequency, level, and bandwidth.

RESULTS: The method was tested on 78 utterances of recorded sustained vowels with simultaneously acquired electroglottograph signals, segmented into 4730 individual pitch periods. For each waveform segment, Fourier analysis was applied to obtain an amplitude spectrum. Formants with three parameters were obtained from each amplitude spectrum. Using these formants, the voice waveforms were regenerated showing strong similarity to the original waveforms. The spectra can be averaged over many pitch periods to reduce noise and to estimate standard deviation.

CONCLUSIONS: Measuring formants from the waveform in each pitch period yields accurate, complete, and reproducible results. The method is applicable to live voices, including both speech and singing signals. The results can be used for voice research, speech and singing synthesis, and a quantitative study of phonetics.}, } @article {pmid40253000, year = {2025}, author = {Chabib, L and Yulianto, and Ananda, PWR and Utami, RN and Mir, M and Elim, D and Fitri, AMN and Zaman, HS and Aziz, AYR and Fauziah, N and Rahman, L and Pandoman Febrian, M and Permana, AD}, title = {Ethyl Cellulose-Based In-Situ Film of Itraconazole for Enhanced Treatment of Fungal Infections.}, journal = {Annales pharmaceutiques francaises}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.pharma.2025.04.002}, pmid = {40253000}, issn = {0003-4509}, abstract = {OBJECTIVES: Fungal infections represent a significant global health challenge, requiring effective treatments to prevent complications and improve patient outcomes. This study aimed to develop an in-situ film-forming system (IFFS) for transcutaneous delivery of itraconazole (ITZ) as an alternative to oral administration, addressing issues such as low bioavailability, reduced efficacy, and potential side effects.

MATERIALS AND METHODS: The IFFS was formulated using ethyl cellulose as the primary polymer, PEG 400 as a plasticizer, and a eutectic mixture of menthol and camphor as penetration enhancers. The system was characterized for viscosity, pH, drying time, water vapor permeability, bioadhesion, and physicochemical interactions (DSC and FTIR). Ex vivo skin permeation and retention studies were conducted using Franz diffusion cells, and antifungal efficacy was tested on an ex vivo Candida albicans infection model. Skin integrity and hemolysis tests were performed to evaluate safety.

RESULTS: The IFFS exhibited desirable physicochemical properties, with increased polymer concentrations enhancing skin retention and bioadhesive strength while reducing permeation rates. Ex vivo studies showed sustained ITZ release and enhanced skin retention. The antifungal activity test demonstrated complete eradication of Candida albicans within 48 hours. Safety assessments confirmed no skin irritation or toxicity.

CONCLUSION: The developed IFFS provides a safe and effective transcutaneous delivery system for ITZ. This innovative approach enhances antifungal efficacy, improves skin retention, and offers a promising alternative to oral administration, minimizing systemic side effects.}, } @article {pmid40246982, year = {2025}, author = {Behroozmand, R and Khoshhal Mollasaraei, Z and Nejati, V and Daliri, A and Fridriksson, J}, title = {Vocal and articulatory speech control deficits in individuals with post-stroke aphasia.}, journal = {Scientific reports}, volume = {15}, number = {1}, pages = {13350}, pmid = {40246982}, issn = {2045-2322}, support = {R01DC018523/NH/NIH HHS/United States ; R01DC019905/NH/NIH HHS/United States ; P50DC014664/NH/NIH HHS/United States ; }, mesh = {Humans ; *Stroke/complications/physiopathology ; *Aphasia/physiopathology/etiology ; Male ; Female ; Middle Aged ; Aged ; *Speech/physiology ; Adult ; Case-Control Studies ; *Voice/physiology ; Speech Acoustics ; }, abstract = {Individuals with post-stroke aphasia exhibit deficits in regulating vocal (i.e., laryngeal) pitch control during speech vowel production; however, it is not determined whether such deficits also exist when they control their supra-laryngeal speech articulators during word production. To address this question, 19 subjects with post-stroke aphasia and 20 controls were tested under an altered auditory feedback paradigm in which they received + 30% shifts in their vowel first-formant frequency during word production. In addition, 17 aphasia subjects and 19 controls from the same groups also completed steady vowel vocalizations while receiving randomized pitch shifts at ± 100 cents. Consistent with previous findings, our data showed that the magnitude of compensatory vocal responses to pitch-shifted vowel productions was significantly reduced in individuals with aphasia vs. controls. In addition, we also found that the magnitude of compensatory articulatory responses to formant-shifted vowels during word production was significantly diminished in the aphasia group compared with controls. However, no significant correlation was found between the vocal and articulatory compensatory responses to pitch and formant alterations. These findings suggest that vocal and articulatory motor speech control are regulated via independent mechanisms, and stroke-induced damage to left-hemispheric brain networks can selectively impair them in stroke survivors with aphasia.}, } @article {pmid40229175, year = {2025}, author = {Kang, MJ and Ryu, JY and Lee, JS and Yang, JD and Chung, HY and Choi, KY}, title = {Acoustic analysis of nasalance and formants in VPI patients: Implications for clinical practice and mobile application development.}, journal = {Journal of cranio-maxillo-facial surgery : official publication of the European Association for Cranio-Maxillo-Facial Surgery}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jcms.2025.03.018}, pmid = {40229175}, issn = {1878-4119}, abstract = {Velopharyngeal insufficiency (VPI) often results in speech abnormalities, making accurate evaluation essential for understanding its relationship with structural anomalies. This retrospective study, spanning January 2019 to December 2022, investigates the role of formant analysis in speech evaluation and treatment. We analyzed speech data from 100 adults, 55 children, and 10 pediatric patients with VPI using Nasometer and PRAAT software, focusing on the sounds Pa, Pi, Pu, Pe, and Po. Nasalance scores and formants 1-4 were measured both pre- and post-VPI surgery and correlated with age, gender, and surgical outcomes. In both normal adults and children, the distributions of formants 1 and 2 for the vowels |a|, |e|, |i|, |o|, and |u| showed variations by age. Gender differences were significant in adults for the vowels |a|, |o|, and |u|, but not in children. VPI surgery significantly improved nasalance scores, and notable changes in formants 1 and 2 were observed post-surgery in VPI patients for the vowels |a|, |e|, and |i|. This study emphasizes the importance of formant analysis in speech therapy and introduces the potential for mobile app-based self-assessment. This approach reduces the reliance on specialized tools, such as nasometers, and provides a more accessible method for the speech management.}, } @article {pmid40228243, year = {2025}, author = {Su, Z and Jiang, H and Yang, Y and Hou, X and Su, Y and Yang, L}, title = {Acoustic Features for Identifying Suicide Risk in Crisis Hotline Callers: Machine Learning Approach.}, journal = {Journal of medical Internet research}, volume = {27}, number = {}, pages = {e67772}, doi = {10.2196/67772}, pmid = {40228243}, issn = {1438-8871}, mesh = {Humans ; *Machine Learning ; *Hotlines ; *Suicide ; *Acoustics ; Female ; Male ; Risk Assessment/methods ; China ; Adult ; *Suicide Prevention ; Suicidal Ideation ; }, abstract = {BACKGROUND: Crisis hotlines serve as a crucial avenue for the early identification of suicide risk, which is of paramount importance for suicide prevention and intervention. However, assessing the risk of callers in the crisis hotline context is constrained by factors such as lack of nonverbal communication cues, anonymity, time limits, and single-occasion intervention. Therefore, it is necessary to develop approaches, including acoustic features, for identifying the suicide risk among hotline callers early and quickly. Given the complicated features of sound, adopting artificial intelligence models to analyze callers' acoustic features is promising.

OBJECTIVE: In this study, we investigated the feasibility of using acoustic features to predict suicide risk in crisis hotline callers. We also adopted a machine learning approach to analyze the complex acoustic features of hotline callers, with the aim of developing suicide risk prediction models.

METHODS: We collected 525 suicide-related calls from the records of a psychological assistance hotline in a province in northwest China. Callers were categorized as low or high risk based on suicidal ideation, suicidal plans, and history of suicide attempts, with risk assessments verified by a team of 18 clinical psychology raters. A total of 164 clearly categorized risk recordings were analyzed, including 102 low-risk and 62 high-risk calls. We extracted 273 audio segments, each exceeding 2 seconds in duration, which were labeled by raters as containing suicide-related expressions for subsequent model training and evaluation. Basic acoustic features (eg, Mel Frequency Cepstral Coefficients, formant frequencies, jitter, shimmer) and high-level statistical function (HSF) features (using OpenSMILE [Open-Source Speech and Music Interpretation by Large-Space Extraction] with the ComParE 2016 configuration) were extracted. Four supervised machine learning algorithms (logistic regression, support vector machine, random forest, and extreme gradient boosting) were trained and evaluated using grouped 5-fold cross-validation and a test set, with performance metrics, including accuracy, F1-score, recall, and false negative rate.

RESULTS: The development of machine learning models utilizing HSF acoustic features has been demonstrated to enhance recognition performance compared to models based solely on basic acoustic features. The random forest classifier, developed with HSFs, achieved the best performance in detecting the suicide risk among the models evaluated (accuracy=0.75, F1-score=0.70, recall=0.76, false negative rate=0.24).

CONCLUSIONS: The results of our study demonstrate the potential of developing artificial intelligence-based early warning systems using acoustic features for identifying the suicide risk among crisis hotline callers. Our work also has implications for employing acoustic features to identify suicide risk in salient voice contexts.}, } @article {pmid40208724, year = {2025}, author = {Zeng, Y and Niziolek, CA and Parrell, B}, title = {Simultaneous acquisition of multiple auditory-motor transformations reveals suprasyllabic motor planning in speech production.}, journal = {Journal of experimental psychology. General}, volume = {}, number = {}, pages = {}, doi = {10.1037/xge0001744}, pmid = {40208724}, issn = {1939-2222}, support = {//National Science Foundation; Division of Behavioral and Cognitive Sciences/ ; }, abstract = {Motor planning forms a critical bridge between psycholinguistic and motoric models of word production. While syllables are often considered the core speech motor planning unit, growing evidence hints at suprasyllabic planning that may correspond to words, but firm experimental support is still lacking. We use differential adaptation to altered auditory feedback to provide novel, straightforward evidence for word-level planning. By introducing opposing perturbations to shared segmental content in near real time during speaking (e.g., raising the first vowel formant of "ped" in "pedigree" but lowering it in "pedicure," so speakers hear something akin to "padigree" and "pidicure"), we assess whether participants can use the larger word context to separately oppose the two perturbations (i.e., by producing "pidigree" and "padicure"). Critically, limb control research shows that such differential learning is possible only when the shared movement forms part of distinct motor plans, allowing a straightforward assay of the scope of planning in multisyllabic words. We found differential adaptation in multisyllabic words but of smaller size relative to monosyllabic words. Our results strongly suggest that speech relies on an interactive motor planning process encompassing both syllables and words. (PsycInfo Database Record (c) 2025 APA, all rights reserved).}, } @article {pmid40189499, year = {2025}, author = {Fitch, WT and Anikin, A and Pisanski, K and Valente, D and Reby, D}, title = {Formant analysis of vertebrate vocalizations: achievements, pitfalls, and promises.}, journal = {BMC biology}, volume = {23}, number = {1}, pages = {92}, pmid = {40189499}, issn = {1741-7007}, support = {W1262-B29//Austrian Science Fund/ ; 2023-00850//Vetenskapsrådet/ ; ANR-21-CE28-0007-01//French National Research Agency/ ; ANR-21-CE28-0007-01//French National Research Agency/ ; }, mesh = {Animals ; *Vocalization, Animal/physiology ; *Vertebrates/physiology ; Humans ; }, abstract = {When applied to vertebrate vocalizations, source-filter theory, initially developed for human speech, has revolutionized our understanding of animal communication, resulting in major insights into the form and function of animal sounds. However, animal calls and human nonverbal vocalizations can differ qualitatively from human speech, often having more chaotic and higher-frequency sources, making formant measurement challenging. We review the considerable achievements of the "formant revolution" in animal vocal communication research, then highlight several important methodological problems in formant analysis. We offer concrete recommendations for effectively applying source-filter theory to non-speech vocalizations and discuss promising avenues for future research in this area.Brief Formants (vocal tract resonances) play key roles in animal communication, offering researchers exciting promise but also potential pitfalls.}, } @article {pmid40181888, year = {2025}, author = {Song, JY and Rojas, C and Pycha, A}, title = {Factors modulating perception and production of speech by AI tools: a test case of Amazon Alexa and Polly.}, journal = {Frontiers in psychology}, volume = {16}, number = {}, pages = {1520111}, doi = {10.3389/fpsyg.2025.1520111}, pmid = {40181888}, issn = {1664-1078}, abstract = {To develop AI tools that can communicate on par with human speakers and listeners, we need a deeper understanding of the factors that affect their perception and production of spoken language. Thus, the goal of this study was to examine to what extent two AI tools, Amazon Alexa and Polly, are impacted by factors that are known to modulate speech perception and production in humans. In particular, we examined the role of lexical (word frequency, phonological neighborhood density) and stylistic (speaking rate) factors. In the domain of perception, high-frequency words and slow speaking rate significantly improved Alexa's recognition of words produced in real time by native speakers of American English (n = 21). Alexa also recognized words with low neighborhood density with greater accuracy, but only at fast speaking rates. In contrast to human listeners, Alexa showed no evidence of adaptation to the speaker over time. In the domain of production, Polly's vowel duration and formants were unaffected by the lexical characteristics of words, unlike human speakers. Overall, these findings suggest that, despite certain patterns that humans and AI tools share, AI tools lack some of the flexibility that is the hallmark of human speech perception and production.}, } @article {pmid40180572, year = {2025}, author = {Atilgan, H and Walker, KM and King, AJ and Schnupp, JW and Bizley, JK}, title = {Auditory training alters the cortical representation of complex sounds.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {}, number = {}, pages = {}, doi = {10.1523/JNEUROSCI.0989-24.2025}, pmid = {40180572}, issn = {1529-2401}, abstract = {Auditory learning is supported by long-term changes in the neural processing of sound. We examined these task-depend changes in auditory cortex by mapping neural sensitivity to timbre, pitch and location cues in cues in trained (n = 5), and untrained control female ferrets (n = 5). Trained animals either identified vowels in a two-alternative forced choice task (n = 3) or discriminated when a repeating vowel changed in identity or pitch (n = 2). Neural responses were recorded under anesthesia in two primary auditory cortical fields and two tonotopically organized non-primary fields. In trained animals, the overall sensitivity to sound timbre was reduced across three cortical fields compared to control animals, but maintained in a non-primary field (the posterior pseudosylvian field). While training did not increase sensitivity to timbre across auditory cortex, it did change the way in which neurons integrated spectral information with neural responses in trained animals increasing their sensitivity to first and second formant frequencies, whereas in control animals' cortical sensitivity to spectral timbre depends mostly on the second formant. Animals trained on timbre identification were required to generalize across pitch when discriminating timbre and their neurons became less modulated by fundamental frequency relative to control animals. Finally, both trained groups showed increased spatial sensitivity and an enhanced response to sound source locations close to the midline, where the loudspeaker was located in the training chamber. These results demonstrate that training elicited widespread alterations in the cortical representation of complex sounds.Significance Statement Learning a task can elicit widespread changes in the brain. Here, we trained animals to discriminate sound timbre using synthetic vowel sounds. Somewhat surprisingly we observed that in 3 out of 4 of the brain regions studied, neural responses became less sensitive to timbre, while in the 4th area sensitivity was maintained. This suggests that training does not simply rewire more neurons to represent learned stimuli. Neurons also changed the way in which they processed stimuli becoming more sensitive to the formant cues that determine vowel identity and tuned preferentially for the region of space in which sounds were presented during training. Together, these results suggest that learning results in complex changes in how and whether neurons represent learned sounds.}, } @article {pmid40178361, year = {2025}, author = {Almurashi, W}, title = {Acoustic Evidence for the Tenseness and Laxity Distinction in Hijazi Arabic: A Pilot Study Using Static and Dynamic Analysis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-14}, doi = {10.1044/2025_JSLHR-24-00692}, pmid = {40178361}, issn = {1558-9102}, abstract = {PURPOSE: Standard Arabic has a simple three-vowel system with short and long distinctions, specifically /i iː a aː u uː/, traditionally believed to differ solely in duration. However, studies on regional Arabic dialects using a static approach (e.g., measuring formant values at the vowel's midpoint) have suggested that these vowels differ in both quality and quantity. This study aimed to investigate whether Hijazi Arabic (HA) exhibits a tense/lax distinction and, importantly, whether a dynamic analysis (particularly Vowel Inherent Spectral Change) could better capture this distinction, an area relatively underexplored in Arabic acoustic studies.

METHOD: Data were collected from 20 native HA speakers, who produced six HA vowels in various consonantal environments. The first two formant values and vowel duration were automatically extracted. Static formant values were measured at the vowel's midpoint, while dynamic spectral changes were measured at three points during the vowel's duration.

RESULTS: The findings revealed a significant distinction between short and long HA vowels, not only in duration but also in their acoustic properties. In the static model, short vowels were more centralized, while long vowels were more peripheral. In the dynamic model, the spectral changes of short vowels differed significantly from those of their long counterparts.

CONCLUSIONS: These results underscore the existence of a tense/lax distinction in HA, challenging the traditional view that the distinction is based solely on duration. They also highlight the value of dynamic vowel analysis for a comprehensive understanding of vowel behavior in phonological systems.}, } @article {pmid40176514, year = {2025}, author = {Anikin, A and Reby, D and Pisanski, K}, title = {Nonlinear vocal phenomena and speech intelligibility.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {380}, number = {1923}, pages = {20240254}, doi = {10.1098/rstb.2024.0254}, pmid = {40176514}, issn = {1471-2970}, support = {//Vetenskapsrådet/ ; }, mesh = {Humans ; *Speech Intelligibility ; Male ; Female ; *Voice ; Adult ; *Speech Acoustics ; *Speech Perception ; Animals ; }, abstract = {At some point in our evolutionary history, humans lost vocal membranes and air sacs, representing an unexpected simplification of the vocal apparatus relative to other great apes. One hypothesis is that these simplifications represent anatomical adaptations for speech because a simpler larynx provides a suitably stable and tonal vocal source with fewer nonlinear vocal phenomena (NLP). The key assumption that NLP reduce speech intelligibility is indirectly supported by studies of dysphonia, but it has not been experimentally tested. Here, we manipulate NLP in vocal stimuli ranging from single vowels to sentences, showing that the vocal source needs to be stable, but not necessarily tonal, for speech to be readily understood. When the task is to discriminate synthesized monophthong and diphthong vowels, continuous NLP (subharmonics, amplitude modulation and even deterministic chaos) actually improve vowel perception in high-pitched voices, likely because the resulting dense spectrum reveals formant transitions. Rough-sounding voices also remain highly intelligible when continuous NLP are added to recorded words and sentences. In contrast, voicing interruptions and pitch jumps dramatically reduce speech intelligibility, likely by interfering with voicing contrasts and normal intonation. We argue that NLP were not eliminated from the human vocal repertoire as we evolved for speech, but only brought under better control.This article is part of the theme issue 'Nonlinear phenomena in vertebrate vocalizations: mechanisms and communicative functions'.}, } @article {pmid40163619, year = {2025}, author = {Owino, G and Bernard Shibwabo, B}, title = {A Systematic Review of Advances in Infant Cry Paralinguistic Classification: Methods, Implementation, and Applications.}, journal = {JMIR rehabilitation and assistive technologies}, volume = {}, number = {}, pages = {}, doi = {10.2196/69457}, pmid = {40163619}, issn = {2369-2529}, abstract = {BACKGROUND: Effective communication is essential for human interaction, yet infants can only express their needs through various types of suggestive cries. Traditional approaches of interpreting infant cries are often subjective, inconsistent, and slow leaving gaps in timely, precise caregiving responses. A precise interpretation of infant cries can potentially provide valuable insights into the infant's health, needs, and well-being, enabling prompt medical or caregiving actions.

OBJECTIVE: This study seeks to systematically review the advancements in methods, coverage, deployment schemes, and applications of infant cry classification over the last 24 years. The review focuses on the different infant cry classification techniques, feature extraction methods, and the practical applications. Furthermore, we aimed to identify recent trends and directions in the field of infant cry signal processing to address both academic and practical needs.

METHODS: A systematic literature review was conducted by using nine electronic databases: Cochrane Database of Systematic Reviews, JSTOR, Web of Science Core Collection, Scopus, PubMed, ACM, MEDLINE, IEEE Xplore, and Google Scholar. A total of 5904 search results were initially retrieved, with 126 studies meeting the eligibility criteria after screening by two independent reviewers. The methodological quality of the studies was assessed using the Cochrane risk-of-bias tool version 2 (RoB2), with 92% (n=116) of the studies indicating a low risk of bias and 8% (n=10) of the studies showing some concerns regarding bias. The overall quality assessment was performed using the TRIPOD (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis) guidelines. The data analysis was conducted using R version 3.64.

RESULTS: Notable advancements in infant cry classification methods were realized, particularly from 2019 onwards employing machine learning, deep learning, and hybrid approaches. Common audio features included Mel-frequency cepstral coefficients (MFCCs), spectrograms, pitch, duration, intensity, formants, zero-crossing rate and chroma. Deployment methods included mobile applications and web-based platforms for real-time analysis with 90% (n=113) of the remaining models remained undeployed to real world applications. Denoising techniques and federated learning were limitedly employed to enhance model robustness and ensure data confidentiality from 5% (n=6) of the studies. Some of the practical applications spanned healthcare monitoring, diagnostics, and caregiver support.

CONCLUSIONS: The evolution of infant cry classification methods has progressed from traditional classical statistical methods to machine learning models but with minimal considerations of data privacy, confidentiality, and ultimate deployment to the practical use. Further research is thus proposed to develop standardized foundational audio multimodal approaches, incorporating a broader range of audio features and ensuring data confidentiality through methods such as federated learning. Furthermore, a preliminary layer is proposed for denoising the cry signal before the feature extractions stage. These improvements will enhance the accuracy, generalizability, and practical applicability of infant cry classification models in diverse healthcare settings.}, } @article {pmid40159307, year = {2025}, author = {Terband, H and Bhat, B}, title = {Intrinsic fundamental frequency of vowels in children with Childhood Apraxia of Speech (CAS).}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {}, number = {}, pages = {1-18}, doi = {10.1159/000545595}, pmid = {40159307}, issn = {1421-9972}, abstract = {Background Intrinsic pitch (IF0) is an inherent property of vowels where high vowels are produced with a higher fundamental frequency than low vowels. Although well studied in adults, it remains underexplored in children. IF0 reflects combined biomechanical effects as well as a deliberate effort from speakers to produce distinct vowels and enhance vowel contrasts. Vowel errors and inconsistency in vowel production is one of the well-known characteristics in Childhood Apraxia of Speech (CAS). We aimed to investigate if children with CAS exhibit IF0 and if present, how it compares with typically developing (TD) children. Method 17 CAS children and 8 TD children were asked to repeat simple bisyllabic non-word utterances of the type [dəCV] six times. The stimuli contained a consonant, C (/b, d/) and a vowel V, which comprised of the corner vowels of the Dutch vowel space (/a, i, u/). The target stimulus was produced in a carrier sentence (/he dəCV wɪːr/; 'hey the CV again'). Mean pitch (F0) and formant (F1 to F3) values were extracted from the recorded speech samples around vowel midpoint and Bark transformed prior to further analyses. Statistical analyses were carried out using linear mixed models for each outcome measure separately. Results The main finding of our study is that IF0 is present in children with CAS with a pattern generally similar to TD children. Additionally, we observed differences in vowel characteristics in children with CAS that are ambiguous, rather we observed vowel specific differences. Children with CAS produced the /a/ vowel with an exaggerated openness whereas they produced /u/ more fronted compared to TD children. Also, children with CAS produced their vowels generally with a higher pitch and a longer duration compared to TD children. Pitch and duration were only correlated (negatively) in the vowel /a/ in both groups. Conclusions Where intrinsic pitch appears to be preserved in children with CAS, they do show differences in articulatory dimensions of vowel production compared to TD that are vowel specific. Clinicians should take these vowel specific differences into account when choosing therapeutic targets.}, } @article {pmid40158914, year = {2025}, author = {Eyisaraç, Ş and Özel, HE and Selçuk, A and Bayakır, F and Başer, S and Altıparmak, E and Genç, S and Özdoğan, F and Köroğlu, E}, title = {Vocal Resonance Alterations Following Anterior Palatoplasty and Expansion Sphincter Pharyngoplasty.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.03.010}, pmid = {40158914}, issn = {1873-4588}, abstract = {AIM: This study aims to examine the effects of combined anterior palatoplasty (AP) and expansion sphincter pharyngoplasty (ESP) on vocal resonance and nasalization in patients with mild to moderate obstructive sleep apnea syndrome (OSAS), utilizing objective testing methods.

MATERIALS AND METHODS: A total of 28 patients with mild to moderate OSAS, determined by polysomnography, were included in the study. Preoperative assessments and postoperative evaluations at the 1st and 6th months were conducted, during which patients produced steady sustained phonation of the vowels /ɑ/, /ɛ/, /ɯ/, /i/, /ɔ/, /œ/, /u/, and /y/. Formant frequencies (F0, F1, F2, F3, and F4) were analyzed. Additionally, nasalization was evaluated using the vowel /ɑ/ in the syllable /ɟ ɑ ɟ/ and quantified by analyzing F0, F1, F2, F3, F4, and A1P0 values, where A1 represents the amplitude of the first formant harmonic peak and P0 represents the amplitude of the lowest nasal peak.

RESULTS: No statistically significant changes were observed in the fundamental frequency (F0) of any vowels before and after surgery. At 6 months postoperatively, significant decreases in F1 for /ɑ/ (P = 0.047) and F3 for /u/ (P = 0.017) were noted. Nasalization measurements at 6 months showed significant changes, including a decrease in F3 (P = 0.023), an increase in F4 (P = 0.025), and a decrease in A1P0 values for nasalized /ɑ/ (P = 0.013).

CONCLUSION: AP + ESP affect vocal resonance specifically in back vowels (/ɑ/, /u/), and leads to nasalization, consistent with the surgical focus on the velopharyngeal region, while preserving fundamental frequency across all vowels. These alterations might influence how individuals perceive their voice, possibly having particular relevance for professional voice users.}, } @article {pmid40149785, year = {2025}, author = {Wang, Q and Xu, F and Wang, X and Wu, S and Ren, L and Liu, X}, title = {How Anxiety State Influences Speech Parameters: A Network Analysis Study from a Real Stressed Scenario.}, journal = {Brain sciences}, volume = {15}, number = {3}, pages = {}, doi = {10.3390/brainsci15030262}, pmid = {40149785}, issn = {2076-3425}, support = {2023RCJB04//Air Force Medical University/ ; }, abstract = {Background/Objectives: Voice analysis has shown promise in anxiety assessment, yet traditional approaches examining isolated acoustic features yield inconsistent results. This study aimed to explore the relationship between anxiety states and vocal parameters from a network perspective in ecologically valid settings. Methods: A cross-sectional study was conducted with 316 undergraduate students (191 males, 125 females; mean age 20.3 ± 0.85 years) who completed a standardized picture description task while their speech was recorded. Participants were categorized into low-anxiety (n = 119) and high-anxiety (n = 197) groups based on self-reported anxiety ratings. Five acoustic parameters-jitter, fundamental frequency (F0), formant frequencies (F1/F2), intensity, and speech rate-were analyzed using network analysis. Results: Network analysis revealed a robust negative relationship between jitter and state anxiety, with jitter as the sole speech parameter consistently linked to state anxiety in the total group. Additionally, higher anxiety levels were associated with a coupling between intensity and F1/F2, whereas the low-anxiety network displayed a sparser organization without intensity and F1/F2 connection. Conclusions: Anxiety could be recognized by speech parameter networks in ecological settings. The distinct pattern with the negative jitter-anxiety relationship in the total network and the connection between intensity and F1/2 in high-anxiety states suggest potential speech markers for anxiety assessment. These findings suggest that state anxiety may directly influence jitter and fundamentally restructure the relationships among speech features, highlighting the importance of examining jitter and speech parameter interactions rather than isolated values in speech detection of anxiety.}, } @article {pmid40145791, year = {2025}, author = {Stepanović, M and Hardmeier, C and Scharenborg, O}, title = {Formant-based vowel categorization for cross-lingual phone recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {157}, number = {3}, pages = {2248-2262}, doi = {10.1121/10.0036222}, pmid = {40145791}, issn = {1520-8524}, mesh = {Humans ; *Phonetics ; *Multilingualism ; *Speech Perception ; *Speech Acoustics ; Female ; Male ; Adult ; Language ; Young Adult ; Recognition, Psychology ; }, abstract = {Multilingual phone recognition models can learn language-independent pronunciation patterns from large volumes of spoken data and recognize them across languages. This potential can be harnessed to improve speech technologies for underresourced languages. However, these models are typically trained on phonological representations of speech sounds, which do not necessarily reflect the phonetic realization of speech. A mismatch between a phonological symbol and its phonetic realizations can lead to phone confusions and reduce performance. This work introduces formant-based vowel categorization aimed at improving cross-lingual vowel recognition by uncovering a vowel's phonetic quality from its formant frequencies, and reorganizing the vowel categories in a multilingual speech corpus to increase their consistency across languages. The work investigates vowel categories obtained from a trilingual multi-dialect speech corpus of Danish, Norwegian, and Swedish using three categorization techniques. Cross-lingual phone recognition experiments reveal that uniting vowel categories of different languages into a set of shared formant-based categories improves cross-lingual recognition of the shared vowels, but also interferes with recognition of vowels not present in one or more training languages. Cross-lingual evaluation on regional dialects provides inconclusive results. Nevertheless, improved recognition of individual vowels can translate to improvements in overall phone recognition on languages unseen during training.}, } @article {pmid40129406, year = {2025}, author = {Chen, F and Pan, C and Hu, H and Hochmuth, S and Kollmeier, B and Warzybok, A}, title = {Understanding the Lombard Effect for Mandarin: Relation Between Speech Recognition Thresholds and Acoustic Parameters.}, journal = {Trends in hearing}, volume = {29}, number = {}, pages = {23312165251324266}, doi = {10.1177/23312165251324266}, pmid = {40129406}, issn = {2331-2165}, mesh = {Humans ; Female ; Male ; *Speech Perception/physiology ; *Speech Acoustics ; Young Adult ; Adult ; *Acoustic Stimulation ; *Noise/adverse effects ; Speech Reception Threshold Test ; Auditory Threshold/physiology ; Sex Factors ; Speech Intelligibility ; Recognition, Psychology ; Perceptual Masking/physiology ; Voice Quality ; Language ; }, abstract = {The present work quantifies the Lombard effect across native speakers of Mandarin Chinese using the Matrix sentence test, which is optimized for precisely assessing speech recognition thresholds (SRTs) in noise. Specifically, we studied the effects of speaker gender, fundamental frequency (F0), formant frequencies (F1 and F2), the duration and rate of voiced segments, and frequency-specific energy redistribution characterized by alpha ratio and speech-weighted signal-to-noise ratio (swSNR) on the recognition of Mandarin in plain and Lombard speech. The Mandarin Chinese matrix test was recorded with plain and Lombard speech from 11 native-Mandarin speakers. SRTs in stationary noise were measured with native-Mandarin, normal-hearing listeners. Results showed that on average, Mandarin Lombard speech was more intelligible than Mandarin plain speech for both female and male speakers, and the Mandarin Lombard gain of female speakers was larger than that of males. In addition, various acoustic analyses involving all speakers showed that (a) only swSNR was significantly correlated with the SRT of the Mandarin plain speech; (b) most acoustic measures were significantly correlated with the SRT of the Mandarin Lombard speech; and (c) alpha ratio and swSNR were significantly correlated with the SRT Lombard gain. In addition, a gender effect was found in the correlational analysis between acoustic parameters and SRT as well as Lombard gain in SRT. The findings highlight the impact of increased high-frequency energy on the observed Lombard gain in Mandarin speech, whereas the changes in individual acoustic parameters (e.g., F0 and F1) appear to play only a minor role.}, } @article {pmid40111024, year = {2025}, author = {Mou, Z and Peng, K and Ye, W and Xu, J and Chen, Y and Tong, M and Lu, J}, title = {Acoustic Properties of Vowel Production in Mandarin-Speaking Patients With Parkinson Disease-Related Hypokinetic Dysarthria.}, journal = {The Journal of craniofacial surgery}, volume = {}, number = {}, pages = {}, pmid = {40111024}, issn = {1536-3732}, support = {2024B03J1341//Science and Technology Projects in Guangzhou/ ; A2023353//Guangdong Medical Science and Technology Research Foundation of China/ ; 2022A0505040007//Special Project of Guangdong Province for technology innovation strategy/ ; 202201020046//Science and Technology Projects in Guangzhou/ ; 2021A1515220049//Government-enterprise Joint Programs of Natural Science Foundation of Guangdong Province/ ; 20202042//Administration of Traditional Chinese Medicine of Guangdong Province/ ; }, abstract = {OBJECTIVE: The objective of the present study is to identify acoustic parameters for speech evaluation in patients who speak Mandarin, with Parkinson disease-related hypokinetic dysarthria (PDHD).

METHODS: The authors' sample included 31 patients with PDHD and 38 neurologically normal adults in a similar age range. The authors recorded each participant articulating a list of Mandarin monosyllables that included 6 monophthong vowels (i.e., /a, i, u, ɤ, y, o/). The authors identified the vowel duration (V-dur) and formants (F1 and F2) of each vowel token. On the basis of the formants, the authors calculated and analyzed the acoustic indexes of vowel space area (VSA), vowel articulation index (VAI), and formant centralization ratio (FCR) of the vowels.

RESULTS: Compared with healthy speakers, patients with PDHD had a significantly longer vowel duration for all 6 vowels (P < 0.01). The differences in VSA, VAI, and FCR between the case and normal groups were all statistically significant.

CONCLUSIONS: Differences in vowel acoustic indexes (V-dur, VSA, VAI, and FCR) between the 2 groups revealed that these 4 indexes were sensitive to the variation in vowel production in patients with PDHD. These indexes can be used to evaluate speech intelligibility caused by impaired vowel pronunciation in patients with PDHD and the outcome of rehabilitation therapy.}, } @article {pmid40103316, year = {2025}, author = {Celenk, C and Ulkumen, B and Celik, O}, title = {The Effect of Concomitant Septoplasty and Turbinate Surgery on Nasality-Related Voice Parameters.}, journal = {Clinical otolaryngology : official journal of ENT-UK ; official journal of Netherlands Society for Oto-Rhino-Laryngology & Cervico-Facial Surgery}, volume = {}, number = {}, pages = {}, doi = {10.1111/coa.14304}, pmid = {40103316}, issn = {1749-4486}, abstract = {INTRODUCTION: Our study aimed to reveal whether septoplasty and inferior turbinate reduction significantly impact the acoustic properties of nasalized syllables and alter subjective and objective voice parameters.

MATERIALS AND METHODS: Forty patients with nasal septal deviation and bilateral grade 2 ≤ inferior turbinate hypertrophy who underwent septoplasty and bilateral inferior turbinoplasty were enrolled. Participants completed the VHI-10, VAS, and NOSE scales preoperatively and at 6 months postoperatively. Changes in VAS and NOSE scores were calculated as VAS[change] and NOSE[change] values. Voice recordings of the sustained vowel /a/ and the word /mini/ were analysed using MDVP. Acoustic analysis was performed with the sustained vowel /a/, and spectrographic analysis was conducted with the consonants /m/, /n/, and the vowel /i/ in /mini/. Recordings were taken preoperatively and at 6 months postoperatively. Statistical analysis compared pre- and postoperative values for significant changes using SPSS Version 21.0 (IBM Corp.; Armonk, NY, USA).

RESULTS: A statistically significant decrease in VAS and NOSE scores was observed at 6 months postoperatively (p < 0.05). No significant difference was found in VHI-10 scores (p > 0.05). Acoustic analysis showed a significant change in pre- and postoperative F0 values (p < 0.05), but not in jitter, jitter%, shimmer, shimmer%, and NHR (p > 0.05). Spectrographic analysis revealed significant postoperative changes in the F3 and F4 formants of consonants /m/, /n/, and vowel /i/ in the word /mini/. A significant correlation was found between postoperative changes in F3 and F4 formant values for consonants /m/ and /n/ with the VAS[change] value. For the NOSE[change] value, a significant correlation was found only with the change in the F3 formant value for the consonant /m/.

CONCLUSION: Nasal surgeries, particularly septo-turbinoplasty, can influence voice timbre by modifying F3 and F4, which is of notable concern for professional voice users, such as singers and actors, due to the potential impact on the singer's formant cluster and overall vocal quality. Although it may not be appropriate to generalise for all rhinological surgeries, the significant changes in the F3 and F4 formants in a specific and refined patient group suggest that caution should be exercised in such surgeries, especially for professional voice users.}, } @article {pmid40096812, year = {2025}, author = {Benz, KR and Hauswald, A and Weisz, N}, title = {Influence of visual analogue of speech envelope, formants, and word onsets on word recognition is not pronounced.}, journal = {Hearing research}, volume = {460}, number = {}, pages = {109237}, doi = {10.1016/j.heares.2025.109237}, pmid = {40096812}, issn = {1878-5891}, abstract = {In noisy environments, filtering out the relevant speech signal from the background noise is a major challenge. Visual cues, such as lip movements, can improve speech understanding. This suggests that lip movements carry information about speech features (e.g. speech envelope, formants, word onsets) that can be used to aid speech understanding. Moreover, the isolated visual or tactile presentation of the speech envelope can also aid word recognition. However, the evidence in this area is rather mixed, and formants and word onsets have not been studied in this context. This online study investigates the effect of different visually presented speech features (speech envelope, formants, word onsets) during a two-talker audio on word recognition. The speech features were presented as a circle whose size was modulated over time based on the dynamics of three speech features. The circle was either modulated according to the speech features of the target speaker, the distractor speaker or an unrelated control sentence. After each sentence, the participants` word recognition was tested by writing down what they heard. We show that word recognition is not enhanced for any of the visual features relative to the visual control condition.}, } @article {pmid40066383, year = {2025}, author = {Bakhshaee, M and Sadri, AB and Sobhani, D and Morovatdar, N and Rasoulian, B}, title = {The Effect of Rhinoplasty on the Acoustic Characteristics of Resonance and Sound Production.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {77}, number = {1}, pages = {401-411}, doi = {10.1007/s12070-024-05208-3}, pmid = {40066383}, issn = {2231-3796}, abstract = {Rhinoplasty is the most common cosmetic surgery procedure in Iran. One of the complications of this procedure that has been less considered is the probable effect of rhinoplasty on voice. This study aimed to assess the influence of rhinoplasty on acoustic characteristics of resonance and sound production. This prospective study was undergone on 25 patients with rhinoplasty and septorhinoplasty. All patients were referred to a speech therapy clinic for voice recording. Participants were asked to read a task containing nasal vowels, nasal consonants, syllables, and sentences with and without nasal consonants while a microphone was placed 5 cm from the mouth in a silent room before and three times (one, three, and six months) after surgery. A speech therapist consultant analyzed the recording data. Acoustic parameters including formant 1-5, LTAS, and HNR were measured and compared before and after surgery. Based on this study, fourth and fifth formants were the most formant affected by rhinoplasty; however, it was not significant. In addition, other investigated acoustic parameters, including LTAS and HNR, did not differ meaningfully after the procedure. Acoustic analysis of nasal vowels, nasal consonants, syllables, words, and sentences with and without nasal consonants did not reveal any significant differences after the rhinoplasty.}, } @article {pmid40050171, year = {2025}, author = {Ambros, GDA and Andrada E Silva, MA}, title = {Resonance Strategies in the Upper Range of Western Operatic Tenors.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.02.025}, pmid = {40050171}, issn = {1873-4588}, abstract = {BACKGROUND: High notes pose a challenge for classical tenors due to physiological and acoustic aspects. According to nonlinear source-filter interactions, it is beneficial in these notes to position the resonances just above the frequency of their closest harmonics, amplifying them while avoiding phonatory discontinuities. Intentional tuning of resonances to harmonics in the high tenor tessitura has been described in the literature.

OBJECTIVES: Identify the resonance strategies employed by operatic tenors in high notes.

METHOD: Five professional tenors were recorded emitting the vowels /a, e, i, o, u/, sung in ascending scales between the notes C3 (131 Hz) and C5 (523 Hz) and spoken in carrier sentences. The frequencies of the first two resonances were extracted through inverse filtering, as well as the amplitudes of the first four harmonics and the peak in the singer's formant region in the radiated spectrum.

RESULTS: From low to high notes, the frequencies of the first two resonances of all vowels tended to converge. Resonance tuning was most employed in the passaggio (first resonance tuned to the second harmonic, second resonance to the fourth harmonic) and at its upper limit (second resonance tuned to the third harmonic). In the highest notes, the balanced distribution of energy among the lower harmonics was more frequent, with the more dramatic voices exhibiting an equally strong singer's formant. Only in the vowel /i/ did first resonance tunings to the first harmonic occur.

CONCLUSIONS: The vowels became progressively less distinguishable towards the high notes. Systematic resonance tuning was not observed in the high notes, with a greater occurrence of similarly strong lower harmonics, without strong distinct spectrum envelope peaks. Where resonance tuning was identified, there was no apparent preference for positioning the resonances above or below the frequency of their closest harmonics.}, } @article {pmid40043206, year = {2024}, author = {Lou, Q and Wang, X and Wan, T and Wang, B}, title = {Speech Acoustic Analysis in Adult Patients With Cleft Palate After Cleft Palate Repair and Speech Therapy.}, journal = {The Journal of craniofacial surgery}, volume = {}, number = {}, pages = {}, doi = {10.1097/SCS.0000000000010495}, pmid = {40043206}, issn = {1536-3732}, abstract = {OBJECTIVE: This study aims to evaluate the enhancement of speech functionality in adult patients with cleft palate through acoustic analysis, assessing pronunciation level improvements before and after palatopharyngoplasty and speech treatment. The findings aim to provide an objective assessment of the treatment efficacy for older patients with cleft palate.

PARTICIPANTS AND INTERVENTION: The study involved acoustic comparisons encompassing vowel formants, voice onset time (VOT) of consonant syllables, syllable duration, and voice characteristic analysis. Speech functionality in each adult cleft palate patient was evaluated thrice: before palatopharyngoplasty, after palatopharyngoplasty, and following speech therapy, using a self-comparative analysis method to discern phonological differences.

RESULTS: No significant alteration in vowel formants was observed in adult cleft palate patients pre-palatopharyngoplasty and post-palatopharyngoplasty. Post-speech treatment, the F2 and F3 values for the anterior high vowel /i/ significantly improved, aligning closely with those of the normal adult group. Similarly, while consonant parameters (VOT value and syllable duration) remained unchanged post-surgery, both metrics showed significant improvement after speech therapy. Except for the prolonged syllable duration of /s/ compared with normal adults, other indicators were not significantly different. Voice parameter analysis revealed no significant change post-operation; however, both HNR and CPPS values post-speech treatment notably increased, matching those of normal adults.

CONCLUSION: Surgical intervention addresses the physical closure of the cleft palate and reconstructs the resonator's structure. Conversely, consonant improvement predominantly occurs through targeted speech therapy aimed at rectifying pronunciation habits and tutoring patients on the effective utilization of repaired articulatory organs. The combined intervention of cleft palate surgery and speech therapy plays a complementary role in speech restoration for cleft palate patients.}, } @article {pmid40016877, year = {2025}, author = {Muegge, JB and McMurray, B}, title = {Understanding the Process of Integration in Binaural Cochlear Implant Configurations.}, journal = {Ear and hearing}, volume = {}, number = {}, pages = {}, pmid = {40016877}, issn = {1538-4667}, abstract = {OBJECTIVES: Cochlear implant (CI) users with access to hearing in both ears (binaural configurations) tend to perform better in speech perception tasks than users with a single-hearing ear alone. This benefit derives from several sources, but one central contributor may be that binaural hearing allows listeners to integrate content across ears. A substantial literature demonstrates that binaural integration differs between CI users and normal hearing controls. However, there are still questions about the underlying process of this integration. Here, we test both normal-hearing listeners and CI users to examine this process.

DESIGN: Twenty-three CI users (7 bimodal, 7 bilateral, and 9 single sided deafness CI users) and 28 age-matched normal-hearing listeners completed a dichotic listening task, in which first and second formants from one of four vowels were played to each ear in various configurations: with both formants heard diotically, with one formant heard diotically, or with one formant heard in one ear and the second formant heard in the other (dichotically). Each formant heard alone should provide minimal information for identifying the vowel. Thus, listeners must successfully integrate information from both ears if they are to show good performance in the dichotic condition.

RESULTS: Normal-hearing listeners showed no noticeable difference in performance when formants were heard diotically or dichotically. CI users showed significantly reduced performance in the dichotic condition relative to when formants were heard diotically. A deeper examination of individual participants suggests that CI users show important variation in their integration process.

CONCLUSIONS: Using a dichotic listening task we provide evidence that while normal-hearing listeners successfully integrate content dichotically, CI users show remarkable differences in how they approach integration. This opens further questions regarding the circumstances in which listeners display different integration profiles and has implications for understanding variation in real-world performance outcomes.}, } @article {pmid39998127, year = {2025}, author = {Persson, A and Barreda, S and Jaeger, TF}, title = {Comparing accounts of formant normalization against US English listeners' vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {157}, number = {2}, pages = {1458-1482}, doi = {10.1121/10.0035476}, pmid = {39998127}, issn = {1520-8524}, mesh = {Humans ; *Speech Perception ; *Phonetics ; Female ; Male ; *Speech Acoustics ; Adult ; Young Adult ; Language ; Acoustic Stimulation ; Recognition, Psychology ; }, abstract = {Human speech recognition tends to be robust, despite substantial cross-talker variability. Believed to be critical to this ability are auditory normalization mechanisms whereby listeners adapt to individual differences in vocal tract physiology. This study investigates the computations involved in such normalization. Two 8-way alternative forced-choice experiments assessed L1 listeners' categorizations across the entire US English vowel space-both for unaltered and synthesized stimuli. Listeners' responses in these experiments were compared against the predictions of 20 influential normalization accounts that differ starkly in the inference and memory capacities they imply for speech perception. This includes variants of estimation-free transformations into psycho-acoustic spaces, intrinsic normalizations relative to concurrent acoustic properties, and extrinsic normalizations relative to talker-specific statistics. Listeners' responses were best explained by extrinsic normalization, suggesting that listeners learn and store distributional properties of talkers' speech. Specifically, computationally simple (single-parameter) extrinsic normalization best fit listeners' responses. This simple extrinsic normalization also clearly outperformed Lobanov normalization-a computationally more complex account that remains popular in research on phonetics and phonology, sociolinguistics, typology, and language acquisition.}, } @article {pmid39955192, year = {2025}, author = {Liu, W and Wang, Y}, title = {Acoustic Characteristics of Tenors and Sopranos in Chinese National Singing and Bel Canto.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.01.039}, pmid = {39955192}, issn = {1873-4588}, abstract = {BACKGROUND: With the advancement of vocal arts, Chinese National Singing and Western Classical Singing (Bel Canto) encounter challenges in cross-cultural adaptation. Investigating formant tuning strategies and the singer's formant is crucial for scientifically characterizing the vocal production techniques in Chinese singing styles.

METHOD: Eight singers-Chinese National Singing tenors, Chinese National Singing sopranos, Bel Canto tenors, and Bel Canto sopranos-were recruited. The fundamental frequency (F0), intensity, formants, and long-term average spectrum (LTAS) were analyzed using a series of designed tasks to examine the phonation and articulation characteristics of these two singing genres in the context of cross-cultural adaptation.

RESULTS: A positive correlation between F0 and intensity was generally observed, though variations existed across vowels and singers. Both linear and non-linear relationships were found between F0 and formants. The first formant (F1) was proportional to F0, with greater variability for female singers in the vowel /a/. LTAS analysis revealed that the tenors exhibited the singer's formant in sung vowels and songs, whereas the sopranos did not exhibit this feature when singing vowels but did so in specific songs. Moreover, the primary and secondary spectral peaks in Bel Canto were less influenced by songs compared to Chinese National Singing.

CONCLUSIONS: (i) Intensity can provide an objective basis for differentiating subjective differences between singing genres, and individual differences are evident in how singers handle the relationship between F0 and intensity. (ii) Vowel modification and vowel migration in sopranos reflect consistency and variability across linguistic and cultural contexts. (iii) The presence and characteristics of the singer's formant are influenced by sexes, singing genres, and songs. Differences in the degree of spectral influence between the two singing genres suggest that Bel Canto emphasizes yi qiang xing zi (ie, phonation drives articulation), while Chinese National Singing emphasizes yi zi xing qiang (ie, articulation drives phonation).}, } @article {pmid39924373, year = {2025}, author = {Pan, AY and Grail, GPO and Albert, G and Groll, MD and Stepp, CE and Arnocky, SA and Hodges-Simeon, CR}, title = {What Contributes to Masculine Perception of Voice Among Transmasculine People on Testosterone Therapy?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.12.037}, pmid = {39924373}, issn = {1873-4588}, abstract = {Voice is a highly salient and complex signal that people use to categorize another's gender. For transmasculine individuals seeking to align their gender expression with their gender identity, vocal presentation is a major concern. Voice-gender incongruence, where one's voice does not match their gender identity, can lead to vocal strain, fatigue, emotional distress, and increased risk of suicidality. Testosterone therapy, which uses exogenous testosterone to masculinize or androgynize the voice and other secondary sexual characteristics in individuals assigned female at birth, is one method to address this issue. However, many individuals remain dissatisfied with their voice post therapy, indicating that hormonal voice modification is a complex process not fully understood. In the present study, we use unmodified voice samples from 30 transmasculine individuals undergoing testosterone therapy and utilized multivariate analysis to determine the relative and combined effects of four acoustic parameters on two measures of gender perception. The results show that transmasculine individuals' speech is perceived as equally "masculine" as that of cisgender males, with both groups being statistically categorized as male at similar rates. Although mean fundamental frequency and formant-estimated vocal tract length together account for a significant portion of the variance in gender perceptions, a substantial amount of variance in gender perception remains unexplained. Understanding the acoustic and sociolinguistic factors that contribute to masculine voice presentation can lead to more informed and individualized care for transmasculine individuals experiencing voice-gender incongruence and considering testosterone therapy. For this population, addressing voice-gender incongruence has important implications for life satisfaction, quality of life, and self-esteem.}, } @article {pmid39889010, year = {2024}, author = {Luo, X and Lv, J and Liu, W and Mi, C and Wang, J and Yang, L and Chu, PK and Liu, C}, title = {Double-formant PCF-SPR refractive index sensor with ultra-high double-peak-shift sensitivity and a wide detection range.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {41}, number = {10}, pages = {1873-1883}, doi = {10.1364/JOSAA.530505}, pmid = {39889010}, issn = {1520-8532}, abstract = {A dual-resonance-peak photonic crystal fiber-surface plasmon resonance (PCF-SPR) refractive index (RI) sensor is designed for different wavelength ranges. The first resonance peak of the sensor is distributed in the wavelength range of 700-2350 nm, while the second peak is distributed in the range of 2350-5550 nm. In addition to detecting analytes using the full spectrum of constraint losses (CLs), it is also possible to use a single resonance peak to achieve the detection of analytes. By systematically optimizing the nanowire diameter, the diameter of the inner and outer layer air hole, the width of the groove, the polishing depth, and the distance from the outer layer air hole to the fiber core, the optimal structure of the sensor is finally determined. In this study, the sensor was studied by numerical analysis, and the characteristics of the sensor were evaluated by wavelength detection technology. The results show that within the RI range of 1.24-1.37, the sensor has a maximum wavelength sensitivity (WS) of 54700 nm/RIU for detecting the RI of analytes. Within the above refractive index range, the regression coefficient R [2] of the dual-peak-resonance wavelength is 0.99993, ensuring the accuracy of the estimated resonance wavelength of the sensor. In addition, the sensor can also use dual-peak-shift sensitivity (DPSS) to detect the refractive index, which is a relatively new sensing technology. The maximum DPSS of the sensor is 95300 nm/RIU. Due to its high sensitivity and unique dual-peak characteristics, this sensor has wide application prospects in medical diagnosis, environmental monitoring, food safety, and other fields.}, } @article {pmid39824758, year = {2025}, author = {Đinh, LG and Brunelle, M and Tạ, TT}, title = {Relating production and perception in two Raglai dialects at different stages of registrogenesis.}, journal = {Phonetica}, volume = {}, number = {}, pages = {}, pmid = {39824758}, issn = {1423-0321}, abstract = {This paper explores the perception of two diachronically related and mutually intelligible phonological oppositions, the onset voicing contrast of Northern Raglai and the register contrast of Southern Raglai. It is the continuation of a previous acoustic study that revealed that Northern Raglai onset stops maintain a voicing distinction accompanied by weak formant and voice quality modulations on following vowels, while Southern Raglai has transphonologized this voicing contrast into a register contrast marked by vowel and voice quality distinctions. Our findings indicate that the two dialects partially differ in their use of identification cues, Northern Raglai listeners using both voicing and F1 as major cues while Southern Raglai listeners largely focus on F1. Production and perception are thus not perfectly aligned in Northern Raglai, because F1 plays a stronger role in perception than production in this dialect. We conclude that mutual intelligibility between dialects is possible because they both use F1 for identification.}, } @article {pmid39769881, year = {2024}, author = {Jv, X and Wu, J and Mao, Q and Li, Q and Zhang, T}, title = {Development on Light and Thin Broadband Sound Absorption Structure Based on Unequal-Cross-Section Microperforated Plate Series Connection.}, journal = {Materials (Basel, Switzerland)}, volume = {17}, number = {24}, pages = {}, pmid = {39769881}, issn = {1996-1944}, support = {51965041//National Natural Science Foundation of China/ ; YC2022-s735//Jiangxi Postgraduate Innovation Special Fund Project/ ; }, abstract = {The sound absorption structure of a microperforated plate has many advantages and has great potential in the field of noise control. In order to solve the problem of broadband sound absorption of microperforated plates, a series acoustic structure of microperforated plates of unequal cross-section was designed based on the traditional microperforated plate series acoustic structure. Compared with the traditional series structure, the sudden change of cross-section increases the sound energy dissipation and greatly improves the sound absorption performance. Through the analysis of its parameters, when the overall thickness of the structure is 20 mm, its sound absorption coefficient is above 0.5 in the frequency range of 1000-3450 Hz; there are three formants, and the sound absorption coefficients corresponding to the three formants reach 1. This study provides new ideas and methods for the design of broadband acoustic structures.}, } @article {pmid39763462, year = {2024}, author = {Caragli, V and Zacheo, E and Nodari, R and Genovese, E and Mancuso, A and Mazzoni, L}, title = {Effects of face protector devices on acoustic parameters of voice.}, journal = {Acta otorhinolaryngologica Italica : organo ufficiale della Societa italiana di otorinolaringologia e chirurgia cervico-facciale}, volume = {44}, number = {6}, pages = {377-391}, pmid = {39763462}, issn = {1827-675X}, mesh = {Humans ; *COVID-19/prevention & control/transmission ; Male ; Adult ; Female ; *Personal Protective Equipment ; *Voice Quality ; *Speech Acoustics ; Masks ; Young Adult ; Middle Aged ; Voice ; }, abstract = {OBJECTIVES: The SARS-CoV-2 pandemic required the use of personal protective equipment (PPE) in medical and social contexts to reduce exposure and prevent pathogen transmission. This study aims to analyse possible changes in voice and speech parameters with and without PPE.

METHODS: Speech samples using different types of PPE were obtained. Recordings were then analysed using PRAAT software (version 6.1.42). Statistical analysis was conducted using ANOVA in Jamovi software. A post-hoc test was performed to compare PPE-related results.

RESULTS: Statistically significant differences were found in Cepstral Peak of Prominence-Smoothed, Harmonic to Noise Ratio (HNR), slope of Long-Term Average Spectrum (LTAS), tilt of trendline through LTAS, shimmer parameters, HNR mean and standard deviation of vowels, vowels and consonants formants. HNR values increased whereas shimmer parameters and formant values reduced using PPE [PPE combined>filtering face piece (FFP)> surgical masks>no PPE].

CONCLUSIONS: Our data show improvement in many parameters of voice and speech quality and modification of speech articulation when using masks, particularly in case of combined PPE. The most relevant changes were found with a combination of face shield and FFP2 masks. This may be due to unconscious improvements in speech articulation and increased demand on vocal folds to achieve better speech intelligibility.}, } @article {pmid39738817, year = {2024}, author = {Hu, Z and Zhang, Z and Li, H and Yang, LZ}, title = {Cross-device and test-retest reliability of speech acoustic measurements derived from consumer-grade mobile recording devices.}, journal = {Behavior research methods}, volume = {57}, number = {1}, pages = {35}, pmid = {39738817}, issn = {1554-3528}, support = {82371931//Natural Science Fund of China/ ; YZJJ202207-TS//HFIPS Director's Fund/ ; 202204295107020004//Anhui Province Key Research and Development Project/ ; }, mesh = {Humans ; Reproducibility of Results ; Male ; Female ; Adult ; Young Adult ; *Speech Acoustics ; Smartphone ; Computers, Handheld ; Speech/physiology ; }, abstract = {In recent years, there has been growing interest in remote speech assessment through automated speech acoustic analysis. While the reliability of widely used features has been validated in professional recording settings, it remains unclear how the heterogeneity of consumer-grade recording devices, commonly used in nonclinical settings, impacts the reliability of these measurements. To address this issue, we systematically investigated the cross-device and test-retest reliability of classical speech acoustic measurements in a sample of healthy Chinese adults using consumer-grade equipment across three popular speech tasks: sustained phonation (SP), diadochokinesis (DDK), and picture description (PicD). A total of 51 participants completed two recording sessions spaced at least 24 hours apart. Speech outputs were recorded simultaneously using four devices: a voice recorder, laptop, tablet, and smartphone. Our results demonstrated good reliability for fundamental frequency and cepstral peak prominence in the SP task across testing sessions and devices. Other features from the SP and PicD tasks exhibited acceptable test-retest reliability, except for the period perturbation quotient from the tablet and formant frequency from the smartphone. However, measures from the DDK task showed a significant decrease in reliability on consumer-grade recording devices compared to professional devices. These findings indicate that the lower recording quality of consumer-grade equipment may compromise the reproducibility of syllable rate estimation, which is critical for DDK analysis. This study underscores the need for standardization of remote speech monitoring methodologies to ensure that remote home assessment provides accurate and reliable results for early screening.}, } @article {pmid39734777, year = {2024}, author = {Lobmaier, JS and Klatt, WK and Schweinberger, SR}, title = {Voice of a woman: influence of interaction partner characteristics on cycle dependent vocal changes in women.}, journal = {Frontiers in psychology}, volume = {15}, number = {}, pages = {1401158}, pmid = {39734777}, issn = {1664-1078}, abstract = {INTRODUCTION: Research has shown that women's vocal characteristics change during the menstrual cycle. Further, evidence suggests that individuals alter their voices depending on the context, such as when speaking to a highly attractive person, or a person with a different social status. The present study aimed at investigating the degree to which women's voices change depending on the vocal characteristics of the interaction partner, and how any such changes are modulated by the woman's current menstrual cycle phase.

METHODS: Forty-two naturally cycling women were recorded once during the late follicular phase (high fertility) and once during the luteal phase (low fertility) while reproducing utterances of men and women who were previously assessed to have either attractive or unattractive voices.

RESULTS: Phonetic analyses revealed that women's voices in response to speakers changed depending on their menstrual cycle phase (F0 variation, maximum F0, Centre of gravity) and depending on the stimulus speaker's vocal attractiveness (HNR, Formants 1-3, Centre of gravity), and sex (Formant 2). Also, the vocal characteristics differed when reproducing spoken sentences of the stimulus speakers compared to when they read out written sentences (minimum F0, Formants 2-4).

DISCUSSION: These results provide further evidence that women alter their voice depending on the vocal characteristics of the interaction partner and that these changes are modulated by the menstrual cycle phase. Specifically, the present findings suggest that cyclic shifts on women's voices may occur only in social contexts (i.e., when a putative interaction partner is involved).}, } @article {pmid39721882, year = {2024}, author = {Xiu, N and Liu, L and Li, W and Cai, Z and Wang, Y and Wang, R and Vaxelaire, B and Sock, R and Ling, Z and Chen, J}, title = {Correlation Analysis Between Cortical Structural Features and Acoustic Features in Patients With Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.11.042}, pmid = {39721882}, issn = {1873-4588}, abstract = {PURPOSE: Parkinson disease (PD) is a progressive neurodegenerative disease. The aim of this study is to investigate the association between acoustic and cortical brain features in Parkinson's disease patients.

METHODS: We recruited 19 (eight females, 11 males) Parkinson's disease patients and 19 (eight females, 11 males) healthy subjects to participate in the experiment. Speech samples of three vowels (/i/, /a/, /u/), six plosives (/p/, /pʰ/, /t/, /tʰ/, /k/, /kʰ/), and three voiced consonants (/l/, /m/, /n/) were collected for the experiment, and the acoustic parameters were extracted for fundamental frequency (F0), voice onset time (VOT), voicing onset-vocalic voicing onset (VO-VVO), first formant (F1), second formant (F2), third formant (F3), first bandwidth (B1), second bandwidth (B2), third bandwidth (B3), Jitter, Shimmer, and Harmonics-to-noise ratio (HNR). We also used Ingenia CX 3.0 T to complete the cranial magnetic resonance scanning and did image processing based on the Desikan-Killiany-Tourville Atlas. We assessed the differences in acoustic and neuroimaging parameters between the PD and healthy controls (HCs) groups using the Levene's test (LT), two-sample independent t test (TT), and Mann-Whitney U test (MWUT), and calculated Spearman's bias correlations for acoustic and neuroimaging parameters in the PD and HC groups, respectively.

RESULTS: The results showed that in acoustic features, based on the results of the TT, it can be seen that the F3 of the PD group regarding the vowel /i/ is significantly smaller than that of the HC group. The jitter on the vowel /u/ was significantly higher in the male PD group than in the male HC group. For other acoustic measures, there were no statistically significant differences between the two groups. In the cortical features, the thickness, area, and volume of the cortex were reduced in the vast majority of the brains of the PD patients, however, there is also a small portion of the cortex that appears to be thickened. In the correlation analysis between cortical and acoustic features, F0, F1, F2, F3, B2, B3, VO-VVO, Jitter, HNR, and VOT acoustic parameters showed significant and strong correlation with thickness, area, and volume of cortical sites such as frontal, temporal, entorhinal, fusiform, and precuneus in PD patients, whereas no significant correlation was found in HC group.

CONCLUSIONS: This suggests that Parkinson's disease does have an effect on the acoustic and cortical features of the patient's brain, and that there is a correlation between the two features.}, } @article {pmid39720068, year = {2024}, author = {Song, J and Kim, H and Lee, YO}, title = {Laryngeal disease classification using voice data: Octave-band vs. mel-frequency filters.}, journal = {Heliyon}, volume = {10}, number = {24}, pages = {e40748}, pmid = {39720068}, issn = {2405-8440}, abstract = {INTRODUCTION: Laryngeal cancer diagnosis relies on specialist examinations, but non-invasive methods using voice data are emerging with artificial intelligence (AI) advancements. Mel Frequency Cepstral Coefficients (MFCCs) are widely used for voice analysis, but Octave Frequency Spectrum Energy (OFSE) may offer better accuracy in detecting subtle voice changes.

PROBLEM STATEMENT: Accurate early diagnosis of laryngeal cancer through voice data is challenging with current methods like MFCC.

OBJECTIVES: This study compares the effectiveness of MFCC and OFSE in classifying voice data into healthy, laryngeal cancer, benign mucosal disease, and vocal fold paralysis categories.

METHODS: Voice samples from 363 patients were analyzed using CNN models, employing MFCC and OFSE with 1/3 octave band filters. Grad-Class Activation Mapping (Grad-CAM) was used to visualize key voice features.

RESULTS: OFSE with 1/3 octave band filters outperformed MFCC in classification accuracy, especially in multi-class classification including laryngeal cancer, benign mucosal disease, and vocal fold paralysis groups (0.9398 ± 0.0232 vs. 0.7061 ± 0.0561). Grad-CAM analysis revealed that OFSE with 1/3 octave band filters effectively distinguished laryngeal cancer from healthy voices by focusing on increased noise in the over-formant area and changes in the fundamental frequency. The analysis also highlighted that specific narrow frequency areas, particularly in vocal fold paralysis, were critical for classification, and benign mucosal diseases occasionally resembled healthy voices, making AI differentiation between benign conditions and laryngeal cancer a significant challenge.

CONCLUSION: OFSE with 1/3 octave band filters provides superior accuracy in diagnosing laryngeal diseases including laryngeal cancer, showing potential for non-invasive, AI-driven early detection.}, } @article {pmid39656685, year = {2024}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA and Madureira, S}, title = {Revisiting the speaker discriminatory power of vowel formant frequencies under a likelihood ratio-based paradigm: The case of mismatched speaking styles.}, journal = {PloS one}, volume = {19}, number = {12}, pages = {e0311363}, pmid = {39656685}, issn = {1932-6203}, mesh = {Humans ; Male ; Adult ; *Speech/physiology ; Speech Acoustics ; Phonetics ; Likelihood Functions ; Young Adult ; Speech Production Measurement/methods ; Language ; }, abstract = {Differentiating subjects through the comparison of their recorded speech is a common endeavor in speaker characterization. When using an acoustic-based approach, this task typically involves scrutinizing specific acoustic parameters and assessing their discriminatory capacity. This experimental study aimed to evaluate the speaker discriminatory power of vowel formants-resonance peaks in the vocal tract-in two different speaking styles: Dialogue and Interview. Different testing procedures were applied, specifically metrics compatible with the likelihood ratio paradigm. Only high-quality recordings were analyzed in this study. The participants were 20 male Brazilian Portuguese (BP) speakers from the same dialectal area. Two speaker-discriminatory power estimates were examined through Multivariate Kernel Density analysis: Log cost-likelihood ratios (Cllr) and equal error rates (EER). As expected, the discriminatory performance was stronger for style-matched analyses than for mismatched-style analyses. In order of relevance, F3, F4, and F1 performed the best in style-matched comparisons, as suggested by lower Cllr and EER values. F2 performed the worst intra-style in both Dialogue and Interview. The discriminatory power of all individual formants (F1-F4) appeared to be affected in the mismatched condition, demonstrating that discriminatory power is sensitive to style-driven changes in speech production. The combination of higher formants 'F3 + F4' outperformed the combination of lower formants 'F1 + F2'. However, in mismatched-style analyses, the magnitude of improvement in Cllr and EER scores increased as more formants were incorporated into the model. The best discriminatory performance was achieved when most formants were combined. Applying multivariate analysis not only reduced average Cllr and EER scores but also influenced the overall probability distribution, shifting the probability density distribution towards lower Cllr and EER values. In general, front and central vowels were found more speaker discriminatory than back vowels as far as the 'F1 + F2' relation was concerned.}, } @article {pmid39656649, year = {2024}, author = {Cervantes Constantino, F and Caputi, Á}, title = {Cortical tracking of speakers' spectral changes predicts selective listening.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {34}, number = {12}, pages = {}, doi = {10.1093/cercor/bhae472}, pmid = {39656649}, issn = {1460-2199}, support = {FCE_1_2019_1_155889//Agencia Nacional de Investigación e Innovación/ ; }, mesh = {Humans ; Male ; Female ; *Speech Perception/physiology ; Adult ; *Electroencephalography/methods ; Young Adult ; Cerebral Cortex/physiology ; Acoustic Stimulation/methods ; }, abstract = {A social scene is particularly informative when people are distinguishable. To understand somebody amid a "cocktail party" chatter, we automatically index their voice. This ability is underpinned by parallel processing of vocal spectral contours from speech sounds, but it has not yet been established how this occurs in the brain's cortex. We investigate single-trial neural tracking of slow frequency modulations in speech using electroencephalography. Participants briefly listened to unfamiliar single speakers, and in addition, they performed a cocktail party comprehension task. Quantified through stimulus reconstruction methods, robust tracking was found in neural responses to slow (delta-theta range) modulations of frequency contours in the fourth and fifth formant band, equivalent to the 3.5-5 KHz audible range. The spectral spacing between neighboring instantaneous frequency contours (ΔF), which also yields indexical information from the vocal tract, was similarly decodable. Moreover, EEG evidence of listeners' spectral tracking abilities predicted their chances of succeeding at selective listening when faced with two-speaker speech mixtures. In summary, the results indicate that the communicating brain can rely on locking of cortical rhythms to major changes led by upper resonances of the vocal tract. Their corresponding articulatory mechanics hence continuously issue a fundamental credential for listeners to target in real time.}, } @article {pmid39665279, year = {2024}, author = {Heiszenberger, E and Reinisch, E and Hartmann, F and Brown, E and Pustka, E}, title = {Perceptually Easy Second-Language Phones Are Not Always Easy: The Role of Orthography and Phonology in Schwa Realization in Second-Language French.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309241277995}, doi = {10.1177/00238309241277995}, pmid = {39665279}, issn = {1756-6053}, abstract = {Encoding and establishing a new second-language (L2) phonological category is notoriously difficult. This is particularly true for phonological contrasts that do not exist in the learners' native language (L1). Phonological categories that also exist in the L1 do not seem to pose any problems. However, foreign-language learners are not only presented with oral input. Instructed L2 learning often involves heavy reliance on written forms of the target language. The present study investigates the contribution of orthography to the quality of phonolexical encoding by examining the acoustics of French schwa by Austrian German learners-a perceptually and articulatorily easy L2 phone with incongruent grapheme-phoneme correspondences between the L1 and L2. We compared production patterns in an auditory word-repetition task (without orthographic input) with those in a word-reading task. We analyzed the formant values (F1, F2, F3) of the schwa realizations of two groups of Austrian high-school students who had been learning French for 1 and 6 years. The results show that production patterns are more likely to be affected by L1 grapheme-to-phoneme correspondences when orthographic input is present. However, orthography does not appear to play the dominant role, as L2 development patterns are strongly determined by both the speaker and especially the lexical item, suggesting a highly complex interaction of multiple internal and external factors in the establishment of L2 phonological categories beyond orthography and phonology.}, } @article {pmid39643915, year = {2024}, author = {Fadeev, KA and Romero Reyes, IV and Goiaeva, DE and Obukhova, TS and Ovsiannikova, TM and Prokofyev, AO and Rytikova, AM and Novikov, AY and Kozunov, VV and Stroganova, TA and Orekhova, EV}, title = {Attenuated processing of vowels in the left temporal cortex predicts speech-in-noise perception deficit in children with autism.}, journal = {Journal of neurodevelopmental disorders}, volume = {16}, number = {1}, pages = {67}, pmid = {39643915}, issn = {1866-1955}, mesh = {Humans ; Male ; *Speech Perception/physiology ; *Magnetoencephalography ; Child ; *Temporal Lobe/physiopathology ; *Noise ; Acoustic Stimulation ; Evoked Potentials, Auditory/physiology ; Autism Spectrum Disorder/physiopathology/complications ; Adolescent ; Auditory Cortex/physiopathology ; Autistic Disorder/physiopathology/complications ; }, abstract = {BACKGROUND: Difficulties with speech-in-noise perception in autism spectrum disorders (ASD) may be associated with impaired analysis of speech sounds, such as vowels, which represent the fundamental phoneme constituents of human speech. Vowels elicit early (< 100 ms) sustained processing negativity (SPN) in the auditory cortex that reflects the detection of an acoustic pattern based on the presence of formant structure and/or periodic envelope information (f0) and its transformation into an auditory "object".

METHODS: We used magnetoencephalography (MEG) and individual brain models to investigate whether SPN is altered in children with ASD and whether this deficit is associated with impairment in their ability to perceive speech in the background of noise. MEG was recorded while boys with ASD and typically developing boys passively listened to sounds that differed in the presence/absence of f0 periodicity and formant structure. Word-in-noise perception was assessed in the separate psychoacoustic experiment using stationary and amplitude modulated noise with varying signal-to-noise ratio.

RESULTS: SPN was present in both groups with similarly early onset. In children with ASD, SPN associated with processing formant structure was reduced predominantly in the cortical areas lateral to and medial to the primary auditory cortex, starting at ~ 150-200 ms after the stimulus onset. In the left hemisphere, this deficit correlated with impaired ability of children with ASD to recognize words in amplitude-modulated noise, but not in stationary noise.

CONCLUSIONS: These results suggest that perceptual grouping of vowel formants into phonemes is impaired in children with ASD and that, in the left hemisphere, this deficit contributes to their difficulties with speech perception in fluctuating background noise.}, } @article {pmid39605265, year = {2024}, author = {Xie, B and Li, Z and Wang, H and Kuang, X and Ni, W and Zhong, R and Li, Y}, title = {[The influence of vowel and sound intensity on the results of voice acoustic formant detection was analyzed].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {38}, number = {12}, pages = {1149-1153}, doi = {10.13201/j.issn.2096-7993.2024.12.011}, pmid = {39605265}, issn = {2096-7993}, mesh = {Humans ; Male ; Female ; Young Adult ; *Speech Acoustics ; Voice Quality ; Phonetics ; Voice/physiology ; Adult ; }, abstract = {Objective:This study aims to explore the influence of vowels and sound intensity on formant, so as to provide reference for the selection of sound samples and vocal methods in acoustic detection. Methods:Thirty-eight healthy subjects, 19 male and 19 female, aged 19-24 years old were recruited. The formants of different vowels（/a/, /(?)/, /i/ and /u/） and different sound intensities（lowest sound, comfort sound, highest true sound and highest falsetto sound） were analyzed, and pairings were compared between groups with significant differences. Results:①The vowels /a/ and /(?)/ in the first formant were larger than /i/ and /u/, and /i/ was the largest in the second formant. The minimum value of the first formant is the lowest sound of /i/ and the maximum is the highest sound of /a/. ②In the first formant, the chest sound area increases with the increase of sound intensity, while the second formant enters the highest falsetto and decreases significantly. Conclusion:Different vowels and sound intensity have different distribution of formant, that is, vowel and sound intensity have different degree of influence on formant. According to the extreme value of the first formant, the maximum normal range is determined initially, which is helpful to improve the acoustic detection.}, } @article {pmid39589237, year = {2025}, author = {Fagniart, S and Delvaux, V and Harmegnies, B and Huberlant, A and Huet, K and Piccaluga, M and Watterman, I and Charlier, B}, title = {Producing Nasal Vowels Without Nasalization? Perceptual Judgments and Acoustic Measurements of Nasal/Oral Vowels Produced by Children With Cochlear Implants and Typically Hearing Peers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {68}, number = {1}, pages = {301-322}, doi = {10.1044/2024_JSLHR-24-00083}, pmid = {39589237}, issn = {1558-9102}, mesh = {Humans ; *Cochlear Implants ; Female ; Male ; Child ; *Speech Acoustics ; *Phonetics ; *Cues ; *Speech Perception/physiology ; *Judgment ; Speech Production Measurement/methods ; Speech/physiology ; Nose/physiology ; Deafness/rehabilitation ; }, abstract = {PURPOSE: The objective of the present study is to investigate nasal and oral vowel production in French-speaking children with cochlear implants (CIs) and children with typical hearing (TH). Vowel nasality relies primarily on acoustic cues that may be less effectively transmitted by the implant. The study investigates how children with CIs manage to produce these segments in French, a language with contrastive vowel nasalization.

METHOD: The children performed a task in which they repeated sentences containing a consonant-vowel-consonant-vowel-type pseudoword, the vowel being a nasal or oral vowel from French. Thirteen children with CIs and 25 children with TH completed the task. Among the children with CIs, the level of exposure to Cued Speech (CS) was either occasional (CS-) or intense (CS+). The productions were analyzed through perceptual judgments and acoustic measurements. Different acoustic cues related to nasality were collected: segmental durations, formant values, and predicted values of nasalization. Multiple regression analyses were conducted to examine which acoustic features are associated with perceived nasality in perceptual judgments.

RESULTS: The perceptual judgments realized on the children's speech productions indicate that children with sustained exposure to CS (CS+) exhibited the best identified and most distinct oral/nasal productions. Acoustic measures revealed different production profiles among the groups: Children in the CS+ group seem to differentiate between nasal and oral vowels by relying on segmental duration cues and variations in oropharyngeal configurations (associated with formant differences) but less through nasal resonance.

CONCLUSION: The study highlights (a) a benefit of sustained CS practice for CI children for the intelligibility of nasal-oral segments, (b) privileged exploitation of temporal (segmental duration) and salient acoustic cues (oropharyngeal configuration) in the CS+ group, and (c) difficulties among children with CI in distinguishing nasal-oral segments through nasal resonance.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.27744768.}, } @article {pmid39550323, year = {2024}, author = {Bøyesen, B and Hide, Ø}, title = {Using Twang and Medialization Techniques to Gain Feminine-Sounding Speech in Trans Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.10.020}, pmid = {39550323}, issn = {1873-4588}, abstract = {OBJECTIVES: In this study, we introduce an intervention based on two techniques: twang and medialization. The hypothesis is that a combination of these two techniques will enable trans women to gain feminine-sounding speech without vocal strain or harm.

METHOD: Five trans women took part in the study. A control group of five cisgender women and five cisgender men were included. A list of 14 monosyllabic words was created, where the vowel /ɑ/ was embedded in various consonant contexts. All participants were asked to read the word list three times, each time presented in a different order. The trans women read the word list before and after intervention. Acoustic analyses of fundamental frequency and the first, second, and third formant frequencies were conducted. For the perceptual analysis, 60 voice samples were selected from the entire material. Fifteen listeners were asked whether they perceived the voice samples as feminine, masculine, or uncertain. The listeners were also asked for gender judgments based on sentences read by the trans women after intervention.

RESULTS: The acoustic analyses revealed an increase in fundamental frequencies and first, second, and third formants after intervention for all five trans women, approaching the values of the female controls. The perceptual judgments showed that the majority of the trans women voice samples were perceived as feminine after intervention.

CONCLUSIONS: Based on the acoustic analyses and the perceptual evaluations, the conclusion seems to show that the combination of the techniques twang and medialization enable the trans women to obtain feminine attribution. Nevertheless, the study is too small for generalizations. However, a take-home message is that it is appropriate to focus primarily on resonance, in addition to speaking fundamental frequency, to gain feminine-sounding speech.}, } @article {pmid39531311, year = {2024}, author = {Ponsonnet, M and Coupé, C and Pellegrino, F and Garcia Arasco, A and Pisanski, K}, title = {Vowel signatures in emotional interjections and nonlinguistic vocalizations expressing pain, disgust, and joy across languagesa).}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {5}, pages = {3118-3139}, doi = {10.1121/10.0032454}, pmid = {39531311}, issn = {1520-8524}, mesh = {Humans ; *Emotions ; Phonetics ; Language ; Speech Acoustics ; Pain/psychology ; Voice Quality ; Happiness ; }, abstract = {In this comparative cross-linguistic study we test whether expressive interjections (words like ouch or yay) share similar vowel signatures across the world's languages, and whether these can be traced back to nonlinguistic vocalizations (like screams and cries) expressing the same emotions of pain, disgust, and joy. We analyze vowels in interjections from dictionaries of 131 languages (over 600 tokens) and compare these with nearly 500 vowels based on formant frequency measures from voice recordings of volitional nonlinguistic vocalizations. We show that across the globe, pain interjections feature a-like vowels and wide falling diphthongs ("ai" as in Ayyy! "aw" as in Ouch!), whereas disgust and joy interjections do not show robust vowel regularities that extend geographically. In nonlinguistic vocalizations, all emotions yield distinct vowel signatures: pain prompts open vowels such as [a], disgust schwa-like central vowels, and joy front vowels such as [i]. Our results show that pain is the only affective experience tested with a clear, robust vowel signature that is preserved between nonlinguistic vocalizations and interjections across languages. These results offer empirical evidence for iconicity in some expressive interjections. We consider potential mechanisms and origins, from evolutionary pressures and sound symbolism to colexification, proposing testable hypotheses for future research.}, } @article {pmid39516258, year = {2024}, author = {Carranante, G and Cany, C and Farri, P and Giavazzi, M and Varnet, L}, title = {Mapping the spectrotemporal regions influencing perception of French stop consonants in noise.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {27183}, pmid = {39516258}, issn = {2045-2322}, support = {ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-17-EURE-0017//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; }, mesh = {Humans ; *Speech Perception/physiology ; Female ; Male ; *Noise ; Adult ; *Phonetics ; Young Adult ; Language ; Cues ; Speech Acoustics ; France ; Acoustic Stimulation ; }, abstract = {Understanding how speech sounds are decoded into linguistic units has been a central research challenge over the last century. This study follows a reverse-correlation approach to reveal the acoustic cues listeners use to categorize French stop consonants in noise. Compared to previous methods, this approach ensures an unprecedented level of detail with only minimal theoretical assumptions. Thirty-two participants performed a speech-in-noise discrimination task based on natural /aCa/ utterances, with C = /b/, /d/, /g/, /p/, /t/, or /k/. The trial-by-trial analysis of their confusions enabled us to map the spectrotemporal information they relied on for their decisions. In place-of-articulation contrasts, the results confirmed the critical role of formant consonant-vowel transitions, used by all participants, and, to a lesser extent, vowel-consonant transitions and high-frequency release bursts. Similarly, for voicing contrasts, we validated the prominent role of the voicing bar cue, with some participants also using formant transitions and burst cues. This approach revealed that most listeners use a combination of several cues for each task, with significant variability within the participant group. These insights shed new light on decades-old debates regarding the relative importance of cues for phoneme perception and suggest that research on acoustic cues should not overlook individual variability in speech perception.}, } @article {pmid39515817, year = {2024}, author = {Lin, YC and Yan, HT and Lin, CH and Chang, HH}, title = {Identifying and Estimating Frailty Phenotypes by Vocal Biomarkers: Cross-Sectional Study.}, journal = {Journal of medical Internet research}, volume = {26}, number = {}, pages = {e58466}, pmid = {39515817}, issn = {1438-8871}, mesh = {Humans ; Aged ; Cross-Sectional Studies ; *Frailty/physiopathology ; Male ; Female ; *Phenotype ; *Biomarkers ; Middle Aged ; Voice/physiology ; Aged, 80 and over ; Taiwan ; Frail Elderly/statistics & numerical data ; Sarcopenia/physiopathology/diagnosis ; }, abstract = {BACKGROUND: Researchers have developed a variety of indices to assess frailty. Recent research indicates that the human voice reflects frailty status. Frailty phenotypes are seldom discussed in the literature on the aging voice.

OBJECTIVE: This study aims to examine potential phenotypes of frail older adults and determine their correlation with vocal biomarkers.

METHODS: Participants aged ≥60 years who visited the geriatric outpatient clinic of a teaching hospital in central Taiwan between 2020 and 2021 were recruited. We identified 4 frailty phenotypes: energy-based frailty, sarcopenia-based frailty, hybrid-based frailty-energy, and hybrid-based frailty-sarcopenia. Participants were asked to pronounce a sustained vowel "/a/" for approximately 1 second. The speech signals were digitized and analyzed. Four voice parameters-the average number of zero crossings (A1), variations in local peaks and valleys (A2), variations in first and second formant frequencies (A3), and spectral energy ratio (A4)-were used for analyzing changes in voice. Logistic regression was used to elucidate the prediction model.

RESULTS: Among 277 older adults, an increase in A1 values was associated with a lower likelihood of energy-based frailty (odds ratio [OR] 0.81, 95% CI 0.68-0.96), whereas an increase in A2 values resulted in a higher likelihood of sarcopenia-based frailty (OR 1.34, 95% CI 1.18-1.52). Respondents with larger A3 and A4 values had a higher likelihood of hybrid-based frailty-sarcopenia (OR 1.03, 95% CI 1.002-1.06) and hybrid-based frailty-energy (OR 1.43, 95% CI 1.02-2.01), respectively.

CONCLUSIONS: Vocal biomarkers might be potentially useful in estimating frailty phenotypes. Clinicians can use 2 crucial acoustic parameters, namely A1 and A2, to diagnose a frailty phenotype that is associated with insufficient energy or reduced muscle function. The assessment of A3 and A4 involves a complex frailty phenotype.}, } @article {pmid39487102, year = {2025}, author = {Hullebus, M and Gafos, A and Boll-Avetisyan, N and Langus, A and Fritzsche, T and Höhle, B}, title = {Infant preference for specific phonetic cue relations in the contrast between voiced and voiceless stops.}, journal = {Infancy : the official journal of the International Society on Infant Studies}, volume = {30}, number = {1}, pages = {e12630}, pmid = {39487102}, issn = {1532-7078}, support = {317633480 - SFB 1287//Deutsche Forschungsgemeinschaft/ ; }, mesh = {Humans ; *Cues ; *Speech Perception ; *Phonetics ; Male ; Female ; Infant ; Speech Acoustics ; Adult ; Acoustic Stimulation ; Language Development ; }, abstract = {Acoustic variability in the speech input has been shown, in certain contexts, to be beneficial during infants' acquisition of sound contrasts. One approach attributes this result to the potential of variability to make the stability of individual cues visible. Another approach suggests that, instead of highlighting individual cues, variability uncovers stable relations between cues that signal a sound contrast. Here, we investigate the relation between Voice Onset Time and the onset of F1 formant frequency, two cues that subserve the voicing contrast in German. First, we verified that German-speaking adults' use of VOT to categorize voiced and voiceless stops is dependent on the value of the F1 onset frequency, in the specific form of a so-called trading relation. Next, we tested whether 6-month-old German learning infants exhibit differential sensitivity to stimulus continua in which the cues varied to an equal extent, but either adhered to the trading relation established in the adult experiment or adhered to a reversed relation. Our results present evidence that infants prefer listening to speech in which phonetic cues conform to certain cue trading relations over cue relations that are reversed.}, } @article {pmid39473806, year = {2024}, author = {Ayadi, H and Elbéji, A and Despotovic, V and Fagherazzi, G}, title = {Digital Vocal Biomarker of Smoking Status Using Ecological Audio Recordings: Results from the Colive Voice Study.}, journal = {Digital biomarkers}, volume = {8}, number = {1}, pages = {159-170}, pmid = {39473806}, issn = {2504-110X}, abstract = {INTRODUCTION: The complex health, social, and economic consequences of tobacco smoking underscore the importance of incorporating reliable and scalable data collection on smoking status and habits into research across various disciplines. Given that smoking impacts voice production, we aimed to develop a gender and language-specific vocal biomarker of smoking status.

METHODS: Leveraging data from the Colive Voice study, we used statistical analysis methods to quantify the effects of smoking on voice characteristics. Various voice feature extraction methods combined with machine learning algorithms were then used to produce a gender and language-specific (English and French) digital vocal biomarker to differentiate smokers from never-smokers.

RESULTS: A total of 1,332‬ participants were included after propensity score matching (mean age = 43.6 [13.65], 64.41% are female, 56.68% are English speakers, 50% are smokers and 50% are never-smokers). We observed differences in voice features distribution: for women, the fundamental frequency F0, the formants F1, F2, and F3 frequencies and the harmonics-to-noise ratio were lower in smokers compared to never-smokers (p < 0.05) while for men no significant disparities were noted between the two groups. The accuracy and AUC of smoking status prediction reached 0.71 and 0.76, respectively, for the female participants, and 0.65 and 0.68, respectively, for the male participants.

CONCLUSION: We have shown that voice features are impacted by smoking. We have developed a novel digital vocal biomarker that can be used in clinical and epidemiological research to assess smoking status in a rapid, scalable, and accurate manner using ecological audio recordings.}, } @article {pmid39461704, year = {2024}, author = {Li, JJ and Daliri, A and Kim, KS and Max, L}, title = {Does pre-speech auditory modulation reflect processes related to feedback monitoring or speech movement planning?.}, journal = {Neuroscience letters}, volume = {843}, number = {}, pages = {138025}, pmid = {39461704}, issn = {1872-7972}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R01 DC020707/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Female ; *Speech/physiology ; Adult ; Young Adult ; *Electroencephalography/methods ; *Speech Perception/physiology ; Auditory Cortex/physiology ; Acoustic Stimulation/methods ; Movement/physiology ; Auditory Perception/physiology ; }, abstract = {Previous studies have revealed that auditory processing is modulated during the planning phase immediately prior to speech onset. To date, the functional relevance of this pre-speech auditory modulation (PSAM) remains unknown. Here, we investigated whether PSAM reflects neuronal processes that are associated with preparing auditory cortex for optimized feedback monitoring as reflected in online speech corrections. Combining electroencephalographic PSAM data from a previous data set with new acoustic measures of the same participants' speech, we asked whether individual speakers' extent of PSAM is correlated with the implementation of within-vowel articulatory adjustments during /b/-vowel-/d/ word productions. Online articulatory adjustments were quantified as the extent of change in inter-trial formant variability from vowel onset to vowel midpoint (a phenomenon known as centering). This approach allowed us to also consider inter-trial variability in formant production, and its possible relation to PSAM, at vowel onset and midpoint separately. Results showed that inter-trial formant variability was significantly smaller at vowel midpoint than at vowel onset. PSAM was not significantly correlated with this amount of change in variability as an index of within-vowel adjustments. Surprisingly, PSAM was negatively correlated with inter-trial formant variability not only in the middle but also at the very onset of the vowels. Thus, speakers with more PSAM produced formants that were already less variable at vowel onset. Findings suggest that PSAM may reflect processes that influence speech acoustics as early as vowel onset and, thus, that are directly involved in motor command preparation (feedforward control) rather than output monitoring (feedback control).}, } @article {pmid39448279, year = {2024}, author = {Pekdemir, A and Kemaloğlu, YK and Gölaç, H and İriz, A and Köktürk, O and Mengü, G}, title = {The Self-Assessment, Perturbation, and Resonance Values of Voice and Speech in Individuals with Snoring and Obstructive Sleep Apnea.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.018}, pmid = {39448279}, issn = {1873-4588}, abstract = {PURPOSE: The static and dynamic soft tissue changes resulting in hypopnea and/or apnea in the subjects with obstructive sleep apnea (OSA) occur in the upper airway, which also serves as the voice or speech tract. In this study, we looked for the Voice Handicap Index-10 (VHI-10) and Voice-Related Quality of Life (V-RQOL) scores in addition to perturbation and formant values of the vowels in those with snoring and OSA.

METHODS: Epworth Sleepiness Scale (ESS), STOP-Bang scores, Body-Mass Index (BMI), neck circumference (NC), modified Mallampati Index, tonsil size, Apnea-Hypopnea Index, VHI-10 and V-RQOL scores, perturbation and formant values, and fundamental frequency of the voice samples were taken to evaluate.

RESULTS: The data revealed that not the perturbation and formant values but scores of VHI-10 and V-RQOL were significantly different between the control and OSA subjects and that both were significantly correlated with ESS and NC. Further, a few significant correlations of BMI and tonsil size with the formant and perturbation values were also found.

CONCLUSIONS: Our data reveal that (i) VHI-10 and V-RQOL were good identifiers for those with OSA, and (ii) perturbation and formant values were related to particularly tonsil size, and further BMI. Hence, we could say that in an attempt to use a voice parameter to screen OSA, VHI-10, and V-RQOL appeared to be better than the objective voice measures, which could be variable due to the tonsil size and BMI of the subjects.}, } @article {pmid39445770, year = {2024}, author = {Feng, S and Jiang, X}, title = {Acoustic encoding of vocally expressed confidence and doubt in Chinese bidialectics.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2860-2876}, doi = {10.1121/10.0032400}, pmid = {39445770}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Male ; Intention ; *Language ; Multilingualism ; Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Language communicators use acoustic-phonetic cues to convey a variety of social information in the spoken language, and the learning of a second language affects speech production in a social setting. It remains unclear how speaking different dialects could affect the acoustic metrics underlying the intended communicative meanings. Nine Chinese Bayannur-Mandarin bidialectics produced single-digit numbers in statements of both Standard Mandarin and the Bayannur dialect with different levels of intended confidence. Fifteen listeners judged the intention presence and confidence level. Prosodically unmarked and marked stimuli exhibited significant differences in perceived intention. A higher intended level was perceived as more confident. The acoustic analysis revealed the segmental (third and fourth formants, center of gravity), suprasegmental (mean fundamental frequency, fundamental frequency range, duration), and source features (harmonic to noise ratio, cepstral peak prominence) can distinguish between confident and doubtful expressions. Most features also distinguished between dialect and Mandarin productions. Interactions on fourth formant and mean fundamental frequency suggested that speakers made greater use of acoustic parameters to encode confidence and doubt in the Bayannur dialect than in Mandarin. In machine learning experiments, the above-chance-level overall classification rates for confidence and doubt and the in-group advantage supported the dialect theory.}, } @article {pmid39443329, year = {2024}, author = {Persson, A}, title = {The acoustic characteristics of Swedish vowels.}, journal = {Phonetica}, volume = {81}, number = {6}, pages = {599-643}, pmid = {39443329}, issn = {1423-0321}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; Sweden ; *Language ; Speech Perception ; Sound Spectrography ; Female ; Male ; Cues ; Adult ; }, abstract = {The Swedish vowel space is relatively densely populated with 21 categories that differ in quality and quantity. Existing descriptions of the entire space rest on recordings made in the late 1990s or earlier, while recent work in general has focused on subsets of the space. The present paper reports on static and dynamic acoustic analyses of the entire vowel space using a recently released database of h-VOWEL-d words (SwehVd). The results highlight the importance of static and dynamic spectral and temporal cues for Swedish vowel category distinction. The first two formants and vowel duration are the primary acoustic cues to vowel identity, however, the third formant contributes to increased category separability for neighboring contrasts presumed to differ in lip-rounding. In addition, even though all long-short vowel pairs differ systematically in duration, they also display considerable spectral differences, suggesting that quantity distinctions are not separate from quality distinctions in Swedish. The dynamic analysis further suggests formant movements in both long and short vowels, with [e:] and [o:] displaying clearer patterns of diphthongization.}, } @article {pmid39438167, year = {2024}, author = {Martínez-Olalla, R and Hidalgo-De la Guía, I and Gayarzábal-Heinze, E and Fernández-Ruiz, R and Núñez-Vidal, E and Álvarez-Marquina, A and Palacios-Alonso, D}, title = {Analysis of Voice Quality in Children With Smith-Magenis Syndrome.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.026}, pmid = {39438167}, issn = {1873-4588}, abstract = {UNLABELLED: The production of phonation involves very complex processes, linked to the physical, clinical, and emotional state of the speaker. Thus, in populations with neurological diseases, it is possible to find the imprint in the voice signal left by the deterioration of certain cortical areas or part of the neurocognitive mechanisms that are involved in speech. In previous works, the authors determined the relationship between the pathological characteristics of the voice of the speakers with Smith-Magenis syndrome (SMS) and a lower value in the cepstral peak prominence (CPP) with respect to normative speakers. They also described the presence of subharmonics in their voices.

OBJECTIVES: The present study aims to verify whether both characteristics can be used simultaneously to differentiate SMS voices from neurotypical voices. It will also be analyzed if there is variation in the trajectory of the formants coinciding with the subharmonics.

METHODS: To do this, the effect of subharmonics in the voices of 12 SMS individuals was isolated to see if they were responsible for the lower CPP values. An evaluation of the CPP was also carried out in the areas of subharmonic presence, from the peak that reflected the value of f0, rather than using the most prominent peak. This offered us a baseline for the CPP value in the presence of subharmonics. It was checked if changes in the formants occurred synchronously to the appearance of those subharmonics. If so, the muscles that control the position of the jaw and tongue would be affected at the same time as the larynx. The latter was difficult to observe since the samples were very short. A comparison of phonatory performance of a sustained /a/ between a normotypical group and non-normotypical group of children was carried out. These groups were balanced and matched in age and gender. The Spanish Association of Smith-Magenis Syndrome (ASME) provides almost 20% of the population in Spain.

RESULTS: The CPP allows differentiating between normative speakers and those with SMS, even when isolating the effect of subharmonics.

CONCLUSIONS: The CPP is a robust index for determining the degree of dysphonia. It makes it possible to differentiate pathological voices from healthy voices even when subharmonics are present. The presence of subharmonics is a characteristic of voices of SMS individuals and is not present in healthy ones. Both indexes can be used simultaneously to differentiate SMS voices from neurotypical voices.}, } @article {pmid39418590, year = {2024}, author = {Krakauer, J and Naber, C and Niziolek, CA and Parrell, B}, title = {Divided Attention Has Limited Effects on Speech Sensorimotor Control.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {11}, pages = {4358-4368}, pmid = {39418590}, issn = {1558-9102}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Attention/physiology ; Male ; Female ; Young Adult ; *Speech/physiology ; Adult ; Feedback, Sensory/physiology ; Adaptation, Physiological/physiology ; Speech Perception/physiology ; Visual Perception/physiology ; Adolescent ; }, abstract = {PURPOSE: When vowel formants are externally perturbed, speakers change their production to oppose that perturbation both during the ongoing production (compensation) and in future productions (adaptation). To date, attempts to explain the large variability across individuals in these responses have focused on trait-based characteristics such as auditory acuity, but evidence from other motor domains suggests that attention may modulate the motor response to sensory perturbations. Here, we test the extent to which divided attention impacts sensorimotor control for supralaryngeal articulation.

METHOD: Neurobiologically healthy speakers were exposed to random (Experiment 1) or consistent (Experiment 2) real-time auditory perturbation of vowel formants to measure online compensation and trial-to-trial adaptation, respectively. In both experiments, participants completed two conditions: one with a simultaneous visual distractor task to divide attention and one without this secondary task.

RESULTS: Divided visual attention slightly reduced online compensation, but only starting > 300 ms after vowel onset, well beyond the typical duration of vowels in speech. Divided attention had no effect on adaptation.

CONCLUSIONS: The results from both experiments suggest that the use of sensory feedback in typical speech motor control is a largely automatic process unaffected by divided visual attention, suggesting that the source of cross-speaker variability in response to formant perturbations likely lies within the speech production system rather than in higher-level cognitive processes. Methodologically, these results suggest that compensation for formant perturbations should be measured prior to 300 ms after vowel onset to avoid any potential impact of attention or other higher-order cognitive factors.}, } @article {pmid39414424, year = {2024}, author = {He, Y and Wang, X and Huang, T and Zhao, W and Fu, Z and Zheng, Q and Jin, L and Kim, H and Liu, H}, title = {The Study of Speech Acoustic Characteristics of Elderly Individuals with Presbyphagia in Ningbo, China.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.041}, pmid = {39414424}, issn = {1873-4588}, abstract = {The feasibility of using acoustic parameters to predict presbyphagia has been preliminarily confirmed. Considering that age and gender can influence the results of acoustic parameters, this study aimed to further explore the specific effects of age and gender on acoustic parameter analysis of the elderly population over 60 years old with presbyphagia. A total of 45 participants were enrolled and divided into three groups (60-69 years old, 70-79 years old, and 80-89 years old). Acoustic parameters, including maximum phonation time, first to third formant frequencies (F1-F3) of /a/, /i/, and /u/, oral diadochokinesis, the acoustic vowel space, and laryngeal diadochokinesis (LDDK), were extracted and calculated. Two-way analysis of variance was used to analyze the correlations between acoustic parameters and age and gender. The result indicates that /hʌ/ LDDK rate had significant differences in age groups, presenting the 80-89 age group being significantly slower than the 60-69 age group. F1/a/, F2/a/, F2/i/, F3/i/, and F2i/F2u differed systematically between genders, with males being lower and smaller than females. Changes that were consistent with /hʌ/ LDDK regularity, confirmed by greater regularity in females. No significant differences were observed for other acoustic parameters. No significant interactions were revealed. According to the preliminary data, we hypothesized that respiratory capacity and control during vocal fold abduction weaken with aging. This highlights the importance of continuously monitoring the respiratory impact on swallowing function in elderly individuals. Additionally, gender influenced several acoustic parameters, indicating the necessity to differentiate between genders when assessing presbyphagia using acoustic parameters, especially focusing on swallowing function in elderly males in Ningbo.}, } @article {pmid39414423, year = {2024}, author = {Wang, Y and Zhao, Y}, title = {Acoustic Characteristics of Modern Chinese Folk Singing at Different Vocal Efforts.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.022}, pmid = {39414423}, issn = {1873-4588}, abstract = {OBJECTIVES: Modern Chinese folk singing is developed by fusing regionally specific traditional Chinese singing with Western scientific training techniques. The purpose of this research is to contribute to the exploration of the acoustic characteristics of Chinese folk songs and the efficient resonance space for the performance.

METHOD: Seven tenors and seven sopranos were invited to sing three songs and read the lyrics in an anechoic chamber. The vocal outputs were meticulously recorded and subjected to a comprehensive acoustic analysis. Overall equivalent sound level, long-term average spectrum (LTAS), gain factors, and other acoustic parameters were analyzed for different vocal efforts (soft, normal, and loud), genders, and vocal modes (singing and speaking).

RESULTS: Male singers have singer's formant at 3 kHz in LTAS, a characteristic not found in other country singers or Chinese opera singers, but slightly higher than the frequency of Western Classical singers. Female singers do not have singer's formant and their LTAS curves are much flatter. The α, spectral balance, and singing power ratio all increased with increasing vocal effort, and they are higher for singing than for speaking. Finally, there is a significant gain factor at 3 kHz, with a maximum value of 1.85 for men and 1.68 for women.

CONCLUSIONS: Male singers in Chinese folk singing have a singer's formant, a phenomenon not consistently observed in their female singers. The intricate acoustic characteristics of this singing style have been extensively examined and can contribute to the existing literature on the spectral properties of diverse vocal genres. Furthermore, this analysis offers foundational data essential for the optimization of room acoustics tailored to vocal performance.}, } @article {pmid39400271, year = {2024}, author = {Clopper, CG}, title = {Dynamic acoustic vowel distances within and across dialects.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2497-2507}, doi = {10.1121/10.0032385}, pmid = {39400271}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; *Speech Production Measurement/methods ; Voice Quality ; Acoustics ; Female ; Male ; Time Factors ; Language ; Sound Spectrography ; Adult ; }, abstract = {Vowels vary in their acoustic similarity across regional dialects of American English, such that some vowels are more similar to one another in some dialects than others. Acoustic vowel distance measures typically evaluate vowel similarity at a discrete time point, resulting in distance estimates that may not fully capture vowel similarity in formant trajectory dynamics. In the current study, language and accent distance measures, which evaluate acoustic distances between talkers over time, were applied to the evaluation of vowel category similarity within talkers. These vowel category distances were then compared across dialects, and their utility in capturing predicted patterns of regional dialect variation in American English was examined. Dynamic time warping of mel-frequency cepstral coefficients was used to assess acoustic distance across the frequency spectrum and captured predicted Southern American English vowel similarity. Root-mean-square distance and generalized additive mixed models were used to assess acoustic distance for selected formant trajectories and captured predicted Southern, New England, and Northern American English vowel similarity. Generalized additive mixed models captured the most predicted variation, but, unlike the other measures, do not return a single acoustic distance value. All three measures are potentially useful for understanding variation in vowel category similarity across dialects.}, } @article {pmid39396508, year = {2024}, author = {Ozkan Atak, HB and Aslan, F and Sennaroglu, G and Sennaroglu, L}, title = {Children with Auditory Brainstem Implants: Language Proficiency and Reading Comprehension Process.}, journal = {Audiology & neuro-otology}, volume = {}, number = {}, pages = {1-12}, doi = {10.1159/000541716}, pmid = {39396508}, issn = {1421-9700}, abstract = {INTRODUCTION: Auditory performance and language proficiency in young children who utilize auditory brainstem implants (ABIs) throughout the first 3 years of life are difficult to predict. ABI users have challenges as a result of delays in language proficiency and the acquisition of reading comprehension, even if ABI technology offers auditory experiences that enhance spoken language development. The aim of this study was to evaluate about the impact of language proficiency on reading comprehension skills in children with ABI.

METHOD: In this study, 20 children with ABI were evaluated for their reading comprehension abilities and language proficiency using an Informal Reading Inventory, Test of Early Language Development-Third Edition (TELD-3), Categories of Auditory Performance-II (CAP-II), and Speech Intelligibility Rating (SIR). Three distinct aspects of reading comprehension were assessed and analyzed to provide a composite score for reading comprehension abilities. TELD-3, which measures receptive and expressive language proficiency, was presented through spoken language.

RESULTS: Studies have shown that there was a relationship between language proficiency and reading comprehension in children with ABI. In the present study, it was determined that the total scores of reading comprehension skills of the children who had poor language proficiency and enrolled in the school for the deaf were also low. The children use short, basic sentences, often repeat words and phrases, and have a restricted vocabulary. In addition, the children had difficulty reading characters and detailed paragraphs and could not remember events in a logical order.

CONCLUSION: Children with ABI may potentially have complicated reading comprehension abilities due to lack of access to all the speech formants needed to develop spoken language. In addition, variables affecting the reading levels of children with ABI include factors such as age at implantation, duration of implant use, presence of additional disability, communication model, and access to auditory rehabilitation. The reading comprehension skills of ABI users were evaluated in this study for the first time in the literature and may constitute a starting point for the examination of variables affecting reading comprehension in this area.}, } @article {pmid39392353, year = {2024}, author = {Yegnanarayana, B and Pannala, V}, title = {Processing group delay spectrograms for study of formant and harmonic contours in speech signals.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2422-2433}, doi = {10.1121/10.0032364}, pmid = {39392353}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; Sound Spectrography ; Signal Processing, Computer-Assisted ; Speech Production Measurement/methods ; Voice Quality ; Time Factors ; Phonetics ; }, abstract = {This paper deals with study of formant and harmonic contours by processing the group delay (GD) spectrograms of speech signals. The GD spectrum is the negative derivative of the phase spectrum with respect to frequency. Recent study shows that the GD spectrogram can be obtained without phase wrapping. Formant frequency contours can be observed in the display of the peaks of the instantaneous wideband equivalent GD spectrogram, derived using the modified single frequency filtering (SFF) analysis of speech signals. Harmonic frequency contours can be observed in the display of the peaks of the instantaneous narrowband equivalent GD spectrogram, derived using the modified SFF analysis of speech signals. For synthetic speech signals, the observed formant contours match the ground truth formant contours from which the signal is derived. For natural speech signals, the observed formant contours match approximately with the given ground truth formant contours mostly in the voiced regions. The results are illustrated for several randomly selected utterances from the TIMIT database. While this study helps to observe the contours of formants in the display, automatic extraction of the formant frequencies needs further processing, requiring logic for eliminating the spurious points, without forcing the number of formants.}, } @article {pmid39356074, year = {2024}, author = {Parrell, B and Niziolek, CA and Chen, T}, title = {Sensorimotor adaptation to a nonuniform formant perturbation generalizes to untrained vowels.}, journal = {Journal of neurophysiology}, volume = {132}, number = {5}, pages = {1437-1444}, pmid = {39356074}, issn = {1522-1598}, support = {P50 HD105353/HD/NICHD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; BCS 2120506//National Science Foundation (NSF)/ ; }, mesh = {Humans ; Male ; Female ; Adult ; *Adaptation, Physiological/physiology ; Young Adult ; *Speech/physiology ; Learning/physiology ; Speech Perception/physiology ; Generalization, Psychological/physiology ; Phonetics ; Feedback, Sensory/physiology ; }, abstract = {When speakers learn to change the way they produce a speech sound, how much does that learning generalize to other speech sounds? Past studies of speech sensorimotor learning have typically tested the generalization of a single transformation learned in a single context. Here, we investigate the ability of the speech motor system to generalize learning when multiple opposing sensorimotor transformations are learned in separate regions of the vowel space. We find that speakers adapt to a nonuniform "centralization" perturbation, learning to produce vowels with greater acoustic contrast, and that this adaptation generalizes to untrained vowels, which pattern like neighboring trained vowels and show increased contrast of a similar magnitude.NEW & NOTEWORTHY We show that sensorimotor adaptation of vowels at the edges of the articulatory working space generalizes to intermediate vowels through local transfer of learning from adjacent vowels. These results extend findings on the locality of sensorimotor learning from upper limb control to speech, a complex task with an opaque and nonlinear transformation between motor actions and sensory consequences. Our results also suggest that our paradigm has potential to drive behaviorally relevant changes that improve communication effectiveness.}, } @article {pmid39322510, year = {2024}, author = {Huang, T and Wang, X and Xu, T and Zhao, W and Cao, Y and Kim, H and Yi, B}, title = {Acoustic Analysis of Mandarin-Speaking Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.08.037}, pmid = {39322510}, issn = {1873-4588}, abstract = {OBJECTIVES: This study aims to investigate the speech characteristics and assess the potential risk of voice fatigue and voice disorders in Chinese transgender women (TW).

METHODS: A case-control study was conducted involving TW recruited in Shanghai, China. The participants included 15 TW, 20 cisgender men (CISM), and 20 cisgender women (CISW). Acoustic parameters including formants (F1, F2, F3, F4), cepstral peak prominence (CPP), jitter, shimmer, harmonic-to-noise ratio (HNR), noise-to-harmonics (NHR), fundamental frequency (f0), and intensity, across vowels, passages, and free talking. Additionally, the Voice Handicap Index-10 (VHI-10) and the Voice Fatigue Index were administered to evaluate voice-related concerns.

RESULTS: (1) The F1 of TW was significantly higher than that of CISW for the vowels /i/ and /u/, and significantly higher than that of CISM for the vowels /a/, /i/, and /u/. The F2 of TW was significantly lower than CISW for the vowels /i/, significantly higher than CISW for the vowels /u/, and significantly higher than CISM for the vowels /a/ and /u/. F3 was significantly lower in TW than in CISW for the vowels /a/ and /i/. The F4 formant was significantly lower in TW than in CISW for the vowels /a/ and /i/, but significantly higher than in CISM for the vowel /u/. (2) The f0 of TW was significantly lower than that of CISW for the vowels /a/, /i/, /u/, during passage reading, and in free speech, but was significantly higher than CISM during passage reading and free talking. Additionally, TW exhibited significantly higher intensity compared with CISW for the vowel /a/ and during passage reading. (3) Jitter in TW was significantly higher than in CISW for the vowels /i/ and /u/, and significantly lower than in CISM during passage reading and free talking. Shimmer was significantly higher in TW compared with both CISW and CISM across the vowels /a/, /i/, during passage reading, and in free talking. The HNR in TW was significantly lower than in both CISW and CISM across all vowels, during passage reading, and in free talking. The NHR was significantly higher in TW than in CISW across all vowels, during passage reading, and in free talking, and significantly higher than in CISM for the vowels /a/, /i/, during passage reading, and in free talking. The CPP in TW was significantly lower than in CISW during passage reading and free talking, and significantly lower than in CISM across all vowels, during passage reading, and in free speech. (4) The VHI-10 scores were significantly higher in TW compared with both CISM and CISW.

CONCLUSIONS: TW exhibit certain acoustic parameters, such as f0 and some of the formants, that fall between those of CISW and CISM without undergoing phonosurgery or voice training. The findings suggest a potential risk for voice fatigue and the development of voice disorders as TW try to modify their vocal characteristics to align with their gender identity.}, } @article {pmid39287502, year = {2024}, author = {Kim, H and Ratkute, V and Epp, B}, title = {Monaural and binaural masking release with speech-like stimuli.}, journal = {JASA express letters}, volume = {4}, number = {9}, pages = {}, doi = {10.1121/10.0028736}, pmid = {39287502}, issn = {2691-1191}, mesh = {Humans ; *Perceptual Masking/physiology ; *Speech Perception/physiology ; Adult ; Acoustic Stimulation ; Male ; Female ; Young Adult ; }, abstract = {The relevance of comodulation and interaural phase difference for speech perception is still unclear. We used speech-like stimuli to link spectro-temporal properties of formants with masking release. The stimuli comprised a tone and three masker bands centered at formant frequencies F1, F2, and F3 derived from a consonant-vowel. The target was a diotic or dichotic frequency-modulated tone following F2 trajectories. Results showed a small comodulation masking release, while the binaural masking level difference was comparable to previous findings. The data suggest that factors other than comodulation may play a dominant role in grouping frequency components in speech.}, } @article {pmid39279469, year = {2024}, author = {Chen, S and Whalen, DH and Mok, PPK}, title = {What R Mandarin Chinese /ɹ/s? - acoustic and articulatory features of Mandarin Chinese rhotics.}, journal = {Phonetica}, volume = {81}, number = {5}, pages = {509-552}, pmid = {39279469}, issn = {1423-0321}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; *Speech Acoustics ; *Tongue/physiology ; Female ; Male ; China ; *Language ; Adult ; Young Adult ; Speech Production Measurement ; Ultrasonography ; East Asian People ; }, abstract = {Rhotic sounds are well known for their considerable phonetic variation within and across languages and their complexity in speech production. Although rhotics in many languages have been examined and documented, the phonetic features of Mandarin rhotics remain unclear, and debates about the prevocalic rhotic (the syllable-onset rhotic) persist. This paper extends the investigation of rhotic sounds by examining the articulatory and acoustic features of Mandarin Chinese rhotics in prevocalic, syllabic (the rhotacized vowel [ɚ]), and postvocalic (r-suffix) positions. Eighteen speakers from Northern China were recorded using ultrasound imaging. Results showed that Mandarin syllabic and postvocalic rhotics can be articulated with various tongue shapes, including tongue-tip-up retroflex and tongue-tip-down bunched shapes. Different tongue shapes have no significant acoustic differences in the first three formants, demonstrating a many-to-one articulation-acoustics relationship. The prevocalic rhotics in our data were found to be articulated only with bunched tongue shapes, and were sometimes produced with frication noise at the start. In general, rhotics in all syllable positions are characterized by a close F2 and F3, though the prevocalic rhotic has a higher F2 and F3 than the syllabic and postvocalic rhotics. The effects of syllable position and vowel context are also discussed.}, } @article {pmid39259883, year = {2024}, author = {Thompson, A and Kim, Y}, title = {Acoustic and Kinematic Predictors of Intelligibility and Articulatory Precision in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {10}, pages = {3595-3611}, pmid = {39259883}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; R03 DC012405/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Parkinson Disease/physiopathology/complications ; *Speech Intelligibility/physiology ; Female ; Male ; Biomechanical Phenomena ; Aged ; *Dysarthria/etiology/physiopathology ; *Speech Acoustics ; Middle Aged ; Speech Production Measurement/methods ; Case-Control Studies ; Phonetics ; }, abstract = {PURPOSE: This study investigated relationships within and between perceptual, acoustic, and kinematic measures in speakers with and without dysarthria due to Parkinson's disease (PD) across different clarity conditions. Additionally, the study assessed the predictive capabilities of selected acoustic and kinematic measures for intelligibility and articulatory precision ratings.

METHOD: Forty participants, comprising 22 with PD and 18 controls, read three phrases aloud using conversational, less clear, and more clear speaking conditions. Acoustic measures and their theoretical kinematic parallel measures (i.e., acoustic and kinematic distance and vowel space area [VSA]; second formant frequency [F2] slope and kinematic speed) were obtained from the diphthong /aɪ/ and selected vowels in the sentences. A total of 368 listeners from crowdsourcing provided ratings for intelligibility and articulatory precision. The research questions were examined using correlations and linear mixed-effects models.

RESULTS: Intelligibility and articulatory precision ratings were highly correlated across all speakers. Acoustic and kinematic distance, as well as F2 slope and kinematic speed, showed moderately positive correlations. In contrast, acoustic and kinematic VSA exhibited no correlation. Among all measures, acoustic VSA and kinematic distance were robust predictors of both intelligibility and articulatory precision ratings, but they were stronger predictors of articulatory precision.

CONCLUSIONS: The findings highlight the importance of measurement selection when examining cross-domain relationships. Additionally, they support the use of behavioral modifications aimed at eliciting larger articulatory gestures to improve intelligibility in individuals with dysarthria due to PD.

OPEN SCIENCE FORM: https://doi.org/10.23641/asha.27011281.}, } @article {pmid39234407, year = {2024}, author = {Subrahmanya, A and Ranasinghe, KG and Kothare, H and Raharjo, I and Kim, KS and Houde, JF and Nagarajan, SS}, title = {Pitch corrections occur in natural speech and are abnormal in patients with Alzheimer's disease.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1424920}, pmid = {39234407}, issn = {1662-5161}, abstract = {Past studies have explored formant centering, a corrective behavior of convergence over the duration of an utterance toward the formants of a putative target vowel. In this study, we establish the existence of a similar centering phenomenon for pitch in healthy elderly controls and examine how such corrective behavior is altered in Alzheimer's Disease (AD). We found the pitch centering response in healthy elderly was similar when correcting pitch errors below and above the target (median) pitch. In contrast, patients with AD showed an asymmetry with a larger correction for the pitch errors below the target phonation than above the target phonation. These findings indicate that pitch centering is a robust compensation behavior in human speech. Our findings also explore the potential impacts on pitch centering from neurodegenerative processes impacting speech in AD.}, } @article {pmid39218756, year = {2024}, author = {Vampola, T and Horáček, J and Laukkanen, AM}, title = {Three-Dimensional Finite Element Modeling of the Singer's Formant Cluster Optimization by Epilaryngeal Narrowing With and Without Velopharyngeal Opening.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.07.035}, pmid = {39218756}, issn = {1873-4588}, abstract = {This study aimed to find the optimal geometrical configuration of the vocal tract (VT) to increase the total acoustic energy output of human voice in the frequency interval 2-3.5 kHz "singer's formant cluster," (SFC) for vowels [a:] and [i:] considering epilaryngeal changes and the velopharyngeal opening (VPO). The study applied 3D volume models of the vocal and nasal tract based on computer tomography images of a female speaker. The epilaryngeal narrowing (EN) increased the total sound pressure level (SPL) and SPL of the SFC by diminishing the frequency difference between acoustic resonances F3 and F4 for [a:] and between F2 and F3 for [i:]. The effect reached its maximum at the low pharynx/epilarynx cross-sectional area ratio 11.4:1 for [a:] and 25:1 for [i:]. The acoustic results obtained with the model optimization are in good agreement with the results of an internationally recognized operatic alto singer. With the EN and the VPO, the VT input reactance was positive over the entire fo singing range (ca 75-1500 Hz). The VPO increased the strength of the SFC and diminished the SPL of F1 for both vowels, but with EN, the SPL decrease was compensated. The effect of EN is not linear and depends on the vowel. Both the EN and the VPO alone and together can support (singing) voice production.}, } @article {pmid39217086, year = {2024}, author = {Figueroa, C and Guillén, V and Huenupán, F and Vallejos, C and Henríquez, E and Urrutia, F and Sanhueza, F and Alarcón, E}, title = {Comparison of Acoustic Parameters of Voice and Speech According to Vowel Type and Suicidal Risk in Adolescents.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.08.006}, pmid = {39217086}, issn = {1873-4588}, abstract = {UNLABELLED: Globally, suicide prevention and understanding suicidal behavior represent significant health challenges. The predictive potential of voice, speech, and language appears as a promising solution to the difficulty in assessment.

OBJECTIVE: To analyze variations in acoustic parameters in voice and speech based on vowel types according to different levels of suicidal risk among adolescents in a text reading task.

METHODOLOGY: Cross-sectional analytical design using nonprobabilistic sampling. Our sample comprised 98 adolescents aged 14 to 19, undergoing voice acoustic assessment, along with suicidal ideation determination through the Okasha Suicidality Scale and Beck Depression Inventory. Acoustic analysis of recordings was conducted using Praat for phonetic research, Python program, Focusrite interface, and microphone to register voice and speech acoustic parameters such as Fundamental Frequency, Jitter, and Formants. Subsequently, data from adolescents with and without suicidal risk were compared.

RESULTS: Significant differences were observed between suicidal and nonsuicidal adolescents in several acoustic aspects, especially in females in fundamental frequency (F0), signal-to-noise ratio (HNRdB), and temporal variability measured by jitter and standard deviation. In men, differences were found in F0 and HNRdB (P < 0.05).

CONCLUSION: This study demonstrated statistically significant variations in various voice acoustic parameters among adolescents with and without suicidal risk. These findings underscore the potential relevance of voice and speech as markers for suicidal risk.}, } @article {pmid39212078, year = {2024}, author = {Zaltz, Y}, title = {The Impact of Trained Conditions on the Generalization of Learning Gains Following Voice Discrimination Training.}, journal = {Trends in hearing}, volume = {28}, number = {}, pages = {23312165241275895}, pmid = {39212078}, issn = {2331-2165}, mesh = {Humans ; Male ; Female ; Young Adult ; *Speech Perception/physiology ; *Generalization, Psychological ; *Cues ; *Noise/adverse effects ; *Acoustic Stimulation ; Adult ; Recognition, Psychology ; Perceptual Masking ; Adolescent ; Speech Acoustics ; Voice Quality ; Discrimination Learning/physiology ; Voice/physiology ; }, abstract = {Auditory training can lead to notable enhancements in specific tasks, but whether these improvements generalize to untrained tasks like speech-in-noise (SIN) recognition remains uncertain. This study examined how training conditions affect generalization. Fifty-five young adults were divided into "Trained-in-Quiet" (n = 15), "Trained-in-Noise" (n = 20), and "Control" (n = 20) groups. Participants completed two sessions. The first session involved an assessment of SIN recognition and voice discrimination (VD) with word or sentence stimuli, employing combined fundamental frequency (F0) + formant frequencies voice cues. Subsequently, only the trained groups proceeded to an interleaved training phase, encompassing six VD blocks with sentence stimuli, utilizing either F0-only or formant-only cues. The second session replicated the interleaved training for the trained groups, followed by a second assessment conducted by all three groups, identical to the first session. Results showed significant improvements in the trained task regardless of training conditions. However, VD training with a single cue did not enhance VD with both cues beyond control group improvements, suggesting limited generalization. Notably, the Trained-in-Noise group exhibited the most significant SIN recognition improvements posttraining, implying generalization across tasks that share similar acoustic conditions. Overall, findings suggest training conditions impact generalization by influencing processing levels associated with the trained task. Training in noisy conditions may prompt higher auditory and/or cognitive processing than training in quiet, potentially extending skills to tasks involving challenging listening conditions, such as SIN recognition. These insights hold significant theoretical and clinical implications, potentially advancing the development of effective auditory training protocols.}, } @article {pmid39185222, year = {2024}, author = {Parrell, B and Naber, C and Kim, OA and Nizolek, CA and McDougle, SD}, title = {Audiomotor prediction errors drive speech adaptation even in the absence of overt movement.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {39185222}, issn = {2692-8205}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; R01 NS132926/NS/NINDS NIH HHS/United States ; }, abstract = {Observed outcomes of our movements sometimes differ from our expectations. These sensory prediction errors recalibrate the brain's internal models for motor control, reflected in alterations to subsequent movements that counteract these errors (motor adaptation). While leading theories suggest that all forms of motor adaptation are driven by learning from sensory prediction errors, dominant models of speech adaptation argue that adaptation results from integrating time-advanced copies of corrective feedback commands into feedforward motor programs. Here, we tested these competing theories of speech adaptation by inducing planned, but not executed, speech. Human speakers (male and female) were prompted to speak a word and, on a subset of trials, were rapidly cued to withhold the prompted speech. On standard trials, speakers were exposed to real-time playback of their own speech with an auditory perturbation of the first formant to induce single-trial speech adaptation. Speakers experienced a similar sensory error on movement cancelation trials, hearing a perturbation applied to a recording of their speech from a previous trial at the time they would have spoken. Speakers adapted to auditory prediction errors in both contexts, altering the spectral content of spoken vowels to counteract formant perturbations even when no actual movement coincided with the perturbed feedback. These results build upon recent findings in reaching, and suggest that prediction errors, rather than corrective motor commands, drive adaptation in speech.}, } @article {pmid39182457, year = {2024}, author = {Chan, RKW and Wang, BX}, title = {Do long-term acoustic-phonetic features and mel-frequency cepstral coefficients provide complementary speaker-specific information for forensic voice comparison?.}, journal = {Forensic science international}, volume = {363}, number = {}, pages = {112199}, doi = {10.1016/j.forsciint.2024.112199}, pmid = {39182457}, issn = {1872-6283}, mesh = {Humans ; Male ; *Phonetics ; *Speech Acoustics ; Sound Spectrography ; *Voice Quality ; Adult ; Forensic Sciences/methods ; Middle Aged ; Young Adult ; Signal Processing, Computer-Assisted ; }, abstract = {A growing number of studies in forensic voice comparison have explored how elements of phonetic analysis and automatic speaker recognition systems may be integrated for optimal speaker discrimination performance. However, few studies have investigated the evidential value of long-term speech features using forensically-relevant speech data. This paper reports an empirical validation study that assesses the evidential strength of the following long-term features: fundamental frequency (F0), formant distributions, laryngeal voice quality, mel-frequency cepstral coefficients (MFCCs), and combinations thereof. Non-contemporaneous recordings with speech style mismatch from 75 male Australian English speakers were analyzed. Results show that 1) MFCCs outperform long-term acoustic phonetic features; 2) source and filter features do not provide considerably complementary speaker-specific information; and 3) the addition of long-term phonetic features to an MFCCs-based system does not lead to meaningful improvement in system performance. Implications for the complementarity of phonetic analysis and automatic speaker recognition systems are discussed.}, } @article {pmid39175901, year = {2024}, author = {Huang, L and Yang, H and Che, Y and Yang, J}, title = {Automatic speech analysis for detecting cognitive decline of older adults.}, journal = {Frontiers in public health}, volume = {12}, number = {}, pages = {1417966}, pmid = {39175901}, issn = {2296-2565}, mesh = {Humans ; Aged ; Female ; Male ; *Cognitive Dysfunction/diagnosis ; China ; Alzheimer Disease/diagnosis ; Aged, 80 and over ; Speech ; Middle Aged ; Bayes Theorem ; Support Vector Machine ; Algorithms ; }, abstract = {BACKGROUND: Speech analysis has been expected to help as a screening tool for early detection of Alzheimer's disease (AD) and mild-cognitively impairment (MCI). Acoustic features and linguistic features are usually used in speech analysis. However, no studies have yet determined which type of features provides better screening effectiveness, especially in the large aging population of China.

OBJECTIVE: Firstly, to compare the screening effectiveness of acoustic features, linguistic features, and their combination using the same dataset. Secondly, to develop Chinese automated diagnosis model using self-collected natural discourse data obtained from native Chinese speakers.

METHODS: A total of 92 participants from communities in Shanghai, completed MoCA-B and a picture description task based on the Cookie Theft under the guidance of trained operators, and were divided into three groups including AD, MCI, and heathy control (HC) based on their MoCA-B score. Acoustic features (Pitches, Jitter, Shimmer, MFCCs, Formants) and linguistic features (part-of-speech, type-token ratio, information words, information units) are extracted. The machine algorithms used in this study included logistic regression, random forest (RF), support vector machines (SVM), Gaussian Naive Bayesian (GNB), and k-Nearest neighbor (kNN). The validation accuracies of the same ML model using acoustic features, linguistic features, and their combination were compared.

RESULTS: The accuracy with linguistic features is generally higher than acoustic features in training. The highest accuracy to differentiate HC and AD is 80.77% achieved by SVM, based on all the features extracted from the speech data, while the highest accuracy to differentiate HC and AD or MCI is 80.43% achieved by RF, based only on linguistic features.

CONCLUSION: Our results suggest the utility and validity of linguistic features in the automated diagnosis of cognitive impairment, and validated the applicability of automated diagnosis for Chinese language data.}, } @article {pmid39171236, year = {2024}, author = {Holmes, L and Rieger, G and Paulmann, S}, title = {The effect of sexual orientation on voice acoustic properties.}, journal = {Frontiers in psychology}, volume = {15}, number = {}, pages = {1412372}, pmid = {39171236}, issn = {1664-1078}, abstract = {INTRODUCTION: Previous research has investigated sexual orientation differences in the acoustic properties of individuals' voices, often theorizing that homosexuals of both sexes would have voice properties mirroring those of heterosexuals of the opposite sex. Findings were mixed, but many of these studies have methodological limitations including small sample sizes, use of recited passages instead of natural speech, or grouping bisexual and homosexual participants together for analyses.

METHODS: To address these shortcomings, the present study examined a wide range of acoustic properties in the natural voices of 142 men and 175 women of varying sexual orientations, with sexual orientation treated as a continuous variable throughout.

RESULTS: Homosexual men had less breathy voices (as indicated by a lower harmonics-to-noise ratio) and, contrary to our prediction, a lower voice pitch and narrower pitch range than heterosexual men. Homosexual women had lower F4 formant frequency (vocal tract resonance or so-called overtone) in overall vowel production, and rougher voices (measured via jitter and spectral tilt) than heterosexual women. For those sexual orientation differences that were statistically significant, bisexuals were in-between heterosexuals and homosexuals. No sexual orientation differences were found in formants F1-F3, cepstral peak prominence, shimmer, or speech rate in either sex.

DISCUSSION: Recommendations for future "natural voice" investigations are outlined.}, } @article {pmid39091036, year = {2024}, author = {Goncharova, M and Jadoul, Y and Reichmuth, C and Fitch, WT and Ravignani, A}, title = {Vocal tract dynamics shape the formant structure of conditioned vocalizations in a harbor seal.}, journal = {Annals of the New York Academy of Sciences}, volume = {1538}, number = {1}, pages = {107-116}, doi = {10.1111/nyas.15189}, pmid = {39091036}, issn = {1749-6632}, support = {(#W1262-B29)//Austrian Science Foundation Grant/ ; DNRF117//Danmarks Grundforskningsfond/ ; N00014-04-1-0284//Office of Naval Research/ ; Independent Max Planck Research Group Leader funding//Max-Planck-Gesellschaft/ ; Advanced Grant SOMACCA/ERC_/European Research Council/International ; }, mesh = {Animals ; *Vocalization, Animal/physiology ; Male ; Tongue/physiology ; Jaw/physiology/anatomy & histology ; Phocoena/physiology ; Humans ; }, abstract = {Formants, or resonance frequencies of the upper vocal tract, are an essential part of acoustic communication. Articulatory gestures-such as jaw, tongue, lip, and soft palate movements-shape formant structure in human vocalizations, but little is known about how nonhuman mammals use those gestures to modify formant frequencies. Here, we report a case study with an adult male harbor seal trained to produce an arbitrary vocalization composed of multiple repetitions of the sound wa. We analyzed jaw movements frame-by-frame and matched them to the tracked formant modulation in the corresponding vocalizations. We found that the jaw opening angle was strongly correlated with the first (F1) and, to a lesser degree, with the second formant (F2). F2 variation was better explained by the jaw angle opening when the seal was lying on his back rather than on the belly, which might derive from soft tissue displacement due to gravity. These results show that harbor seals share some common articulatory traits with humans, where the F1 depends more on the jaw position than F2. We propose further in vivo investigations of seals to further test the role of the tongue on formant modulation in mammalian sound production.}, } @article {pmid39086377, year = {2024}, author = {Dorman, MF and Natale, SC and Stohl, JS and Felder, J}, title = {Close approximations to the sound of a cochlear implant.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1434786}, pmid = {39086377}, issn = {1662-5161}, abstract = {Cochlear implant (CI) systems differ in terms of electrode design and signal processing. It is likely that patients fit with different implant systems will experience different percepts when presented speech via their implant. The sound quality of speech can be evaluated by asking single-sided-deaf (SSD) listeners fit with a cochlear implant (CI) to modify clean signals presented to their typically hearing ear to match the sound quality of signals presented to their CI ear. In this paper, we describe very close matches to CI sound quality, i.e., similarity ratings of 9.5 to 10 on a 10-point scale, by ten patients fit with a 28 mm electrode array and MED EL signal processing. The modifications required to make close approximations to CI sound quality fell into two groups: One consisted of a restricted frequency bandwidth and spectral smearing while a second was characterized by a wide bandwidth and no spectral smearing. Both sets of modifications were different from those found for patients with shorter electrode arrays who chose upshifts in voice pitch and formant frequencies to match CI sound quality. The data from matching-based metrics of CI sound quality document that speech sound-quality differs for patients fit with different CIs and among patients fit with the same CI.}, } @article {pmid39056002, year = {2024}, author = {Bonacina, S and Krizman, J and Farley, J and Nicol, T and LaBella, CR and Kraus, N}, title = {Persistent post-concussion symptoms include neural auditory processing in young children.}, journal = {Concussion (London, England)}, volume = {9}, number = {1}, pages = {CNC114}, pmid = {39056002}, issn = {2056-3299}, abstract = {AIM: Difficulty understanding speech following concussion is likely caused by auditory processing impairments. We hypothesized that concussion disrupts pitch and phonetic processing of a sound, cues in understanding a talker.

We obtained frequency following responses to a syllable from 120 concussed and 120 control. Encoding of the fundamental frequency (F0), a pitch cue and the first formant (F1), a phonetic cue, was poorer in concussed children. The F0 reduction was greater in the children assessed within 2 weeks of their injuries.

CONCLUSION: Concussions affect auditory processing. Results strengthen evidence of reduced F0 encoding in children with concussion and call for longitudinal study aimed at monitoring the recovery course with respect to the auditory system.}, } @article {pmid39026879, year = {2024}, author = {Li, JJ and Daliri, A and Kim, KS and Max, L}, title = {Does pre-speech auditory modulation reflect processes related to feedback monitoring or speech movement planning?.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {39026879}, issn = {2692-8205}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020707/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; }, abstract = {Previous studies have revealed that auditory processing is modulated during the planning phase immediately prior to speech onset. To date, the functional relevance of this pre-speech auditory modulation (PSAM) remains unknown. Here, we investigated whether PSAM reflects neuronal processes that are associated with preparing auditory cortex for optimized feedback monitoring as reflected in online speech corrections. Combining electroencephalographic PSAM data from a previous data set with new acoustic measures of the same participants' speech, we asked whether individual speakers' extent of PSAM is correlated with the implementation of within-vowel articulatory adjustments during /b/-vowel-/d/ word productions. Online articulatory adjustments were quantified as the extent of change in inter-trial formant variability from vowel onset to vowel midpoint (a phenomenon known as centering). This approach allowed us to also consider inter-trial variability in formant production and its possible relation to PSAM at vowel onset and midpoint separately. Results showed that inter-trial formant variability was significantly smaller at vowel midpoint than at vowel onset. PSAM was not significantly correlated with this amount of change in variability as an index of within-vowel adjustments. Surprisingly, PSAM was negatively correlated with inter-trial formant variability not only in the middle but also at the very onset of the vowels. Thus, speakers with more PSAM produced formants that were already less variable at vowel onset. Findings suggest that PSAM may reflect processes that influence speech acoustics as early as vowel onset and, thus, that are directly involved in motor command preparation (feedforward control) rather than output monitoring (feedback control).}, } @article {pmid39019670, year = {2024}, author = {Doyle, KA and Harel, D and Feeny, GT and Novak, VD and McAllister, T}, title = {Word and Gender Identification in the Speech of Transgender Individuals.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.06.007}, pmid = {39019670}, issn = {1873-4588}, support = {R21 DC021537/DC/NIDCD NIH HHS/United States ; }, abstract = {Listeners use speech to identify both linguistic information, such as the word being produced, and indexical attributes, such as the gender of the speaker. Previous research has shown that these two aspects of speech perception are interrelated. It is important to understand this relationship in the context of gender-affirming voice training (GAVT), where changes in speech production as part of a speaker's gender-affirming care could potentially influence listeners' recognition of the intended utterance. This study conducted a secondary analysis of data from an experiment in which trans women matched shifted targets for the second formant frequency using visual-acoustic biofeedback. Utterances were synthetically altered to feature a gender-ambiguous fundamental frequency and were presented to blinded listeners for rating on a visual analog scale representing the gender spectrum, as well as word identification in a forced-choice task. We found a statistically significant association between the accuracy of word identification and the gender rating of utterances. However, there was no statistically significant difference in word identification accuracy for the formant-shifted conditions relative to an unshifted condition. Overall, these results support previous research in finding that word identification and speaker gender identification are interrelated processes; however, the findings also suggest that a small magnitude of shift in formant frequencies (of the type that might be pursued in a GAVT context) does not have a significant negative impact on the perceptual recoverability of isolated words.}, } @article {pmid38985077, year = {2024}, author = {Lorenzoni, DC and Henriques, JFC and Silva, LKD and Rosa, RR and Berretin-Felix, G and Freitas, KMS and Janson, G}, title = {Comparison of speech changes caused by four different orthodontic retainers: a crossover randomized clinical trial.}, journal = {Dental press journal of orthodontics}, volume = {29}, number = {3}, pages = {e2423277}, pmid = {38985077}, issn = {2177-6709}, mesh = {Humans ; *Orthodontic Retainers ; Female ; Male ; Adult ; *Cross-Over Studies ; Orthodontic Appliance Design ; Young Adult ; Speech/physiology ; }, abstract = {OBJECTIVE: This study aimed to compare the influence of four different maxillary removable orthodontic retainers on speech.

MATERIAL AND METHODS: Eligibility criteria for sample selection were: 20-40-year subjects with acceptable occlusion, native speakers of Portuguese. The volunteers (n=21) were divided in four groups randomized with a 1:1:1:1 allocation ratio. The four groups used, in random order, the four types of retainers full-time for 21 days each, with a washout period of 7-days. The removable maxillary retainers were: conventional wraparound, wraparound with an anterior hole, U-shaped wraparound, and thermoplastic retainer. Three volunteers were excluded. The final sample comprised 18 subjects (11 male; 7 female) with mean age of 27.08 years (SD=4.65). The speech evaluation was performed in vocal excerpts recordings made before, immediately after, and 21 days after the installation of each retainer, with auditory-perceptual and acoustic analysis of formant frequencies F1 and F2 of the vowels. Repeated measures ANOVA and Friedman with Tukey tests were used for statistical comparison.

RESULTS: Speech changes increased immediately after conventional wraparound and thermoplastic retainer installation, and reduced after 21 days, but not to normal levels. However, this increase was statistically significant only for the wraparound with anterior hole and the thermoplastic retainer. Formant frequencies of vowels were altered at initial time, and the changes remained in conventional, U-shaped and thermoplastic appliances after three weeks.

CONCLUSIONS: The thermoplastic retainer was more harmful to the speech than wraparound appliances. The conventional and U-shaped retainers interfered less in speech. The three-week period was not sufficient for speech adaptation.}, } @article {pmid38981448, year = {2024}, author = {Liu, B and Lei, J and Wischhoff, OP and Smereka, KA and Jiang, JJ}, title = {Acoustic Character Governing Variation in Normal, Benign, and Malignant Voices.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {}, number = {}, pages = {1-10}, doi = {10.1159/000540255}, pmid = {38981448}, issn = {1421-9972}, abstract = {INTRODUCTION: Benign and malignant vocal fold lesions (VFLs) are growths that occur on the vocal folds. However, the treatments for these two types of lesions differ significantly. Therefore, it is imperative to use a multidisciplinary approach to properly recognize suspicious lesions. This study aimed to determine the important acoustic characteristics specific to benign and malignant VFLs.

METHODS: The acoustic model of voice quality was utilized to measure various acoustic parameters in 157 participants, including individuals with normal, benign, and malignant conditions. The study comprised 62 female and 95 male participants (43 ± 10 years). Voice samples were collected at the Shanghai Eye, Ear, Nose, and Throat Hospital of Fudan University between May 2020 and July 2021. The acoustic variables of the participants were analyzed using Principal Component Analysis (PCA) to present important acoustic characteristics that are specific to normal vocal folds, benign VFLs, and malignant VFLs. The similarities and differences in acoustic factors were also studied for benign conditions including Reinke's edema, polyps, cysts, and leukoplakia.

RESULTS: Using the PCA method, the components that accounted for the variation in the data were identified, highlighting acoustic characteristics in the normal, benign, and malignant groups. The analysis indicated that coefficients of variation in root mean square energy were observed solely within the normal group. Coefficients of variation in pitch (F0) were found to be significant only in benign voices, while higher formant frequencies and their variability were identified as contributors to the acoustic variance within the malignant group. The presence of formant dispersion (FD) as a weighted factor in PCA was exclusively noted in individuals with Reinke's edema. The amplitude ratio between subharmonics and harmonics (SHR) and its coefficients of variation were evident exclusively in the polyps group. In the case of voices with cysts, both pitch (F0) and coefficients of variation for FD were observed to contribute to variations. Additionally, higher formant frequencies and their coefficients of variation played a role in the acoustic variance among voices of patients with leukoplakia.

CONCLUSION: Experimental evidence demonstrates the utility of the PCA method in the identification of vibrational alterations in the acoustic characteristics of voice affected by lesions. Furthermore, the PCA analysis has highlighted underlying acoustic differences between various conditions such as Reinke's edema, polyps, cysts, and leukoplakia. These findings can be used in the future to develop an automated malignant voice analysis algorithm, which will facilitate timely intervention and management of vocal fold conditions.}, } @article {pmid38951556, year = {2024}, author = {Fletcher, MD and Akis, E and Verschuur, CA and Perry, SW}, title = {Improved tactile speech perception and noise robustness using audio-to-tactile sensory substitution with amplitude envelope expansion.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {15029}, pmid = {38951556}, issn = {2045-2322}, support = {EP/W032422/1//Engineering and Physical Sciences Research Council/ ; EP/T517859/1//Engineering and Physical Sciences Research Council/ ; }, mesh = {Humans ; *Speech Perception/physiology ; Male ; Female ; Adult ; *Noise ; *Hearing Aids ; Young Adult ; Touch/physiology ; Acoustic Stimulation/methods ; Touch Perception/physiology ; Hearing Loss/physiopathology ; }, abstract = {Recent advances in haptic technology could allow haptic hearing aids, which convert audio to tactile stimulation, to become viable for supporting people with hearing loss. A tactile vocoder strategy for audio-to-tactile conversion, which exploits these advances, has recently shown significant promise. In this strategy, the amplitude envelope is extracted from several audio frequency bands and used to modulate the amplitude of a set of vibro-tactile tones. The vocoder strategy allows good consonant discrimination, but vowel discrimination is poor and the strategy is susceptible to background noise. In the current study, we assessed whether multi-band amplitude envelope expansion can effectively enhance critical vowel features, such as formants, and improve speech extraction from noise. In 32 participants with normal touch perception, tactile-only phoneme discrimination with and without envelope expansion was assessed both in quiet and in background noise. Envelope expansion improved performance in quiet by 10.3% for vowels and by 5.9% for consonants. In noise, envelope expansion improved overall phoneme discrimination by 9.6%, with no difference in benefit between consonants and vowels. The tactile vocoder with envelope expansion can be deployed in real-time on a compact device and could substantially improve clinical outcomes for a new generation of haptic hearing aids.}, } @article {pmid38916010, year = {2024}, author = {Sahoo, AK and Sahoo, PK and Gupta, V and Behera, G and Sidam, S and Mishra, UP and Chavan, A and Binu, R and Gour, S and Velayutham, DK and Pooja, and Chatterjee, T and Pal, D}, title = {Assessment of Changes in the Quality of Voice in Post-thyroidectomy Patients With Intact Recurrent and Superior Laryngeal Nerve Function.}, journal = {Cureus}, volume = {16}, number = {5}, pages = {e60873}, pmid = {38916010}, issn = {2168-8184}, abstract = {Background Thyroidectomy is a routinely performed surgical procedure used to treat benign, malignant, and some hormonal disorders of the thyroid that are not responsive to medical therapy. Voice alterations following thyroid surgery are well-documented and often attributed to recurrent laryngeal nerve dysfunction. However, subtle changes in voice quality can persist despite anatomically intact laryngeal nerves. This study aimed to quantify post-thyroidectomy voice changes in patients with intact laryngeal nerves, focusing on fundamental frequency, first formant frequency, shimmer intensity, and maximum phonation duration. Methodology This cross-sectional study was conducted at a tertiary referral center in central India and focused on post-thyroidectomy patients with normal vocal cord function. Preoperative assessments included laryngeal endoscopy and voice recording using a computer program, with evaluations repeated at one and three months post-surgery. Patients with normal laryngeal endoscopic findings underwent voice analysis and provided feedback on subjective voice changes. The PRAAT version 6.2 software was utilized for voice analysis. Results The study included 41 patients with normal laryngoscopic findings after thyroid surgery, with the majority being female (85.4%) and the average age being 42.4 years. Hemithyroidectomy was performed in 41.4% of patients and total thyroidectomy in 58.6%, with eight patients undergoing central compartment neck dissection. Except for one patient, the majority reported no subjective change in voice following surgery. Objective voice analysis showed statistically significant changes in the one-month postoperative period compared to preoperative values, including a 5.87% decrease in fundamental frequency, a 1.37% decrease in shimmer intensity, and a 6.24% decrease in first formant frequency, along with a 4.35% decrease in maximum phonatory duration. These trends persisted at the three-month postoperative period, although values approached close to preoperative levels. Results revealed statistically significant alterations in voice parameters, particularly fundamental frequency and first formant frequency, with greater values observed in total thyroidectomy patients. Shimmer intensity also exhibited slight changes. Comparison between hemithyroidectomy and total thyroidectomy groups revealed no significant differences in fundamental frequency, first formant frequency, and shimmer. However, maximum phonation duration showed a significantly greater change in the hemithyroidectomy group at both one-month and three-month postoperative intervals. Conclusions This study on post-thyroidectomy patients with normal vocal cord movement revealed significant changes in voice parameters postoperatively, with most patients reporting no subjective voice changes. The findings highlight the importance of objective voice analysis in assessing post-thyroidectomy voice outcomes.}, } @article {pmid38890016, year = {2024}, author = {Xiu, N and Li, W and Liu, L and Liu, Z and Cai, Z and Li, L and Vaxelaire, B and Sock, R and Ling, Z and Chen, J and Wang, Y}, title = {A Study on Voice Measures in Patients with Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.05.018}, pmid = {38890016}, issn = {1873-4588}, abstract = {PURPOSE: This research aims to identify acoustic features which can distinguish patients with Parkinson's disease (PD patients) and healthy speakers.

METHODS: Thirty PD patients and 30 healthy speakers were recruited in the experiment, and their speech was collected, including three vowels (/i/, /a/, and /u/) and nine consonants (/p/, /pʰ/, /t/, /tʰ/, /k/, /kʰ/, /l/, /m/, and /n/). Acoustic features like fundamental frequency (F0), Jitter, Shimmer, harmonics-to-noise ratio (HNR), first formant (F1), second formant (F2), third formant (F3), first bandwidth (B1), second bandwidth (B2), third bandwidth (B3), voice onset, voice onset time were analyzed in our experiment. Two-sample independent t test and the nonparametric Mann-Whitney U (MWU) test were carried out alternatively to compare the acoustic measures between the PD patients and healthy speakers. In addition, after figuring out the effective acoustic features for distinguishing PD patients and healthy speakers, we adopted two methods to detect PD patients: (1) Built classifiers based on the effective acoustic features and (2) Trained support vector machine classifiers via the effective acoustic features.

RESULTS: Significant differences were found between the male PD group and the male health control in vowel /i/ (Jitter and Shimmer) and /a/ (Shimmer and HNR). Among female subjects, significant differences were observed in F0 standard deviation (F0 SD) of /u/ between the two groups. Additionally, significant differences between PD group and health control were also found in the F3 of /i/ and /n/, whereas other acoustic features showed no significant differences between the two groups. The HNR of vowel /a/ performed the best classification accuracy compared with the other six acoustic features above found to distinguish PD patients and healthy speakers.

CONCLUSIONS: PD can cause changes in the articulation and phonation of PD patients, wherein increases or decreases occur in some acoustic features. Therefore, the use of acoustic features to detect PD is expected to be a low-cost and large-scale diagnostic method.}, } @article {pmid38880296, year = {2024}, author = {Weirich, M and Simpson, AP and Knutti, N}, title = {Effects of testosterone on speech production and perception: Linking hormone levels in males to vocal cues and female voice attractiveness ratings.}, journal = {Physiology & behavior}, volume = {283}, number = {}, pages = {114615}, doi = {10.1016/j.physbeh.2024.114615}, pmid = {38880296}, issn = {1873-507X}, mesh = {Humans ; *Testosterone/metabolism/pharmacology ; Male ; Adult ; Young Adult ; *Saliva/metabolism/chemistry ; *Hydrocortisone/metabolism ; *Speech Perception/physiology/drug effects ; *Speech/physiology/drug effects ; *Voice/drug effects ; *Cues ; Female ; Beauty ; Acoustic Stimulation ; }, abstract = {This study sets out to investigate the potential effect of males' testosterone level on speech production and speech perception. Regarding speech production, we investigate intra- and inter-individual variation in mean fundamental frequency (fo) and formant frequencies and highlight the potential interacting effect of another hormone, i.e. cortisol. In addition, we investigate the influence of different speech materials on the relationship between testosterone and speech production. Regarding speech perception, we investigate the potential effect of individual differences in males' testosterone level on ratings of attractiveness of female voices. In the production study, data is gathered from 30 healthy adult males ranging from 19 to 27 years (mean age: 22.4, SD: 2.2) who recorded their voices and provided saliva samples at 9 am, 12 noon and 3 pm on a single day. Speech material consists of sustained vowels, counting, read speech and a free description of pictures. Biological measures comprise speakers' height, grip strength, and hormone levels (testosterone and cortisol). In the perception study, participants were asked to rate the attractiveness of female voice stimuli (sentence stimulus, same-speaker pairs) that were manipulated in three steps regarding mean fo and formant frequencies. Regarding speech production, our results show that testosterone affected mean fo (but not formants) both within and between speakers. This relationship was weakened in speakers with high cortisol levels and depended on the speech material. Regarding speech perception, we found female stimuli with higher mean fo and formants to be rated as sounding more attractive than stimuli with lower mean fo and formants. Moreover, listeners with low testosterone showed an increased sensitivity to vocal cues of female attractiveness. While our results of the production study support earlier findings of a relationship between testosterone and mean fo in males (which is mediated by cortisol), they also highlight the relevance of the speech material: The effect of testosterone was strongest in sustained vowels, potentially due to a strengthened effect of hormones on physiologically strongly influenced tasks such as sustained vowels in contrast to more free speech tasks such as a picture description. The perception study is the first to show an effect of males' testosterone level on female attractiveness ratings using voice stimuli.}, } @article {pmid38852197, year = {2024}, author = {Krupić, F and Moravcova, M and Dervišević, E and Čustović, S and Grbić, K and Lindström, P}, title = {When time does not heal all wounds: three decades' experience of immigrants living in Sweden.}, journal = {Medicinski glasnik : official publication of the Medical Association of Zenica-Doboj Canton, Bosnia and Herzegovina}, volume = {21}, number = {2}, pages = {}, doi = {10.17392/1696-21-02}, pmid = {38852197}, issn = {1840-2445}, abstract = {AIM: To investigate how immigrants from the Balkan region experienced their current life situation after living in Sweden for 30 years or more.

MATERIALS: The study was designed as a qualitative study using data from interviews with informants from five Balkan countries. The inclusion criteria were informants who were immigrants to Sweden and had lived in Sweden for more than 30 years. Five groups comprising sixteen informants were invited to participate in the study, and they all agreed.

RESULTS: The analysis of the interviews resulted in three main categories: "from someone to no one", "labour market", and "discrimination". All the informants reported that having an education and life experience was worth-less, having a life but having to start over, re-educating, applying for many jobs but often not being answered, and finally getting a job for which every in-formant was educated but being humiliated every day and treated separately as well as being discriminated against.

CONCLUSION: Coming to Sweden with all their problems, having an education and work experience that was equal to zero in Sweden, studying Swedish and re-reading/repeating all their education, looking for a job and not receiving answers to applications, and finally getting a job but being treated differently and discriminated against on a daily basis was experienced by all the in-formants as terrible. Even though there are enough similar studies in Sweden, it is always good to write more to help prospective immigrants and prospective employers in Sweden.}, } @article {pmid38847582, year = {2024}, author = {Mittapalle, KR and Alku, P}, title = {Classification of phonation types in singing voice using wavelet scattering network-based features.}, journal = {JASA express letters}, volume = {4}, number = {6}, pages = {}, doi = {10.1121/10.0026241}, pmid = {38847582}, issn = {2691-1191}, abstract = {The automatic classification of phonation types in singing voice is essential for tasks such as identification of singing style. In this study, it is proposed to use wavelet scattering network (WSN)-based features for classification of phonation types in singing voice. WSN, which has a close similarity with auditory physiological models, generates acoustic features that greatly characterize the information related to pitch, formants, and timbre. Hence, the WSN-based features can effectively capture the discriminative information across phonation types in singing voice. The experimental results show that the proposed WSN-based features improved phonation classification accuracy by at least 9% compared to state-of-the-art features.}, } @article {pmid38841122, year = {2024}, author = {Gorina-Careta, N and Arenillas-Alcón, S and Puertollano, M and Mondéjar-Segovia, A and Ijjou-Kadiri, S and Costa-Faidella, J and Gómez-Roig, MD and Escera, C}, title = {Exposure to bilingual or monolingual maternal speech during pregnancy affects the neurophysiological encoding of speech sounds in neonates differently.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1379660}, pmid = {38841122}, issn = {1662-5161}, abstract = {INTRODUCTION: Exposure to maternal speech during the prenatal period shapes speech perception and linguistic preferences, allowing neonates to recognize stories heard frequently in utero and demonstrating an enhanced preference for their mother's voice and native language. Yet, with a high prevalence of bilingualism worldwide, it remains an open question whether monolingual or bilingual maternal speech during pregnancy influence differently the fetus' neural mechanisms underlying speech sound encoding.

METHODS: In the present study, the frequency-following response (FFR), an auditory evoked potential that reflects the complex spectrotemporal dynamics of speech sounds, was recorded to a two-vowel /oa/ stimulus in a sample of 129 healthy term neonates within 1 to 3 days after birth. Newborns were divided into two groups according to maternal language usage during the last trimester of gestation (monolingual; bilingual). Spectral amplitudes and spectral signal-to-noise ratios (SNR) at the stimulus fundamental (F0) and first formant (F1) frequencies of each vowel were, respectively, taken as measures of pitch and formant structure neural encoding.

RESULTS: Our results reveal that while spectral amplitudes at F0 did not differ between groups, neonates from bilingual mothers exhibited a lower spectral SNR. Additionally, monolingually exposed neonates exhibited a higher spectral amplitude and SNR at F1 frequencies.

DISCUSSION: We interpret our results under the consideration that bilingual maternal speech, as compared to monolingual, is characterized by a greater complexity in the speech sound signal, rendering newborns from bilingual mothers more sensitive to a wider range of speech frequencies without generating a particularly strong response at any of them. Our results contribute to an expanding body of research indicating the influence of prenatal experiences on language acquisition and underscore the necessity of including prenatal language exposure in developmental studies on language acquisition, a variable often overlooked yet capable of influencing research outcomes.}, } @article {pmid38820240, year = {2024}, author = {Wu, HY}, title = {Uncovering Gender-Specific and Cross-Gender Features in Mandarin Deception: An Acoustic and Electroglottographic Approach.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {7}, pages = {2021-2037}, doi = {10.1044/2024_JSLHR-23-00288}, pmid = {38820240}, issn = {1558-9102}, mesh = {Humans ; Female ; Male ; *Speech Acoustics ; Young Adult ; Adult ; *Deception ; *Language ; Glottis/physiology ; Sex Factors ; China ; Electrodiagnosis ; }, abstract = {PURPOSE: This study aimed to investigate the acoustic and electroglottographic (EGG) profiles of Mandarin deception, including global characteristics and the influence of gender.

METHOD: Thirty-six Mandarin speakers participated in an interactive interview game in which they provided both deceptive and truthful answers to 14 biographical questions. Acoustic and EGG signals of the participants' responses were simultaneously recorded; 20 acoustic and 14 EGG features were analyzed using binary logistic regression models.

RESULTS: Increases in fundamental frequency (F0) mean, intensity mean, first formant (F1), fifth formant (F5), contact quotient (CQ), decontacting-time quotient (DTQ), and contact index (CI) as well as decreases in jitter, shimmer, harmonics-to-noise ratio (HNR), and fourth formant (F4) were significantly correlated with global deception. Cross-gender features included increases in intensity mean and F5 and decreases in jitter, HNR, and F4, whereas gender-specific features encompassed increases in F0 mean, shimmer, F1, third formant, and DTQ, as well as decreases in F0 maximum and CQ for female deception, and increases in CQ and CI and decreases in shimmer for male deception.

CONCLUSIONS: The results suggest that Mandarin deception could be tied to underlying pragmatic functions, emotional arousal, decreased glottal contact skewness, and more pressed phonation. Disparities in gender-specific features lend support to differences in the use of pragmatics, levels of deception-induced emotional arousal, skewness of glottal contact patterns, and phonation types.}, } @article {pmid38789366, year = {2024}, author = {Neuhaus, TJ and Scherer, RC and Whitfield, JA}, title = {Gender Perception of Speech: Dependence on Fundamental Frequency, Implied Vocal Tract Length, and Source Spectral Tilt.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.01.014}, pmid = {38789366}, issn = {1873-4588}, abstract = {OBJECTIVE: To investigate how listeners use fundamental frequency, implied vocal tract length, and source spectral tilt to infer speaker gender.

METHODS: Sound files each containing the vowels /i, æ, ɑ, u/ interspersed by brief silences were synthesized. Each of the 210 stimuli was a combination of 10 values for fundamental frequency and 7 values for implied vocal tract length (and the associated formant frequencies) ranging from male-typical to female-typical, and 3 values for source spectral tilt approximating the voice qualities of breathy, normal, and pressed. Twenty-three listeners judged each synthesized "speaker" as "female" or "male." Generalized linear mixed model analysis was used to determine the extent to which fundamental frequency, implied vocal track length, and spectral tilt influenced listener judgment.

RESULTS: Increasing fundamental frequency and decreasing implied vocal tract length resulted in increased probability of female judgment. Two interactions were identified: An increase in fundamental frequency and also a decrease in source spectral tilt (more negative) resulted in a greater increase in the probability of female judgment when the vocal tract length was relatively short.

CONCLUSIONS: The relationships among fundamental frequency, implied vocal tract length, source spectral tilt, and probability of female judgment changed across the range of normal values, suggesting that the relative contributions of fundamental frequency and implied vocal tract length to gender perception varied over the ranges studied. There was no threshold of fundamental frequency or implied vocal tract length that dramatically shifted the perception between male and female.}, } @article {pmid38782960, year = {2024}, author = {Balolia, KL and Fitzgerald, PL}, title = {Male proboscis monkey cranionasal size and shape is associated with visual and acoustic signalling.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {10715}, pmid = {38782960}, issn = {2045-2322}, mesh = {Animals ; Male ; *Sex Characteristics ; Nasal Cavity/anatomy & histology/physiology ; Nose/anatomy & histology ; Animal Communication ; Acoustics ; Skull/anatomy & histology ; Vocalization, Animal/physiology ; Female ; }, abstract = {The large nose adorned by adult male proboscis monkeys is hypothesised to serve as an audiovisual signal of sexual selection. It serves as a visual signal of male quality and social status, and as an acoustic signal, through the expression of loud, low-formant nasalised calls in dense rainforests, where visibility is poor. However, it is unclear how the male proboscis monkey nasal complex, including the internal structure of the nose, plays a role in visual or acoustic signalling. Here, we use cranionasal data to assess whether large noses found in male proboscis monkeys serve visual and/or acoustic signalling functions. Our findings support a visual signalling function for male nasal enlargement through a relatively high degree of nasal aperture sexual size dimorphism, the craniofacial region to which nasal soft tissue attaches. We additionally find nasal aperture size increases beyond dental maturity among male proboscis monkeys, consistent with the visual signalling hypothesis. We show that the cranionasal region has an acoustic signalling role through pronounced nasal cavity sexual shape dimorphism, wherein male nasal cavity shape allows the expression of loud, low-formant nasalised calls. Our findings provide robust support for the male proboscis monkey nasal complex serving both visual and acoustic functions.}, } @article {pmid38778635, year = {2024}, author = {Beach, SD and Niziolek, CA}, title = {Inhibitory modulation of speech trajectories: Evidence from a vowel-modified Stroop task.}, journal = {Cognitive neuropsychology}, volume = {41}, number = {1-2}, pages = {51-69}, pmid = {38778635}, issn = {1464-0627}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Stroop Test ; *Inhibition, Psychological ; Male ; *Speech/physiology ; Female ; *Reaction Time/physiology ; Adult ; Young Adult ; Reading ; Phonetics ; Attention/physiology ; }, abstract = {How does cognitive inhibition influence speaking? The Stroop effect is a classic demonstration of the interference between reading and color naming. We used a novel variant of the Stroop task to measure whether this interference impacts not only the response speed, but also the acoustic properties of speech. Speakers named the color of words in three categories: congruent (e.g., red written in red), color-incongruent (e.g., green written in red), and vowel-incongruent - those with partial phonological overlap with their color (e.g., rid written in red, grain in green, and blow in blue). Our primary aim was to identify any effect of the distractor vowel on the acoustics of the target vowel. Participants were no slower to respond on vowel-incongruent trials, but formant trajectories tended to show a bias away from the distractor vowel, consistent with a phenomenon of acoustic inhibition that increases contrast between confusable alternatives.}, } @article {pmid38755075, year = {2024}, author = {Aaen, M and Sadolin, C}, title = {Towards Improved Auditory-Perceptual Assessment of Timbres: Comparing Accuracy and Reliability of Four Deconstructed Timbre Assessment Models.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.03.039}, pmid = {38755075}, issn = {1873-4588}, abstract = {UNLABELLED: Timbre is a central quality of singing, yet remains a complex notion poorly understood in psychoacoustic studies. Previous studies note how no single acoustic variable or combinations of variables consistently predict timbre dimensions. Timbre varies on a continuum from darkest to lightest. These extremes are associated with laryngeal and vocal tract adjustments related to smaller and larger vocal tract area and variations in vocal fold vibratory characteristics. Perceptually, timbre assessment is influenced by spectral characteristics and formant frequency adjustments, though these dimensions are not independently perceived. Perceptual studies repeatedly demonstrate difficulties in correlating variations in timbre stimuli to specific measures. A recent study demonstrated how acoustic predictive salience of voice category and voice weight across pitches contribute to timbre assessments and concludes that timbre may be related to as-of-yet unknown factor(s). The purpose of this study was to test four different models for assessing timbre; one model focused on specific anatomy, one on listener intuition, one utilizing auditory anchors, and one using expert raters in a deconstructed timbre model with five specific dimensions.

METHODS: Four independent panels were conducted with separate cohorts of professional singing teachers. Forty-one assessors took part in the anatomically focused panel, 54 in the intuition-based panel, 30 in the anchored panel, and 12 in the expert listener panel. Stimuli taken from live performances of well-known singers were used for all panels, representing all genders, genres, and styles across a large pitch range. All stimuli are available as Supplementary Materials. Fleiss' kappa values, descriptive statistics, and significance tests are reported for all panel assessments.

RESULTS: Panels 1 through 4 varied in overall accuracy and agreement. The intuition-based model showed overall 45% average accuracy (SD ± 4%), k = 0.289 (<0.001) compared to overall 71% average accuracy (SD ± 3%), k = 0.368 (<0.001) of the anatomical focused panel. The auditory-anchored model showed overall 75% average accuracy (SD ± 8%), k = 0.54 (<0.001) compared with overall 83% average accuracy and agreement of k = 0.63 (<0.001) for panel 4. Results revealed that the highest accuracy and reliability were achieved in a deconstructed timbre model and that providing anchoring improved reliability but with no further increase in accuracy.

CONCLUSION: Deconstructing timbre into specific parameters improved auditory perceptual accuracy and overall agreement. Assessing timbre along with other perceptual dimensions improves accuracy and reliability. Panel assessors' expert level of listening skills remain an important factor in obtaining reliable and accurate assessments of auditory stimuli for timbre dimensions. Anchoring improved reliability but with no further increase in accuracy. The study suggests that timbre assessment can be improved by approaching the percept through a prism of five specific dimensions each related to specific physiology and auditory-perceptual subcategories. Further tests are needed with framework-naïve listeners, nonmusically educated listeners, artificial intelligence comparisons, and synthetic stimuli to further test the reliability.}, } @article {pmid38754028, year = {2024}, author = {Ning, LH and Hui, TC}, title = {The Accompanying Effect in Responses to Auditory Perturbations: Unconscious Vocal Adjustments to Unperturbed Parameters.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {6}, pages = {1731-1751}, doi = {10.1044/2024_JSLHR-23-00543}, pmid = {38754028}, issn = {1558-9102}, mesh = {Humans ; Male ; Female ; Young Adult ; *Pitch Perception/physiology ; Adult ; *Bayes Theorem ; Speech Perception/physiology ; Loudness Perception/physiology ; Feedback, Sensory/physiology ; Voice/physiology ; Acoustic Stimulation/methods ; Speech Acoustics ; }, abstract = {PURPOSE: The present study examined whether participants respond to unperturbed parameters while experiencing specific perturbations in auditory feedback. For instance, we aim to determine if speakers adjust voice loudness when only pitch is artificially altered in auditory feedback. This phenomenon is referred to as the "accompanying effect" in the present study.

METHOD: Thirty native Mandarin speakers were asked to sustain the vowel /ɛ/ for 3 s while their auditory feedback underwent single shifts in one of the three distinct ways: pitch shift (±100 cents; coded as PT), loudness shift (±6 dB; coded as LD), or first formant (F1) shift (±100 Hz; coded as FM). Participants were instructed to ignore the perturbations in their auditory feedback. Response types were categorized based on pitch, loudness, and F1 for each individual trial, such as Popp_Lopp_Fopp indicating opposing responses in all three domains.

RESULTS: The accompanying effect appeared 93% of the time. Bayesian Poisson regression models indicate that opposing responses in all three domains (Popp_Lopp_Fopp) were the most prevalent response type across the conditions (PT, LD, and FM). The more frequently used response types exhibited opposing responses and significantly larger response curves than the less frequently used response types. Following responses became more prevalent only when the perturbed stimuli were perceived as voices from someone else (external references), particularly in the FM condition. In terms of isotropy, loudness and F1 tended to change in the same direction rather than loudness and pitch.

CONCLUSION: The presence of the accompanying effect suggests that the motor systems responsible for regulating pitch, loudness, and formants are not entirely independent but rather interconnected to some degree.}, } @article {pmid38741274, year = {2024}, author = {Ekström, AG}, title = {Correcting the record: Phonetic potential of primate vocal tracts and the legacy of Philip Lieberman (1934-2022).}, journal = {American journal of primatology}, volume = {86}, number = {8}, pages = {e23637}, doi = {10.1002/ajp.23637}, pmid = {38741274}, issn = {1098-2345}, mesh = {Animals ; *Vocalization, Animal ; *Phonetics ; *Primates/physiology/anatomy & histology ; Humans ; History, 20th Century ; Speech/physiology ; Biological Evolution ; }, abstract = {The phonetic potential of nonhuman primate vocal tracts has been the subject of considerable contention in recent literature. Here, the work of Philip Lieberman (1934-2022) is considered at length, and two research papers-both purported challenges to Lieberman's theoretical work-and a review of Lieberman's scientific legacy are critically examined. I argue that various aspects of Lieberman's research have been consistently misinterpreted in the literature. A paper by Fitch et al. overestimates the would-be "speech-ready" capacities of a rhesus macaque, and the data presented nonetheless supports Lieberman's principal position-that nonhuman primates cannot articulate the full extent of human speech sounds. The suggestion that no vocal anatomical evolution was necessary for the evolution of human speech (as spoken by all normally developing humans) is not supported by phonetic or anatomical data. The second challenge, by Boë et al., attributes vowel-like qualities of baboon calls to articulatory capacities based on audio data; I argue that such "protovocalic" properties likely result from disparate articulatory maneuvers compared to human speakers. A review of Lieberman's scientific legacy by Boë et al. ascribes a view of speech evolution (which the authors term "laryngeal descent theory") to Lieberman, which contradicts his writings. The present article documents a pattern of incorrect interpretations of Lieberman's theoretical work in recent literature. Finally, the apparent trend of vowel-like formant dispersions in great ape vocalization literature is discussed with regard to Lieberman's theoretical work. The review concludes that the "Lieberman account" of primate vocal tract phonetic capacities remains supported by research: the ready articulation of fully human speech reflects species-unique anatomy.}, } @article {pmid38738242, year = {2024}, author = {Cao, S and Rosenzweig, I and Bilotta, F and Jiang, H and Xia, M}, title = {Automatic detection of obstructive sleep apnea based on speech or snoring sounds: a narrative review.}, journal = {Journal of thoracic disease}, volume = {16}, number = {4}, pages = {2654-2667}, pmid = {38738242}, issn = {2072-1439}, abstract = {BACKGROUND AND OBJECTIVE: Obstructive sleep apnea (OSA) is a common chronic disorder characterized by repeated breathing pauses during sleep caused by upper airway narrowing or collapse. The gold standard for OSA diagnosis is the polysomnography test, which is time consuming, expensive, and invasive. In recent years, more cost-effective approaches for OSA detection based in predictive value of speech and snoring has emerged. In this paper, we offer a comprehensive summary of current research progress on the applications of speech or snoring sounds for the automatic detection of OSA and discuss the key challenges that need to be overcome for future research into this novel approach.

METHODS: PubMed, IEEE Xplore, and Web of Science databases were searched with related keywords. Literature published between 1989 and 2022 examining the potential of using speech or snoring sounds for automated OSA detection was reviewed.

KEY CONTENT AND FINDINGS: Speech and snoring sounds contain a large amount of information about OSA, and they have been extensively studied in the automatic screening of OSA. By importing features extracted from speech and snoring sounds into artificial intelligence models, clinicians can automatically screen for OSA. Features such as formant, linear prediction cepstral coefficients, mel-frequency cepstral coefficients, and artificial intelligence algorithms including support vector machines, Gaussian mixture model, and hidden Markov models have been extensively studied for the detection of OSA.

CONCLUSIONS: Due to the significant advantages of noninvasive, low-cost, and contactless data collection, an automatic approach based on speech or snoring sounds seems to be a promising tool for the detection of OSA.}, } @article {pmid38717213, year = {2024}, author = {Feng, H and Wang, L}, title = {Acoustic analysis of English tense and lax vowels: Comparing the production between Mandarin Chinese learners and native English speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {5}, pages = {3071-3089}, doi = {10.1121/10.0025931}, pmid = {38717213}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; Male ; Female ; *Multilingualism ; Young Adult ; Speech Production Measurement ; Adult ; Language ; Acoustics ; Learning ; Voice Quality ; Sound Spectrography ; East Asian People ; }, abstract = {This study investigated how 40 Chinese learners of English as a foreign language (EFL learners) differed from 40 native English speakers in the production of four English tense-lax contrasts, /i-ɪ/, /u-ʊ/, /ɑ-ʌ/, and /æ-ε/, by examining the acoustic measurements of duration, the first three formant frequencies, and the slope of the first formant movement (F1 slope). The dynamic formant trajectory was modeled using discrete cosine transform coefficients to demonstrate the time-varying properties of formant trajectories. A discriminant analysis was employed to illustrate the extent to which Chinese EFL learners relied on different acoustic parameters. This study found that: (1) Chinese EFL learners overemphasized durational differences and weakened spectral differences for the /i-ɪ/, /u-ʊ/, and /ɑ-ʌ/ pairs, although they maintained sufficient spectral differences for /æ-ε/. In contrast, native English speakers predominantly used spectral differences across all four pairs; (2) in non-low tense-lax contrasts, unlike native English speakers, Chinese EFL learners failed to exhibit different F1 slope values, indicating a non-nativelike tongue-root placement during the articulatory process. The findings underscore the contribution of dynamic spectral patterns to the differentiation between English tense and lax vowels, and reveal the influence of precise articulatory gestures on the realization of the tense-lax contrast.}, } @article {pmid38714709, year = {2024}, author = {Ostrega, J and Shiramizu, V and Lee, AJ and Jones, BC and Feinberg, DR}, title = {No evidence that averaging voices influences attractiveness.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {10488}, pmid = {38714709}, issn = {2045-2322}, support = {EP/T023783/1//Engineering and Physical Sciences Research Council/ ; RGPIN-2023-05146//Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Humans ; Male ; Female ; *Voice/physiology ; Adult ; Young Adult ; *Beauty ; Judgment/physiology ; Adolescent ; }, abstract = {Vocal attractiveness influences important social outcomes. While most research on the acoustic parameters that influence vocal attractiveness has focused on the possible roles of sexually dimorphic characteristics of voices, such as fundamental frequency (i.e., pitch) and formant frequencies (i.e., a correlate of body size), other work has reported that increasing vocal averageness increases attractiveness. Here we investigated the roles these three characteristics play in judgments of the attractiveness of male and female voices. In Study 1, we found that increasing vocal averageness significantly decreased distinctiveness ratings, demonstrating that participants could detect manipulations of vocal averageness in this stimulus set and using this testing paradigm. However, in Study 2, we found no evidence that increasing averageness significantly increased attractiveness ratings of voices. In Study 3, we found that fundamental frequency was negatively correlated with male vocal attractiveness and positively correlated with female vocal attractiveness. By contrast with these results for fundamental frequency, vocal attractiveness and formant frequencies were not significantly correlated. Collectively, our results suggest that averageness may not necessarily significantly increase attractiveness judgments of voices and are consistent with previous work reporting significant associations between attractiveness and voice pitch.}, } @article {pmid38704279, year = {2024}, author = {Leyns, C and Adriaansen, A and Daelman, J and Bostyn, L and Meerschman, I and T'Sjoen, G and D'haeseleer, E}, title = {Long-term Acoustic Effects of Gender-Affirming Voice Training in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.04.007}, pmid = {38704279}, issn = {1873-4588}, abstract = {OBJECTIVES: One role of a speech-language pathologist (SLP) is to help transgender clients in developing a healthy, gender-congruent communication. Transgender women frequently approach SLPs to train their voices to sound more feminine, however, long-term acoustic effects of the training needs to be rigorously examined in effectiveness studies. The aim of this study was to investigate the long-term effects (follow-up 1: 3months and follow-up 2: 1year after last session) of gender-affirming voice training for transgender women, in terms of acoustic parameters.

STUDY DESIGN: This study was a randomized sham-controlled trial with a cross-over design.

METHODS: Twenty-six transgender women were included for follow-up 1 and 18 for follow-up 2. All participants received 14weeks of gender-affirming voice training (4weeks sham training, 10weeks of voice feminization training: 5weeks pitch elevation training and 5weeks articulation-resonance training), but in a different order. Speech samples were recorded with Praat at four different time points (pre, post, follow-up 1, follow-up 2). Acoustic analysis included fo of sustained vowel /a:/, reading and spontaneous speech. Formant frequencies (F1-F2-F3) of vowels /a/, /i/, and /u/ were determined and vowel space was calculated. A linear mixed model was used to compare the acoustic voice measurements between measurements (pre - post, pre - follow-up 1, pre - follow-up 2, post - follow-up 1, post - follow-up 2, follow-up 1 - follow-up 2).

RESULTS: Most of the fo measurements and formant frequencies that increased immediately after the intervention, were stable at both follow-up measurements. The median fo during the sustained vowel, reading and spontaneous speech stayed increased at both follow-ups compared to the pre-measurement. However, a decrease of 16 Hz/1.7 ST (reading) and 12 Hz/1.5 ST (spontaneous speech) was detected between the post-measurement (169 Hz for reading, 144 Hz for spontaneous speech) and 1year after the last session (153 Hz and 132 Hz, respectively). The lower limit of fo did not change during reading and spontaneous speech, both directly after the intervention and during both follow-ups. F1-2 of vowel /a/ and the vowel space increased after the intervention and both follow-ups. Individual analyses showed that more aspects should be controlled after the intervention, such as exercises that were performed at home, or the duration of extra gender-affirming voice training sessions.

CONCLUSIONS: After 10 sessions of voice feminization training and follow-up measurements after 3months and 1year, stable increases were found for some formant frequencies and fo measurements, but not all of them. More time should be spent on increasing the fifth percentile of fo, as the lower limit of fo also contributes to the perception of more feminine voice.}, } @article {pmid38693788, year = {2024}, author = {Kocjančič, T and Bořil, T and Hofmann, S}, title = {Acoustic and Articulatory Visual Feedback in Classroom L2 Vowel Remediation.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309231223736}, doi = {10.1177/00238309231223736}, pmid = {38693788}, issn = {1756-6053}, abstract = {This paper presents L2 vowel remediation in a classroom setting via two real-time visual feedback methods: articulatory ultrasound tongue imaging, which shows tongue shape and position, and a newly developed acoustic formant analyzer, which visualizes a point correlating with the combined effect of tongue position and lip rounding in a vowel quadrilateral. Ten Czech students of the Swedish language participated in the study. Swedish vowel production is difficult for Czech speakers since the languages differ significantly in their vowel systems. The students selected the vowel targets on their own and practiced in two classroom groups, with six students receiving two ultrasound training lessons, followed by one acoustic, and four students receiving two acoustic lessons, followed by one ultrasound. Audio data were collected pre-training, after the two sessions employing the first visual feedback method, and at post-training, allowing measuring Euclidean distance among selected groups of vowels and observing the direction of change within the vowel quadrilateral as a result of practice. Perception tests were performed before and after training, revealing that most learners perceived selected vowels correctly already before the practice. The study showed that both feedback methods can be successfully applied to L2 classroom learning, and both lead to the improvement in the pronunciation of the selected vowels, as well as the Swedish vowel set as a whole. However, ultrasound tongue imaging seems to have an advantage as it resulted in a greater number of improved targets.}, } @article {pmid38656176, year = {2024}, author = {Saldías O'Hrens, M and Castro, C and Espinoza, VM and Stoney, J and Quezada, C and Laukkanen, AM}, title = {Spectral features related to the auditory perception of twang-like voices.}, journal = {Logopedics, phoniatrics, vocology}, volume = {}, number = {}, pages = {1-18}, doi = {10.1080/14015439.2024.2345373}, pmid = {38656176}, issn = {1651-2022}, abstract = {BACKGROUND: To the best of our knowledge, studies on the relationship between spectral energy distribution and the degree of perceived twang-like voices are still sparse. Through an auditory-perceptual test we aimed to explore the spectral features that may relate with the auditory-perception of twang-like voices.

METHODS: Ten judges who were blind to the test's tasks and stimuli rated the amount of twang perceived on seventy-six audio samples. The stimuli consisted of twenty voices recorded from eight CCM singers who sustained the vowel [a:] in different pitches, with and without a twang-like voice. Also, forty filtered and sixteen synthesized-manipulated stimuli were included.

RESULTS AND CONCLUSIONS: Based on the intra-rater reliability scores, four judges were identified as suitable to be included in the analyses. Results showed that the frequency of F1 and F2 correlated strongly with the auditory-perception of twang-like voices (0.90 and 0.74, respectively), whereas F3 showed a moderate negative correlation (-0.52). The frequency difference between F1 and F3 showed a strong negative correlation (-0.82). The mean energy between 1-2 kHz and 2-3 kHz correlated moderately (0.51 and 0.42, respectively). The frequency of F4 and F5, and the energy above 3 kHz showed weak correlations. Since the spectral changes under 2 kHz have been associated with the jaw, lips, and tongue adjustments (i.e. vowel articulation) and a higher vertical laryngeal position might affect the frequency of all formants (including F1 and F2), our results suggest that vowel articulation and the laryngeal height may be relevant when performing twang-like voices.}, } @article {pmid38644071, year = {2024}, author = {Cruz, TLB and Frič, M and Andrade, PA}, title = {A Comparison of Countertenor Singing at Various Professional Levels Using Acoustic, Electroglottographic, and Videofluoroscopic Methods.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.03.033}, pmid = {38644071}, issn = {1873-4588}, abstract = {INTRODUCTION: The vocal characteristics of countertenors (CTTs) are poorly understood due to a lack of studies in this field. This study aims to explore differences among CTTs at various professional levels, examining both disparities and congruences in singing styles to better understand the CTT voice.

MATERIALS AND METHODS: Four CTTs (one student, one amateur, and two professionals) sang "La giustizia ha già sull'arco" from Handel's Giulio Cesare, with concurrent videofluoroscopic, electroglottography (EGG), and acoustic data collection. Auditory-perceptual analysis was employed to rate professional level. Acoustic analysis included LH1-LH2, formant cluster prominence, and vibrato analysis. EGG data was analyzed using FonaDyn software, while anatomical modifications were quantified using videofluoroscopic images.

RESULTS: CTTs exhibited EGG contact quotient values surpassing typical levels for inexperienced falsettos. Their vibrato characteristics aligned with expectations for classical singing, whereas the presence of the singer's formant was not observed. Variations in supraglottic adjustments among CTTs underscored the diversity of techniques employed by CTT singers.

CONCLUSIONS: CTTs exhibited vocal techniques that highlighted the influence of individual preferences, professional experience, and stylistic choices in shaping their singing characteristics. The data revealed discernible differences between professional and amateur CTTs, providing insights into the impact of varying levels of experience on vocal expression.}, } @article {pmid38629882, year = {2024}, author = {Torres, C and Li, W and Escudero, P}, title = {Acoustic, phonetic, and phonological features of Drehu vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {4}, pages = {2612-2626}, doi = {10.1121/10.0025538}, pmid = {38629882}, issn = {1520-8524}, mesh = {*Phonetics ; *Speech Acoustics ; Acoustics ; }, abstract = {This study presents an acoustic investigation of the vowel inventory of Drehu (Southern Oceanic Linkage), spoken in New Caledonia. Reportedly, Drehu has a 14 vowel system distinguishing seven vowel qualities and an additional length distinction. Previous phonological descriptions were based on impressionistic accounts showing divergent proposals for two out of seven reported vowel qualities. This study presents the first phonetic investigation of Drehu vowels based on acoustic data from eight speakers. To examine the phonetic correlates of the proposed phonological vowel inventory, multi-point acoustic analyses were used, and vowel inherent spectral change (VISC) was investigated (F1, F2, and F3). Additionally, vowel duration was measured. Contrary to reports from other studies on VISC in monophthongs, we find that monophthongs in Drehu are mostly steady state. We propose a revised vowel inventory and focus on the acoustic description of open-mid /ɛ/ and the central vowel /ə/, whose status was previously unclear. Additionally, we find that vowel quality stands orthogonal to vowel quantity by demonstrating that the phonological vowel length distinction is primarily based on a duration cue rather than formant structure. Finally, we report the acoustic properties of the seven vowel qualities that were identified.}, } @article {pmid38564597, year = {2024}, author = {Wang, H and Ali, Y and Max, L}, title = {Perceptual formant discrimination during speech movement planning.}, journal = {PloS one}, volume = {19}, number = {4}, pages = {e0301514}, pmid = {38564597}, issn = {1932-6203}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech/physiology ; *Speech Perception/physiology ; Acoustics ; *Auditory Cortex ; Movement ; Phonetics ; Speech Acoustics ; }, abstract = {Evoked potential studies have shown that speech planning modulates auditory cortical responses. The phenomenon's functional relevance is unknown. We tested whether, during this time window of cortical auditory modulation, there is an effect on speakers' perceptual sensitivity for vowel formant discrimination. Participants made same/different judgments for pairs of stimuli consisting of a pre-recorded, self-produced vowel and a formant-shifted version of the same production. Stimuli were presented prior to a "go" signal for speaking, prior to passive listening, and during silent reading. The formant discrimination stimulus /uh/ was tested with a congruent productions list (words with /uh/) and an incongruent productions list (words without /uh/). Logistic curves were fitted to participants' responses, and the just-noticeable difference (JND) served as a measure of discrimination sensitivity. We found a statistically significant effect of condition (worst discrimination before speaking) without congruency effect. Post-hoc pairwise comparisons revealed that JND was significantly greater before speaking than during silent reading. Thus, formant discrimination sensitivity was reduced during speech planning regardless of the congruence between discrimination stimulus and predicted acoustic consequences of the planned speech movements. This finding may inform ongoing efforts to determine the functional relevance of the previously reported modulation of auditory processing during speech planning.}, } @article {pmid38557735, year = {2024}, author = {Havenhill, J}, title = {Articulatory and acoustic dynamics of fronted back vowels in American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {4}, pages = {2285-2301}, doi = {10.1121/10.0025461}, pmid = {38557735}, issn = {1520-8524}, mesh = {United States ; *Speech Acoustics ; *Phonetics ; Acoustics ; Language ; South Carolina ; }, abstract = {Fronting of the vowels /u, ʊ, o/ is observed throughout most North American English varieties, but has been analyzed mainly in terms of acoustics rather than articulation. Because an increase in F2, the acoustic correlate of vowel fronting, can be the result of any gesture that shortens the front cavity of the vocal tract, acoustic data alone do not reveal the combination of tongue fronting and/or lip unrounding that speakers use to produce fronted vowels. It is furthermore unresolved to what extent the articulation of fronted back vowels varies according to consonantal context and how the tongue and lips contribute to the F2 trajectory throughout the vowel. This paper presents articulatory and acoustic data on fronted back vowels from two varieties of American English: coastal Southern California and South Carolina. Through analysis of dynamic acoustic, ultrasound, and lip video data, it is shown that speakers of both varieties produce fronted /u, ʊ, o/ with rounded lips, and that high F2 observed for these vowels is associated with a front-central tongue position rather than unrounded lips. Examination of time-varying formant trajectories and articulatory configurations shows that the degree of vowel-internal F2 change is predominantly determined by coarticulatory influence of the coda.}, } @article {pmid38530014, year = {2024}, author = {Singh, VP and Sahidullah, M and Kinnunen, T}, title = {ChildAugment: Data augmentation methods for zero-resource children's speaker verification.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {2221-2232}, doi = {10.1121/10.0025178}, pmid = {38530014}, issn = {1520-8524}, mesh = {Male ; Child ; Adult ; Female ; Humans ; Reproducibility of Results ; *Speech ; *Acoustics ; Neural Networks, Computer ; Motivation ; }, abstract = {The accuracy of modern automatic speaker verification (ASV) systems, when trained exclusively on adult data, drops substantially when applied to children's speech. The scarcity of children's speech corpora hinders fine-tuning ASV systems for children's speech. Hence, there is a timely need to explore more effective ways of reusing adults' speech data. One promising approach is to align vocal-tract parameters between adults and children through children-specific data augmentation, referred here to as ChildAugment. Specifically, we modify the formant frequencies and formant bandwidths of adult speech to emulate children's speech. The modified spectra are used to train emphasized channel attention, propagation, and aggregation in time-delay neural network recognizer for children. We compare ChildAugment against various state-of-the-art data augmentation techniques for children's ASV. We also extensively compare different scoring methods, including cosine scoring, probabilistic linear discriminant analysis (PLDA), and neural PLDA. We also propose a low-complexity weighted cosine score for extremely low-resource children ASV. Our findings on the CSLU kids corpus indicate that ChildAugment holds promise as a simple, acoustics-motivated approach, for improving state-of-the-art deep learning based ASV for children. We achieve up to 12.45% (boys) and 11.96% (girls) relative improvement over the baseline. For reproducibility, we provide the evaluation protocols and codes here.}, } @article {pmid38503674, year = {2024}, author = {Södersten, M and Oates, J and Sand, A and Granqvist, S and Quinn, S and Dacakis, G and Nygren, U}, title = {Gender-Affirming Voice Training for Trans Women: Acoustic Outcomes and Their Associations With Listener Perceptions Related to Gender.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.02.003}, pmid = {38503674}, issn = {1873-4588}, abstract = {OBJECTIVES: To investigate acoustic outcomes of gender-affirming voice training for trans women wanting to develop a female sounding voice and to describe what happens acoustically when male sounding voices become more female sounding.

STUDY DESIGN: Prospective treatment study with repeated measures.

METHODS: N = 74 trans women completed a voice training program of 8-12 sessions and had their voices audio recorded twice before and twice after training. Reference data were obtained from N = 40 cisgender speakers. Fundamental frequency (fo), formant frequencies (F1-F4), sound pressure level (Leq), and level difference between first and second harmonic (L1-L2) were extracted from a reading passage and spontaneous speech. N = 79 naive listeners provided gender-related ratings of participants' audio recordings. A linear mixed-effects model was used to estimate average training effects. Individual level analyses determined how changes in acoustic data were related to listeners' ratings.

RESULTS: Group data showed substantial training effects on fo (average, minimum, and maximum) and formant frequencies. Individual data demonstrated that many participants also increased Leq and some increased L1-L2. Measures that most strongly predicted listener ratings of a female sounding voice were: fo, average formant frequency, and Leq.

CONCLUSIONS: This is the largest prospective study reporting on acoustic outcomes of gender-affirming voice training for trans women. We confirm findings from previous smaller scale studies by demonstrating that listener perceptions of male and female sounding voices are related to acoustic voice features, and that voice training for trans women wanting to sound female is associated with desirable acoustic changes, indicating training effectiveness. Although acoustic measures can be a valuable indicator of training effectiveness, particularly from the perspective of clinicians and researchers, we contend that a combination of outcome measures, including client perspectives, are needed to provide comprehensive evaluation of gender-affirming voice training that is relevant for all stakeholders.}, } @article {pmid38501906, year = {2024}, author = {Dolquist, DV and Munson, B}, title = {Clinical Focus: The Development and Description of a Palette of Transmasculine Voices.}, journal = {American journal of speech-language pathology}, volume = {33}, number = {3}, pages = {1113-1126}, doi = {10.1044/2024_AJSLP-23-00398}, pmid = {38501906}, issn = {1558-9110}, mesh = {Humans ; Male ; *Transgender Persons/psychology ; *Speech Acoustics ; *Voice Quality ; Adult ; *Speech Production Measurement ; Young Adult ; Speech-Language Pathology/methods ; Female ; Middle Aged ; Phonetics ; }, abstract = {PURPOSE: The study of gender and speech has historically excluded studies of transmasculine individuals. Consequently, generalizations about speech and gender are based on cisgender individuals. This lack of representation hinders clinical training and clinical service delivery, particularly by speech-language pathologists providing gender-affirming communication services. This letter describes a new corpus of the speech of American English-speaking transmasculine men, transmasculine nonbinary people, and cisgender men that is open and available to clinicians and researchers.

METHOD: Twenty masculine-presenting native English speakers from the Upper Midwestern United States (including cisgender men, transmasculine men, and transmasculine nonbinary people) were recorded, producing three sets of speech materials: Consensus Auditory-Perceptual Evaluation of Voice sentences, the Rainbow Passage, and a novel set of sentences developed for this project. Acoustic measures vowels (overall formant frequency scaling, vowel-space dispersion, fundamental frequency, breathiness), consonants (voice onset time of word-initial voiceless stops, spectral moments of word-initial /s/), and the entire sentence (rate of speech) that were made.

RESULTS: The acoustic measures reveal a wide range for all dependent measures and low correlations among the measures. Results show that many of the voices depart considerably from the norms for men's speech in published studies.

CONCLUSION: This new corpus can be used to illustrate different ways of sounding masculine by speech-language pathologists performing gender-affirming communication services and by higher education teachers as examples of diverse ways of sounding masculine.}, } @article {pmid38498664, year = {2024}, author = {Kim, Y and Thompson, A and Nip, ISB}, title = {Effects of Deep-Brain Stimulation on Speech: Perceptual and Acoustic Data.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {4}, pages = {1090-1106}, pmid = {38498664}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; R01 DC020468/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Deep Brain Stimulation ; Acoustics ; Speech Intelligibility/physiology ; Voice Quality ; *Parkinson Disease/complications/therapy ; Brain ; Speech Acoustics ; }, abstract = {PURPOSE: This study examined speech changes induced by deep-brain stimulation (DBS) in speakers with Parkinson's disease (PD) using a set of auditory-perceptual and acoustic measures.

METHOD: Speech recordings from nine speakers with PD and DBS were compared between DBS-On and DBS-Off conditions using auditory-perceptual and acoustic analyses. Auditory-perceptual ratings included voice quality, articulation precision, prosody, speech intelligibility, and listening effort obtained from 44 listeners. Acoustic measures were made for voicing proportion, second formant frequency slope, vowel dispersion, articulation rate, and range of fundamental frequency and intensity.

RESULTS: No significant changes were found between DBS-On and DBS-Off for the five perceptual ratings. Four of six acoustic measures revealed significant differences between the two conditions. While articulation rate and acoustic vowel dispersion increased, voicing proportion and intensity range decreased from the DBS-Off to DBS-On condition. However, a visual examination of the data indicated that the statistical significance was mostly driven by a small number of participants, while the majority did not show a consistent pattern of such changes.

CONCLUSIONS: Our data, in general, indicate no-to-minimal changes in speech production ensued from DBS stimulation. The findings are discussed with a focus on large interspeaker variability in PD in terms of their speech characteristics and the potential effects of DBS on speech.}, } @article {pmid38498508, year = {2024}, author = {Sabev, M and Andreeva, B}, title = {The acoustics of Contemporary Standard Bulgarian vowels: A corpus study.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {2128-2138}, doi = {10.1121/10.0025293}, pmid = {38498508}, issn = {1520-8524}, mesh = {*Speech Acoustics ; Bulgaria ; *Phonetics ; Acoustics ; Multivariate Analysis ; }, abstract = {A comprehensive examination of the acoustics of Contemporary Standard Bulgarian vowels is lacking to date, and this article aims to fill that gap. Six acoustic variables-the first three formant frequencies, duration, mean f0, and mean intensity-of 11 615 vowel tokens from 140 speakers were analysed using linear mixed models, multivariate analysis of variance, and linear discriminant analysis. The vowel system, which comprises six phonemes in stressed position, [ε a ɔ i ɤ u], was examined from four angles. First, vowels in pretonic syllables were compared to other unstressed vowels, and no spectral or durational differences were found, contrary to an oft-repeated claim that pretonic vowels reduce less. Second, comparisons of stressed and unstressed vowels revealed significant differences in all six variables for the non-high vowels [ε a ɔ]. No spectral or durational differences were found in [i ɤ u], which disproves another received view that high vowels are lowered when unstressed. Third, non-high vowels were compared with their high counterparts; the height contrast was completely neutralized in unstressed [a-ɤ] and [ɔ-u] while [ε-i] remained distinct. Last, the acoustic correlates of vowel contrasts were examined, and it was demonstrated that only F1, F2 frequencies and duration were systematically employed in differentiating vowel phonemes.}, } @article {pmid38497731, year = {2024}, author = {Ashokumar, M and Schwartz, JL and Ito, T}, title = {Changes in Speech Production Following Perceptual Training With Orofacial Somatosensory Inputs.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {10S}, pages = {3962-3973}, doi = {10.1044/2023_JSLHR-23-00249}, pmid = {38497731}, issn = {1558-9102}, mesh = {Humans ; *Speech Perception/physiology ; Female ; Male ; Young Adult ; *Speech/physiology ; Adult ; *Phonetics ; Face/physiology ; Learning/physiology ; }, abstract = {PURPOSE: Orofacial somatosensory inputs play an important role in speech motor control and speech learning. Since receiving specific auditory-somatosensory inputs during speech perceptual training alters speech perception, similar perceptual training could also alter speech production. We examined whether the production performance was changed by perceptual training with orofacial somatosensory inputs.

METHOD: We focused on the French vowels /e/ and /ø/, contrasted in their articulation by horizontal gestures. Perceptual training consisted of a vowel identification task contrasting /e/ and /ø/. Along with training, for the first group of participants, somatosensory stimulation was applied as facial skin stretch in backward direction. We recorded the target vowels uttered by the participants before and after the perceptual training and compared their F1, F2, and F3 formants. We also tested a control group with no somatosensory stimulation and another somatosensory group with a different vowel continuum (/e/-/i/) for perceptual training.

RESULTS: Perceptual training with somatosensory stimulation induced changes in F2 and F3 in the produced vowel sounds. F2 decreased consistently in the two somatosensory groups. F3 increased following the /e/-/ø/ training and decreased following the /e/-/i/ training. F2 change was significantly correlated with the perceptual shift between the first and second half of the training phase in the somatosensory group with the /e/-/ø/ training, but not with the /e/-/i/ training. The control group displayed no effect on F2 and F3, and just a tendency of F1 increase.

CONCLUSION: The results suggest that somatosensory inputs associated to speech sound inputs can play a role in speech training and learning in both production and perception.}, } @article {pmid38480766, year = {2024}, author = {Saha, S and Rattansingh, A and Martino, R and Viswanathan, K and Saha, A and Montazeri Ghahjaverestan, N and Yadollahi, A}, title = {A pilot observation using ultrasonography and vowel articulation to investigate the influence of suspected obstructive sleep apnea on upper airway.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {6144}, pmid = {38480766}, issn = {2045-2322}, mesh = {Humans ; Pilot Projects ; *Sleep Apnea, Obstructive/complications ; *Sleep Apnea Syndromes/complications ; Trachea ; Ultrasonography ; }, abstract = {Failure to employ suitable measures before administering full anesthesia to patients with obstructive sleep apnea (OSA) who are undergoing surgery may lead to developing complications after surgery. Therefore, it is very important to screen OSA before performing a surgery, which is currently done by subjective questionnaires such as STOP-Bang, Berlin scores. These questionnaires have 10-36% specificity in detecting sleep apnea, along with no information given on anatomy of upper airway, which is important for intubation. To address these challenges, we performed a pilot study to understand the utility of ultrasonography and vowel articulation in screening OSA. Our objective was to investigate the influence of OSA risk factors in vowel articulation through ultrasonography and acoustic features analysis. To accomplish this, we recruited 18 individuals with no risk of OSA and 13 individuals with high risk of OSA and asked them to utter vowels, such as /a/ (as in "Sah"), /e/ (as in "See"). An expert ultra-sonographer measured the parasagittal anterior-posterior (PAP) and transverse diameter of the upper airway. From the recorded vowel sounds, we extracted 106 features, including power, pitch, formant, and Mel frequency cepstral coefficients (MFCC). We analyzed the variation of the PAP diameters and vowel features from "See: /i/" to "Sah /a/" between control and OSA groups by two-way repeated measures ANOVA. We found that, there was a variation of upper airway diameter from "See" to "Sah" was significantly smaller in OSA group than control group (OSA: ∆12.8 ± 5.3 mm vs. control: ∆22.5 ± 3.9 mm OSA, p < 0.01). Moreover, we found several vowel features showed the exact same or opposite trend as PAP diameter variation, which led us to build a machine learning model to estimate PAP diameter from vowel features. We found a correlation coefficient of 0.75 between the estimated and measured PAP diameter after applying four estimation models and combining their output with a random forest model, which showed the feasibility of using acoustic features of vowel sounds to monitor upper airway diameter. Overall, this study has proven the concept that ultrasonography and vowel sounds analysis may be useful as an easily accessible imaging tool of upper airway.}, } @article {pmid38469160, year = {2024}, author = {Lee, H and Cho, M and Kwon, HY}, title = {Attention-based speech feature transfer between speakers.}, journal = {Frontiers in artificial intelligence}, volume = {7}, number = {}, pages = {1259641}, pmid = {38469160}, issn = {2624-8212}, abstract = {In this study, we propose a simple yet effective method for incorporating the source speaker's characteristics in the target speaker's speech. This allows our model to generate the speech of the target speaker with the style of the source speaker. To achieve this, we focus on the attention model within the speech synthesis model, which learns various speaker features such as spectrogram, pitch, intensity, formant, pulse, and voice breaks. The model is trained separately using datasets specific to the source and target speakers. Subsequently, we replace the attention weights learned from the source speaker's dataset with the attention weights from the target speaker's model. Finally, by providing new input texts to the target model, we generate the speech of the target speaker with the styles of the source speaker. We validate the effectiveness of our model through similarity analysis utilizing five evaluation metrics and showcase real-world examples.}, } @article {pmid38456732, year = {2024}, author = {Borjigin, A and Bakst, S and Anderson, K and Litovsky, RY and Niziolek, CA}, title = {Discrimination and sensorimotor adaptation of self-produced vowels in cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {1895-1908}, pmid = {38456732}, issn = {1520-8524}, support = {R01 DC003083/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; T32 DC005359/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; F32 DC017653/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Cochlear Implants ; *Speech Perception ; *Cochlear Implantation ; Auditory Perception ; Speech ; }, abstract = {Humans rely on auditory feedback to monitor and adjust their speech for clarity. Cochlear implants (CIs) have helped over a million people restore access to auditory feedback, which significantly improves speech production. However, there is substantial variability in outcomes. This study investigates the extent to which CI users can use their auditory feedback to detect self-produced sensory errors and make adjustments to their speech, given the coarse spectral resolution provided by their implants. First, we used an auditory discrimination task to assess the sensitivity of CI users to small differences in formant frequencies of their self-produced vowels. Then, CI users produced words with altered auditory feedback in order to assess sensorimotor adaptation to auditory error. Almost half of the CI users tested can detect small, within-channel differences in their self-produced vowels, and they can utilize this auditory feedback towards speech adaptation. An acoustic hearing control group showed better sensitivity to the shifts in vowels, even in CI-simulated speech, and elicited more robust speech adaptation behavior than the CI users. Nevertheless, this study confirms that CI users can compensate for sensory errors in their speech and supports the idea that sensitivity to these errors may relate to variability in production.}, } @article {pmid38443265, year = {2024}, author = {Stone, TC and Erickson, ML}, title = {Experienced and Inexperienced Listeners' Perception of Vocal Strain.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.02.002}, pmid = {38443265}, issn = {1873-4588}, abstract = {OBJECTIVE: The ability to perceive strain or tension in a voice is critical for both speech-language pathologists and singing teachers. Research on voice quality has focused primarily on the perception of breathiness or roughness. The perception of vocal strain has not been extensively researched and is poorly understood.

METHODS/DESIGN: This study employs a group and a within-subject design. Synthetic female sung stimuli were created that varied in source slope and vocal tract transfer function. Two groups of listeners, inexperienced listeners and experienced vocal pedagogues, listened to the stimuli and rated the perceived strain using a visual analog scale Synthetic female stimuli were constructed on the vowel /ɑ/ at 2 pitches, A3 and F5, using glottal source slopes that drop in amplitude at constant rates varying from - 6 dB/octave to - 18 dB/octave. All stimuli were filtered using three vocal tract transfer functions, one derived from a lyric/coloratura soprano, one derived from a mezzo-soprano, and a third that has resonance frequencies mid-way between the two. Listeners heard the stimuli over headphones and rated them on a scale from "no strain" to "very strained" using a visual-analog scale.

RESULTS: Spectral source slope was strongly related to the perception of strain in both groups of listeners. Experienced listeners' perception of strain was also related to formant pattern, while inexperienced listeners' perception of strain was also related to pitch.

CONCLUSION: This study has shown that spectral source slope can be a powerful cue to the perception of strain. However, inexperienced and experienced listeners also differ from each other in how strain is perceived across speaking and singing pitches. These differences may be based on both experience and the goals of the listener.}, } @article {pmid38440592, year = {2024}, author = {Umashankar, A and Ramamoorthy, S and Selvaraj, JL and Dhandayutham, S}, title = {Comparative Study on the Acoustic Analysis of Voice in Auditory Brainstem Implantees, Cochlear Implantees, and Normal Hearing Children.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {76}, number = {1}, pages = {645-652}, pmid = {38440592}, issn = {2231-3796}, abstract = {The aim of the study was to compare the acoustic characteristics of voice between Auditory Brainstem Implantees, Cochlear Implantees and normal hearing children. Voice parameters such as fundamental frequency, formant frequencies, perturbation measures, and harmonic to noise ratio were measured in a total of 30 children out of which 10 were Auditory Brainstem Implantees, 10 were Cochlear Implantees and 10 were normal hearing children. Parametric and nonparametric statistics were done to establish the nature of significance between the three groups. Overall deviancies were seen in the implanted group for all acoustic parameters. However abnormal deviations were seen in individuals with Auditory Brainstem Implants indicating the deficit in the feedback loop impacting the voice characteristics. The deviancy in feedback could attribute to the poor performance in ABI and CI. The CI performed comparatively better when compared to the ABI group indicating a slight feedback loop due to the type of Implant. However, there needs to be additional evidence supporting this and there is a need to carry out the same study using a larger sample size and a longitudinal design.}, } @article {pmid38435340, year = {2023}, author = {Cuadros, J and Z-Rivera, L and Castro, C and Whitaker, G and Otero, M and Weinstein, A and Martínez-Montes, E and Prado, P and Zañartu, M}, title = {DIVA Meets EEG: Model Validation Using Formant-Shift Reflex.}, journal = {Applied sciences (Basel, Switzerland)}, volume = {13}, number = {13}, pages = {}, pmid = {38435340}, issn = {2076-3417}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; }, abstract = {The neurocomputational model 'Directions into Velocities of Articulators' (DIVA) was developed to account for various aspects of normal and disordered speech production and acquisition. The neural substrates of DIVA were established through functional magnetic resonance imaging (fMRI), providing physiological validation of the model. This study introduces DIVA_EEG an extension of DIVA that utilizes electroencephalography (EEG) to leverage the high temporal resolution and broad availability of EEG over fMRI. For the development of DIVA_EEG, EEG-like signals were derived from original equations describing the activity of the different DIVA maps. Synthetic EEG associated with the utterance of syllables was generated when both unperturbed and perturbed auditory feedback (first formant perturbations) were simulated. The cortical activation maps derived from synthetic EEG closely resembled those of the original DIVA model. To validate DIVA_EEG, the EEG of individuals with typical voices (N = 30) was acquired during an altered auditory feedback paradigm. The resulting empirical brain activity maps significantly overlapped with those predicted by DIVA_EEG. In conjunction with other recent model extensions, DIVA_EEG lays the foundations for constructing a complete neurocomputational framework to tackle vocal and speech disorders, which can guide model-driven personalized interventions.}, } @article {pmid38418558, year = {2024}, author = {Fletcher, MD and Akis, E and Verschuur, CA and Perry, SW}, title = {Improved tactile speech perception using audio-to-tactile sensory substitution with formant frequency focusing.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {4889}, pmid = {38418558}, issn = {2045-2322}, support = {EP/W032422/1//Engineering and Physical Sciences Research Council/ ; EP/T517859/1//Engineering and Physical Sciences Research Council/ ; }, mesh = {Humans ; *Speech Perception ; Speech ; Touch ; *Touch Perception ; *Cochlear Implants ; }, abstract = {Haptic hearing aids, which provide speech information through tactile stimulation, could substantially improve outcomes for both cochlear implant users and for those unable to access cochlear implants. Recent advances in wide-band haptic actuator technology have made new audio-to-tactile conversion strategies viable for wearable devices. One such strategy filters the audio into eight frequency bands, which are evenly distributed across the speech frequency range. The amplitude envelopes from the eight bands modulate the amplitudes of eight low-frequency tones, which are delivered through vibration to a single site on the wrist. This tactile vocoder strategy effectively transfers some phonemic information, but vowels and obstruent consonants are poorly portrayed. In 20 participants with normal touch perception, we tested (1) whether focusing the audio filters of the tactile vocoder more densely around the first and second formant frequencies improved tactile vowel discrimination, and (2) whether focusing filters at mid-to-high frequencies improved obstruent consonant discrimination. The obstruent-focused approach was found to be ineffective. However, the formant-focused approach improved vowel discrimination by 8%, without changing overall consonant discrimination. The formant-focused tactile vocoder strategy, which can readily be implemented in real time on a compact device, could substantially improve speech perception for haptic hearing aid users.}, } @article {pmid38381271, year = {2024}, author = {Maya Lastra, N and Rangel Negrín, A and Coyohua Fuentes, A and Dias, PAD}, title = {Mantled howler monkey males assess their rivals through formant spacing of long-distance calls.}, journal = {Primates; journal of primatology}, volume = {65}, number = {3}, pages = {183-190}, pmid = {38381271}, issn = {1610-7365}, support = {726265//Consejo Nacional de Ciencia y Tecnología/ ; 15 1529//Consejo Veracruzano de Ciencia y Tecnología/ ; }, mesh = {Male ; Animals ; *Glucocorticoids/metabolism ; Vocalization, Animal/physiology ; *Alouatta/physiology ; Testosterone ; }, abstract = {Formant frequency spacing of long-distance vocalizations is allometrically related to body size and could represent an honest signal of fighting potential. There is, however, only limited evidence that primates use formant spacing to assess the competitive potential of rivals during interactions with extragroup males, a risky context. We hypothesized that if formant spacing of long-distance calls is inversely related to the fighting potential of male mantled howler monkeys (Alouatta palliata), then males should: (1) be more likely and (2) faster to display vocal responses to calling rivals; (3) be more likely and (4) faster to approach calling rivals; and have higher fecal (5) glucocorticoid and (6) testosterone metabolite concentrations in response to rivals calling at intermediate and high formant spacing than to those with low formant spacing. We studied the behavioral responses of 11 adult males to playback experiments of long-distance calls from unknown individuals with low (i.e., emulating large individuals), intermediate, and high (i.e., small individuals) formant spacing (n = 36 experiments). We assayed fecal glucocorticoid and testosterone metabolite concentrations (n = 174). Playbacks always elicited vocal responses, but males responded quicker to intermediate than to low formant spacing playbacks. Low formant spacing calls were less likely to elicit approaches whereas high formant spacing calls resulted in quicker approaches. Males showed stronger hormonal responses to low than to both intermediate and high formant spacing calls. It is possible that males do not escalate conflicts with rivals with low formant spacing calls if these are perceived as large, and against whom winning probabilities should decrease and confrontation costs increase; but are willing to escalate conflicts with rivals of high formant spacing. Formant spacing may therefore be an important signal for rival assessment in this species.}, } @article {pmid38364044, year = {2024}, author = {Merritt, B and Bent, T and Kilgore, R and Eads, C}, title = {Auditory free classification of gender diverse speakersa).}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {2}, pages = {1422-1436}, doi = {10.1121/10.0024521}, pmid = {38364044}, issn = {1520-8524}, mesh = {Humans ; Male ; Female ; *Speech Perception ; Voice Quality ; Speech Acoustics ; Masculinity ; *Sexual and Gender Minorities ; }, abstract = {Auditory attribution of speaker gender has historically been assumed to operate within a binary framework. The prevalence of gender diversity and its associated sociophonetic variability motivates an examination of how listeners perceptually represent these diverse voices. Utterances from 30 transgender (1 agender individual, 15 non-binary individuals, 7 transgender men, and 7 transgender women) and 30 cisgender (15 men and 15 women) speakers were used in an auditory free classification paradigm, in which cisgender listeners classified the speakers on perceived general similarity and gender identity. Multidimensional scaling of listeners' classifications revealed two-dimensional solutions as the best fit for general similarity classifications. The first dimension was interpreted as masculinity/femininity, where listeners organized speakers from high to low fundamental frequency and first formant frequency. The second was interpreted as gender prototypicality, where listeners separated speakers with fundamental frequency and first formant frequency at upper and lower extreme values from more intermediate values. Listeners' classifications for gender identity collapsed into a one-dimensional space interpreted as masculinity/femininity. Results suggest that listeners engage in fine-grained analysis of speaker gender that cannot be adequately captured by a gender dichotomy. Further, varying terminology used in instructions may bias listeners' gender judgements.}, } @article {pmid38358292, year = {2024}, author = {Almurashi, W and Al-Tamimi, J and Khattab, G}, title = {Dynamic specification of vowels in Hijazi Arabic.}, journal = {Phonetica}, volume = {81}, number = {2}, pages = {185-220}, pmid = {38358292}, issn = {1423-0321}, mesh = {Male ; Female ; Humans ; *Phonetics ; *Speech Acoustics ; Language ; Acoustics ; Cues ; }, abstract = {Research on various languages shows that dynamic approaches to vowel acoustics - in particular Vowel-Inherent Spectral Change (VISC) - can play a vital role in characterising and classifying monophthongal vowels compared with a static model. This study's aim was to investigate whether dynamic cues also allow for better description and classification of the Hijazi Arabic (HA) vowel system, a phonological system based on both temporal and spectral distinctions. Along with static and dynamic F1 and F2 patterns, we evaluated the extent to which vowel duration, F0, and F3 contribute to increased/decreased discriminability among vowels. Data were collected from 20 native HA speakers (10 females and 10 males) producing eight HA monophthongal vowels in a word list with varied consonantal contexts. Results showed that dynamic cues provide further insights regarding HA vowels that are not normally gleaned from static measures alone. Using discriminant analysis, the dynamic cues (particularly the seven-point model) had relatively higher classification rates, and vowel duration was found to play a significant role as an additional cue. Our results are in line with dynamic approaches and highlight the importance of looking beyond static cues and beyond the first two formants for further insights into the description and classification of vowel systems.}, } @article {pmid38348589, year = {2024}, author = {Simeone, PJ and Green, JR and Tager-Flusberg, H and Chenausky, KV}, title = {Vowel distinctiveness as a concurrent predictor of expressive language function in autistic children.}, journal = {Autism research : official journal of the International Society for Autism Research}, volume = {17}, number = {2}, pages = {419-431}, doi = {10.1002/aur.3102}, pmid = {38348589}, issn = {1939-3806}, support = {R00 DC017490/DC/NIDCD NIH HHS/United States ; /NH/NIH HHS/United States ; P50 DC013027/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; P50 DC018006/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Autistic Disorder/complications ; *Autism Spectrum Disorder/complications ; Language ; Speech ; *Language Disorders ; Phonetics ; }, abstract = {Speech ability may limit spoken language development in some minimally verbal autistic children. In this study, we aimed to determine whether an acoustic measure of speech production, vowel distinctiveness, is concurrently related to expressive language (EL) for autistic children. Syllables containing the vowels [i] and [a] were recorded remotely from 27 autistic children (4;1-7;11) with a range of spoken language abilities. Vowel distinctiveness was calculated using automatic formant tracking software. Robust hierarchical regressions were conducted with receptive language (RL) and vowel distinctiveness as predictors of EL. Hierarchical regressions were also conducted within a High EL and a Low EL subgroup. Vowel distinctiveness accounted for 29% of the variance in EL for the entire group, RL for 38%. For the Low EL group, only vowel distinctiveness was significant, accounting for 38% of variance in EL. Conversely, in the High EL group, only RL was significant and accounted for 26% of variance in EL. Replicating previous results, speech production and RL significantly predicted concurrent EL in autistic children, with speech production being the sole significant predictor for the Low EL group and RL the sole significant predictor for the High EL group. Further work is needed to determine whether vowel distinctiveness longitudinally, as well as concurrently, predicts EL. Findings have important implications for the early identification of language impairment and in developing language interventions for autistic children.}, } @article {pmid38341748, year = {2024}, author = {Shadle, CH and Fulop, SA and Chen, WR and Whalen, DH}, title = {Assessing accuracy of resonances obtained with reassigned spectrograms from the "ground truth" of physical vocal tract models.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {2}, pages = {1253-1263}, pmid = {38341748}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Voice ; Acoustics ; Speech Acoustics ; Vibration ; Sound Spectrography ; }, abstract = {The reassigned spectrogram (RS) has emerged as the most accurate way to infer vocal tract resonances from the acoustic signal [Shadle, Nam, and Whalen (2016). "Comparing measurement errors for formants in synthetic and natural vowels," J. Acoust. Soc. Am. 139(2), 713-727]. To date, validating its accuracy has depended on formant synthesis for ground truth values of these resonances. Synthesis is easily controlled, but it has many intrinsic assumptions that do not necessarily accurately realize the acoustics in the way that physical resonances would. Here, we show that physical models of the vocal tract with derivable resonance values allow a separate approach to the ground truth, with a different range of limitations. Our three-dimensional printed vocal tract models were excited by white noise, allowing an accurate determination of the resonance frequencies. Then, sources with a range of fundamental frequencies were implemented, allowing a direct assessment of whether RS avoided the systematic bias towards the nearest strong harmonic to which other analysis techniques are prone. RS was indeed accurate at fundamental frequencies up to 300 Hz; above that, accuracy was somewhat reduced. Future directions include testing mechanical models with the dimensions of children's vocal tracts and making RS more broadly useful by automating the detection of resonances.}, } @article {pmid38319369, year = {2024}, author = {Saghiri, MA and Vakhnovetsky, J and Amanabi, M and Karamifar, K and Farhadi, M and Amini, SB and Conte, M}, title = {Exploring the impact of type II diabetes mellitus on voice quality.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {281}, number = {5}, pages = {2707-2716}, pmid = {38319369}, issn = {1434-4726}, mesh = {Humans ; Voice Quality ; Speech Acoustics ; *Diabetes Mellitus, Type 2/complications ; Cross-Sectional Studies ; Speech Production Measurement ; *Voice ; Acoustics ; }, abstract = {PURPOSE: This cross-sectional study aimed to investigate the potential of voice analysis as a prescreening tool for type II diabetes mellitus (T2DM) by examining the differences in voice recordings between non-diabetic and T2DM participants.

METHODS: 60 participants diagnosed as non-diabetic (n = 30) or T2DM (n = 30) were recruited on the basis of specific inclusion and exclusion criteria in Iran between February 2020 and September 2023. Participants were matched according to their year of birth and then placed into six age categories. Using the WhatsApp application, participants recorded the translated versions of speech elicitation tasks. Seven acoustic features [fundamental frequency, jitter, shimmer, harmonic-to-noise ratio (HNR), cepstral peak prominence (CPP), voice onset time (VOT), and formant (F1-F2)] were extracted from each recording and analyzed using Praat software. Data was analyzed with Kolmogorov-Smirnov, two-way ANOVA, post hoc Tukey, binary logistic regression, and student t tests.

RESULTS: The comparison between groups showed significant differences in fundamental frequency, jitter, shimmer, CPP, and HNR (p < 0.05), while there were no significant differences in formant and VOT (p > 0.05). Binary logistic regression showed that shimmer was the most significant predictor of the disease group. There was also a significant difference between diabetes status and age, in the case of CPP.

CONCLUSIONS: Participants with type II diabetes exhibited significant vocal variations compared to non-diabetic controls.}, } @article {pmid38299984, year = {2024}, author = {Benway, NR and Preston, JL and Salekin, A and Hitchcock, E and McAllister, T}, title = {Evaluating acoustic representations and normalization for rhoticity classification in children with speech sound disorders.}, journal = {JASA express letters}, volume = {4}, number = {2}, pages = {}, pmid = {38299984}, issn = {2691-1191}, support = {R01 DC017476/DC/NIDCD NIH HHS/United States ; R01 DC020959/DC/NIDCD NIH HHS/United States ; T32 DC000046/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Speech Sound Disorder/diagnosis ; Acoustics ; Engineering ; Models, Statistical ; Neural Networks, Computer ; }, abstract = {The effects of different acoustic representations and normalizations were compared for classifiers predicting perception of children's rhotic versus derhotic /ɹ/. Formant and Mel frequency cepstral coefficient (MFCC) representations for 350 speakers were z-standardized, either relative to values in the same utterance or age-and-sex data for typical /ɹ/. Statistical modeling indicated age-and-sex normalization significantly increased classifier performances. Clinically interpretable formants performed similarly to MFCCs and were endorsed for deep neural network engineering, achieving mean test-participant-specific F1-score = 0.81 after personalization and replication (σx = 0.10, med = 0.83, n = 48). Shapley additive explanations analysis indicated the third formant most influenced fully rhotic predictions.}, } @article {pmid38257406, year = {2024}, author = {Hou, Y and Li, Q and Wang, Z and Liu, T and He, Y and Li, H and Ren, Z and Guo, X and Yang, G and Liu, Y and Yu, L}, title = {Study on a Pig Vocalization Classification Method Based on Multi-Feature Fusion.}, journal = {Sensors (Basel, Switzerland)}, volume = {24}, number = {2}, pages = {}, pmid = {38257406}, issn = {1424-8220}, support = {2021ZD0113803//Scientific and Technological Innovation 2030 Program of China Ministry of Science and Technology/ ; 20YFZCSN00220//Tianjin Science and Technology Planning Project/ ; JKZX202214//Beijing Academy of Agriculture and Forestry Sciences Outstanding Scientist Training Program/ ; }, mesh = {Swine ; Animals ; *Recognition, Psychology ; *Cough ; Neural Networks, Computer ; Principal Component Analysis ; }, abstract = {To improve the classification of pig vocalization using vocal signals and improve recognition accuracy, a pig vocalization classification method based on multi-feature fusion is proposed in this study. With the typical vocalization of pigs in large-scale breeding houses as the research object, short-time energy, frequency centroid, formant frequency and first-order difference, and Mel frequency cepstral coefficient and first-order difference were extracted as the fusion features. These fusion features were improved using principal component analysis. A pig vocalization classification model with a BP neural network optimized based on the genetic algorithm was constructed. The results showed that using the improved features to recognize pig grunting, squealing, and coughing, the average recognition accuracy was 93.2%; the recognition precisions were 87.9%, 98.1%, and 92.7%, respectively, with an average of 92.9%; and the recognition recalls were 92.0%, 99.1%, and 87.4%, respectively, with an average of 92.8%, which indicated that the proposed pig vocalization classification method had good recognition precision and recall, and could provide a reference for pig vocalization information feedback and automatic recognition.}, } @article {pmid38252795, year = {2024}, author = {Nagamine, T}, title = {Formant dynamics in second language speech: Japanese speakers' production of English liquids.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {1}, pages = {479-495}, doi = {10.1121/10.0024351}, pmid = {38252795}, issn = {1520-8524}, mesh = {Humans ; *Acoustics ; *Language ; *Speech ; }, abstract = {This article reports an acoustic study analysing the time-varying spectral properties of word-initial English liquids produced by 31 first-language (L1) Japanese and 14 L1 English speakers. While it is widely accepted that L1 Japanese speakers have difficulty in producing English /l/ and /ɹ/, the temporal characteristics of L2 English liquids are not well-understood, even in light of previous findings that English liquids show dynamic properties. In this study, the distance between the first and second formants (F2-F1) and the third formant (F3) are analysed dynamically over liquid-vowel intervals in three vowel contexts using generalised additive mixed models (GAMMs). The results demonstrate that L1 Japanese speakers produce word-initial English liquids with stronger vocalic coarticulation than L1 English speakers. L1 Japanese speakers may have difficulty in dissociating F2-F1 between the liquid and the vowel to a varying degree, depending on the vowel context, which could be related to perceptual factors. This article shows that dynamic information uncovers specific challenges that L1 Japanese speakers have in producing L2 English liquids accurately.}, } @article {pmid38226202, year = {2023}, author = {Ghaemi, H and Grillo, R and Alizadeh, O and Shirzadeh, A and Ejtehadi, B and Torkzadeh, M and Samieirad, S}, title = {What Is the Effect of Maxillary Impaction Orthognathic Surgery on Voice Characteristics? A Quasi-Experimental Study.}, journal = {World journal of plastic surgery}, volume = {12}, number = {3}, pages = {44-56}, pmid = {38226202}, issn = {2228-7914}, abstract = {BACKGROUND: Regarding the impact of orthognathic surgery on the airway and voice, this study was carried out to investigate the effects of maxillary impaction surgery on patients' voices through acoustic analysis and articulation assessment.

METHODS: This quasi-experimental, before-and-after, double-blind study aimed at examining the effects of maxillary impaction surgery on the voice of orthognathic surgery patients. Before the surgery, a speech therapist conducted acoustic analysis, which included fundament frequency (F0), Jitter, Shimmer, and the harmonic-to-noise ratio (HNR), as well as first, second, and third formants (F1, F2, and F3). The patient's age, sex, degree of maxillary deformity, and impaction were documented in a checklist. Voice analysis was repeated during follow-up appointments at one and six months after the surgery in a blinded manner. The data were statistically analyzed using SPSS 23, and the significance level was set at 0.05.

RESULTS: Twenty two patients (18 females, 4 males) were examined, with ages ranging from 18 to 40 years and an average age of 25.54 years. F2, F3, HNR, and Shimmer demonstrated a significant increase over the investigation period compared to the initial phase of the study (P <0.001 for each). Conversely, the Jitter variable exhibited a significant decrease during the follow-up assessments in comparison to the initial phase of the study (P< 0.001).

CONCLUSION: Following maxillary impaction surgery, improvements in voice quality were observed compared to the preoperative condition. However, further studies with larger samples are needed to confirm the relevancy.}, } @article {pmid38214609, year = {2024}, author = {Hedrick, M and Thornton, K}, title = {Reaction time for correct identification of vowels in consonant-vowel syllables and of vowel segments.}, journal = {JASA express letters}, volume = {4}, number = {1}, pages = {}, doi = {10.1121/10.0024334}, pmid = {38214609}, issn = {2691-1191}, mesh = {Adult ; Humans ; Young Adult ; Reaction Time ; *Phonetics ; }, abstract = {Reaction times for correct vowel identification were measured to determine the effects of intertrial intervals, vowel, and cue type. Thirteen adults with normal hearing, aged 20-38 years old, participated. Stimuli included three naturally produced syllables (/ba/ /bi/ /bu/) presented whole or segmented to isolate the formant transition or static formant center. Participants identified the vowel presented via loudspeaker by mouse click. Results showed a significant effect of intertrial intervals, no significant effect of cue type, and a significant vowel effect-suggesting that feedback occurs, vowel identification may depend on cue duration, and vowel bias may stem from focal structure.}, } @article {pmid38174963, year = {2024}, author = {Sathe, NC and Kain, A and Reiss, LAJ}, title = {Fusion of dichotic consonants in normal-hearing and hearing-impaired listenersa).}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {1}, pages = {68-77}, pmid = {38174963}, issn = {1520-8524}, support = {R01 DC013307/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Hearing Loss, Sensorineural ; *Speech Perception/physiology ; *Hearing Loss ; Psychoacoustics ; Phonetics ; Hearing ; }, abstract = {Hearing-impaired (HI) listeners have been shown to exhibit increased fusion of dichotic vowels, even with different fundamental frequency (F0), leading to binaural spectral averaging and interference. To determine if similar fusion and averaging occurs for consonants, four natural and synthesized stop consonants (/pa/, /ba/, /ka/, /ga/) at three F0s of 74, 106, and 185 Hz were presented dichotically-with ΔF0 varied-to normal-hearing (NH) and HI listeners. Listeners identified the one or two consonants perceived, and response options included /ta/ and /da/ as fused percepts. As ΔF0 increased, both groups showed decreases in fusion and increases in percent correct identification of both consonants, with HI listeners displaying similar fusion but poorer identification. Both groups exhibited spectral averaging (psychoacoustic fusion) of place of articulation but phonetic feature fusion for differences in voicing. With synthetic consonants, NH subjects showed increased fusion and decreased identification. Most HI listeners were unable to discriminate the synthetic consonants. The findings suggest smaller differences between groups in consonant fusion than vowel fusion, possibly due to the presence of more cues for segregation in natural speech or reduced reliance on spectral cues for consonant perception. The inability of HI listeners to discriminate synthetic consonants suggests a reliance on cues other than formant transitions for consonant discrimination.}, } @article {pmid38165498, year = {2024}, author = {Wang, L and Liu, R and Wang, Y and Xu, X and Zhang, R and Wei, Y and Zhu, R and Zhang, X and Wang, F}, title = {Effectiveness of a Biofeedback Intervention Targeting Mental and Physical Health Among College Students Through Speech and Physiology as Biomarkers Using Machine Learning: A Randomized Controlled Trial.}, journal = {Applied psychophysiology and biofeedback}, volume = {49}, number = {1}, pages = {71-83}, pmid = {38165498}, issn = {1573-3270}, support = {ZD2021026//Key Project supported by Medical Science and Technology Development Foundation, Jiangsu Commission of Health/ ; 62176129//National Natural Science Foundation of China/ ; 81725005//National Science Fund for Distinguished Young Scholars/ ; U20A6005//National Natural Science Foundation Regional Innovation and Development Joint Fund/ ; BE2021617//Jiangsu Provincial Key Research and Development Program/ ; }, mesh = {Humans ; *Speech ; *Sleep Initiation and Maintenance Disorders ; Biofeedback, Psychology/methods ; Students/psychology ; Biomarkers ; Machine Learning ; }, abstract = {Biofeedback therapy is mainly based on the analysis of physiological features to improve an individual's affective state. There are insufficient objective indicators to assess symptom improvement after biofeedback. In addition to psychological and physiological features, speech features can precisely convey information about emotions. The use of speech features can improve the objectivity of psychiatric assessments. Therefore, biofeedback based on subjective symptom scales, objective speech, and physiological features to evaluate efficacy provides a new approach for early screening and treatment of emotional problems in college students. A 4-week, randomized, controlled, parallel biofeedback therapy study was conducted with college students with symptoms of anxiety or depression. Speech samples, physiological samples, and clinical symptoms were collected at baseline and at the end of treatment, and the extracted speech features and physiological features were used for between-group comparisons and correlation analyses between the biofeedback and wait-list groups. Based on the speech features with differences between the biofeedback intervention and wait-list groups, an artificial neural network was used to predict the therapeutic effect and response after biofeedback therapy. Through biofeedback therapy, improvements in depression (p = 0.001), anxiety (p = 0.001), insomnia (p = 0.013), and stress (p = 0.004) severity were observed in college-going students (n = 52). The speech and physiological features in the biofeedback group also changed significantly compared to the waitlist group (n = 52) and were related to the change in symptoms. The energy parameters and Mel-Frequency Cepstral Coefficients (MFCC) of speech features can predict whether biofeedback intervention effectively improves anxiety and insomnia symptoms and treatment response. The accuracy of the classification model built using the artificial neural network (ANN) for treatment response and non-response was approximately 60%. The results of this study provide valuable information about biofeedback in improving the mental health of college-going students. The study identified speech features, such as the energy parameters, and MFCC as more accurate and objective indicators for tracking biofeedback therapy response and predicting efficacy. Trial Registration ClinicalTrials.gov ChiCTR2100045542.}, } @article {pmid38158551, year = {2024}, author = {Anikin, A and Barreda, S and Reby, D}, title = {A practical guide to calculating vocal tract length and scale-invariant formant patterns.}, journal = {Behavior research methods}, volume = {56}, number = {6}, pages = {5588-5604}, pmid = {38158551}, issn = {1554-3528}, mesh = {Humans ; *Software ; Phonetics ; Speech/physiology ; Speech Acoustics ; Vocal Cords/physiology ; Acoustics ; }, abstract = {Formants (vocal tract resonances) are increasingly analyzed not only by phoneticians in speech but also by behavioral scientists studying diverse phenomena such as acoustic size exaggeration and articulatory abilities of non-human animals. This often involves estimating vocal tract length acoustically and producing scale-invariant representations of formant patterns. We present a theoretical framework and practical tools for carrying out this work, including open-source software solutions included in R packages soundgen and phonTools. Automatic formant measurement with linear predictive coding is error-prone, but formant_app provides an integrated environment for formant annotation and correction with visual and auditory feedback. Once measured, formants can be normalized using a single recording (intrinsic methods) or multiple recordings from the same individual (extrinsic methods). Intrinsic speaker normalization can be as simple as taking formant ratios and calculating the geometric mean as a measure of overall scale. The regression method implemented in the function estimateVTL calculates the apparent vocal tract length assuming a single-tube model, while its residuals provide a scale-invariant vowel space based on how far each formant deviates from equal spacing (the schwa function). Extrinsic speaker normalization provides more accurate estimates of speaker- and vowel-specific scale factors by pooling information across recordings with simple averaging or mixed models, which we illustrate with example datasets and R code. The take-home messages are to record several calls or vowels per individual, measure at least three or four formants, check formant measurements manually, treat uncertain values as missing, and use the statistical tools best suited to each modeling context.}, } @article {pmid38135960, year = {2023}, author = {Kraxberger, F and Näger, C and Laudato, M and Sundström, E and Becker, S and Mihaescu, M and Kniesburges, S and Schoder, S}, title = {On the Alignment of Acoustic and Coupled Mechanic-Acoustic Eigenmodes in Phonation by Supraglottal Duct Variations.}, journal = {Bioengineering (Basel, Switzerland)}, volume = {10}, number = {12}, pages = {}, pmid = {38135960}, issn = {2306-5354}, support = {39480417//Austrian Research Promotion Agency/ ; 446965891//Deutsche Forschungsgemeinschaft/ ; n/a//TU Graz Open Access Publishing Fund/ ; }, abstract = {Sound generation in human phonation and the underlying fluid-structure-acoustic interaction that describes the sound production mechanism are not fully understood. A previous experimental study, with a silicone made vocal fold model connected to a straight vocal tract pipe of fixed length, showed that vibroacoustic coupling can cause a deviation in the vocal fold vibration frequency. This occurred when the fundamental frequency of the vocal fold motion was close to the lowest acoustic resonance frequency of the pipe. What is not fully understood is how the vibroacoustic coupling is influenced by a varying vocal tract length. Presuming that this effect is a pure coupling of the acoustical effects, a numerical simulation model is established based on the computation of the mechanical-acoustic eigenvalue. With varying pipe lengths, the lowest acoustic resonance frequency was adjusted in the experiments and so in the simulation setup. In doing so, the evolution of the vocal folds' coupled eigenvalues and eigenmodes is investigated, which confirms the experimental findings. Finally, it was shown that for normal phonation conditions, the mechanical mode is the most efficient vibration pattern whenever the acoustic resonance of the pipe (lowest formant) is far away from the vocal folds' vibration frequency. Whenever the lowest formant is slightly lower than the mechanical vocal fold eigenfrequency, the coupled vocal fold motion pattern at the formant frequency dominates.}, } @article {pmid38082914, year = {2023}, author = {Pah, ND and Motin, MA and Oliveira, GC and Kumar, DK}, title = {The Change of Vocal Tract Length in People with Parkinson's Disease.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2023}, number = {}, pages = {1-4}, doi = {10.1109/EMBC40787.2023.10340263}, pmid = {38082914}, issn = {2694-0604}, mesh = {Humans ; Male ; *Parkinson Disease/complications/diagnosis ; *Voice ; Dysarthria/diagnosis/etiology ; Speech ; }, abstract = {Hypokinetic dysarthria is one of the early symptoms of Parkinson's disease (PD) and has been proposed for early detection and also for monitoring of the progression of the disease. PD reduces the control of vocal tract muscles such as the tongue and lips and, therefore the length of the active vocal tract is altered. However, the change in the vocal tract length due to the disease has not been investigated. The aim of this study was to determine the difference in the apparent vocal tract length (AVTL) between people with PD and age-matched control healthy people. The phoneme, /a/ from the UCI Parkinson's Disease Classification Dataset and the Italian Parkinson's Voice and Speech Dataset were used and AVTL was calculated based on the first four formants of the sustained phoneme (F1-F4). The results show a correlation between Parkinson's disease and an increase in vocal tract length. The most sensitive feature was the AVTL calculated using the first formants of sustained phonemes (F1). The other significant finding reported in this article is that the difference is significant and only appeared in the male participants. However, the size of the database is not sufficiently large to identify the possible confounding factors such as the severity and duration of the disease, medication, age, and comorbidity factors.Clinical relevance-The outcomes of this research have the potential to improve the identification of early Parkinsonian dysarthria and monitor PD progression.}, } @article {pmid38061210, year = {2024}, author = {Orekhova, EV and Fadeev, KA and Goiaeva, DE and Obukhova, TS and Ovsiannikova, TM and Prokofyev, AO and Stroganova, TA}, title = {Different hemispheric lateralization for periodicity and formant structure of vowels in the auditory cortex and its changes between childhood and adulthood.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {171}, number = {}, pages = {287-307}, doi = {10.1016/j.cortex.2023.10.020}, pmid = {38061210}, issn = {1973-8102}, mesh = {Adult ; Humans ; Child ; *Auditory Cortex/physiology ; Acoustic Stimulation ; Auditory Perception/physiology ; Magnetoencephalography ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {The spectral formant structure and periodicity pitch are the major features that determine the identity of vowels and the characteristics of the speaker. However, very little is known about how the processing of these features in the auditory cortex changes during development. To address this question, we independently manipulated the periodicity and formant structure of vowels while measuring auditory cortex responses using magnetoencephalography (MEG) in children aged 7-12 years and adults. We analyzed the sustained negative shift of source current associated with these vowel properties, which was present in the auditory cortex in both age groups despite differences in the transient components of the auditory response. In adults, the sustained activation associated with formant structure was lateralized to the left hemisphere early in the auditory processing stream requiring neither attention nor semantic mapping. This lateralization was not yet established in children, in whom the right hemisphere contribution to formant processing was strong and decreased during or after puberty. In contrast to the formant structure, periodicity was associated with a greater response in the right hemisphere in both children and adults. These findings suggest that left-lateralization for the automatic processing of vowel formant structure emerges relatively late in ontogenesis and pose a serious challenge to current theories of hemispheric specialization for speech processing.}, } @article {pmid38058304, year = {2023}, author = {Alain, C and Göke, K and Shen, D and Bidelman, GM and Bernstein, LJ and Snyder, JS}, title = {Neural alpha oscillations index context-driven perception of ambiguous vowel sequences.}, journal = {iScience}, volume = {26}, number = {12}, pages = {108457}, pmid = {38058304}, issn = {2589-0042}, abstract = {Perception of bistable stimuli is influenced by prior context. In some cases, the interpretation matches with how the preceding stimulus was perceived; in others, it tends to be the opposite of the previous stimulus percept. We measured high-density electroencephalography (EEG) while participants were presented with a sequence of vowels that varied in formant transition, promoting the perception of one or two auditory streams followed by an ambiguous bistable sequence. For the bistable sequence, participants were more likely to report hearing the opposite percept of the one heard immediately before. This auditory contrast effect coincided with changes in alpha power localized in the left angular gyrus and left sensorimotor and right sensorimotor/supramarginal areas. The latter correlated with participants' perception. These results suggest that the contrast effect for a bistable sequence of vowels may be related to neural adaptation in posterior auditory areas, which influences participants' perceptual construal level of ambiguous stimuli.}, } @article {pmid38050971, year = {2024}, author = {Shellikeri, S and Cho, S and Ash, S and Gonzalez-Recober, C and Mcmillan, CT and Elman, L and Quinn, C and Amado, DA and Baer, M and Irwin, DJ and Massimo, L and Olm, CA and Liberman, MY and Grossman, M and Nevler, N}, title = {Digital markers of motor speech impairments in spontaneous speech of patients with ALS-FTD spectrum disorders.}, journal = {Amyotrophic lateral sclerosis & frontotemporal degeneration}, volume = {25}, number = {3-4}, pages = {317-325}, pmid = {38050971}, issn = {2167-9223}, support = {K99 AG073510/AG/NIA NIH HHS/United States ; P01 AG066597/AG/NIA NIH HHS/United States ; R01 NS109260/NS/NINDS NIH HHS/United States ; K08 NS114106/NS/NINDS NIH HHS/United States ; P30 AG072979/AG/NIA NIH HHS/United States ; R01 AG054519/AG/NIA NIH HHS/United States ; }, mesh = {Humans ; *Frontotemporal Dementia/diagnosis/diagnostic imaging ; *Amyotrophic Lateral Sclerosis/complications/diagnosis ; Speech ; Magnetic Resonance Imaging ; *Dystonic Disorders ; }, abstract = {OBJECTIVE: To evaluate automated digital speech measures, derived from spontaneous speech (picture descriptions), in assessing bulbar motor impairments in patients with ALS-FTD spectrum disorders (ALS-FTSD).

METHODS: Automated vowel algorithms were employed to extract two vowel acoustic measures: vowel space area (VSA), and mean second formant slope (F2 slope). Vowel measures were compared between ALS with and without clinical bulbar symptoms (ALS + bulbar (n = 49, ALSFRS-r bulbar subscore: x¯ = 9.8 (SD = 1.7)) vs. ALS-nonbulbar (n = 23), behavioral variant frontotemporal dementia (bvFTD, n = 25) without a motor syndrome, and healthy controls (HC, n = 32). Correlations with bulbar motor clinical scales, perceived listener effort, and MRI cortical thickness of the orobuccal primary motor cortex (oral PMC) were examined. We compared vowel measures to speaking rate, a conventional metric for assessing bulbar dysfunction.

RESULTS: ALS + bulbar had significantly reduced VSA and F2 slope than ALS-nonbulbar (|d|=0.94 and |d|=1.04, respectively), bvFTD (|d|=0.89 and |d|=1.47), and HC (|d|=0.73 and |d|=0.99). These reductions correlated with worse bulbar clinical scores (VSA: R = 0.33, p = 0.043; F2 slope: R = 0.38, p = 0.011), greater listener effort (VSA: R=-0.43, p = 0.041; F2 slope: p > 0.05), and cortical thinning in oral PMC (F2 slope: β = 0.0026, p = 0.017). Vowel measures demonstrated greater sensitivity and specificity for bulbar impairment than speaking rate, while showing independence from cognitive and respiratory impairments.

CONCLUSION: Automatic vowel measures are easily derived from a brief spontaneous speech sample, are sensitive to mild-moderate stage of bulbar disease in ALS-FTSD, and may present better sensitivity to bulbar impairment compared to traditional assessments such as speaking rate.}, } @article {pmid38033551, year = {2023}, author = {Heeringa, AN and Jüchter, C and Beutelmann, R and Klump, GM and Köppl, C}, title = {Altered neural encoding of vowels in noise does not affect behavioral vowel discrimination in gerbils with age-related hearing loss.}, journal = {Frontiers in neuroscience}, volume = {17}, number = {}, pages = {1238941}, pmid = {38033551}, issn = {1662-4548}, abstract = {INTRODUCTION: Understanding speech in a noisy environment, as opposed to speech in quiet, becomes increasingly more difficult with increasing age. Using the quiet-aged gerbil, we studied the effects of aging on speech-in-noise processing. Specifically, behavioral vowel discrimination and the encoding of these vowels by single auditory-nerve fibers were compared, to elucidate some of the underlying mechanisms of age-related speech-in-noise perception deficits.

METHODS: Young-adult and quiet-aged Mongolian gerbils, of either sex, were trained to discriminate a deviant naturally-spoken vowel in a sequence of vowel standards against a speech-like background noise. In addition, we recorded responses from single auditory-nerve fibers of young-adult and quiet-aged gerbils while presenting the same speech stimuli.

RESULTS: Behavioral vowel discrimination was not significantly affected by aging. For both young-adult and quiet-aged gerbils, the behavioral discrimination between /eː/ and /iː/ was more difficult to make than /eː/ vs. /aː/ or /iː/ vs. /aː/, as evidenced by longer response times and lower d' values. In young-adults, spike timing-based vowel discrimination agreed with the behavioral vowel discrimination, while in quiet-aged gerbils it did not. Paradoxically, discrimination between vowels based on temporal responses was enhanced in aged gerbils for all vowel comparisons. Representation schemes, based on the spectrum of the inter-spike interval histogram, revealed stronger encoding of both the fundamental and the lower formant frequencies in fibers of quiet-aged gerbils, but no qualitative changes in vowel encoding. Elevated thresholds in combination with a fixed stimulus level, i.e., lower sensation levels of the stimuli for old individuals, can explain the enhanced temporal coding of the vowels in noise.

DISCUSSION: These results suggest that the altered auditory-nerve discrimination metrics in old gerbils may mask age-related deterioration in the central (auditory) system to the extent that behavioral vowel discrimination matches that of the young adults.}, } @article {pmid38029503, year = {2024}, author = {Mohn, JL and Baese-Berk, MM and Jaramillo, S}, title = {Selectivity to acoustic features of human speech in the auditory cortex of the mouse.}, journal = {Hearing research}, volume = {441}, number = {}, pages = {108920}, pmid = {38029503}, issn = {1878-5891}, support = {R56 DC015531/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Mice ; Animals ; *Auditory Cortex/physiology ; Speech ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; Acoustics ; Auditory Perception/physiology ; }, abstract = {A better understanding of the neural mechanisms of speech processing can have a major impact in the development of strategies for language learning and in addressing disorders that affect speech comprehension. Technical limitations in research with human subjects hinder a comprehensive exploration of these processes, making animal models essential for advancing the characterization of how neural circuits make speech perception possible. Here, we investigated the mouse as a model organism for studying speech processing and explored whether distinct regions of the mouse auditory cortex are sensitive to specific acoustic features of speech. We found that mice can learn to categorize frequency-shifted human speech sounds based on differences in formant transitions (FT) and voice onset time (VOT). Moreover, neurons across various auditory cortical regions were selective to these speech features, with a higher proportion of speech-selective neurons in the dorso-posterior region. Last, many of these neurons displayed mixed-selectivity for both features, an attribute that was most common in dorsal regions of the auditory cortex. Our results demonstrate that the mouse serves as a valuable model for studying the detailed mechanisms of speech feature encoding and neural plasticity during speech-sound learning.}, } @article {pmid38010781, year = {2024}, author = {Anikin, A and Valente, D and Pisanski, K and Cornec, C and Bryant, GA and Reby, D}, title = {The role of loudness in vocal intimidation.}, journal = {Journal of experimental psychology. General}, volume = {153}, number = {2}, pages = {511-530}, doi = {10.1037/xge0001508}, pmid = {38010781}, issn = {1939-2222}, support = {//Vetenskapsrådet/ ; //French National Research Agency (ANR)/ ; }, mesh = {Humans ; *Voice ; Voice Quality ; Aggression ; Communication ; Sound ; }, abstract = {Across many species, a major function of vocal communication is to convey formidability, with low voice frequencies traditionally considered the main vehicle for projecting large size and aggression. Vocal loudness is often ignored, yet it might explain some puzzling exceptions to this frequency code. Here we demonstrate, through acoustic analyses of over 3,000 human vocalizations and four perceptual experiments, that vocalizers produce low frequencies when attempting to sound large, but loudness is prioritized for displays of strength and aggression. Our results show that, although being loud is effective for signaling strength and aggression, it poses a physiological trade-off with low frequencies because a loud voice is achieved by elevating pitch and opening the mouth wide into a-like vowels. This may explain why aggressive vocalizations are often high-pitched and why open vowels are considered "large" in sound symbolism despite their high first formant. Callers often compensate by adding vocal harshness (nonlinear vocal phenomena) to undesirably high-pitched loud vocalizations, but a combination of low and loud remains an honest predictor of both perceived and actual physical formidability. The proposed notion of a loudness-frequency trade-off thus adds a new dimension to the widely accepted frequency code and requires a fundamental rethinking of the evolutionary forces shaping the form of acoustic signals. (PsycInfo Database Record (c) 2024 APA, all rights reserved).}, } @article {pmid38000960, year = {2023}, author = {Barrientos, E and Cataldo, E}, title = {Estimating Formant Frequencies of Vowels Sung by Sopranos Using Weighted Linear Prediction.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.018}, pmid = {38000960}, issn = {1873-4588}, abstract = {This study introduces the weighted linear prediction adapted to high-pitched singing voices (WLP-HPSV) method for accurately estimating formant frequencies of vowels sung by lyric sopranos. The WLP-HPSV method employs a variant of the WLP analysis combined with the zero-frequency filtering (ZFF) technique to address specific challenges in formant estimation from singing signals. Evaluation of the WLP-HPSV method compared to the LPC method demonstrated its superior performance in accurately capturing the spectral characteristics of synthetic /u/ vowels and the /a/ and /u/ natural singing vowels. The QCP parameters used in the WLP-HPSV method varied with pitch, revealing insights into the interplay between the vocal tract and glottal characteristics during vowel production. The comparison between the LPC and WLP-HPSV methods highlighted the robustness of the WLP-HPSV method in accurately estimating formant frequencies across different pitches.}, } @article {pmid37992456, year = {2024}, author = {Punamäki, RL and Diab, SY and Drosos, K and Qouta, SR and Vänskä, M}, title = {The role of acoustic features of maternal infant-directed singing in enhancing infant sensorimotor, language and socioemotional development.}, journal = {Infant behavior & development}, volume = {74}, number = {}, pages = {101908}, doi = {10.1016/j.infbeh.2023.101908}, pmid = {37992456}, issn = {1934-8800}, mesh = {Female ; Infant ; Child ; Infant, Newborn ; Humans ; *Singing ; Prospective Studies ; Speech ; Language ; Acoustics ; Language Development ; }, abstract = {The quality of infant-directed speech (IDS) and infant-directed singing (IDSi) are considered vital to children, but empirical studies on protomusical qualities of the IDSi influencing infant development are rare. The current prospective study examines the role of IDSi acoustic features, such as pitch variability, shape and movement, and vocal amplitude vibration, timbre, and resonance, in associating with infant sensorimotor, language, and socioemotional development at six and 18 months. The sample consists of 236 Palestinian mothers from Gaza Strip singing to their six-month-olds a song by their own choice. Maternal IDSi was recorded and analyzed by the OpenSMILE- tool to depict main acoustic features of pitch frequencies, variations, and contours, vocal intensity, resonance formants, and power. The results are based on completed 219 maternal IDSi. Mothers reported about their infants' sensorimotor, language-vocalization, and socioemotional skills at six months, and psychologists tested these skills by Bayley Scales for Infant Development at 18 months. Results show that maternal IDSi characterized by wide pitch variability and rich and high vocal amplitude and vibration were associated with infants' optimal sensorimotor, language vocalization, and socioemotional skills at six months, and rich and high vocal amplitude and vibration predicted these optimal developmental skills also at 18 months. High resonance and rhythmicity formants were associated with optimal language and vocalization skills at six months. To conclude, the IDSi is considered important in enhancing newborn and risk infants' wellbeing, and the current findings argue that favorable acoustic singing qualities are crucial for optimal multidomain development across infancy.}, } @article {pmid37992412, year = {2023}, author = {Levin, M and Zaltz, Y}, title = {Voice Discrimination in Quiet and in Background Noise by Simulated and Real Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {12}, pages = {5169-5186}, doi = {10.1044/2023_JSLHR-23-00019}, pmid = {37992412}, issn = {1558-9102}, mesh = {Humans ; *Cochlear Implants ; *Speech Perception ; *Cochlear Implantation/methods ; *Hearing Loss/rehabilitation ; Noise ; }, abstract = {PURPOSE: Cochlear implant (CI) users demonstrate poor voice discrimination (VD) in quiet conditions based on the speaker's fundamental frequency (fo) and formant frequencies (i.e., vocal-tract length [VTL]). Our purpose was to examine the effect of background noise at levels that allow good speech recognition thresholds (SRTs) on VD via acoustic CI simulations and CI hearing.

METHOD: Forty-eight normal-hearing (NH) listeners who listened via noise-excited (n = 20) or sinewave (n = 28) vocoders and 10 prelingually deaf CI users (i.e., whose hearing loss began before language acquisition) participated in the study. First, the signal-to-noise ratio (SNR) that yields 70.7% correct SRT was assessed using an adaptive sentence-in-noise test. Next, the CI simulation listeners performed 12 adaptive VDs: six in quiet conditions, two with each cue (fo, VTL, fo + VTL), and six amid speech-shaped noise. The CI participants performed six VDs: one with each cue, in quiet and amid noise. SNR at VD testing was 5 dB higher than the individual's SRT in noise (SRTn +5 dB).

RESULTS: Results showed the following: (a) Better VD was achieved via the noise-excited than the sinewave vocoder, with the noise-excited vocoder better mimicking CI VD; (b) background noise had a limited negative effect on VD, only for the CI simulation listeners; and (c) there was a significant association between SNR at testing and VTL VD only for the CI simulation listeners.

CONCLUSIONS: For NH listeners who listen to CI simulations, noise that allows good SRT can nevertheless impede VD, probably because VD depends more on bottom-up sensory processing. Conversely, for prelingually deaf CI users, noise that allows good SRT hardly affects VD, suggesting that they rely strongly on bottom-up processing for both VD and speech recognition.}, } @article {pmid37992404, year = {2024}, author = {Kapsner-Smith, MR and Abur, D and Eadie, TL and Stepp, CE}, title = {Test-Retest Reliability of Behavioral Assays of Feedforward and Feedback Auditory-Motor Control of Voice and Articulation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {1}, pages = {34-48}, pmid = {37992404}, issn = {1558-9102}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; F31 DC020359/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; }, mesh = {Male ; Humans ; Female ; Feedback ; Reproducibility of Results ; *Voice/physiology ; Speech ; Hearing ; }, abstract = {PURPOSE: Behavioral assays of feedforward and feedback auditory-motor control of voice and articulation frequently are used to make inferences about underlying neural mechanisms and to study speech development and disorders. However, no studies have examined the test-retest reliability of such measures, which is critical for rigorous study of auditory-motor control. Thus, the purpose of the present study was to assess the reliability of assays of feedforward and feedback control in voice versus articulation domains.

METHOD: Twenty-eight participants (14 cisgender women, 12 cisgender men, one transgender man, one transmasculine/nonbinary) who denied any history of speech, hearing, or neurological impairment were measured for responses to predictable versus unexpected auditory feedback perturbations of vocal (fundamental frequency, fo) and articulatory (first formant, F1) acoustic parameters twice, with 3-6 weeks between sessions. Reliability was measured with intraclass correlations.

RESULTS: Opposite patterns of reliability were observed for fo and F1; fo reflexive responses showed good reliability and fo adaptive responses showed poor reliability, whereas F1 reflexive responses showed poor reliability and F1 adaptive responses showed moderate reliability. However, a criterion-referenced categorical measurement of fo adaptive responses as typical versus atypical showed substantial test-retest agreement.

CONCLUSIONS: Individual responses to some behavioral assays of auditory-motor control of speech should be interpreted with caution, which has implications for several fields of research. Additional research is needed to establish reliable criterion-referenced measures of F1 adaptive responses as well as fo and F1 reflexive responses. Furthermore, the opposite patterns of test-retest reliability observed for voice versus articulation add to growing evidence for differences in underlying neural control mechanisms.}, } @article {pmid37988375, year = {2023}, author = {Zhang, W and Clayards, M}, title = {Contribution of acoustic cues to prominence ratings for four Mandarin vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {5}, pages = {3364-3373}, doi = {10.1121/10.0022410}, pmid = {37988375}, issn = {1520-8524}, mesh = {*Cues ; Bayes Theorem ; Acoustics ; Speech Acoustics ; *Speech Perception ; Phonetics ; }, abstract = {The acoustic cues for prosodic prominence have been explored extensively, but one open question is to what extent they differ by context. This study investigates the extent to which vowel type affects how acoustic cues are related to prominence ratings provided in a corpus of spoken Mandarin. In the corpus, each syllable was rated as either prominent or non-prominent. We predicted prominence ratings using Bayesian mixed-effect regression models for each of four Mandarin vowels (/a, i, ɤ, u/), using fundamental frequency (F0), intensity, duration, the first and second formants, and tone type as predictors. We compared the role of each cue within and across the four models. We found that overall duration was the best predictor of prominence ratings and that formants were the weakest, but the role of each cue differed by vowel. We did not find credible evidence that F0 was relevant for /a/, or that intensity was relevant for /i/. We also found evidence that duration was more important for /ɤ/ than for /i/. The results suggest that vowel type credibly affects prominence ratings, which may reflect differences in the coordination of acoustic cues in prominence marking.}, } @article {pmid37974753, year = {2023}, author = {Jasim, M and Nayana, VG and Nayaka, H and Nayak, PS}, title = {Effect of Adenotonsillectomy on Spectral and Acoustic Characteristics.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {75}, number = {4}, pages = {3467-3475}, pmid = {37974753}, issn = {2231-3796}, abstract = {Acoustic analysis and perceptual analysis has been extensively used to assess the speech and voice among individual with voice disorders. These methods provide objective, quantitative and precise information on the speech and voice characteristics in any given disorder and help in monitoring any recovery, deterioration, or improvement in an individual's speech and also differentiate between normal and abnormal speech and voice characteristics. The present study was carried out to investigate the spectral characteristics (formant frequency parameters and formant centralization ratios) and voice characteristics (Acoustic parameters of voice) changes in individuals following adenotonsillectomy. A total of 34 participants participated in the study with a history of adenotonsillar hypertrophy. Spectral and acoustic voice parameters were analyzed across the three-time domains, before surgery (T0), 30 days (T1), and 90 days (T2) after surgery. Data was analyzed statistically using the SPSS software version-28.0.0.0. Descriptive statistics were used to find the mean and standard deviation. Repeated measures of ANOVA were used to compare the pre and post-experimental measures for spectral and acoustic, voice parameters. The derived parameter of acoustic vowel space (formant centralization ratio 3) was compared across three conditions timelines. The results revealed that acoustic vowel space measure and formant frequency measures were significantly increased in pre and post-operative conditions across the three timelines. A significant difference was obtained across the acoustic parameters across the time domains. Adenotonsillectomy has been proved to be an efficient surgical procedure in treating children with chronic adenotonsillitis. The results obtained have indicated an overall improvement in the spectral and acoustic voice parameters thereby highlighting the need for adenotonsillectomy at the right time and at the right age.}, } @article {pmid37972580, year = {2024}, author = {Noffs, G and Cobler-Lichter, M and Perera, T and Kolbe, SC and Butzkueven, H and Boonstra, FMC and van der Walt, A and Vogel, AP}, title = {Plug-and-Play Microphones for Recording Speech and Voice with Smart Devices.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {76}, number = {4}, pages = {372-385}, pmid = {37972580}, issn = {1421-9972}, mesh = {Humans ; *Speech Acoustics ; Male ; Speech Production Measurement/instrumentation ; Female ; Adult ; Voice Quality ; Equipment Design ; Multiple Sclerosis ; }, abstract = {INTRODUCTION: Smart devices are widely available and capable of quickly recording and uploading speech segments for health-related analysis. The switch from laboratory recordings with professional-grade microphone setups to remote, smart device-based recordings offers immense potential for the scalability of voice assessment. Yet, a growing body of literature points to a wide heterogeneity among acoustic metrics for their robustness to variation in recording devices. The addition of consumer-grade plug-and-play microphones has been proposed as a possible solution. The aim of our study was to assess if the addition of consumer-grade plug-and-play microphones increases the acoustic measurement agreement between ultra-portable devices and a reference microphone.

METHODS: Speech was simultaneously recorded by a reference high-quality microphone commonly used in research and by two configurations with plug-and-play microphones. Twelve speech-acoustic features were calculated using recordings from each microphone to determine the agreement intervals in measurements between microphones. Agreement intervals were then compared to expected deviations in speech in various neurological conditions. Each microphone's response to speech and to silence was characterized through acoustic analysis to explore possible reasons for differences in acoustic measurements between microphones. The statistical differentiation of two groups, neurotypical and people with multiple sclerosis, using metrics from each tested microphone was compared to that of the reference microphone.

RESULTS: The two consumer-grade plug-and-play microphones favored high frequencies (mean center of gravity difference ≥ +175.3 Hz) and recorded more noise (mean difference in signal to noise ≤ -4.2 dB) when compared to the reference microphone. Between consumer-grade microphones, differences in relative noise were closely related to distance between the microphone and the speaker's mouth. Agreement intervals between the reference and consumer-grade microphones remained under disease-expected deviations only for fundamental frequency (f0, agreement interval ≤0.06 Hz), f0 instability (f0 CoV, agreement interval ≤0.05%), and tracking of second formant movement (agreement interval ≤1.4 Hz/ms). Agreement between microphones was poor for other metrics, particularly for fine timing metrics (mean pause length and pause length variability for various tasks). The statistical difference between the two groups of speakers was smaller with the plug-and-play than with the reference microphone.

CONCLUSION: Measurement of f0 and F2 slope was robust to variation in recording equipment, while other acoustic metrics were not. Thus, the tested plug-and-play microphones should not be used interchangeably with professional-grade microphones for speech analysis. Plug-and-play microphones may assist in equipment standardization within speech studies, including remote or self-recording, possibly with small loss in accuracy and statistical power as observed in the current study.}, } @article {pmid37944057, year = {2023}, author = {Ribas-Prats, T and Cordero, G and Lip-Sosa, DL and Arenillas-Alcón, S and Costa-Faidella, J and Gómez-Roig, MD and Escera, C}, title = {Developmental Trajectory of the Frequency-Following Response During the First 6 Months of Life.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {12}, pages = {4785-4800}, doi = {10.1044/2023_JSLHR-23-00104}, pmid = {37944057}, issn = {1558-9102}, mesh = {Infant, Newborn ; Infant ; Humans ; *Speech Perception/physiology ; Language Development ; Phonetics ; Electroencephalography ; }, abstract = {PURPOSE: The aim of the present study is to characterize the maturational changes during the first 6 months of life in the neural encoding of two speech sound features relevant for early language acquisition: the stimulus fundamental frequency (fo), related to stimulus pitch, and the vowel formant composition, particularly F1. The frequency-following response (FFR) was used as a snapshot into the neural encoding of these two stimulus attributes.

METHOD: FFRs to a consonant-vowel stimulus /da/ were retrieved from electroencephalographic recordings in a sample of 80 healthy infants (45 at birth and 35 at the age of 1 month). Thirty-two infants (16 recorded at birth and 16 recorded at 1 month) returned for a second recording at 6 months of age.

RESULTS: Stimulus fo and F1 encoding showed improvements from birth to 6 months of age. Most remarkably, a significant improvement in the F1 neural encoding was observed during the first month of life.

CONCLUSION: Our results highlight the rapid and sustained maturation of the basic neural machinery necessary for the phoneme discrimination ability during the first 6 months of age.}, } @article {pmid37943390, year = {2024}, author = {Mračková, M and Mareček, R and Mekyska, J and Košťálová, M and Rektorová, I}, title = {Levodopa may modulate specific speech impairment in Parkinson's disease: an fMRI study.}, journal = {Journal of neural transmission (Vienna, Austria : 1996)}, volume = {131}, number = {2}, pages = {181-187}, pmid = {37943390}, issn = {1435-1463}, support = {LX22NPO5107 (MEYS): Financed by EU-Next Generation EU//Ministerstvo Školství, Mládeže a Tělovýchovy/ ; }, mesh = {Humans ; *Levodopa/adverse effects ; *Parkinson Disease/complications/diagnostic imaging/drug therapy ; Speech/physiology ; Magnetic Resonance Imaging/methods ; Quality of Life ; Speech Disorders/diagnostic imaging/etiology ; Dysarthria/etiology/complications ; Antiparkinson Agents/adverse effects ; }, abstract = {Hypokinetic dysarthria (HD) is a difficult-to-treat symptom affecting quality of life in patients with Parkinson's disease (PD). Levodopa may partially alleviate some symptoms of HD in PD, but the neural correlates of these effects are not fully understood. The aim of our study was to identify neural mechanisms by which levodopa affects articulation and prosody in patients with PD. Altogether 20 PD patients participated in a task fMRI study (overt sentence reading). Using a single dose of levodopa after an overnight withdrawal of dopaminergic medication, levodopa-induced BOLD signal changes within the articulatory pathway (in regions of interest; ROIs) were studied. We also correlated levodopa-induced BOLD signal changes with the changes in acoustic parameters of speech. We observed no significant changes in acoustic parameters due to acute levodopa administration. After levodopa administration as compared to the OFF dopaminergic condition, patients showed task-induced BOLD signal decreases in the left ventral thalamus (p = 0.0033). The changes in thalamic activation were associated with changes in pitch variation (R = 0.67, p = 0.006), while the changes in caudate nucleus activation were related to changes in the second formant variability which evaluates precise articulation (R = 0.70, p = 0.003). The results are in line with the notion that levodopa does not have a major impact on HD in PD, but it may induce neural changes within the basal ganglia circuitries that are related to changes in speech prosody and articulation.}, } @article {pmid37940420, year = {2023}, author = {Liu, W and Wang, Y and Liang, C}, title = {Formant and Voice Source Characteristics of Vowels in Chinese National Singing and Bel Canto. A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.016}, pmid = {37940420}, issn = {1873-4588}, abstract = {BACKGROUND: There have been numerous reports on the acoustic characteristics of singers' vowel articulation and phonation, and these studies cover many phonetic dimensions, such as fundamental frequency (F0), intensity, formant frequency, and voice quality.

METHOD: Taking the three representative vowels (/a/, /i/, /u/) in Chinese National Singing and Bel Canto as the research object, the present study investigates the differences and associations in vowel articulation and phonation between Chinese National Singing and Bel Canto using acoustic measures, for example, F0, formant frequency, long-term average spectrum (LTAS).

RESULTS: The relationship between F0 and formant indicates that F1 is proportional to F0, in which the female has a significant variation in vowel /a/. Compared with the male, the formant structure of the female singing voice differs significantly from that of the speech voice. Regarding the relationship between intensity and formant, LTAS shows that the Chinese National Singing tenor and Bel Canto baritone have the singer's formant cluster when singing vowels, while the two sopranos do not.

CONCLUSIONS: The systematic changes of formant frequencies with voice source are observed. (i) F1 of the female vowel /a/ has undergone a significant tuning change in the register transition, reflecting the characteristics of singing genres. (ii) Female singers utilize the intrinsic pitch of vowels when adopting the register transition strategy. This finding can be assumed to facilitate understanding the theory of intrinsic vowel pitch and revise Sundberg's hypothesis that F1 rises with F0. A non-linear relationship exists between F1 and F0, which adds to the non-linear interaction of the formant and vocal source. (iii) The singer's formant is affected by voice classification, gender, and singing genres.}, } @article {pmid37935372, year = {2023}, author = {Keller, PE and Lee, J and König, R and Novembre, G}, title = {Sex-related communicative functions of voice spectral energy in human chorusing.}, journal = {Biology letters}, volume = {19}, number = {11}, pages = {20230326}, pmid = {37935372}, issn = {1744-957X}, mesh = {Humans ; Male ; Female ; *Voice ; *Music ; Acoustics ; Social Behavior ; }, abstract = {Music is a human communicative art whose evolutionary origins may lie in capacities that support cooperation and/or competition. A mixed account favouring simultaneous cooperation and competition draws on analogous interactive displays produced by collectively signalling non-human animals (e.g. crickets and frogs). In these displays, rhythmically coordinated calls serve as a beacon whereby groups of males 'cooperatively' attract potential female mates, while the likelihood of each male competitively attracting an actual mate depends on the precedence of his signal. Human behaviour consistent with the mixed account was previously observed in a renowned boys choir, where the basses-the oldest boys with the deepest voices-boosted their acoustic prominence by increasing energy in a high-frequency band of the vocal spectrum when girls were in an otherwise male audience. The current study tested female and male sensitivity and preferences for this subtle vocal modulation in online listening tasks. Results indicate that while female and male listeners are similarly sensitive to enhanced high-spectral energy elicited by the presence of girls in the audience, only female listeners exhibit a reliable preference for it. Findings suggest that human chorusing is a flexible form of social communicative behaviour that allows simultaneous group cohesion and sexually motivated competition.}, } @article {pmid37925330, year = {2023}, author = {Baker, CP and Brockmann-Bauser, M and Purdy, SC and Rakena, TO}, title = {High and Wide: An In Silico Investigation of Frequency, Intensity, and Vibrato Effects on Widely Applied Acoustic Voice Perturbation and Noise Measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.007}, pmid = {37925330}, issn = {1873-4588}, abstract = {OBJECTIVES: This in silico study explored the effects of a wide range of fundamental frequency (fo), source-spectrum tilt (SST), and vibrato extent (VE) on commonly used frequency and amplitude perturbation and noise measures.

METHOD: Using 53 synthesized tones produced in Madde, the effects of stepwise increases in fo, intensity (modeled by decreasing SST), and VE on the PRAAT parameters jitter % (local), relative average perturbation (RAP) %, shimmer % (local), amplitude perturbation quotient 3 (APQ3) %, and harmonics-to-noise ratio (HNR) dB were investigated. A secondary experiment was conducted to determine whether any fo effects on jitter, RAP, shimmer, APQ3, and HNR were stable. A total of 10 sinewaves were synthesized in Sopran from 100 to 1000 Hz using formant frequencies for /a/, /i/, and /u/-like vowels, respectively. All effects were statistically assessed with Kendall's tau-b and partial correlation.

RESULTS: Increasing fo resulted in an overall increase in jitter, RAP, shimmer, and APQ3 values, respectively (P < 0.01). Oscillations of the data across the explored fo range were observed in all measurement outputs. In the Sopran tests, the oscillatory pattern seen in the Madde fo condition remained and showed differences between vowel conditions. Increasing intensity (decreasing SST) led to reduced pitch and amplitude perturbation and HNR (P < 0.05). Increasing VE led to lower HNR and an almost linear increase of all other measures (P < 0.05).

CONCLUSION: These novel data offer a controlled demonstration for the behavior of jitter (local) %, RAP %, shimmer (local) %, APQ3 %, and HNR (dB) when varying fo, SST, and VE in synthesized tones. Since humans will vary in all of these aspects in spoken language and vowel phonation, researchers should take potential resonance-harmonics type effects into account when comparing intersubject or preintervention and postintervention data using these measures.}, } @article {pmid37906609, year = {2023}, author = {Song, J and Kim, M and Park, J}, title = {Acoustic correlates of perceived personality from Korean utterances in a formal communicative setting.}, journal = {PloS one}, volume = {18}, number = {10}, pages = {e0293222}, pmid = {37906609}, issn = {1932-6203}, mesh = {Humans ; Female ; *Speech ; *Voice ; Acoustics ; Personality ; Language ; }, abstract = {The aim of the present study was to find acoustic correlates of perceived personality from the speech produced in a formal communicative setting-that of Korean customer service employees in particular. This work extended previous research on voice personality impressions to a different sociocultural and linguistic context in which speakers are expected to speak politely in a formal register. To use naturally produced speech rather than read speech, we devised a new method that successfully elicited spontaneous speech from speakers who were role-playing as customer service employees, while controlling for the words and sentence structures they used. We then examined a wide range of acoustic properties in the utterances, including voice quality and global acoustic and segmental properties using Principal Component Analysis. Subjects of the personality rating task listened to the utterances and rated perceived personality in terms of the Big-Five personality traits. While replicating some previous findings, we discovered several acoustic variables that exclusively accounted for the personality judgments of female speakers; a more modal voice quality increased perceived conscientiousness and neuroticism, and less dispersed formants reflecting a larger body size increased the perceived levels of extraversion and openness. These biases in personality perception likely reflect gender and occupation-related stereotypes that exist in South Korea. Our findings can also serve as a basis for developing and evaluating synthetic speech for Voice Assistant applications in future studies.}, } @article {pmid37905994, year = {2024}, author = {Ealer, C and Niemczak, CE and Nicol, T and Magohe, A and Bonacina, S and Zhang, Z and Rieke, C and Leigh, S and Kobrina, A and Lichtenstein, J and Massawe, ER and Kraus, N and Buckey, JC}, title = {Auditory neural processing in children living with HIV uncovers underlying central nervous system dysfunction.}, journal = {AIDS (London, England)}, volume = {38}, number = {3}, pages = {289-298}, pmid = {37905994}, issn = {1473-5571}, support = {R01 HD095277/HD/NICHD NIH HHS/United States ; }, mesh = {Child ; Humans ; Cohort Studies ; Cross-Sectional Studies ; *HIV Infections/complications ; Acoustic Stimulation ; Tanzania ; Central Nervous System ; }, abstract = {OBJECTIVE: Central nervous system (CNS) damage from HIV infection or treatment can lead to developmental delays and poor educational outcomes in children living with HIV (CLWH). Early markers of central nervous system dysfunction are needed to target interventions and prevent life-long disability. The frequency following response (FFR) is an auditory electrophysiology test that can reflect the health of the central nervous system. In this study, we explore whether the FFR reveals auditory central nervous system dysfunction in CLWH.

STUDY DESIGN: Cross-sectional analysis of an ongoing cohort study. Data were from the child's first visit in the study.

SETTING: The infectious disease center in Dar es Salaam, Tanzania.

METHODS: We collected the FFR from 151 CLWH and 151 HIV-negative children. To evoke the FFR, three speech syllabi (/da/, /ba/, /ga/) were played monaurally to the child's right ear. Response measures included neural timing (peak latencies), strength of frequency encoding (fundamental frequency and first formant amplitude), encoding consistency (inter-response consistency), and encoding precision (stimulus-to-response correlation).

RESULTS: CLWH showed smaller first formant amplitudes (P < 0.0001), weaker inter-response consistencies (P < 0.0001) and smaller stimulus to response correlations (P < 0.0001) than FFRs from HIV-negative children. These findings generalized across the three speech stimuli with moderately strong effect sizes (partial η2 ranged from 0.061 to 0.094).

CONCLUSION: The FFR shows auditory central nervous system dysfunction in CLWH. Neural encoding of auditory stimuli was less robust, more variable, and less accurate. As the FFR is a passive and objective test, it may offer an effective way to assess and detect central nervous system function in CLWH.}, } @article {pmid37900335, year = {2023}, author = {Mutlu, A and Celik, S and Kilic, MA}, title = {Effects of Personal Protective Equipment on Speech Acoustics.}, journal = {Sisli Etfal Hastanesi tip bulteni}, volume = {57}, number = {3}, pages = {434-439}, pmid = {37900335}, issn = {1302-7123}, abstract = {OBJECTIVES: The transmission of severe acute respiratory syndrome coronavirus-2 occurs primarily through droplets, which highlights the importance of protecting the oral, nasal, and conjunctival mucosas using personal protective equipment (PPE). The use of PPE can lead to communication difficulties between healthcare workers and patients. This study aimed to investigate changes in the acoustic parameters of speech sounds when different types of PPE are used.

METHODS: A cross-sectional study was conducted, enrolling 18 healthy male and female participants. They were instructed to produce a sustained [ɑː] vowel for at least 3 s to estimate voice quality. In addition, all Turkish vowels were produced for a minimum of 200 ms. Finally, three Turkish fricative consonants ([f], [s], and [ʃ]) were produced in a consonant/vowel/consonant format with different vowel contexts within a carrier sentence. Recordings were repeated under the following conditions: no PPE, surgical mask, N99 mask, face shield, surgical mask + face shield, and N99 mask + face shield. All recordings were subjected to analysis.

RESULTS: Frequency perturbation parameters did not show significant differences. However, in males, all vowels except [u] in the first formant (F1), except [ɔ] and [u] in the second formant (F2), except [ɛ] and [ɔ] in the third formant (F3), and only [i] in the fourth formant (F4) were significant. In females, all vowels except [i] in F1, except [u] in F2, all vowels in F3, and except [u] and [ɯ] in F4 were significant. Spectral moment values exhibited significance in both groups.

CONCLUSION: The use of different types of PPE resulted in variations in speech acoustic features. These findings may be attributed to the filtering effects of PPE on specific frequencies and the potential chamber effect in front of the face. Understanding the impact of PPE on speech acoustics contributes to addressing communication challenges in healthcare settings.}, } @article {pmid37877773, year = {2023}, author = {Steffman, J and Zhang, W}, title = {Vowel perception under prominence: Examining the roles of F0, duration, and distributional information.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {4}, pages = {2594-2608}, doi = {10.1121/10.0021300}, pmid = {37877773}, issn = {1520-8524}, mesh = {*Cues ; *Language ; Speech ; Perception ; }, abstract = {This study investigates how prosodic prominence mediates the perception of American English vowels, testing the effects of F0 and duration. In Experiment 1, the perception of four vowel continua varying in duration and formants (high: /i-ɪ/, /u-ʊ/, non-high: /ɛ-ae/, /ʌ-ɑ/), was examined under changes in F0-based prominence. Experiment 2 tested if cue usage varies as the distributional informativity of duration as a cue to prominence is manipulated. Both experiments show that duration is a consistent vowel-intrinsic cue. F0-based prominence affected perception of vowels via compensation for peripheralization of prominent vowels in the vowel space. Longer duration and F0-based prominence further enhanced the perception of formant cues. The distributional manipulation in Experiment 2 exerted a minimal impact. Findings suggest that vowel perception is mediated by prominence in a height-dependent manner which reflects patterns in the speech production literature. Further, duration simultaneously serves as an intrinsic cue and serves a prominence-related function in enhancing perception of formant cues.}, } @article {pmid37873157, year = {2023}, author = {Wang, H and Ali, Y and Max, L}, title = {Perceptual formant discrimination during speech movement planning.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37873157}, issn = {2692-8205}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; }, abstract = {Evoked potential studies have shown that speech planning modulates auditory cortical responses. The phenomenon's functional relevance is unknown. We tested whether, during this time window of cortical auditory modulation, there is an effect on speakers' perceptual sensitivity for vowel formant discrimination. Participants made same/different judgments for pairs of stimuli consisting of a pre-recorded, self-produced vowel and a formant-shifted version of the same production. Stimuli were presented prior to a "go" signal for speaking, prior to passive listening, and during silent reading. The formant discrimination stimulus /uh/ was tested with a congruent productions list (words with /uh/) and an incongruent productions list (words without /uh/). Logistic curves were fitted to participants' responses, and the just-noticeable difference (JND) served as a measure of discrimination sensitivity. We found a statistically significant effect of condition (worst discrimination before speaking) without congruency effect. Post-hoc pairwise comparisons revealed that JND was significantly greater before speaking than during silent reading. Thus, formant discrimination sensitivity was reduced during speech planning regardless of the congruence between discrimination stimulus and predicted acoustic consequences of the planned speech movements. This finding may inform ongoing efforts to determine the functional relevance of the previously reported modulation of auditory processing during speech planning.}, } @article {pmid37850867, year = {2023}, author = {Miller, HE and Kearney, E and Nieto-Castañón, A and Falsini, R and Abur, D and Acosta, A and Chao, SC and Dahl, KL and Franken, M and Heller Murray, ES and Mollaei, F and Niziolek, CA and Parrell, B and Perrachione, T and Smith, DJ and Stepp, CE and Tomassi, N and Guenther, FH}, title = {Do Not Cut Off Your Tail: A Mega-Analysis of Responses to Auditory Perturbation Experiments.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {11}, pages = {4315-4331}, pmid = {37850867}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; P50 DC015446/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R01 DC011277/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; F31 DC019032/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T90 DA032484/DA/NIDA NIH HHS/United States ; F31 DC016197/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; F31 DC020352/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Aged ; *Speech/physiology ; *Parkinson Disease ; Feedback, Sensory/physiology ; }, abstract = {PURPOSE: The practice of removing "following" responses from speech perturbation analyses is increasingly common, despite no clear evidence as to whether these responses represent a unique response type. This study aimed to determine if the distribution of responses to auditory perturbation paradigms represents a bimodal distribution, consisting of two distinct response types, or a unimodal distribution.

METHOD: This mega-analysis pooled data from 22 previous studies to examine the distribution and magnitude of responses to auditory perturbations across four tasks: adaptive pitch, adaptive formant, reflexive pitch, and reflexive formant. Data included at least 150 unique participants for each task, with studies comprising younger adult, older adult, and Parkinson's disease populations. A Silverman's unimodality test followed by a smoothed bootstrap resampling technique was performed for each task to evaluate the number of modes in each distribution. Wilcoxon signed-ranks tests were also performed for each distribution to confirm significant compensation in response to the perturbation.

RESULTS: Modality analyses were not significant (p > .05) for any group or task, indicating unimodal distributions. Our analyses also confirmed compensatory reflexive responses to pitch and formant perturbations across all groups, as well as adaptive responses to sustained formant perturbations. However, analyses of sustained pitch perturbations only revealed evidence of adaptation in studies with younger adults.

CONCLUSION: The demonstration of a clear unimodal distribution across all tasks suggests that following responses do not represent a distinct response pattern, but rather the tail of a unimodal distribution.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.24282676.}, } @article {pmid37845148, year = {2023}, author = {Chu, M and Wang, J and Fan, Z and Yang, M and Xu, C and Ma, Y and Tao, Z and Wu, D}, title = {A Multidomain Generative Adversarial Network for Hoarse-to-Normal Voice Conversion.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.027}, pmid = {37845148}, issn = {1873-4588}, abstract = {Hoarse voice affects the efficiency of communication between people. However, surgical treatment may result in patients with poorer voice quality, and voice repair techniques can only repair vowels. In this paper, we propose a novel multidomain generative adversarial voice conversion method to achieve hoarse-to-normal voice conversion and personalize voices for patients with hoarseness. The proposed method aims to improve the speech quality of hoarse voices through a multidomain generative adversarial network. The proposed method is evaluated on subjective and objective evaluation metrics. According to the findings of the spectrum analysis, the suggested method converts hoarse voice formants more effectively than variational auto-encoder (VAE), Auto-VC (voice conversion), StarGAN-VC (Generative Adversarial Network- Voice Conversion), and CycleVAE. For the word error rate, the suggested method obtains absolute gains of 35.62, 37.97, 45.42, and 50.05 compared to CycleVAE, StarGAN-VC, Auto-VC, and VAE, respectively. The suggested method achieves CycleVAE, VAE, StarGAN-VC, and Auto-VC, respectively, in terms of naturalness by 42.49%, 51.60%, 69.37%, and 77.54%. The suggested method outperforms VAE, CycleVAE, StarGAN-VC, and Auto-VC, respectively, in terms of intelligibility, with absolute gains of 0.87, 0.93, 1.08, and 1.13. In terms of content similarity, the proposed method obtains 43.48%, 75.52%, 76.21%, and 108.62% improvements compared to CycleVAE, StarGAN-VC, Auto-VC, and VAE, respectively. ABX results show that the suggested method can personalize the voice for patients with hoarseness. This study demonstrates the feasibility of voice conversion methods in improving the speech quality of hoarse voices.}, } @article {pmid37838586, year = {2023}, author = {Santos, SS and Christmann, MK and Cielo, CA}, title = {Spectrographic Vocal Characteristics in Female Teachers: Finger Kazoo Intensive Short-term Vocal Therapy.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.023}, pmid = {37838586}, issn = {1873-4588}, abstract = {OBJECTIVE: Verify the results from intensive short-term vocal therapy using the Finger Kazoo technique about the spectrographic vocal measurements of teachers.

METHODS: Controlled and randomized trial. Spectrographic vocal assessment was performed by judges before and after intensive short-term vocal therapy with Finger Kazoo. Sample was composed of 41 female teachers. There were two study groups (with vocal nodules and without structural affection of the vocal folds) and the respective control groups. For the statistical analysis of the data, nonparametric tests were used (Mann-Whitney test and Wilcoxon test).

RESULTS: After intensive short-term vocal therapy with Finger Kazoo, improvement in voice spectral parameters, such as improvement in tracing (color intensity and regularity), greater definition of formants and harmonics, increased replacement of harmonics by noise, and a greater number of harmonics, mainly in the group without structural affection of the vocal folds.

CONCLUSION: There was an improvement in the spectrographic vocal parameters, showing greater stability, quality, and projection of the emission, especially in female teachers without structural affection of the vocal folds.}, } @article {pmid37831677, year = {2023}, author = {Kim, JA and Jang, H and Choi, Y and Min, YG and Hong, YH and Sung, JJ and Choi, SJ}, title = {Subclinical articulatory changes of vowel parameters in Korean amyotrophic lateral sclerosis patients with perceptually normal voices.}, journal = {PloS one}, volume = {18}, number = {10}, pages = {e0292460}, pmid = {37831677}, issn = {1932-6203}, mesh = {Humans ; *Dysarthria/diagnosis/etiology ; *Amyotrophic Lateral Sclerosis ; Speech Intelligibility ; Phonetics ; Republic of Korea ; Speech Acoustics ; }, abstract = {The available quantitative methods for evaluating bulbar dysfunction in patients with amyotrophic lateral sclerosis (ALS) are limited. We aimed to characterize vowel properties in Korean ALS patients, investigate associations between vowel parameters and clinical features of ALS, and analyze subclinical articulatory changes of vowel parameters in those with perceptually normal voices. Forty-three patients with ALS (27 with dysarthria and 16 without dysarthria) and 20 healthy controls were prospectively collected in the study. Dysarthria was assessed using the ALS Functional Rating Scale-Revised (ALSFRS-R) speech subscores, with any loss of 4 points indicating the presence of dysarthria. The structured speech samples were recorded and analyzed using Praat software. For three corner vowels (/a/, /i/, and /u/), data on the vowel duration, fundamental frequency, frequencies of the first two formants (F1 and F2), harmonics-to-noise ratio, vowel space area (VSA), and vowel articulation index (VAI) were extracted from the speech samples. Corner vowel durations were significantly longer in ALS patients with dysarthria than in healthy controls. The F1 frequency of /a/, F2 frequencies of /i/ and /u/, the VSA, and the VAI showed significant differences between ALS patients with dysarthria and healthy controls. The area under the curve (AUC) was 0.912. The F1 frequency of /a/ and the VSA were the major determinants for differentiating ALS patients who had not yet developed apparent dysarthria from healthy controls (AUC 0.887). In linear regression analyses, as the ALSFRS-R speech subscore decreased, both the VSA and VAI were reduced. In contrast, vowel durations were found to be rather prolonged. The analyses of vowel parameters provided a useful metric correlated with disease severity for detecting subclinical bulbar dysfunction in ALS patients.}, } @article {pmid37830332, year = {2024}, author = {Cai, X and Ouyang, M and Yin, Y and Zhang, Q}, title = {Sensorimotor Adaptation to Formant-Shifted Auditory Feedback Is Predicted by Language-Specific Factors in L1 and L2 Speech Production.}, journal = {Language and speech}, volume = {67}, number = {3}, pages = {846-869}, doi = {10.1177/00238309231202503}, pmid = {37830332}, issn = {1756-6053}, mesh = {Humans ; Male ; Female ; *Feedback, Sensory/physiology ; Young Adult ; *Multilingualism ; Adult ; *Adaptation, Physiological ; *Speech/physiology ; Speech Perception/physiology ; Memory, Short-Term/physiology ; Executive Function/physiology ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Auditory feedback plays an important role in the long-term updating and maintenance of speech motor control; thus, the current study explored the unresolved question of how sensorimotor adaptation is predicted by language-specific and domain-general factors in first-language (L1) and second-language (L2) production. Eighteen English-L1 speakers and 22 English-L2 speakers performed the same sensorimotor adaptation experiments and tasks, which measured language-specific and domain-general abilities. The experiment manipulated the language groups (English-L1 and English-L2) and experimental conditions (baseline, early adaptation, late adaptation, and end). Linear mixed-effects model analyses indicated that auditory acuity was significantly associated with sensorimotor adaptation in L1 and L2 speakers. Analysis of vocal responses showed that L1 speakers exhibited significant sensorimotor adaptation under the early adaptation, late adaptation, and end conditions, whereas L2 speakers exhibited significant sensorimotor adaptation only under the late adaptation condition. Furthermore, the domain-general factors of working memory and executive control were not associated with adaptation/aftereffects in either L1 or L2 production, except for the role of working memory in aftereffects in L2 production. Overall, the study empirically supported the hypothesis that sensorimotor adaptation is predicted by language-specific factors such as auditory acuity and language experience, whereas general cognitive abilities do not play a major role in this process.}, } @article {pmid37827893, year = {2023}, author = {Geng, P and Fan, N and Ling, R and Li, Z and Guo, H and Lu, Q and Chen, X}, title = {Acoustic Characteristics of Mandarin Speech in Male Drug Users.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.022}, pmid = {37827893}, issn = {1873-4588}, abstract = {AIM: Drug use/addiction has a profound impact on the physical and mental health of individuals. Previous studies have indicated that drug users may experience speech perception disorders, including speech illusion and challenges in recognizing emotional speech. However, the influence of drugs on speech production, as another crucial aspect of speech communication, has not been thoroughly examined. Therefore, the current study aimed to investigate how drugs affect the acoustic characteristics of speech in Chinese male drug users.

METHOD: Speech recordings were collected from a total of 160 male drug users (including 106 heroin users, 23 ketamine users, and 31 methamphetamine users) and 55 male healthy controls with no history of drug use. Acoustic analysis was conducted on the collected speech data from these groups, and classification analysis was performed using five supervised learning algorithms.

RESULTS: The results demonstrated that drug users exhibited smaller F0 standard deviation, reduced loudness, cepstral peak prominence, and formant relative energies, as well as higher H1-A3, longer unvoiced segments, and fewer voiced segments per second compared to the control group. The classification analyses yielded good performance in classifying drug users and non-drug users, with an accuracy above 86%. Moreover, the identification of the three groups of drug users achieved an accuracy of approximately 70%. Additionally, the study revealed different effects on speech production among the three types of drugs.

CONCLUSION: The above findings indicate the presence of speech disorders, such as vocal hoarseness, in drug users, thus confirming the assumption that the acoustic characteristics of speech in drug users deviates from the norm. This study not only fills the knowledge gap regarding the effects of drugs on the speech production of Chinese male drug users but also provides a more comprehensive understanding of how drugs impact human behaviors. Furthermore, this research provides theoretical foundations of detoxification and speech rehabilitation for drug users.}, } @article {pmid37817600, year = {2023}, author = {Favaro, L and Zanoli, A and Ludynia, K and Snyman, A and Carugati, F and Friard, O and Scaglione, FE and Manassero, L and Valazza, A and Mathevon, N and Gamba, M and Reby, D}, title = {Vocal tract shape variation contributes to individual vocal identity in African penguins.}, journal = {Proceedings. Biological sciences}, volume = {290}, number = {2008}, pages = {20231029}, pmid = {37817600}, issn = {1471-2954}, mesh = {Animals ; *Spheniscidae/physiology ; Vocalization, Animal/physiology ; Body Size ; Acoustics ; Communication ; }, abstract = {Variation in formant frequencies has been shown to affect social interactions and sexual competition in a range of avian species. Yet, the anatomical bases of this variation are poorly understood. Here, we investigated the morphological correlates of formants production in the vocal apparatus of African penguins. We modelled the geometry of the supra-syringeal vocal tract of 20 specimens to generate a population of virtual vocal tracts with varying dimensions. We then estimated the acoustic response of these virtual vocal tracts and extracted the centre frequency of the first four predicted formants. We demonstrate that: (i) variation in length and cross-sectional area of vocal tracts strongly affects the formant pattern, (ii) the tracheal region determines most of this variation, and (iii) the skeletal size of penguins does not correlate with the trachea length and consequently has relatively little effect on formants. We conclude that in African penguins, while the variation in vocal tract geometry generates variation in resonant frequencies supporting the discrimination of conspecifics, such variation does not provide information on the emitter's body size. Overall, our findings advance our understanding of the role of formant frequencies in bird vocal communication.}, } @article {pmid37811992, year = {2023}, author = {de Boer, MM and Heeren, WFL}, title = {The language dependency of /m/ in native Dutch and non-native English.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {4}, pages = {2168-2176}, doi = {10.1121/10.0021288}, pmid = {37811992}, issn = {1520-8524}, mesh = {*Multilingualism ; *Speech Perception ; Language ; Speech ; Phonetics ; }, abstract = {In forensic speaker comparisons, the current practice is to try to avoid comparisons between speech fragments in different languages. However, globalization requires an exploration of individual speech features that may show phonetic consistency across a speaker's languages. We predicted that the bilabial nasal /m/ may be minimally affected by the language spoken due to the involvement of the rigid nasal cavity in combination with a lack of fixed oral articulatory targets. The results show that indeed, L1 Dutch speakers (N = 53) had similar nasal formants and formant bandwidths when speaking in their L2 English as in their native language, suggesting language-independency of /m/ within speakers. In fact, acoustics seemed to rely more on the phonetic context than on the language spoken. Nevertheless, caution should still be exercised when sampling across languages when the languages' phoneme inventories and phonotactics show substantial differences.}, } @article {pmid37809163, year = {2023}, author = {Meng, Z and Liu, H and Ma, AC}, title = {Optimizing Voice Recognition Informatic Robots for Effective Communication in Outpatient Settings.}, journal = {Cureus}, volume = {15}, number = {9}, pages = {e44848}, pmid = {37809163}, issn = {2168-8184}, abstract = {Aim/Objective Within the dynamic healthcare technology landscape, this research aims to explore patient inquiries within outpatient clinics, elucidating the interplay between technology and healthcare intricacies. Building upon the initial intelligent guidance robot implementation shortcomings, this investigation seeks to enhance informatic robots with voice recognition technology. The objective is to analyze users' vocal patterns, discern age-associated vocal attributes, and facilitate age differentiation through subtle vocal nuances to enhance the efficacy of human-robot communication within outpatient clinical settings. Methods This investigation employs a multi-faceted approach. It leverages voice recognition technology to analyze users' vocal patterns. A diverse dataset of voice samples from various age groups was collected. Acoustic features encompassing pitch, formant frequencies, spectral characteristics, and vocal tract length are extracted from the audio samples. The Mel Filterbank and Mel-Frequency Cepstral Coefficients (MFCCs) are employed for speech and audio processing tasks alongside machine learning algorithms to assess and match vocal patterns to age-related traits. Results The research reveals compelling outcomes. The incorporation of voice recognition technology contributes to a significant improvement in human-robot communication within outpatient clinical settings. Through accurate analysis of vocal patterns and age-related traits, informatic robots can differentiate age through nuanced verbal cues. This augmentation leads to enhanced contextual understanding and tailored responses, significantly advancing the efficiency of patient interactions with the robots. Conclusion Integrating voice recognition technology into informatic robots presents a noteworthy advancement in outpatient clinic settings. By enabling age differentiation through vocal nuances, this augmentation enhances the precision and relevance of responses. The study contributes to the ongoing discourse on the dynamic evolution of healthcare technology, underscoring the complex synergy between technological progression and the intricate realities within healthcare infrastructure. As healthcare continues to metamorphose, the seamless integration of voice recognition technology marks a pivotal stride in optimizing human-robot communication and elevating patient care within outpatient settings.}, } @article {pmid37790479, year = {2023}, author = {Mohn, JL and Baese-Berk, MM and Jaramillo, S}, title = {Selectivity to acoustic features of human speech in the auditory cortex of the mouse.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37790479}, issn = {2692-8205}, support = {R56 DC015531/DC/NIDCD NIH HHS/United States ; }, abstract = {A better understanding of the neural mechanisms of speech processing can have a major impact in the development of strategies for language learning and in addressing disorders that affect speech comprehension. Technical limitations in research with human subjects hinder a comprehensive exploration of these processes, making animal models essential for advancing the characterization of how neural circuits make speech perception possible. Here, we investigated the mouse as a model organism for studying speech processing and explored whether distinct regions of the mouse auditory cortex are sensitive to specific acoustic features of speech. We found that mice can learn to categorize frequency-shifted human speech sounds based on differences in formant transitions (FT) and voice onset time (VOT). Moreover, neurons across various auditory cortical regions were selective to these speech features, with a higher proportion of speech-selective neurons in the dorso-posterior region. Last, many of these neurons displayed mixed-selectivity for both features, an attribute that was most common in dorsal regions of the auditory cortex. Our results demonstrate that the mouse serves as a valuable model for studying the detailed mechanisms of speech feature encoding and neural plasticity during speech-sound learning.}, } @article {pmid37786950, year = {2024}, author = {Sant'Anna, LIDA and Miranda E Paulo, D and Baião, FCS and Lima, IFP and Vieira, WA and César, CPHAR and Pithon, MM and Maia, LC and Paranhos, LR}, title = {Can rapid maxillary expansion affect speech sound production in growing patients? A systematic review.}, journal = {Orthodontics & craniofacial research}, volume = {27}, number = {2}, pages = {185-192}, doi = {10.1111/ocr.12716}, pmid = {37786950}, issn = {1601-6343}, support = {//Conselho Nacional de Desenvolvimento Científico e Tecnológico/ ; //Coordenação de Aperfeiçoamento de Pessoal de Nível Superior/ ; //Fundação de Amparo à Pesquisa do Estado de Minas Gerais/ ; }, mesh = {Humans ; *Phonetics ; *Palatal Expansion Technique/adverse effects ; Speech ; Maxilla ; Nasal Cavity ; }, abstract = {Rapid maxillary expansion (RME) may change speech sound parameters due to the enlargement of oral and nasal cavities. This study aimed to systematically review the current evidence on speech changes as a side effect of RME. An electronic search was conducted in nine databases, and two of them accessed the 'grey literature'. The eligibility criteria included clinical studies assessing orthodontic patients with maxillary transverse deficiency and the relationship with speech alterations without restricting publication year or language. Only interventional studies were included. The JBI Critical Appraisal Tool assessed the risk of bias. The initial search provided 4853 studies. Seven articles (n = 200 patients) met the inclusion criteria and were analysed. The primary source of bias was the absence of a control group in four studies. RME altered speech production by changing vowel fundamental frequency and fricative phoneme formant frequency. Shimmer and jitter rates changed in one and two studies, respectively. Two studies presented deterioration during orthodontic treatment, but speech improved after appliance removal. Despite the limited evidence, RME affects speech during and after treatment.}, } @article {pmid37778391, year = {2023}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Correction: 'Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage' (2021), by Grawunder et al.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1890}, pages = {20230319}, pmid = {37778391}, issn = {1471-2970}, } @article {pmid37769645, year = {2024}, author = {van Brenk, F and Lowit, A and Tjaden, K}, title = {Effects of Speaking Rate on Variability of Second Formant Frequency Transitions in Dysarthria.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {76}, number = {3}, pages = {295-308}, pmid = {37769645}, issn = {1421-9972}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Dysarthria/physiopathology/etiology ; Male ; Female ; *Parkinson Disease/complications/physiopathology ; Middle Aged ; *Speech Intelligibility ; Aged ; Speech Acoustics ; Adult ; Speech Production Measurement ; Sound Spectrography ; Ataxia/physiopathology ; Phonetics ; }, abstract = {INTRODUCTION: This study examined the utility of multiple second formant (F2) slope metrics to capture differences in speech production for individuals with dysarthria and healthy controls as a function of speaking rate. In addition, the utility of F2 slope metrics for predicting severity of intelligibility impairment in dysarthria was examined.

METHODS: Twenty three speakers with Parkinson's disease and mild to moderate hypokinetic dysarthria (HD), 9 speakers with various neurological diseases and mild to severe ataxic or ataxic-spastic dysarthria (AD), and 26 age-matched healthy control speakers (CON) participated in a sentence repetition task. Sentences were produced at habitual, fast, and slow speaking rate. A variety of metrics were derived from the rising F2 transition portion of the diphthong /ai/. To obtain measures of intelligibility for the two clinical speaker groups, 15 undergraduate SLP students participated in a transcription experiment.

RESULTS: Significantly shallower slopes were found for the speakers with HD compared to control speakers. Steeper F2 slopes were associated with increased speaking rate for all groups. Higher variability in F2 slope metrics was found for the speakers with AD compared to the two other speaker groups. For both clinical speaker groups, there was a negative association between intelligibility and F2 slope variability metrics, indicating lower variability in speech production was associated with higher intelligibility.

DISCUSSION: F2 slope metrics were sensitive to dysarthria presence, dysarthria type, and speaking rate. The current study provided evidence that the use of F2 slope variability measures has additional value to F2 slope averaged measures for predicting severity of intelligibility impairment in dysarthria.}, } @article {pmid37756574, year = {2023}, author = {Liu, W and Wang, T and Huang, X}, title = {The influences of forward context on stop-consonant perception: The combined effects of contrast and acoustic cue activation?.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {3}, pages = {1903-1920}, doi = {10.1121/10.0021077}, pmid = {37756574}, issn = {1520-8524}, mesh = {*Cues ; *Speech Perception/physiology ; Phonetics ; Acoustics ; Speech Acoustics ; Acoustic Stimulation ; }, abstract = {The perception of the /da/-/ga/ series, distinguished primarily by the third formant (F3) transition, is affected by many nonspeech and speech sounds. Previous studies mainly investigated the influences of context stimuli with frequency bands located in the F3 region and proposed the account of spectral contrast effects. This study examined the effects of context stimuli with bands not in the F3 region. The results revealed that these non-F3-region stimuli (whether with bands higher or lower than the F3 region) mainly facilitated the identification of /ga/; for example, the stimuli (including frequency-modulated glides, sine-wave tones, filtered sentences, and natural vowels) in the low-frequency band (500-1500 Hz) led to more /ga/ responses than those in the low-F3 region (1500-2500 Hz). It is suggested that in the F3 region, context stimuli may act through spectral contrast effects, while in non-F3 regions, context stimuli might activate the acoustic cues of /g/ and further facilitate the identification of /ga/. The combination of contrast and acoustic cue effects can explain more results concerning the forward context influences on the perception of the /da/-/ga/ series, including the effects of non-F3-region stimuli and the imbalanced influences of context stimuli on /da/ and /ga/ perception.}, } @article {pmid37748969, year = {2023}, author = {Toppo, R and Sinha, S}, title = {The Acoustics of Gender in Indian English: Toward Forensic Profiling in a Multilingual Context.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.07.030}, pmid = {37748969}, issn = {1873-4588}, abstract = {The present study is an acoustic analysis of Indian English, specifically examining the speech patterns and characteristics of three different groups with different native languages. This study investigates fundamental frequency (fo), fo range, fo variance, formant frequencies, and vowel space size in 42 native male and female speakers of Odia, Bangla, and Hindi. Furthermore, it investigated the potential correlation between fundamental frequency and vowel space, examining whether variations in vowel space size could be influenced by gender-specific perceptual factors. The paper emphasizes that in a multilingual context, gender identification can be efficiently correlated with both fo and formant frequencies. To measure a range of acoustic characteristics, speech samples were collected from the recording task. Analysis was done on PRAAT. The study revealed significant differences between genders for the examined acoustic characteristics. Results indicate differences in the size of gender-specific variations among the language groups, with females exhibiting more significant differences in fo, formant frequencies, and vowel space than males. The findings show no significant correlation between fo and vowel space area, indicating that other features are responsible for large vowel space for females. These findings display significant potential toward creating a robust empirical framework for gender profiling that can be utilized in a wide range of forensic linguistics investigations.}, } @article {pmid37736531, year = {2023}, author = {Osiecka, AN and Briefer, EF and Kidawa, D and Wojczulanis-Jakubas, K}, title = {Social calls of the little auk (Alle alle) reflect body size and possibly partnership, but not sex.}, journal = {Royal Society open science}, volume = {10}, number = {9}, pages = {230845}, pmid = {37736531}, issn = {2054-5703}, abstract = {Source-filter theory posits that an individual's size and vocal tract length are reflected in the parameters of their calls. In species that mate assortatively, this could result in vocal similarity. In the context of mate selection, this would mean that animals could listen in to find a partner that sounds-and therefore is-similar to them. We investigated the social calls of the little auk (Alle alle), a highly vocal seabird mating assortatively, using vocalizations produced inside 15 nests by known individuals. Source- and filter-related acoustic parameters were used in linear mixed models testing the possible impact of body size. A principal component analysis followed by a permuted discriminant function analysis tested the effect of sex. Additionally, randomization procedures tested whether partners are more vocally similar than random birds. There was a significant effect of size on the mean fundamental frequency of a simple call, but not on parameters of a multisyllable call with apparent formants. Neither sex nor partnership influenced the calls-there was, however, a tendency to match certain parameters between partners. This indicates that vocal cues are at best weak indicators of size, and other factors likely play a role in mate selection.}, } @article {pmid37730823, year = {2023}, author = {Georgiou, GP}, title = {Comparison of the prediction accuracy of machine learning algorithms in crosslinguistic vowel classification.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {15594}, pmid = {37730823}, issn = {2045-2322}, mesh = {Adult ; Humans ; *Algorithms ; *Neural Networks, Computer ; Acoustics ; Discriminant Analysis ; Machine Learning ; }, abstract = {Machine learning algorithms can be used for the prediction of nonnative sound classification based on crosslinguistic acoustic similarity. To date, very few linguistic studies have compared the classification accuracy of different algorithms. This study aims to assess how well machines align with human speech perception by assessing the ability of three machine learning algorithms, namely, linear discriminant analysis (LDA), decision tree (C5.0), and neural network (NNET), to predict the classification of second language (L2) sounds in terms of first language (L1) categories. The models were trained using the first three formants and duration of L1 vowels and fed with the same acoustic features of L2 vowels. To validate their accuracy, adult L2 speakers completed a perceptual classification task. The results indicated that NNET predicted with success the classification of all L2 vowels with the highest proportion in terms of L1 categories, while LDA and C5.0 missed only one vowel each. Furthermore, NNET exhibited superior accuracy in predicting the full range of above chance responses, followed closely by LDA. C5.0 did not meet the anticipated performance levels. The findings can hold significant implications for advancing both the theoretical and practical frameworks of speech acquisition.}, } @article {pmid37717981, year = {2023}, author = {Zhang, T and Liu, X and Liu, G and Shao, Y}, title = {PVR-AFM: A Pathological Voice Repair System based on Non-linear Structure.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {5}, pages = {648-662}, doi = {10.1016/j.jvoice.2021.05.010}, pmid = {37717981}, issn = {1873-4588}, mesh = {Humans ; Aged ; *Voice ; Speech ; *Voice Disorders/diagnosis ; Algorithms ; Cognition ; }, abstract = {OBJECTIVE: Speech signal processing has become an important technique to ensure that the voice interaction system communicates accurately with the user by improving the clarity or intelligibility of speech signals. However, most existing works only focus on whether to process the voice of average human but ignore the communication needs of individuals suffering from voice disorder, including voice-related professionals, older people, and smokers. To solve this demand, it is essential to design a non-invasive repair system that processes pathological voices.

METHODS: In this paper, we propose a repair system for multiple polyp vowels, such as /a/, /i/ and /u/. We utilize a non-linear model based on amplitude-modulation (AM) and a frequency-modulation (FM) structure to extract the pitch and formant of pathological voice. To solve the fracture and instability of pitch, we provide a pitch extraction algorithm, which ensures that pitch's stability and avoids the errors of double pitch caused by the instability of low-frequency signal. Furthermore, we design a formant reconstruction mechanism, which can effectively determine the frequency and bandwidth to accomplish formant repair.

RESULTS: Finally, spectrum observation and objective indicators show that the system has better performance in improving the intelligibility of pathological speech.}, } @article {pmid37701868, year = {2023}, author = {Roland, V and Huet, K and Harmegnies, B and Piccaluga, M and Verhaegen, C and Delvaux, V}, title = {Vowel production: a potential speech biomarker for early detection of dysarthria in Parkinson's disease.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1129830}, pmid = {37701868}, issn = {1664-1078}, abstract = {OBJECTIVES: Our aim is to detect early, subclinical speech biomarkers of dysarthria in Parkinson's disease (PD), i.e., systematic atypicalities in speech that remain subtle, are not easily detectible by the clinician, so that the patient is labeled "non-dysarthric." Based on promising exploratory work, we examine here whether vowel articulation, as assessed by three acoustic metrics, can be used as early indicator of speech difficulties associated with Parkinson's disease.

STUDY DESIGN: This is a prospective case-control study.

METHODS: Sixty-three individuals with PD and 35 without PD (healthy controls-HC) participated in this study. Out of 63 PD patients, 43 had been diagnosed with dysarthria (DPD) and 20 had not (NDPD). Sustained vowels were recorded for each speaker and formant frequencies were measured. The analyses focus on three acoustic metrics: individual vowel triangle areas (tVSA), vowel articulation index (VAI) and the Phi index.

RESULTS: tVSA were found to be significantly smaller for DPD speakers than for HC. The VAI showed significant differences between these two groups, indicating greater centralization and lower vowel contrasts in the DPD speakers with dysarhtria. In addition, DPD and NDPD speakers had lower Phi values, indicating a lower organization of their vowel system compared to the HC. Results also showed that the VAI index was the most efficient to distinguish between DPD and NDPD whereas the Phi index was the best acoustic metric to discriminate NDPD and HC.

CONCLUSION: This acoustic study identified potential subclinical vowel-related speech biomarkers of dysarthria in speakers with Parkinson's disease who have not been diagnosed with dysarthria.}, } @article {pmid37695295, year = {2023}, author = {Perrine, BL and Scherer, RC}, title = {Using a vertical three-mass computational model of the vocal folds to match human phonation of three adult males.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {3}, pages = {1505-1525}, pmid = {37695295}, issn = {1520-8524}, support = {R01 DC007640/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Adult ; Male ; *Vocal Cords ; Phonation ; Glottis ; *Larynx ; Computer Simulation ; }, abstract = {Computer models of phonation are used to study various parameters that are difficult to control, measure, and observe in human subjects. Imitating human phonation by varying the prephonatory conditions of computer models offers insight into the variations that occur across human phonatory production. In the present study, a vertical three-mass computer model of phonation [Perrine, Scherer, Fulcher, and Zhai (2020). J. Acoust. Soc. Am. 147, 1727-1737], driven by empirical pressures from a physical model of the vocal folds (model M5), with a vocal tract following the design of Ishizaka and Flanagan [(1972). Bell Sys. Tech. J. 51, 1233-1268] was used to match prolonged vowels produced by three male subjects using various pitch and loudness levels. The prephonatory conditions of tissue mass and tension, subglottal pressure, glottal diameter and angle, posterior glottal gap, false vocal fold gap, and vocal tract cross-sectional areas were varied in the model to match the model output with the fundamental frequency, alternating current airflow, direct current airflow, skewing quotient, open quotient, maximum flow negative derivative, and the first three formant frequencies from the human production. Parameters were matched between the model and human subjects with an average overall percent mismatch of 4.40% (standard deviation = 6.75%), suggesting a reasonable ability of the simple low dimensional model to mimic these variables.}, } @article {pmid37650429, year = {2023}, author = {Steffman, J}, title = {Vowel-internal cues to vowel quality and prominence in speech perception.}, journal = {Phonetica}, volume = {80}, number = {5}, pages = {329-356}, pmid = {37650429}, issn = {1423-0321}, mesh = {Humans ; *Cues ; *Speech Perception ; Language ; Speech ; Phonetics ; Speech Acoustics ; }, abstract = {This study examines how variation in F0 and intensity impacts the perception of American English vowels. Both properties vary intrinsically as a function of vowel features in the speech production literature, raising the question of the perceptual impact of each. In addition to considering listeners' interpretation of either cue as an intrinsic property of the vowel, the possible prominence-marking function of each is considered. Two patterns of prominence strengthening in vowels, sonority expansion and hyperarticulation, are tested in light of recent findings that contextual prominence impacts vowel perception in line with these effects (i.e. a prominent vowel is expected by listeners to be realized as if it had undergone prominence strengthening). Across four vowel contrasts with different height and frontness features, listeners categorized phonetic continua with variation in formants, F0 and intensity. Results show that variation in level F0 height is interpreted as an intrinsic cue by listeners. Higher F0 cues a higher vowel, following intrinsic F0 effects in the production literature. In comparison, intensity is interpreted as a prominence-lending cue, for which effect directionality is dependent on vowel height. Higher intensity high vowels undergo perceptual re-calibration in line with (acoustic) hyperarticulation, whereas higher intensity non-high vowels undergo perceptual re-calibration in line with sonority expansion.}, } @article {pmid37630210, year = {2023}, author = {Yang, J and Yue, Y and Lv, H and Ren, B and Zhang, Y}, title = {Effect of Adding Intermediate Layers on the Interface Bonding Performance of WC-Co Diamond-Coated Cemented Carbide Tool Materials.}, journal = {Molecules (Basel, Switzerland)}, volume = {28}, number = {16}, pages = {}, pmid = {37630210}, issn = {1420-3049}, support = {ZR2022ME129//Natural Science Foundation of Shandong Province of China/ ; 2021-2//Science and Technology Research-Revealing-list System- special project of QingdaoWest Coast New Area of Shandong province of China/ ; }, abstract = {The interface models of diamond-coated WC-Co cemented carbide (DCCC) were constructed without intermediate layers and with different interface terminals, such as intermediate layers of TiC, TiN, CrN, and SiC. The adhesion work of the interface model was calculated based on the first principle. The results show that the adhesion work of the interface was increased after adding four intermediate layers. Their effect on improving the interface adhesion performance of cemented carbide coated with diamond was ranked in descending order as follows: SiC > CrN > TiC > TiN. The charge density difference and the density of states were further analyzed. After adding the intermediate layer, the charge distribution at the interface junction was changed, and the electron cloud at the interface junction overlapped to form a more stable chemical bond. Additionally, after adding the intermediate layer, the density of states of the atoms at the interface increased in the energy overlapping area. The formant formed between the electronic orbitals enhances the bond strength. Thus, the interface bonding performance of DCCC was enhanced. Among them, the most obvious was the interatomic electron cloud overlapping at the diamond/SiCC-Si/WC-Co interface, its bond length was the shortest (1.62 Å), the energy region forming the resonance peak was the largest (-5-20 eV), and the bonding was the strongest. The interatomic bond length at the diamond/TiNTi/WC-Co interface was the longest (4.11 Å), the energy region forming the resonance peak was the smallest (-5-16 eV), and the bonding was the weakest. Comprehensively considering four kinds of intermediate layers, the best intermediate layer for improving the interface bonding performance of DCCC was SiC, and the worst was TiN.}, } @article {pmid37616075, year = {2023}, author = {Bradshaw, AR and Lametti, DR and Shiller, DM and Jasmin, K and Huang, R and McGettigan, C}, title = {Speech motor adaptation during synchronous and metronome-timed speech.}, journal = {Journal of experimental psychology. General}, volume = {152}, number = {12}, pages = {3476-3489}, doi = {10.1037/xge0001459}, pmid = {37616075}, issn = {1939-2222}, support = {//Leverhulme Trust/ ; }, mesh = {Humans ; Speech/physiology ; *Speech Perception/physiology ; *Voice/physiology ; Phonetics ; Learning ; }, abstract = {Sensorimotor integration during speech has been investigated by altering the sound of a speaker's voice in real time; in response, the speaker learns to change their production of speech sounds in order to compensate (adaptation). This line of research has however been predominantly limited to very simple speaking contexts, typically involving (a) repetitive production of single words and (b) production of speech while alone, without the usual exposure to other voices. This study investigated adaptation to a real-time perturbation of the first and second formants during production of sentences either in synchrony with a prerecorded voice (synchronous speech group) or alone (solo speech group). Experiment 1 (n = 30) found no significant difference in the average magnitude of compensatory formant changes between the groups; however, synchronous speech resulted in increased between-individual variability in such formant changes. Participants also showed acoustic-phonetic convergence to the voice they were synchronizing with prior to introduction of the feedback alteration. Furthermore, the extent to which the changes required for convergence agreed with those required for adaptation was positively correlated with the magnitude of subsequent adaptation. Experiment 2 tested an additional group with a metronome-timed speech task (n = 15) and found a similar pattern of increased between-participant variability in formant changes. These findings demonstrate that speech motor adaptation can be measured robustly at the group level during performance of more complex speaking tasks; however, further work is needed to resolve whether self-voice adaptation and other-voice convergence reflect additive or interactive effects during sensorimotor control of speech. (PsycInfo Database Record (c) 2023 APA, all rights reserved).}, } @article {pmid37591234, year = {2023}, author = {Ancel, EE and Smith, ML and Rao, VNV and Munson, B}, title = {Relating Acoustic Measures to Listener Ratings of Children's Productions of Word-Initial /ɹ/ and /w/.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {9}, pages = {3413-3427}, pmid = {37591234}, issn = {1558-9102}, support = {R01 DC002932/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Child, Preschool ; Reproducibility of Results ; *Acoustics ; Educational Status ; *Schools ; }, abstract = {PURPOSE: The /ɹ/ productions of young children acquiring American English are highly variable and often inaccurate, with [w] as the most common substitution error. One acoustic indicator of the goodness of children's /ɹ/ productions is the difference between the frequency of the second formant (F2) and the third formant (F3), with a smaller F3-F2 difference being associated with a perceptually more adultlike /ɹ/. This study analyzed the effectiveness of automatically extracted F3-F2 differences in characterizing young children's productions of /ɹ/-/w/ in comparison with manually coded measurements.

METHOD: Automated F3-F2 differences were extracted from productions of a variety of different /ɹ/- and /w/-initial words spoken by 3- to 4-year-old monolingual preschoolers (N = 117; 2,278 tokens in total). These automated measures were compared to ratings of the phoneme goodness of children's productions as rated by untrained adult listeners (n = 132) on a visual analog scale, as well as to narrow transcriptions of the production into four categories: [ɹ], [w], and two intermediate categories.

RESULTS: Data visualizations show a weak relationship between automated F3-F2 differences with listener ratings and narrow transcriptions. Mixed-effects models suggest the automated F3-F2 difference only modestly predicts listener ratings (R [2] = .37) and narrow transcriptions (R [2] = .32).

CONCLUSION: The weak relationship between automated F3-F2 difference and both listener ratings and narrow transcriptions suggests that these automated acoustic measures are of questionable reliability and utility in assessing preschool children's mastery of the /ɹ/-/w/ contrast.}, } @article {pmid37555773, year = {2023}, author = {Stilp, C and Chodroff, E}, title = {"Please say what this word is": Linguistic experience and acoustic context interact in vowel categorization .}, journal = {JASA express letters}, volume = {3}, number = {8}, pages = {}, pmid = {37555773}, issn = {2691-1191}, support = {R01 DC020303/DC/NIDCD NIH HHS/United States ; }, mesh = {*Phonetics ; *Speech Perception ; Acoustics ; Speech Acoustics ; Language ; }, abstract = {Ladefoged and Broadbent [(1957). J. Acoust. Soc. Am. 29(1), 98-104] is a foundational study in speech perception research, demonstrating that acoustic properties of earlier sounds alter perception of subsequent sounds: a context sentence with a lowered first formant (F1) frequency promotes perception of a raised F1 in a target word, and vice versa. The present study replicated the original with U.K. and U.S. listeners. While the direction of the perceptual shift was consistent with the original study, neither sample replicated the large effect sizes. This invites consideration of how linguistic experience relates to the magnitudes of these context effects.}, } @article {pmid37555771, year = {2023}, author = {Tanner, J}, title = {Prosodic and durational influences on the formant dynamics of Japanese vowels.}, journal = {JASA express letters}, volume = {3}, number = {8}, pages = {}, doi = {10.1121/10.0020547}, pmid = {37555771}, issn = {2691-1191}, mesh = {Humans ; Phonetics ; *Speech Acoustics ; *Language ; }, abstract = {The relationship between prosodic structure and segmental realisation is a central question within phonetics. For vowels, this has been typically examined in terms of duration, leaving largely unanswered how prosodic boundaries influence spectral realisation. This study examines the influence of prosodic boundary strength-as well as duration and pauses-on vowel dynamics in spontaneous Japanese. While boundary strength has a marginal effect on dynamics, increased duration and pauses result in greater vowel peripherality and spectral change. These findings highlight the complex relationship between prosodic and segmental structure, and illustrate the importance of multifactorial analysis in corpus research.}, } @article {pmid37547022, year = {2023}, author = {Hilger, A and Cole, J and Larson, C}, title = {Task-dependent pitch auditory feedback control in cerebellar ataxia.}, journal = {Research square}, volume = {}, number = {}, pages = {}, pmid = {37547022}, issn = {2693-5015}, support = {F31 DC017877/DC/NIDCD NIH HHS/United States ; }, abstract = {PURPOSE: The purpose of this study was to investigate how ataxia affects the task-dependent role of pitch auditory feedback control in speech. In previous research, individuals with ataxia produced over-corrected, hypermetric compensatory responses to unexpected pitch and formant frequency perturbations in auditory feedback in sustained vowels and single words (Houde et al., 2019; Li et al., 2019; Parrell et al., 2017). In this study, we investigated whether ataxia would also affect the task-dependent role of the auditory feedback control system, measuring whether pitch-shift responses would be mediated by speech task or semantic focus pattern as they are in neurologically healthy speakers.

METHODS: Twenty-two adults with ataxia and 29 age- and sex-matched control participants produced sustained vowels and sentences with and without corrective focus while their auditory feedback was briefly and unexpectedly perturbed in pitch by +/-200 cents. The magnitude and latency of the reflexive pitch-shift responses were measured as a reflection of auditory feedback control.

RESULTS: Individuals with ataxia produced larger reflexive pitch-shift responses in both the sustained-vowel and sentence-production tasks than the control participants. Additionally, a differential response magnitude was observed by task and sentence focus pattern for both groups.

CONCLUSION: These findings demonstrate that even though accuracy of auditory feedback control correction is affected by cerebellar damage, as evidenced by the hypermetric responses, the system still retains efficiency in utilizing the task-dependent role of auditory feedback.}, } @article {pmid37541926, year = {2023}, author = {Gao, Y and Feng, Y and Wu, D and Lu, F and He, H and Tian, C}, title = {Effect of Wearing Different Masks on Acoustic, Aerodynamic, and Formant Parameters.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.06.018}, pmid = {37541926}, issn = {1873-4588}, abstract = {OBJECTIVE: This study aimed to investigate the effects of different types of masks on acoustic, aerodynamic, and formant parameters in healthy people.

METHODS: Our study involved 30 healthy participants, 15 of each gender, aged 20-40 years. The tests were conducted under four conditions: without a mask, after wearing a surgical mask, after wearing a head-mounted N95 mask, and after wearing an ear-mounted N95 mask. Voice recording was done with the mask on. The acoustic parameters include mean fundamental frequency (F0), mean intensity, percentage of jitter (local), percentage of shimmer (local), mean noise to harmonic ratio (NHR), aerodynamic parameter, maximum phonation time (MPT), and formant parameters (/a/, /i/, /u/ three vowels F1, F2).

RESULTS: The main effect of mask type was significant in MPT, mean F0, mean HNR, /a/F1, /a/F2, /i/F2. However, the effect sizes and power in /a/F2, /i/F2 were low. MPT, mean F0 and mean HNR significantly increased and /a/F1 significantly decreased after wearing the head-mounted n95 mask. The mean F0 and mean HNR increased significantly after wearing the ear-mounted n95 mask. No significant changes were observed in parameters after wearing the surgical mask in this study. When the statistics are performed separately for males and females, the results obtained are similar to those previously obtained for unspecified males and females.

CONCLUSION: After wearing the surgical mask, this study found insignificant changes in mean F0, jitter (local), shimmer (local), mean NHR, mean intensity, MPT, and the vowels F1 and F2. This may be due to the looser design of the surgical mask and the relatively small attenuation of sound. N95 masks have a greater effect on vocalization than surgical masks and may cause changes in F0 and HNR after wearing an N95 mask. In the present study, no significant changes in jitter and shimmer were observed after wearing the mask. In addition, there was a significant reduction in /a/F1 after wearing the N95 headgear mask may owing to its high restriction of jaw mobility. In future studies, the change in jaw movement amplitude after wearing the mouthpiece can be added to investigate.}, } @article {pmid37522248, year = {2023}, author = {Rizzi, R and Bidelman, GM}, title = {Duplex perception reveals brainstem auditory representations are modulated by listeners' ongoing percept for speech.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {33}, number = {18}, pages = {10076-10086}, pmid = {37522248}, issn = {1460-2199}, support = {R01 DC016267/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech ; *Speech Perception/physiology ; Brain Stem/physiology ; Brain/physiology ; Hearing ; Auditory Perception/physiology ; Acoustic Stimulation ; }, abstract = {So-called duplex speech stimuli with perceptually ambiguous spectral cues to one ear and isolated low- versus high-frequency third formant "chirp" to the opposite ear yield a coherent percept supporting their phonetic categorization. Critically, such dichotic sounds are only perceived categorically upon binaural integration. Here, we used frequency-following responses (FFRs), scalp-recorded potentials reflecting phase-locked subcortical activity, to investigate brainstem responses to fused speech percepts and to determine whether FFRs reflect binaurally integrated category-level representations. We recorded FFRs to diotic and dichotic stop-consonants (/da/, /ga/) that either did or did not require binaural fusion to properly label along with perceptually ambiguous sounds without clear phonetic identity. Behaviorally, listeners showed clear categorization of dichotic speech tokens confirming they were heard with a fused, phonetic percept. Neurally, we found FFRs were stronger for categorically perceived speech relative to category-ambiguous tokens but also differentiated phonetic categories for both diotically and dichotically presented speech sounds. Correlations between neural and behavioral data further showed FFR latency predicted the degree to which listeners labeled tokens as "da" versus "ga." The presence of binaurally integrated, category-level information in FFRs suggests human brainstem processing reflects a surprisingly abstract level of the speech code typically circumscribed to much later cortical processing.}, } @article {pmid37506120, year = {2023}, author = {Kim, KS and Gaines, JL and Parrell, B and Ramanarayanan, V and Nagarajan, SS and Houde, JF}, title = {Mechanisms of sensorimotor adaptation in a hierarchical state feedback control model of speech.}, journal = {PLoS computational biology}, volume = {19}, number = {7}, pages = {e1011244}, pmid = {37506120}, issn = {1553-7358}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; F32 DC019538/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; P50 DC019900/DC/NIDCD NIH HHS/United States ; R01 NS100440/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Feedback ; *Speech ; *Adaptation, Physiological ; Feedback, Sensory ; Movement ; }, abstract = {Upon perceiving sensory errors during movements, the human sensorimotor system updates future movements to compensate for the errors, a phenomenon called sensorimotor adaptation. One component of this adaptation is thought to be driven by sensory prediction errors-discrepancies between predicted and actual sensory feedback. However, the mechanisms by which prediction errors drive adaptation remain unclear. Here, auditory prediction error-based mechanisms involved in speech auditory-motor adaptation were examined via the feedback aware control of tasks in speech (FACTS) model. Consistent with theoretical perspectives in both non-speech and speech motor control, the hierarchical architecture of FACTS relies on both the higher-level task (vocal tract constrictions) as well as lower-level articulatory state representations. Importantly, FACTS also computes sensory prediction errors as a part of its state feedback control mechanism, a well-established framework in the field of motor control. We explored potential adaptation mechanisms and found that adaptive behavior was present only when prediction errors updated the articulatory-to-task state transformation. In contrast, designs in which prediction errors updated forward sensory prediction models alone did not generate adaptation. Thus, FACTS demonstrated that 1) prediction errors can drive adaptation through task-level updates, and 2) adaptation is likely driven by updates to task-level control rather than (only) to forward predictive models. Additionally, simulating adaptation with FACTS generated a number of important hypotheses regarding previously reported phenomena such as identifying the source(s) of incomplete adaptation and driving factor(s) for changes in the second formant frequency during adaptation to the first formant perturbation. The proposed model design paves the way for a hierarchical state feedback control framework to be examined in the context of sensorimotor adaptation in both speech and non-speech effector systems.}, } @article {pmid37499137, year = {2023}, author = {Illner, V and Tykalova, T and Skrabal, D and Klempir, J and Rusz, J}, title = {Automated Vowel Articulation Analysis in Connected Speech Among Progressive Neurological Diseases, Dysarthria Types, and Dysarthria Severities.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {8}, pages = {2600-2621}, doi = {10.1044/2023_JSLHR-22-00526}, pmid = {37499137}, issn = {1558-9102}, mesh = {Humans ; Dysarthria/etiology ; Speech/physiology ; *Cerebellar Ataxia ; *Parkinson Disease/complications ; Articulation Disorders ; Atrophy ; Speech Acoustics ; Speech Intelligibility ; }, abstract = {PURPOSE: Although articulatory impairment represents distinct speech characteristics in most neurological diseases affecting movement, methods allowing automated assessments of articulation deficits from the connected speech are scarce. This study aimed to design a fully automated method for analyzing dysarthria-related vowel articulation impairment and estimate its sensitivity in a broad range of neurological diseases and various types and severities of dysarthria.

METHOD: Unconstrained monologue and reading passages were acquired from 459 speakers, including 306 healthy controls and 153 neurological patients. The algorithm utilized a formant tracker in combination with a phoneme recognizer and subsequent signal processing analysis.

RESULTS: Articulatory undershoot of vowels was presented in a broad spectrum of progressive neurodegenerative diseases, including Parkinson's disease, progressive supranuclear palsy, multiple-system atrophy, Huntington's disease, essential tremor, cerebellar ataxia, multiple sclerosis, and amyotrophic lateral sclerosis, as well as in related dysarthria subtypes including hypokinetic, hyperkinetic, ataxic, spastic, flaccid, and their mixed variants. Formant ratios showed a higher sensitivity to vowel deficits than vowel space area. First formants of corner vowels were significantly lower for multiple-system atrophy than cerebellar ataxia. Second formants of vowels /a/ and /i/ were lower in ataxic compared to spastic dysarthria. Discriminant analysis showed a classification score of up to 41.0% for disease type, 39.3% for dysarthria type, and 49.2% for dysarthria severity. Algorithm accuracy reached an F-score of 0.77.

CONCLUSIONS: Distinctive vowel articulation alterations reflect underlying pathophysiology in neurological diseases. Objective acoustic analysis of vowel articulation has the potential to provide a universal method to screen motor speech disorders.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.23681529.}, } @article {pmid37496795, year = {2023}, author = {Mailhos, A and Egea-Caparrós, DA and Cabana, Á and Martínez-Sánchez, F}, title = {Voice pitch is negatively associated with sociosexual behavior in males but not in females.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1200065}, pmid = {37496795}, issn = {1664-1078}, abstract = {Acoustic cues play a major role in social interactions in many animal species. In addition to the semantic contents of human speech, voice attributes - e.g., voice pitch, formant position, formant dispersion, etc. - have been proposed to provide critical information for the assessment of potential rivals and mates. However, prior studies exploring the association of acoustic attributes with reproductive success, or some of its proxies, have produced mixed results. Here, we investigate whether the mean fundamental frequency (F0), formant position (Pf), and formant dispersion (Df) - dimorphic attributes of the human voice - are related to sociosexuality, as measured by the Revised Sociosexual Orientation Inventory (SOI-R) - a trait also known to exhibit sex differences - in a sample of native Spanish-speaking students (101 males, 147 females). Analyses showed a significant negative correlation between F0 and sociosexual behavior, and between Pf and sociosexual desire in males but not in females. These correlations remained significant after correcting for false discovery rate (FDR) and controlling for age, a potential confounding variable. Our results are consistent with a role of F0 and Pf serving as cues in the mating domain in males but not in females. Alternatively, the association of voice attributes and sociosexual orientation might stem from the parallel effect of male sex hormones both on the male brain and the anatomical structures involved in voice production.}, } @article {pmid37477268, year = {2023}, author = {González-Alvarez, J and Sos-Peña, R}, title = {Body Perception From Connected Speech: Speaker Height Discrimination from Natural Sentences and Sine-Wave Replicas with and without Pitch.}, journal = {Perceptual and motor skills}, volume = {130}, number = {4}, pages = {1353-1365}, doi = {10.1177/00315125231173581}, pmid = {37477268}, issn = {1558-688X}, mesh = {Humans ; Male ; Female ; Speech ; *Speech Perception ; *Voice ; Body Height ; Language ; Pitch Perception ; }, abstract = {In addition to language, the human voice carries information about the physical characteristics of speakers, including their body size (height and weight). The fundamental speaking frequency, perceived as voice pitch, and the formant frequencies, or resonators of the vocal tract, are the acoustic speech parameters that have been most intensely studied for perceiving a speaker's body size. In this study, we created sine-wave (SW) replicas of connected speech (sentences) uttered by 20 male and 20 female speakers, consisting of three time-varying sinusoidal waves matching the frequency pattern of the first three formants of each sentence. These stimuli only provide information about the formant frequencies of a speech signal. We also created a new experimental condition by adding a sinusoidal replica of the voice pitch of each sentence. Results obtained from a binary discrimination task revealed that (a) our SW replicas provided sufficient useful information to accurately judge the speakers' body height at an above chance level; (b) adding the sinusoidal replica about the voice pitch did not significantly increase accuracy; and (c) stimuli from female speakers were more informative for body height detection and allowed higher perceptual accuracy, due to a stronger correlation between formant frequencies and actual body height than stimuli from male speakers.}, } @article {pmid37467104, year = {2023}, author = {Vilanova, ID and Almeida, SB and de Araújo, VS and Santos, RS and Schroder, AGD and Zeigelboim, BS and Corrêa, CC and Taveira, KVM and de Araujo, CM}, title = {Impact of orthognathic surgery on voice and speech: a systematic review and meta-analysis.}, journal = {European journal of orthodontics}, volume = {45}, number = {6}, pages = {747-763}, doi = {10.1093/ejo/cjad025}, pmid = {37467104}, issn = {1460-2210}, mesh = {Adult ; Humans ; *Orthognathic Surgery ; Speech ; *Orthognathic Surgical Procedures ; }, abstract = {BACKGROUND: Orthognathic surgical procedures, whether in one or both jaws, can affect structures regarding the articulation and resonance of voice and speech.

OBJECTIVE: Evaluating the impact of orthognathic surgery on voice and speech performance in individuals with skeletal dentofacial disharmony.

SEARCH METHODS: Word combinations and truncations were adapted for the following electronic databases: EMBASE, PubMed/Medline, Scopus, Web of Science, Cochrane Library, and Latin American and Caribbean Literature in Health Sciences (LILACS), and grey literature.

SELECTION CRITERIA: The research included studies on nonsyndromic adults with skeletal dentofacial disharmony undergoing orthognathic surgery. These studies assessed patients before and after surgery or compared them with individuals with good facial harmony using voice and speech parameters through validated protocols.

DATA COLLECTION AND ANALYSIS: Two independent reviewers performed all stages of the review. The Joanna Briggs Institute tool was used to assess risk of bias in the cohort studies, and ROBINS-I was used for nonrandomized clinical trials. The authors also performed a meta-analysis of random effects.

RESULTS: A total of 1163 articles were retrieved after the last search, of which 23 were read in full. Of these, four were excluded, totalling 19 articles for quantitative synthesis. When comparing the pre- and postoperative periods, both for fundamental frequency, formants, and jitter and shimmer perturbation measures, orthognathic surgery did not affect vowel production. According to the articles, the main articulatory errors associated with skeletal dentofacial disharmonies prior to surgery were distortions of fricative sounds, mainly/s/ and/z/.

CONCLUSIONS: Orthognathic surgery may have little or no impact on vocal characteristics during vowel production. However, due to the confounding factors involved, estimates are inconclusive. The most prevalent articulatory disorders in the preoperative period were distortion of the fricative phonemes/s/ and/z/. However, further studies must be carried out to ensure greater robustness to these findings.

REGISTRATION: PROSPERO (CRD42022291113).}, } @article {pmid37436271, year = {2023}, author = {Stoehr, A and Souganidis, C and Thomas, TB and Jacobsen, J and Martin, CD}, title = {Voice onset time and vowel formant measures in online testing and laboratory-based testing with(out) surgical face masks.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {1}, pages = {152-166}, doi = {10.1121/10.0020064}, pmid = {37436271}, issn = {1520-8524}, mesh = {Humans ; Masks ; Pandemics ; Speech Acoustics ; *COVID-19 ; *Voice ; Phonetics ; }, abstract = {Since the COVID-19 pandemic started, conducting experiments online is increasingly common, and face masks are often used in everyday life. It remains unclear whether phonetic detail in speech production is captured adequately when speech is recorded in internet-based experiments or in experiments conducted with face masks. We tested 55 Spanish-Basque-English trilinguals in picture naming tasks in three conditions: online, laboratory-based with surgical face masks, and laboratory-based without face masks (control). We measured plosive voice onset time (VOT) in each language, the formants and duration of English vowels /iː/ and /ɪ/, and the Spanish/Basque vowel space. Across conditions, there were differences between English and Spanish/Basque VOT and in formants and duration between English /iː/-/ɪ/; between conditions, small differences emerged. Relative to the control condition, the Spanish/Basque vowel space was larger in online testing and smaller in the face mask condition. We conclude that testing online or with face masks is suitable for investigating phonetic detail in within-participant designs although the precise measurements may differ from those in traditional laboratory-based research.}, } @article {pmid37433805, year = {2023}, author = {Kries, J and De Clercq, P and Lemmens, R and Francart, T and Vandermosten, M}, title = {Acoustic and phonemic processing are impaired in individuals with aphasia.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {11208}, pmid = {37433805}, issn = {2045-2322}, mesh = {Humans ; *Aphasia/etiology ; *Language Disorders ; Acoustics ; Cognition ; Individuality ; }, abstract = {Acoustic and phonemic processing are understudied in aphasia, a language disorder that can affect different levels and modalities of language processing. For successful speech comprehension, processing of the speech envelope is necessary, which relates to amplitude changes over time (e.g., the rise times). Moreover, to identify speech sounds (i.e., phonemes), efficient processing of spectro-temporal changes as reflected in formant transitions is essential. Given the underrepresentation of aphasia studies on these aspects, we tested rise time processing and phoneme identification in 29 individuals with post-stroke aphasia and 23 healthy age-matched controls. We found significantly lower performance in the aphasia group than in the control group on both tasks, even when controlling for individual differences in hearing levels and cognitive functioning. Further, by conducting an individual deviance analysis, we found a low-level acoustic or phonemic processing impairment in 76% of individuals with aphasia. Additionally, we investigated whether this impairment would propagate to higher-level language processing and found that rise time processing predicts phonological processing performance in individuals with aphasia. These findings show that it is important to develop diagnostic and treatment tools that target low-level language processing mechanisms.}, } @article {pmid37424066, year = {2024}, author = {Maes, P and Weyland, M and Kissine, M}, title = {Structure and acoustics of the speech of verbal autistic preschoolers.}, journal = {Journal of child language}, volume = {51}, number = {3}, pages = {509-525}, doi = {10.1017/S0305000923000417}, pmid = {37424066}, issn = {1469-7602}, support = {//Fondation Roger de Spoelberch/ ; //Fondation Francqui - Stichting/ ; //Marguerite-Marie Delacroix foundation/ ; }, mesh = {Humans ; Child, Preschool ; Male ; Female ; *Speech Acoustics ; Phonetics ; Child Language ; Autistic Disorder/psychology ; Speech ; Speech Production Measurement ; }, abstract = {In this study, we report an extensive investigation of the structural language and acoustical specificities of the spontaneous speech of ten three- to five-year-old verbal autistic children. The autistic children were compared to a group of ten typically developing children matched pairwise on chronological age, nonverbal IQ and socioeconomic status, and groupwise on verbal IQ and gender on various measures of structural language (phonetic inventory, lexical diversity and morpho-syntactic complexity) and a series of acoustical measures of speech (mean and range fundamental frequency, a formant dispersion index, syllable duration, jitter and shimmer). Results showed that, overall, the structure and acoustics of the verbal autistic children's speech were highly similar to those of the TD children. Few remaining atypicalities in the speech of autistic children lay in a restricted use of different vocabulary items, a somewhat diminished morpho-syntactic complexity, and a slightly exaggerated syllable duration.}, } @article {pmid37417627, year = {2023}, author = {Park, EJ and Yoo, SD}, title = {Correlation between the parameters of quadrilateral vowel and dysphonia severity in patients with traumatic brain injury.}, journal = {Medicine}, volume = {102}, number = {27}, pages = {e33030}, pmid = {37417627}, issn = {1536-5964}, mesh = {Humans ; *Dysphonia/diagnosis/etiology ; Retrospective Studies ; Dysarthria ; Quality of Life ; Acoustics ; }, abstract = {Dysarthria and dysphonia are common in patients with traumatic brain injury (TBI). Multiple factors may contribute to TBI-induced dysarthria, including poor vocalization, articulation, respiration, and/or resonance. Many patients suffer from dysarthria that persists after the onset of TBI, with negative effects on their quality of life. This study aimed to investigate the relationship between vowel quadrilateral parameters and Dysphoria Severity Index (DSI), which objectively reflects vocal function We retrospectively enrolled TBI patients diagnosed using computer tomography. Participants had dysarthria and dysphonia and underwent acoustic analysis. Praat software was used to measure vowel space area (VSA), formant centralization ratio (FCR), and the second formant (F2) ratio. For the 4 corner vowels (/a/,/u/,/i/, and/ae/), the resonance frequency of the vocal folds was measured and is shown as 2-dimensional coordinates for the formant parameters. Pear-son correlation and multiple linear regression analyses were performed between the variables. VSA showed a significant positive correlation with DSI/a/ (R = 0.221) and DSI/i/ (R = 0.026). FCR showed a significant negative correlation with DSI/u/ and DSI/i/. The F2 ratio showed a significant positive correlation with DSI/u/ and DSI/ae/. In the multiple linear regression analysis, VSA was found to be a significant predictor of DSI/a/ (β = 0.221, P = .030, R 2 = 0.139). F2 ratio (β = 0.275, P = .0.015) and FCR (β = -0.218, P = .029) was a significant predictor of DSI/u/ (R 2 = 0.203). FCR was a significant predictor of DSI/i/ (β = -0.260, P = .010, R 2 = 0.158). F2 ratio was a significant predictor of DSI/ae/ (β = 0.254, P = .013, R 2 = 0.154). Vowel quadrilateral parameters, such as VSA, FCR, and F2 ratio, may be associated with dysphonia severity in TBI patients.}, } @article {pmid37416548, year = {2023}, author = {Persson, A and Jaeger, TF}, title = {Evaluating normalization accounts against the dense vowel space of Central Swedish.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1165742}, pmid = {37416548}, issn = {1664-1078}, abstract = {Talkers vary in the phonetic realization of their vowels. One influential hypothesis holds that listeners overcome this inter-talker variability through pre-linguistic auditory mechanisms that normalize the acoustic or phonetic cues that form the input to speech recognition. Dozens of competing normalization accounts exist-including both accounts specific to vowel perception and general purpose accounts that can be applied to any type of cue. We add to the cross-linguistic literature on this matter by comparing normalization accounts against a new phonetically annotated vowel database of Swedish, a language with a particularly dense vowel inventory of 21 vowels differing in quality and quantity. We evaluate normalization accounts on how they differ in predicted consequences for perception. The results indicate that the best performing accounts either center or standardize formants by talker. The study also suggests that general purpose accounts perform as well as vowel-specific accounts, and that vowel normalization operates in both temporal and spectral domains.}, } @article {pmid37413966, year = {2023}, author = {Steinschneider, M}, title = {Toward an understanding of vowel encoding in the human auditory cortex.}, journal = {Neuron}, volume = {111}, number = {13}, pages = {1995-1997}, doi = {10.1016/j.neuron.2023.06.004}, pmid = {37413966}, issn = {1097-4199}, mesh = {Humans ; *Auditory Cortex/physiology ; Phonetics ; *Speech Perception/physiology ; }, abstract = {In this issue of Neuron, Oganian et al.[1] performed intracranial recordings in the auditory cortex of human subjects to clarify how vowels are encoded by the brain. Formant-based tuning curves demonstrated the organization of vowel encoding. The need for population codes and demonstration of speaker normalization were emphasized.}, } @article {pmid37404579, year = {2023}, author = {Hong, Y and Chen, S and Zhou, F and Chan, A and Tang, T}, title = {Phonetic entrainment in L2 human-robot interaction: an investigation of children with and without autism spectrum disorder.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1128976}, pmid = {37404579}, issn = {1664-1078}, abstract = {Phonetic entrainment is a phenomenon in which people adjust their phonetic features to approach those of their conversation partner. Individuals with Autism Spectrum Disorder (ASD) have been reported to show some deficits in entrainment during their interactions with human interlocutors, though deficits in terms of significant differences from typically developing (TD) controls were not always registered. One reason related to the inconsistencies of whether deficits are detected or not in autistic individuals is that the conversation partner's speech could hardly be controlled, and both the participants and the partners might be adjusting their phonetic features. The variabilities in the speech of conversation partners and various social traits exhibited might make the phonetic entrainment (if any) of the participants less detectable. In this study, we attempted to reduce the variability of the interlocutors by employing a social robot and having it do a goal-directed conversation task with children with and without ASD. Fourteen autistic children and 12 TD children participated the current study in their second language English. Results showed that autistic children showed comparable vowel formants and mean fundamental frequency (f0) entrainment as their TD peers, but they did not entrain their f0 range as the TD group did. These findings suggest that autistic children were capable of exhibiting phonetic entrainment behaviors similar to TD children in vowel formants and f0, particularly in a less complex situation where the speech features and social traits of the interlocutor were controlled. Furthermore, the utilization of a social robot may have increased the interest of these children in phonetic entrainment. On the other hand, entrainment of f0 range was more challenging for these autistic children even in a more controlled situation. This study demonstrates the viability and potential of using human-robot interactions as a novel method to evaluate abilities and deficits in phonetic entrainment in autistic children.}, } @article {pmid37401990, year = {2023}, author = {Terranova, F and Baciadonna, L and Maccarone, C and Isaja, V and Gamba, M and Favaro, L}, title = {Penguins perceive variations of source- and filter-related vocal parameters of species-specific vocalisations.}, journal = {Animal cognition}, volume = {26}, number = {5}, pages = {1613-1622}, pmid = {37401990}, issn = {1435-9456}, mesh = {Animals ; *Spheniscidae ; Vocalization, Animal ; Species Specificity ; Acoustics ; Sound ; }, abstract = {Animal vocalisations encode a wide range of biological information about the age, sex, body size, and social status of the emitter. Moreover, vocalisations play a significant role in signalling the identity of the emitter to conspecifics. Recent studies have shown that, in the African penguin (Spheniscus demersus), acoustic cues to individual identity are encoded in the fundamental frequency (F0) and resonance frequencies (formants) of the vocal tract. However, although penguins are known to produce vocalisations where F0 and formants vary among individuals, it remains to be tested whether the receivers can perceive and use such information in the individual recognition process. In this study, using the Habituation-Dishabituation (HD) paradigm, we tested the hypothesis that penguins perceive and respond to a shift of ± 20% (corresponding to the natural inter-individual variation observed in ex-situ colonies) of F0 and formant dispersion (ΔF) of species-specific calls. We found that penguins were more likely to look rapidly and for longer at the source of the sound when F0 and formants of the calls were manipulated, indicating that they could perceive variations of these parameters in the vocal signals. Our findings provide the first experimental evidence that, in the African penguin, listeners can perceive changes in F0 and formants, which can be used by the receiver as potential cues for the individual discrimination of the emitter.}, } @article {pmid37391267, year = {2024}, author = {Panneton, R and Cristia, A and Taylor, C and Moon, C}, title = {Positive Valence Contributes to Hyperarticulation in Maternal Speech to Infants and Puppies.}, journal = {Journal of child language}, volume = {51}, number = {5}, pages = {1230-1240}, doi = {10.1017/S0305000923000296}, pmid = {37391267}, issn = {1469-7602}, mesh = {Humans ; Female ; Infant ; *Mother-Child Relations ; *Emotions ; Animals ; Dogs ; Speech ; Maternal Behavior/psychology ; Male ; Mothers/psychology ; Adult ; Phonetics ; }, abstract = {Infant-directed speech often has hyperarticulated features, such as point vowels whose formants are further apart than in adult-directed speech. This increased "vowel space" may reflect the caretaker's effort to speak more clearly to infants, thus benefiting language processing. However, hyperarticulation may also result from more positive valence (e.g., speaking with positive vocal emotion) often found in mothers' speech to infants. This study was designed to replicate others who have found hyperarticulation in maternal speech to their 6-month-olds, but also to examine their speech to a non-human infant (i.e., a puppy). We rated both kinds of maternal speech for their emotional valence and recorded mothers' speech to a human adult. We found that mothers produced more positively valenced utterances and some hyperarticulation in both their infant- and puppy-directed speech, compared to their adult-directed speech. This finding promotes looking at maternal speech from a multi-faceted perspective that includes emotional state.}, } @article {pmid37384576, year = {2023}, author = {Vogt, C and Floegel, M and Kasper, J and Gispert-Sánchez, S and Kell, CA}, title = {Oxytocinergic modulation of speech production-a double-blind placebo-controlled fMRI study.}, journal = {Social cognitive and affective neuroscience}, volume = {18}, number = {1}, pages = {}, pmid = {37384576}, issn = {1749-5024}, mesh = {Humans ; Male ; *Speech ; *Oxytocin/pharmacology ; Magnetic Resonance Imaging ; Receptors, Oxytocin/genetics ; Language ; Double-Blind Method ; Administration, Intranasal ; Brain/physiology ; }, abstract = {Many socio-affective behaviors, such as speech, are modulated by oxytocin. While oxytocin modulates speech perception, it is not known whether it also affects speech production. Here, we investigated effects of oxytocin administration and interactions with the functional rs53576 oxytocin receptor (OXTR) polymorphism on produced speech and its underlying brain activity. During functional magnetic resonance imaging, 52 healthy male participants read sentences out loud with either neutral or happy intonation, a covert reading condition served as a common baseline. Participants were studied once under the influence of intranasal oxytocin and in another session under placebo. Oxytocin administration increased the second formant of produced vowels. This acoustic feature has previously been associated with speech valence; however, the acoustic differences were not perceptually distinguishable in our experimental setting. When preparing to speak, oxytocin enhanced brain activity in sensorimotor cortices and regions of both dorsal and right ventral speech processing streams, as well as subcortical and cortical limbic and executive control regions. In some of these regions, the rs53576 OXTR polymorphism modulated oxytocin administration-related brain activity. Oxytocin also gated cortical-basal ganglia circuits involved in the generation of happy prosody. Our findings suggest that several neural processes underlying speech production are modulated by oxytocin, including control of not only affective intonation but also sensorimotor aspects during emotionally neutral speech.}, } @article {pmid37344246, year = {2023}, author = {Vasquez-Serrano, P and Reyes-Moreno, J and Guido, RC and Sepúlveda-Sepúlveda, A}, title = {MFCC Parameters of the Speech Signal: An Alternative to Formant-Based Instantaneous Vocal Tract Length Estimation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.05.012}, pmid = {37344246}, issn = {1873-4588}, abstract = {On the one hand, the relationship between formant frequencies and vocal tract length (VTL) has been intensively studied over the years. On the other hand, the connection involving mel-frequency cepstral coefficients (MFCCs), which concisely codify the overall shape of a speaker's spectral envelope with just a few cepstral coefficients, and VTL has only been modestly analyzed, being worth of further investigation. Thus, based on different statistical models, this article explores the advantages and disadvantages of the latter approach, which is relatively novel, in contrast to the former which arises from more traditional studies. Additionally, VTL is assumed to be a static and inherent characteristic of speakers, that is, a single length parameter is frequently estimated per speaker. By contrast, in this paper we consider VTL estimation from a dynamic perspective using modern real-time Magnetic Resonance Imaging (rtMRI) to measure VTL in parallel with audio signals. To support the experiments, data obtained from USC-TIMIT magnetic resonance videos were used, allowing for the 2D real-time analysis of articulators in motion. As a result, we observed that the performance of MFCCs in case of speaker-dependent modeling is higher, however, in case of cross-speaker modeling, which uses different speakers' data for training and evaluating, its performance is not significantly different of that obtained with formants. In complement, we note that the estimation based on MFCCs is robust, with an acceptable computational time complexity, coherent with the traditional approach.}, } @article {pmid37307398, year = {2023}, author = {Cox, C and Dideriksen, C and Keren-Portnoy, T and Roepstorff, A and Christiansen, MH and Fusaroli, R}, title = {Infant-directed speech does not always involve exaggerated vowel distinctions: Evidence from Danish.}, journal = {Child development}, volume = {94}, number = {6}, pages = {1672-1696}, doi = {10.1111/cdev.13950}, pmid = {37307398}, issn = {1467-8624}, support = {DFF-7013-00074//Danmarks Frie Forskningsfond/ ; //Interacting Minds Centre/ ; }, mesh = {Adult ; Infant ; Humans ; Female ; Child, Preschool ; Male ; Child ; *Speech ; Language ; Language Development ; Child Language ; *Speech Perception ; Denmark ; Phonetics ; Speech Acoustics ; }, abstract = {This study compared the acoustic properties of 26 (100% female, 100% monolingual) Danish caregivers' spontaneous speech addressed to their 11- to 24-month-old infants (infant-directed speech, IDS) and an adult experimenter (adult-directed speech, ADS). The data were collected between 2016 and 2018 in Aarhus, Denmark. Prosodic properties of Danish IDS conformed to cross-linguistic patterns, with a higher pitch, greater pitch variability, and slower articulation rate than ADS. However, an acoustic analysis of vocalic properties revealed that Danish IDS had a reduced or similar vowel space, higher within-vowel variability, raised formants, and lower degree of vowel discriminability compared to ADS. None of the measures, except articulation rate, showed age-related differences. These results push for future research to conduct theory-driven comparisons across languages with distinct phonological systems.}, } @article {pmid37305920, year = {2023}, author = {, }, title = {Editor's note: Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1882}, pages = {20230201}, pmid = {37305920}, issn = {1471-2970}, } @article {pmid37303890, year = {2023}, author = {Baron, A and Harwood, V and Kleinman, D and Campanelli, L and Molski, J and Landi, N and Irwin, J}, title = {Where on the face do we look during phonemic restoration: An eye-tracking study.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1005186}, pmid = {37303890}, issn = {1664-1078}, abstract = {Face to face communication typically involves audio and visual components to the speech signal. To examine the effect of task demands on gaze patterns in response to a speaking face, adults participated in two eye-tracking experiments with an audiovisual (articulatory information from the mouth was visible) and a pixelated condition (articulatory information was not visible). Further, task demands were manipulated by having listeners respond in a passive (no response) or an active (button press response) context. The active experiment required participants to discriminate between speech stimuli and was designed to mimic environmental situations which require one to use visual information to disambiguate the speaker's message, simulating different listening conditions in real-world settings. Stimuli included a clear exemplar of the syllable /ba/ and a second exemplar in which the formant initial consonant was reduced creating an /a/-like consonant. Consistent with our hypothesis, results revealed that the greatest fixations to the mouth were present in the audiovisual active experiment and visual articulatory information led to a phonemic restoration effect for the /a/ speech token. In the pixelated condition, participants fixated on the eyes, and discrimination of the deviant token within the active experiment was significantly greater than the audiovisual condition. These results suggest that when required to disambiguate changes in speech, adults may look to the mouth for additional cues to support processing when it is available.}, } @article {pmid37302909, year = {2023}, author = {Ikuma, T and McWhorter, AJ and Oral, E and Kunduk, M}, title = {Formant-Aware Spectral Analysis of Sustained Vowels of Pathological Breathy Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.05.002}, pmid = {37302909}, issn = {1873-4588}, abstract = {OBJECTIVES: This paper reports the effectiveness of formant-aware spectral parameters to predict the perceptual breathiness rating. A breathy voice has a steeper spectral slope and higher turbulent noise than a normal voice. Measuring spectral parameters of acoustic signals over lower formant regions is a known approach to capture the properties related to breathiness. This study examines this approach by testing the contemporary spectral parameters and algorithms within the framework, alternate frequency band designs, and vowel effects.

METHODS: Sustained vowel recordings (/a/, /i/, and /u/) of speakers with voice disorders in the German Saarbrueken Voice Database were considered (n: 367). Recordings with signal irregularities, such as subharmonics or with roughness perception, were excluded from the study. Four speech language pathologists perceptually rated the recordings for breathiness on a 100-point scale, and their averages were used in the analysis. The acoustic spectra were segmented into four frequency bands according to the vowel formant structures. Five spectral parameters (intraband harmonics-to-noise ratio, HNR; interband harmonics ratio, HHR; interband noise ratio, NNR; and interband glottal-to-noise energy, GNE, ratio) were evaluated in each band to predict the perceptual breathiness rating. Four HNR algorithms were tested.

RESULTS: Multiple linear regression models of spectral parameters, led by the HNRs, were shown to explain up to 85% of the variance in perceptual breathiness ratings. This performance exceeded that of the acoustic breathiness index (82%). Individually, the HNR over the first two formants best explained the variances in the breathiness (78%), exceeding the smoothed cepstrum peak prominence (74%). The performance of HNR was highly algorithm dependent (10% spread). Some vowel effects were observed in the perceptual rating (higher for /u/), predictability (5% lower for /u/), and model parameter selections.

CONCLUSIONS: Strong per-vowel breathiness acoustic models were found by segmenting the spectrum to isolate the portion most affected by breathiness.}, } @article {pmid37260602, year = {2023}, author = {Ashokumar, M and Guichet, C and Schwartz, JL and Ito, T}, title = {Correlation between the effect of orofacial somatosensory inputs in speech perception and speech production performance.}, journal = {Auditory perception & cognition}, volume = {6}, number = {1-2}, pages = {97-107}, pmid = {37260602}, issn = {2574-2450}, support = {R01 DC017439/DC/NIDCD NIH HHS/United States ; }, abstract = {INTRODUCTION: Orofacial somatosensory inputs modify the perception of speech sounds. Such auditory-somatosensory integration likely develops alongside speech production acquisition. We examined whether the somatosensory effect in speech perception varies depending on individual characteristics of speech production.

METHODS: The somatosensory effect in speech perception was assessed by changes in category boundary between /e/ and /ø/ in a vowel identification test resulting from somatosensory stimulation providing facial skin deformation in the rearward direction corresponding to articulatory movement for /e/ applied together with the auditory input. Speech production performance was quantified by the acoustic distances between the average first, second and third formants of /e/ and /ø/ utterances recorded in a separate test.

RESULTS: The category boundary between /e/ and /ø/ was significantly shifted towards /ø/ due to the somatosensory stimulation which is consistent with previous research. The amplitude of the category boundary shift was significantly correlated with the acoustic distance between the mean second - and marginally third - formants of /e/ and /ø/ productions, with no correlation with the first formant distance.

DISCUSSION: Greater acoustic distances can be related to larger contrasts between the articulatory targets of vowels in speech production. These results suggest that the somatosensory effect in speech perception can be linked to speech production performance.}, } @article {pmid37227411, year = {2023}, author = {Saba, JN and Ali, H and Hansen, JHL}, title = {The effects of estimation accuracy, estimation approach, and number of selected channels using formant-priority channel selection for an "n-of-m" sound processing strategy for cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {3100}, pmid = {37227411}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implants ; *Speech Perception ; *Cochlear Implantation ; Sound ; Noise ; }, abstract = {Previously, selection of l channels was prioritized according to formant frequency locations in an l-of-n-of-m-based signal processing strategy to provide important voicing information independent of listening environments for cochlear implant (CI) users. In this study, ideal, or ground truth, formants were incorporated into the selection stage to determine the effect of accuracy on (1) subjective speech intelligibility, (2) objective channel selection patterns, and (3) objective stimulation patterns (current). An average +11% improvement (p < 0.05) was observed across six CI users in quiet, but not for noise or reverberation conditions. Analogous increases in channel selection and current for the upper range of F1 and a decrease across mid-frequencies with higher corresponding current, were both observed at the expense of noise-dominant channels. Objective channel selection patterns were analyzed a second time to determine the effects of estimation approach and number of selected channels (n). A significant effect of estimation approach was only observed in the noise and reverberation condition with minor differences in channel selection and significantly decreased stimulated current. Results suggest that estimation method, accuracy, and number of channels in the proposed strategy using ideal formants may improve intelligibility when corresponding stimulated current of formant channels are not masked by noise-dominant channels.}, } @article {pmid37224720, year = {2023}, author = {Carney, LH and Cameron, DA and Kinast, KB and Feld, CE and Schwarz, DM and Leong, UC and McDonough, JM}, title = {Effects of sensorineural hearing loss on formant-frequency discrimination: Measurements and models.}, journal = {Hearing research}, volume = {435}, number = {}, pages = {108788}, pmid = {37224720}, issn = {1878-5891}, support = {R01 DC001641/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech Perception/physiology ; *Hearing Loss, Sensorineural/diagnosis ; Mesencephalon ; *Inferior Colliculi/physiology ; Cochlear Nerve ; Phonetics ; }, abstract = {This study concerns the effect of hearing loss on discrimination of formant frequencies in vowels. In the response of the healthy ear to a harmonic sound, auditory-nerve (AN) rate functions fluctuate at the fundamental frequency, F0. Responses of inner-hair-cells (IHCs) tuned near spectral peaks are captured (or dominated) by a single harmonic, resulting in lower fluctuation depths than responses of IHCs tuned between spectral peaks. Therefore, the depth of neural fluctuations (NFs) varies along the tonotopic axis and encodes spectral peaks, including formant frequencies of vowels. This NF code is robust across a wide range of sound levels and in background noise. The NF profile is converted into a rate-place representation in the auditory midbrain, wherein neurons are sensitive to low-frequency fluctuations. The NF code is vulnerable to sensorineural hearing loss (SNHL) because capture depends upon saturation of IHCs, and thus the interaction of cochlear gain with IHC transduction. In this study, formant-frequency discrimination limens (DLFFs) were estimated for listeners with normal hearing or mild to moderate SNHL. The F0 was fixed at 100 Hz, and formant peaks were either aligned with harmonic frequencies or placed between harmonics. Formant peak frequencies were 600 and 2000 Hz, in the range of first and second formants of several vowels. The difficulty of the task was varied by changing formant bandwidth to modulate the contrast in the NF profile. Results were compared to predictions from model auditory-nerve and inferior colliculus (IC) neurons, with listeners' audiograms used to individualize the AN model. Correlations between DLFFs, audiometric thresholds near the formant frequencies, age, and scores on the Quick speech-in-noise test are reported. SNHL had a strong effect on DLFF for the second formant frequency (F2), but relatively small effect on DLFF for the first formant (F1). The IC model appropriately predicted substantial threshold elevations for changes in F2 as a function of SNHL and little effect of SNHL on thresholds for changes in F1.}, } @article {pmid37214801, year = {2023}, author = {Rizzi, R and Bidelman, GM}, title = {Duplex perception reveals brainstem auditory representations are modulated by listeners' ongoing percept for speech.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37214801}, issn = {2692-8205}, support = {R01 DC016267/DC/NIDCD NIH HHS/United States ; }, abstract = {So-called duplex speech stimuli with perceptually ambiguous spectral cues to one ear and isolated low- vs. high-frequency third formant "chirp" to the opposite ear yield a coherent percept supporting their phonetic categorization. Critically, such dichotic sounds are only perceived categorically upon binaural integration. Here, we used frequency-following responses (FFRs), scalp-recorded potentials reflecting phase-locked subcortical activity, to investigate brainstem responses to fused speech percepts and to determine whether FFRs reflect binaurally integrated category-level representations. We recorded FFRs to diotic and dichotic stop-consonants (/da/, /ga/) that either did or did not require binaural fusion to properly label along with perceptually ambiguous sounds without clear phonetic identity. Behaviorally, listeners showed clear categorization of dichotic speech tokens confirming they were heard with a fused, phonetic percept. Neurally, we found FFRs were stronger for categorically perceived speech relative to category-ambiguous tokens but also differentiated phonetic categories for both diotically and dichotically presented speech sounds. Correlations between neural and behavioral data further showed FFR latency predicted the degree to which listeners labeled tokens as "da" vs. "ga". The presence of binaurally integrated, category-level information in FFRs suggests human brainstem processing reflects a surprisingly abstract level of the speech code typically circumscribed to much later cortical processing.}, } @article {pmid37212513, year = {2023}, author = {Cox, SR and Huang, T and Chen, WR and Ng, ML}, title = {An acoustic study of Cantonese alaryngeal speech in different speaking conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2973}, pmid = {37212513}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech, Alaryngeal/methods ; Speech, Esophageal ; Speech ; *Larynx, Artificial ; Acoustics ; Speech Intelligibility ; Speech Acoustics ; }, abstract = {Esophageal (ES) speech, tracheoesophageal (TE) speech, and the electrolarynx (EL) are common methods of communication following the removal of the larynx. Our recent study demonstrated that intelligibility may increase for Cantonese alaryngeal speakers using clear speech (CS) compared to their everyday "habitual speech" (HS), but the reasoning is still unclear [Hui, Cox, Huang, Chen, and Ng (2022). Folia Phoniatr. Logop. 74, 103-111]. The purpose of this study was to assess the acoustic characteristics of vowels and tones produced by Cantonese alaryngeal speakers using HS and CS. Thirty-one alaryngeal speakers (9 EL, 10 ES, and 12 TE speakers) read The North Wind and the Sun passage in HS and CS. Vowel formants, vowel space area (VSA), speaking rate, pitch, and intensity were examined, and their relationship to intelligibility were evaluated. Statistical models suggest that larger VSAs significantly improved intelligibility, but slower speaking rate did not. Vowel and tonal contrasts did not differ between HS and CS for all three groups, but the amount of information encoded in fundamental frequency and intensity differences between high and low tones positively correlated with intelligibility for TE and ES groups, respectively. Continued research is needed to understand the effects of different speaking conditions toward improving acoustic and perceptual characteristics of Cantonese alaryngeal speech.}, } @article {pmid37210244, year = {2023}, author = {Valls-Ontañón, A and Ferreiro, M and Moragues-Aguiló, B and Molins-Ballabriga, G and Julián-González, S and Sauca-Balart, A and Hernández-Alfaro, F}, title = {Impact of 3-dimensional anatomical changes secondary to orthognathic surgery on voice resonance and articulatory function: a prospective study.}, journal = {The British journal of oral & maxillofacial surgery}, volume = {61}, number = {5}, pages = {373-379}, doi = {10.1016/j.bjoms.2023.04.007}, pmid = {37210244}, issn = {1532-1940}, mesh = {Humans ; *Orthognathic Surgery ; Prospective Studies ; Facial Bones ; Speech ; Tongue ; Speech Acoustics ; }, abstract = {An evaluation was made of the impact of orthognathic surgery (OS) on speech, addressing in particular the effects of skeletal and airway changes on voice resonance characteristics and articulatory function. A prospective study was carried out involving 29 consecutive patientssubjected to OS. Preoperative, and short and long-term postoperative evaluations were made of anatomical changes (skeletal and airway measurements), speech evolution (assessed objectively by acoustic analysis: fundamental frequency, local jitter, local shimmer of each vowel, and formants F1 and F2 of vowel /a/), and articulatory function (use of compensatory musculature, point of articulation, and speech intelligibility). These were also assessed subjectively by means of a visual analogue scale. Articulatory function after OS showed immediate improvement and had further progressed at one year of follow up. This improvement significantly correlated with the anatomical changes, and was also notably perceived by the patient. On the other hand, although a slight modification in vocal resonance was reported and seen to correlate with anatomical changes of the tongue, hyoid bone, and airway, it was not subjectively perceived by the patients. In conclusion, the results demonstrated that OS had beneficial effects on articulatory function and imperceptible subjective changes in a patient's voice. Patients subjected to OS, apart from benefitting from improved articulatory function, should not be afraid that they will not recognise their voice after treatment.}, } @article {pmid37205390, year = {2023}, author = {Shellikeri, S and Cho, S and Ash, S and Gonzalez-Recober, C and McMillan, CT and Elman, L and Quinn, C and Amado, DA and Baer, M and Irwin, DJ and Massimo, L and Olm, C and Liberman, M and Grossman, M and Nevler, N}, title = {Digital markers of motor speech impairments in natural speech of patients with ALS-FTD spectrum disorders.}, journal = {medRxiv : the preprint server for health sciences}, volume = {}, number = {}, pages = {}, doi = {10.1101/2023.04.29.23289308}, pmid = {37205390}, support = {K08 NS114106/NS/NINDS NIH HHS/United States ; }, abstract = {BACKGROUND AND OBJECTIVES: Patients with ALS-FTD spectrum disorders (ALS-FTSD) have mixed motor and cognitive impairments and require valid and quantitative assessment tools to support diagnosis and tracking of bulbar motor disease. This study aimed to validate a novel automated digital speech tool that analyzes vowel acoustics from natural, connected speech as a marker for impaired articulation due to bulbar motor disease in ALS-FTSD.

METHODS: We used an automatic algorithm called Forced Alignment Vowel Extraction (FAVE) to detect spoken vowels and extract vowel acoustics from 1 minute audio-recorded picture descriptions. Using automated acoustic analysis scripts, we derived two articulatory-acoustic measures: vowel space area (VSA, in Bark [2]) which represents tongue range-of-motion (size), and average second formant slope of vowel trajectories (F2 slope) which represents tongue movement speed. We compared vowel measures between ALS with and without clinically-evident bulbar motor disease (ALS+bulbar vs. ALS-bulbar), behavioral variant frontotemporal dementia (bvFTD) without a motor syndrome, and healthy controls (HC). We correlated impaired vowel measures with bulbar disease severity, estimated by clinical bulbar scores and perceived listener effort, and with MRI cortical thickness of the orobuccal part of the primary motor cortex innervating the tongue (oralPMC). We also tested correlations with respiratory capacity and cognitive impairment.

RESULTS: Participants were 45 ALS+bulbar (30 males, mean age=61±11), 22 ALS-nonbulbar (11 males, age=62±10), 22 bvFTD (13 males, age=63±7), and 34 HC (14 males, age=69±8). ALS+bulbar had smaller VSA and shallower average F2 slopes than ALS-bulbar (VSA: | d |=0.86, p =0.0088; F2 slope: | d |=0.98, p =0.0054), bvFTD (VSA: | d |=0.67, p =0.043; F2 slope: | d |=1.4, p <0.001), and HC (VSA: | d |=0.73, p =0.024; F2 slope: | d |=1.0, p <0.001). Vowel measures declined with worsening bulbar clinical scores (VSA: R=0.33, p =0.033; F2 slope: R=0.25, p =0.048), and smaller VSA was associated with greater listener effort (R=-0.43, p =0.041). Shallower F2 slopes were related to cortical thinning in oralPMC (R=0.50, p =0.03). Neither vowel measure was associated with respiratory nor cognitive test scores.

CONCLUSIONS: Vowel measures extracted with automatic processing from natural speech are sensitive to bulbar motor disease in ALS-FTD and are robust to cognitive impairment.}, } @article {pmid37203275, year = {2023}, author = {Easwar, V and Peng, ZE and Mak, V and Mikiel-Hunter, J}, title = {Differences between children and adults in the neural encoding of voice fundamental frequency in the presence of noise and reverberation.}, journal = {The European journal of neuroscience}, volume = {58}, number = {2}, pages = {2547-2562}, doi = {10.1111/ejn.16049}, pmid = {37203275}, issn = {1460-9568}, mesh = {Humans ; Adult ; Male ; Child ; Aged ; Adolescent ; *Speech Perception/physiology ; Noise ; Speech ; }, abstract = {Environmental noise and reverberation challenge speech understanding more significantly in children than in adults. However, the neural/sensory basis for the difference is poorly understood. We evaluated the impact of noise and reverberation on the neural processing of the fundamental frequency of voice (f0)-an important cue to tag or recognize a speaker. In a group of 39 6- to 15-year-old children and 26 adults with normal hearing, envelope following responses (EFRs) were elicited by a male-spoken /i/ in quiet, noise, reverberation, and both noise and reverberation. Due to increased resolvability of harmonics at lower than higher vowel formants that may affect susceptibility to noise and/or reverberation, the /i/ was modified to elicit two EFRs: one initiated by the low frequency first formant (F1) and the other initiated by mid to high frequency second and higher formants (F2+) with predominantly resolved and unresolved harmonics, respectively. F1 EFRs were more susceptible to noise whereas F2+ EFRs were more susceptible to reverberation. Reverberation resulted in greater attenuation of F1 EFRs in adults than children, and greater attenuation of F2+ EFRs in older than younger children. Reduced modulation depth caused by reverberation and noise explained changes in F2+ EFRs but was not the primary determinant for F1 EFRs. Experimental data paralleled modelled EFRs, especially for F1. Together, data suggest that noise or reverberation influences the robustness of f0 encoding depending on the resolvability of vowel harmonics and that maturation of processing temporal/envelope information of voice is delayed in reverberation, particularly for low frequency stimuli.}, } @article {pmid37173176, year = {2024}, author = {Wang, Y and Hattori, M and Masaki, K and Sumita, YI}, title = {Detailed speech evaluation including formant 3 analysis and voice visualization in maxillofacial rehabilitation: A clinical report.}, journal = {The Journal of prosthetic dentistry}, volume = {132}, number = {6}, pages = {1331.e1-1331.e7}, doi = {10.1016/j.prosdent.2023.02.022}, pmid = {37173176}, issn = {1097-6841}, mesh = {Humans ; Male ; Aged ; *Palatal Obturators ; Maxillary Sinus/diagnostic imaging ; Speech Disorders/rehabilitation/etiology ; Speech Production Measurement ; Speech Acoustics ; Voice Quality ; Maxilla ; }, abstract = {Objective speech evaluation such as analysis of formants 1 and 2 and nasality measurement have been used in maxillofacial rehabilitation for outcome assessment. However, in some patients, those evaluations are insufficient to assess a specific or unique problem. This report describes the use of a new speech evaluation including formant 3 analysis and voice visualization in a patient with a maxillofacial defect. The patient was a 67-year-old man who had a maxillary defect that opened to the maxillary sinus and who had an unnatural voice even when wearing an obturator. Nasality was low and the frequency of formants 1 and 2 were normal even without the obturator. However, a low frequency of formant 3 and a shifted center of voice were observed. These results indicated that the unnatural voice was related to increased resonant volume in the pharynx rather than hypernasality. This patient demonstrates that advanced speech analysis can be useful for detecting the cause of speech disorder and planning maxillofacial rehabilitation.}, } @article {pmid37138997, year = {2023}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {On the speaker discriminatory power asymmetry regarding acoustic-phonetic parameters and the impact of speaking style.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1101187}, pmid = {37138997}, issn = {1664-1078}, abstract = {This study aimed to assess what we refer to as the speaker discriminatory power asymmetry and its forensic implications in comparisons performed in different speaking styles: spontaneous dialogues vs. interviews. We also addressed the impact of data sampling on the speaker's discriminatory performance concerning different acoustic-phonetic estimates. The participants were 20 male speakers, Brazilian Portuguese speakers from the same dialectal area. The speech material consisted of spontaneous telephone conversations between familiar individuals, and interviews conducted between each individual participant and the researcher. Nine acoustic-phonetic parameters were chosen for the comparisons, spanning from temporal and melodic to spectral acoustic-phonetic estimates. Ultimately, an analysis based on the combination of different parameters was also conducted. Two speaker discriminatory metrics were examined: Cost Log-likelihood-ratio (Cllr) and Equal Error Rate (EER) values. A general speaker discriminatory trend was suggested when assessing the parameters individually. Parameters pertaining to the temporal acoustic-phonetic class depicted the weakest performance in terms of speaker contrasting power as evidenced by the relatively higher Cllr and EER values. Moreover, from the set of acoustic parameters assessed, spectral parameters, mainly high formant frequencies, i.e., F3 and F4, were the best performing in terms of speaker discrimination, depicting the lowest EER and Cllr scores. The results appear to suggest a speaker discriminatory power asymmetry concerning parameters from different acoustic-phonetic classes, in which temporal parameters tended to present a lower discriminatory power. The speaking style mismatch also seemed to considerably impact the speaker comparison task, by undermining the overall discriminatory performance. A statistical model based on the combination of different acoustic-phonetic estimates was found to perform best in this case. Finally, data sampling has proven to be of crucial relevance for the reliability of discriminatory power assessment.}, } @article {pmid37129674, year = {2023}, author = {Zaltz, Y}, title = {The effect of stimulus type and testing method on talker discrimination of school-age children.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2611}, doi = {10.1121/10.0017999}, pmid = {37129674}, issn = {1520-8524}, mesh = {Adult ; Humans ; Child ; Adolescent ; Young Adult ; *Speech Perception ; Child Development ; Speech ; Linguistics ; Acoustics ; }, abstract = {Efficient talker discrimination (TD) improves speech understanding under multi-talker conditions. So far, TD of children has been assessed using various testing parameters, making it difficult to draw comparative conclusions. This study explored the effects of the stimulus type and variability on children's TD. Thirty-two children (7-10 years old) underwent eight TD assessments with fundamental frequency + formant changes using an adaptive procedure. Stimuli included consonant-vowel-consonant words or three-word sentences and were either fixed by run or by trial (changing throughout the run). Cognitive skills were also assessed. Thirty-one adults (18-35 years old) served as controls. The results showed (1) poorer TD for the fixed-by-trial than the fixed-by-run method, with both stimulus types for the adults but only with the words for the children; (2) poorer TD for the words than the sentences with the fixed-by-trial method only for the children; and (3) significant correlations between the children's age and TD. These results support a developmental trajectory in the use of perceptual anchoring for TD and in its reliance on comprehensive acoustic and linguistic information. The finding that the testing parameters may influence the top-down and bottom-up processing for TD should be considered when comparing data across studies or when planning new TD experiments.}, } @article {pmid37128454, year = {2022}, author = {Ghosh, S and Feng, Z and Bian, J and Butler, K and Prosperi, M}, title = {DR-VIDAL - Doubly Robust Variational Information-theoretic Deep Adversarial Learning for Counterfactual Prediction and Treatment Effect Estimation on Real World Data.}, journal = {AMIA ... Annual Symposium proceedings. AMIA Symposium}, volume = {2022}, number = {}, pages = {485-494}, pmid = {37128454}, issn = {1942-597X}, support = {R01 AI141810/AI/NIAID NIH HHS/United States ; R01 AI170187/AI/NIAID NIH HHS/United States ; R01 AG076234/AG/NIA NIH HHS/United States ; R21 CA245858/CA/NCI NIH HHS/United States ; R01 AI145552/AI/NIAID NIH HHS/United States ; R56 AG069880/AG/NIA NIH HHS/United States ; }, mesh = {Humans ; Prognosis ; *Electronic Health Records ; Causality ; }, abstract = {Determining causal effects of interventions onto outcomes from real-world, observational (non-randomized) data, e.g., treatment repurposing using electronic health records, is challenging due to underlying bias. Causal deep learning has improved over traditional techniques for estimating individualized treatment effects (ITE). We present the Doubly Robust Variational Information-theoretic Deep Adversarial Learning (DR-VIDAL), a novel generative framework that combines two joint models of treatment and outcome, ensuring an unbiased ITE estimation even when one of the two is misspecified. DR-VIDAL integrates: (i) a variational autoencoder (VAE) to factorize confounders into latent variables according to causal assumptions; (ii) an information-theoretic generative adversarial network (Info-GAN) to generate counterfactuals; (iii) a doubly robust block incorporating treatment propensities for outcome predictions. On synthetic and real-world datasets (Infant Health and Development Program, Twin Birth Registry, and National Supported Work Program), DR-VIDAL achieves better performance than other non-generative and generative methods. In conclusion, DR-VIDAL uniquely fuses causal assumptions, VAE, Info-GAN, and doubly robustness into a comprehensive, per- formant framework. Code is available at: https://github.com/Shantanu48114860/DR-VIDAL-AMIA-22 under MIT license.}, } @article {pmid37116009, year = {2024}, author = {Li, M and Erickson, IM and Cross, EV and Lee, JD}, title = {It's Not Only What You Say, But Also How You Say It: Machine Learning Approach to Estimate Trust from Conversation.}, journal = {Human factors}, volume = {66}, number = {6}, pages = {1724-1741}, pmid = {37116009}, issn = {1547-8181}, mesh = {Humans ; *Trust ; *Machine Learning ; Adult ; Communication ; Male ; Female ; }, abstract = {OBJECTIVE: The objective of this study was to estimate trust from conversations using both lexical and acoustic data.

BACKGROUND: As NASA moves to long-duration space exploration operations, the increasing need for cooperation between humans and virtual agents requires real-time trust estimation by virtual agents. Measuring trust through conversation is a novel and unintrusive approach.

METHOD: A 2 (reliability) × 2 (cycles) × 3 (events) within-subject study with habitat system maintenance was designed to elicit various levels of trust in a conversational agent. Participants had trust-related conversations with the conversational agent at the end of each decision-making task. To estimate trust, subjective trust ratings were predicted using machine learning models trained on three types of conversational features (i.e., lexical, acoustic, and combined). After training, model explanation was performed using variable importance and partial dependence plots.

RESULTS: Results showed that a random forest algorithm, trained using the combined lexical and acoustic features, predicted trust in the conversational agent most accurately (Radj2=0.71). The most important predictors were a combination of lexical and acoustic cues: average sentiment considering valence shifters, the mean of formants, and Mel-frequency cepstral coefficients (MFCC). These conversational features were identified as partial mediators predicting people's trust.

CONCLUSION: Precise trust estimation from conversation requires lexical cues and acoustic cues.

APPLICATION: These results showed the possibility of using conversational data to measure trust, and potentially other dynamic mental states, unobtrusively and dynamically.}, } @article {pmid37106680, year = {2023}, author = {Teixeira, FL and Costa, MRE and Abreu, JP and Cabral, M and Soares, SP and Teixeira, JP}, title = {A Narrative Review of Speech and EEG Features for Schizophrenia Detection: Progress and Challenges.}, journal = {Bioengineering (Basel, Switzerland)}, volume = {10}, number = {4}, pages = {}, pmid = {37106680}, issn = {2306-5354}, support = {UIDB/05757/2020//Fundação para a Ciência e Tecnologia/ ; UIDP/05757/2020//Fundação para a Ciência e Tecnologia/ ; LA/P/0007/2021//Fundação para a Ciência e Tecnologia/ ; }, abstract = {Schizophrenia is a mental illness that affects an estimated 21 million people worldwide. The literature establishes that electroencephalography (EEG) is a well-implemented means of studying and diagnosing mental disorders. However, it is known that speech and language provide unique and essential information about human thought. Semantic and emotional content, semantic coherence, syntactic structure, and complexity can thus be combined in a machine learning process to detect schizophrenia. Several studies show that early identification is crucial to prevent the onset of illness or mitigate possible complications. Therefore, it is necessary to identify disease-specific biomarkers for an early diagnosis support system. This work contributes to improving our knowledge about schizophrenia and the features that can identify this mental illness via speech and EEG. The emotional state is a specific characteristic of schizophrenia that can be identified with speech emotion analysis. The most used features of speech found in the literature review are fundamental frequency (F0), intensity/loudness (I), frequency formants (F1, F2, and F3), Mel-frequency cepstral coefficients (MFCC's), the duration of pauses and sentences (SD), and the duration of silence between words. Combining at least two feature categories achieved high accuracy in the schizophrenia classification. Prosodic and spectral or temporal features achieved the highest accuracy. The work with higher accuracy used the prosodic and spectral features QEVA, SDVV, and SSDL, which were derived from the F0 and spectrogram. The emotional state can be identified with most of the features previously mentioned (F0, I, F1, F2, F3, MFCCs, and SD), linear prediction cepstral coefficients (LPCC), linear spectral features (LSF), and the pause rate. Using the event-related potentials (ERP), the most promissory features found in the literature are mismatch negativity (MMN), P2, P3, P50, N1, and N2. The EEG features with higher accuracy in schizophrenia classification subjects are the nonlinear features, such as Cx, HFD, and Lya.}, } @article {pmid37105171, year = {2023}, author = {Oganian, Y and Bhaya-Grossman, I and Johnson, K and Chang, EF}, title = {Vowel and formant representation in the human auditory speech cortex.}, journal = {Neuron}, volume = {111}, number = {13}, pages = {2105-2118.e4}, pmid = {37105171}, issn = {1097-4199}, support = {R01 DC012379/DC/NIDCD NIH HHS/United States ; U01 NS117765/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Speech ; Phonetics ; Auditory Perception ; *Speech Perception ; *Auditory Cortex ; }, abstract = {Vowels, a fundamental component of human speech across all languages, are cued acoustically by formants, resonance frequencies of the vocal tract shape during speaking. An outstanding question in neurolinguistics is how formants are processed neurally during speech perception. To address this, we collected high-density intracranial recordings from the human speech cortex on the superior temporal gyrus (STG) while participants listened to continuous speech. We found that two-dimensional receptive fields based on the first two formants provided the best characterization of vowel sound representation. Neural activity at single sites was highly selective for zones in this formant space. Furthermore, formant tuning is adjusted dynamically for speaker-specific spectral context. However, the entire population of formant-encoding sites was required to accurately decode single vowels. Overall, our results reveal that complex acoustic tuning in the two-dimensional formant space underlies local vowel representations in STG. As a population code, this gives rise to phonological vowel perception.}, } @article {pmid37080890, year = {2023}, author = {Herbst, CT and Story, BH and Meyer, D}, title = {Acoustical Theory of Vowel Modification Strategies in Belting.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.01.004}, pmid = {37080890}, issn = {1873-4588}, abstract = {Various authors have argued that belting is to be produced by "speech-like" sounds, with the first and second supraglottic vocal tract resonances (fR1 and fR2) at frequencies of the vowels determined by the lyrics to be sung. Acoustically, the hallmark of belting has been identified as a dominant second harmonic, possibly enhanced by first resonance tuning (fR1≈2fo). It is not clear how both these concepts - (a) phonating with "speech-like," unmodified vowels; and (b) producing a belting sound with a dominant second harmonic, typically enhanced by fR1 - can be upheld when singing across a singer's entire musical pitch range. For instance, anecdotal reports from pedagogues suggest that vowels with a low fR1, such as [i] or [u], might have to be modified considerably (by raising fR1) in order to phonate at higher pitches. These issues were systematically addressed in silico with respect to treble singing, using a linear source-filter voice production model. The dominant harmonic of the radiated spectrum was assessed in 12987 simulations, covering a parameter space of 37 fundamental frequencies (fo) across the musical pitch range from C3 to C6; 27 voice source spectral slope settings from -4 to -30 dB/octave; computed for 13 different IPA vowels. The results suggest that, for most unmodified vowels, the stereotypical belting sound characteristics with a dominant second harmonic can only be produced over a pitch range of about a musical fifth, centered at fo≈0.5fR1. In the [ɔ] and [ɑ] vowels, that range is extended to an octave, supported by a low second resonance. Data aggregation - considering the relative prevalence of vowels in American English - suggests that, historically, belting with fR1≈2fo was derived from speech, and that songs with an extended musical pitch range likely demand considerable vowel modification. We thus argue that - on acoustical grounds - the pedagogical commandment for belting with unmodified, "speech-like" vowels can not always be fulfilled.}, } @article {pmid37078508, year = {2023}, author = {Dillon, MT and Helpard, L and Brown, KD and Selleck, AM and Richter, ME and Rooth, MA and Thompson, NJ and Dedmon, MM and Ladak, HM and Agrawal, S}, title = {Influence of the Frequency-to-Place Function on Recognition with Place-Based Cochlear Implant Maps.}, journal = {The Laryngoscope}, volume = {133}, number = {12}, pages = {3540-3547}, doi = {10.1002/lary.30710}, pmid = {37078508}, issn = {1531-4995}, support = {//Academic Medical Organization of Southwestern Ontario/ ; //MED-EL Medical Electronics/ ; //Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Adult ; Humans ; *Cochlear Implants ; Artificial Intelligence ; *Speech Perception ; *Cochlear Implantation ; Cochlea/anatomy & histology ; Acoustic Stimulation/methods ; }, abstract = {OBJECTIVE: Comparison of acute speech recognition for cochlear implant (CI) alone and electric-acoustic stimulation (EAS) users listening with default maps or place-based maps using either a spiral ganglion (SG) or a new Synchrotron Radiation-Artificial Intelligence (SR-AI) frequency-to-place function.

METHODS: Thirteen adult CI-alone or EAS users completed a task of speech recognition at initial device activation with maps that differed in the electric filter frequency assignments. The three map conditions were: (1) maps with the default filter settings (default map), (2) place-based maps with filters aligned to cochlear SG tonotopicity using the SG function (SG place-based map), and (3) place-based maps with filters aligned to cochlear Organ of Corti (OC) tonotopicity using the SR-AI function (SR-AI place-based map). Speech recognition was evaluated using a vowel recognition task. Performance was scored as the percent correct for formant 1 recognition due to the rationale that the maps would deviate the most in the estimated cochlear place frequency for low frequencies.

RESULTS: On average, participants had better performance with the OC SR-AI place-based map as compared to the SG place-based map and the default map. A larger performance benefit was observed for EAS users than for CI-alone users.

CONCLUSION: These pilot data suggest that EAS and CI-alone users may experience better performance with a patient-centered mapping approach that accounts for the variability in cochlear morphology (OC SR-AI frequency-to-place function) in the individualization of the electric filter frequencies (place-based mapping procedure).

LEVEL OF EVIDENCE: 3 Laryngoscope, 133:3540-3547, 2023.}, } @article {pmid37071803, year = {2023}, author = {Terband, H and van Brenk, F}, title = {Modeling Responses to Auditory Feedback Perturbations in Adults, Children, and Children With Complex Speech Sound Disorders: Evidence for Impaired Auditory Self-Monitoring?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1563-1587}, doi = {10.1044/2023_JSLHR-22-00379}, pmid = {37071803}, issn = {1558-9102}, mesh = {Young Adult ; Humans ; Child ; Child, Preschool ; *Speech Sound Disorder ; Feedback ; *Speech Perception/physiology ; Reproducibility of Results ; Speech/physiology ; Feedback, Sensory/physiology ; *Stuttering ; }, abstract = {PURPOSE: Previous studies have found that typically developing (TD) children were able to compensate and adapt to auditory feedback perturbations to a similar or larger degree compared to young adults, while children with speech sound disorder (SSD) were found to produce predominantly following responses. However, large individual differences lie underneath the group-level results. This study investigates possible mechanisms in responses to formant shifts by modeling parameters of feedback and feedforward control of speech production based on behavioral data.

METHOD: SimpleDIVA was used to model an existing dataset of compensation/adaptation behavior to auditory feedback perturbations collected from three groups of Dutch speakers: 50 young adults, twenty-three 4- to 8-year-old children with TD speech, and seven 4- to 8-year-old children with SSD. Between-groups and individual within-group differences in model outcome measures representing auditory and somatosensory feedback control gain and feedforward learning rate were assessed.

RESULTS: Notable between-groups and within-group variation was found for all outcome measures. Data modeled for individual speakers yielded model fits with varying reliability. Auditory feedback control gain was negative in children with SSD and positive in both other groups. Somatosensory feedback control gain was negative for both groups of children and marginally negative for adults. Feedforward learning rate measures were highest in the children with TD speech followed by children with SSD, compared to adults.

CONCLUSIONS: The SimpleDIVA model was able to account for responses to the perturbation of auditory feedback other than corrective, as negative auditory feedback control gains were associated with following responses to vowel shifts. These preliminary findings are suggestive of impaired auditory self-monitoring in children with complex SSD. Possible mechanisms underlying the nature of following responses are discussed.}, } @article {pmid37059081, year = {2023}, author = {Chao, SC and Daliri, A}, title = {Effects of Gradual and Sudden Introduction of Perturbations on Adaptive Responses to Formant-Shift and Formant-Clamp Perturbations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1588-1599}, pmid = {37059081}, issn = {1558-9102}, support = {R01 DC019905/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech/physiology ; *Speech Perception ; Phonetics ; Speech Acoustics ; }, abstract = {PURPOSE: When the speech motor system encounters errors, it generates adaptive responses to compensate for the errors. Unlike errors induced by formant-shift perturbations, errors induced by formant-clamp perturbations do not correspond with the speaker's speech (i.e., degraded motor-to-auditory correspondence). We previously showed that adaptive responses to formant-clamp perturbations are smaller than responses to formant-shift perturbations when perturbations are introduced gradually. This study examined responses to formant-clamp and formant-shift perturbations when perturbations are introduced suddenly.

METHOD: One group of participants (n = 30) experienced gradually introduced formant-clamp and formant-shift perturbations, and another group (n = 30) experienced suddenly introduced formant-clamp and formant-shift perturbations. We designed the perturbations based on participant-specific vowel configurations such that a participant's first and second formants of /ɛ/ were perturbed toward their /æ/. To estimate adaptive responses, we measured formant changes (0-100 ms of the vowel) in response to the formant perturbations.

RESULTS: We found that (a) the difference between responses to formant-clamp and formant-shift perturbations was smaller when the perturbations were introduced suddenly and (b) responses to suddenly introduced (but not gradually introduced) formant-shift perturbations positively correlated with responses to formant-clamp perturbations.

CONCLUSIONS: These results showed that the speech motor system responds to errors induced by formant-shift and formant-clamp perturbations more differently when perturbations are introduced gradually than suddenly. Overall, the quality of errors (formant-shift vs. formant-clamp) and the manner of introducing errors (gradually vs. suddenly) modulate the speech motor system's evaluations of and responses to errors.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.22406422.}, } @article {pmid37040323, year = {2023}, author = {Luo, X and Daliri, A}, title = {The Impact of Bimodal Hearing on Speech Acoustics of Vowel Production in Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1511-1524}, pmid = {37040323}, issn = {1558-9102}, support = {R01 DC019905/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Cochlear Implants ; Speech Acoustics ; *Cochlear Implantation ; Hearing ; *Hearing Aids ; }, abstract = {PURPOSE: This study aimed to investigate the acoustic changes in vowel production with different forms of auditory feedback via cochlear implant (CI), hearing aid (HA), and bimodal hearing (CI + HA).

METHOD: Ten post-lingually deaf adult bimodal CI users (aged 50-78 years) produced English vowels /i/, /ɛ/, /æ/, /ɑ/, /ʊ/, and /u/ in the context of /hVd/ during short-term use of no device (ND), HA, CI, and CI + HA. Segmental features (first formant frequency [F 1], second formant frequency [F 2], and vowel space area) and suprasegmental features (duration, intensity, and fundamental frequency [f o]) of vowel production were analyzed. Participants also categorized a vowel continuum synthesized from their own productions of /ɛ/ and /æ/ using HA, CI, and CI + HA.

RESULTS: F 1s of all vowels decreased; F 2s of front vowels but not back vowels increased; vowel space areas increased; and vowel durations, intensities, and f os decreased with statistical significance in the HA, CI, and CI + HA conditions relative to the ND condition. Only f os were lower, and vowel space areas were larger with CI and CI + HA than with HA. Average changes in f o, intensity, and F 1 from the ND condition to the HA, CI, and CI + HA conditions were positively correlated. Most participants did not show a typical psychometric function for vowel categorization, and thus, the relationship between vowel categorization and production was not tested.

CONCLUSIONS: The results suggest that acoustic, electric, and bimodal hearing have a measurable impact on vowel acoustics of post-lingually deaf adults when their hearing devices are turned on and off temporarily. Also, changes in f o and F 1 with the use of hearing devices may be largely driven by changes in intensity.}, } @article {pmid37031224, year = {2023}, author = {Hsu, TC and Wu, BX and Lin, RT and Chien, CJ and Yeh, CY and Chang, TH}, title = {Electron-phonon interaction toward engineering carrier mobility of periodic edge structured graphene nanoribbons.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {5781}, pmid = {37031224}, issn = {2045-2322}, support = {NSTC-109-2222-E-002-002-MY3//Ministry of Science and Technology, Taiwan/ ; 110-2622-8-002-014//Taiwan Semiconductor Manufacturing Company/ ; }, abstract = {Graphene nanoribbons have many extraordinary electrical properties and are the candidates for semiconductor industry. In this research, we propose a design of Coved GNRs with periodic structure ranged from 4 to 8 nm or more, of which the size is within practical feature sizes by advanced lithography tools. The carrier transport properties of Coved GNRs with the periodic coved shape are designed to break the localized electronic state and reducing electron-phonon scattering. In this way, the mobility of Coved GNRs can be enhanced by orders compared with the zigzag GNRs in same width. Moreover, in contrast to occasional zero bandgap transition of armchair and zigzag GNRs without precision control in atomic level, the Coved GNRs with periodic edge structures can exclude the zero bandgap conditions, which makes practical the mass production process. The designed Coved-GNRs is fabricated over the Germanium (110) substrate where the graphene can be prepared in the single-crystalline and single-oriented formants and the edge of GNRs is later repaired under "balanced condition growth" and we demonstrate that the propose coved structures are compatible to current fabrication facility.}, } @article {pmid37015000, year = {2023}, author = {Vorperian, HK and Kent, RD and Lee, Y and Buhr, KA}, title = {Vowel Production in Children and Adults With Down Syndrome: Fundamental and Formant Frequencies of the Corner Vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {4}, pages = {1208-1239}, pmid = {37015000}, issn = {1558-9102}, support = {R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Male ; Female ; Humans ; Adult ; Child ; Child, Preschool ; Adolescent ; Young Adult ; Middle Aged ; Aged ; Aged, 80 and over ; *Speech Acoustics ; *Down Syndrome ; Phonetics ; Speech Intelligibility ; Acoustics ; }, abstract = {PURPOSE: Atypical vowel production contributes to reduced speech intelligibility in children and adults with Down syndrome (DS). This study compares the acoustic data of the corner vowels /i/, /u/, /æ/, and /ɑ/ from speakers with DS against typically developing/developed (TD) speakers.

METHOD: Measurements of the fundamental frequency (f o) and first four formant frequencies (F1-F4) were obtained from single word recordings containing the target vowels from 81 participants with DS (ages 3-54 years) and 293 TD speakers (ages 4-92 years), all native speakers of English. The data were used to construct developmental trajectories and to determine interspeaker and intraspeaker variability.

RESULTS: Trajectories for DS differed from TD based on age and sex, but the groups were similar with the striking change in f o and F1-F4 frequencies around age 10 years. Findings confirm higher f o in DS, and vowel-specific differences between DS and TD in F1 and F2 frequencies, but not F3 and F4. The measure of F2 differences of front-versus-back vowels was more sensitive of compression than reduced vowel space area/centralization across age and sex. Low vowels had more pronounced F2 compression as related to reduced speech intelligibility. Intraspeaker variability was significantly greater for DS than TD for nearly all frequency values across age.

DISCUSSION: Vowel production differences between DS and TD are age- and sex-specific, which helps explain contradictory results in previous studies. Increased intraspeaker variability across age in DS confirms the presence of a persisting motor speech disorder. Atypical vowel production in DS is common and related to dysmorphology, delayed development, and disordered motor control.}, } @article {pmid37005127, year = {2023}, author = {Capobianco, S and Nacci, A and Calcinoni, O and Bruschini, L and Berrettini, S and Bottalico, P}, title = {Assessing Acoustic Parameters in Early Music and Romantic Operatic Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.02.009}, pmid = {37005127}, issn = {1873-4588}, abstract = {OBJECTIVE: Since the recent early music (EM) revival, a subset of singers have begun to specialize in a style of singing that is perceptually different from the more "mainstream" romantic operatic (RO) singing style. The aim of this study is to characterize EM with respect to RO singing in terms of its vibrato characteristics and the singer's formant cluster.

STUDY DESIGN: This study presents a within-subject experimental design.

METHODS: Ten professional singers (5 F; 5M) versed in both EM and RO repertoire were enrolled in the study. Each singer recorded the first 10 bars of the famous Aria, "Amarilli Mia Bella" (Giulio Caccini, 1602) a cappella, in RO and EM styles, in random order. Three sustained notes were extracted from the acoustical recordings and were analyzed using the free user-friendly software Biovoice to extract five parameters: vibrato rate, vibrato extent, vibrato jitter (Jvib), vibrato shimmer, and quality ratio (QR), an estimation of the singer's formant power.

RESULTS: Vibrato in EM singing was characterized by a higher rate, a smaller extent, and less regular cycle-cycle period duration (higher Jvib) compared to RO singing. As in previous studies, RO singing presented a more prominent singer's formant, as indicated by a smaller QR.

CONCLUSIONS: Acoustical analysis of some vibrato characteristics and the Singer's Formant significantly differentiated EM from RO singing styles. Given the acoustical distinctions between EM and RO styles, future scientific and musicological studies should consider distinguishing between the two styles rather than using a singular term for and description of Western Classical singing.}, } @article {pmid37003707, year = {2023}, author = {Wood, S}, title = {Dating the open /æ/ sound change in Southern British English.}, journal = {JASA express letters}, volume = {3}, number = {3}, pages = {035205}, doi = {10.1121/10.0015281}, pmid = {37003707}, issn = {2691-1191}, abstract = {The new open /æ/ was not noticed in the non-regional received pronunciation (RP) accent of Southern British English until the 1980s. Dating to the 1950s or 1920s had been suggested, but the earliest known regional example was born in Kent in the 1860s. Formant data from archived recordings of 29 Southeastern speakers, born between the 1850s and 1960s, were studied using two methods: inspection of formant diagrams for closer /æ/, and modelling low vowels for open /æ/. The earliest RP speaker found with new open /æ/ was born in 1857, demonstrating that this type of sound change had started by the 1850s.}, } @article {pmid37002095, year = {2023}, author = {Serrurier, A and Neuschaefer-Rube, C}, title = {Morphological and acoustic modeling of the vocal tract.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {3}, pages = {1867}, doi = {10.1121/10.0017356}, pmid = {37002095}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Speech Acoustics ; Phonetics ; Speech ; *Voice ; Acoustics ; }, abstract = {In speech production, the anatomical morphology forms the substrate on which the speakers build their articulatory strategy to reach specific articulatory-acoustic goals. The aim of this study is to characterize morphological inter-speaker variability by building a shape model of the full vocal tract including hard and soft structures. Static magnetic resonance imaging data from 41 speakers articulating altogether 1947 phonemes were considered, and the midsagittal articulator contours were manually outlined. A phoneme-independent average-articulation representative of morphology was calculated as the speaker mean articulation. A principal component analysis-driven shape model was derived from average-articulations, leading to five morphological components, which explained 87% of the variance. Almost three-quarters of the variance was related to independent variations of the horizontal oral and vertical pharyngeal lengths, the latter capturing male-female differences. The three additional components captured shape variations related to head tilt and palate shape. Plane wave propagation acoustic simulations were run to characterize morphological components. A lengthening of 1 cm of the vocal tract in the vertical or horizontal directions led to a decrease in formant values of 7%-8%. Further analyses are required to analyze three-dimensional variability and to understand the morphological-acoustic relationships per phoneme. Average-articulations and model code are publicly available (https://github.com/tonioser/VTMorphologicalModel).}, } @article {pmid36949035, year = {2023}, author = {Lou, Q and Wang, X and Chen, Y and Wang, G and Jiang, L and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients With Repaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {34}, number = {6}, pages = {e551-e556}, doi = {10.1097/SCS.0000000000009301}, pmid = {36949035}, issn = {1536-3732}, mesh = {Adult ; Humans ; *Cleft Palate/surgery ; Speech ; Pharynx/surgery ; *Velopharyngeal Insufficiency/surgery ; Pharyngeal Muscles ; }, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients with repaired cleft palate through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with complete velopharyngeal closure (VPC) and velopharyngeal insufficiency (VPI) patients.

PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality and consonant missing rate, for objective acoustic analysis, we used speech sample normalization and objective acoustic parameters included normalized vowel formants, voice onset time and the analysis of 3-dimensional spectrogram and spectrum, were carried out on speech samples produced by 3 groups of speakers: (a) speakers with velopharyngeal competence after palatorrhaphy (n=38); (b) speakers with velopharyngeal incompetence after palatorrhaphy (n=70), (c) adult patients with cleft palate (n=65) and (d) typical speakers (n=30).

RESULTS: There was a highly negative correlation between VPC grade and speech intelligibility (ρ=-0.933), and a highly positive correlation between VPC and nasality (ρ=0.813). In subjective evaluation, the speech level of VPI patients was significantly lower than that of VPC patients and normal adults. Although the nasality and consonant loss rate of VPC patients were significantly higher than that of normal adults, the speech intelligibility of VPC patients was not significantly different from that of normal adults. In acoustic analysis, patients with VPI still performed poorly compared with patients with VPC.

CONCLUSIONS: The speech function of adult cleft palate patients is affected by abnormal palatal structure and bad pronunciation habits. In subjective evaluation, there was no significant difference in speech level between VPC patients and normal adults, whereas there was significant difference between VPI patients and normal adults. The acoustic parameters were different between the 2 groups after cleft palate repair. The condition of palatopharyngeal closure after cleft palate can affect the patient's speech.}, } @article {pmid36946195, year = {2023}, author = {Easwar, V and Purcell, D and Wright, T}, title = {Predicting Hearing aid Benefit Using Speech-Evoked Envelope Following Responses in Children With Hearing Loss.}, journal = {Trends in hearing}, volume = {27}, number = {}, pages = {23312165231151468}, pmid = {36946195}, issn = {2331-2165}, mesh = {Adolescent ; Child ; Female ; Humans ; Male ; Evoked Potentials, Auditory ; *Hearing Aids ; *Hearing Loss/physiopathology/therapy ; *Speech Perception/physiology ; Speech/physiology ; }, abstract = {Electroencephalography could serve as an objective tool to evaluate hearing aid benefit in infants who are developmentally unable to participate in hearing tests. We investigated whether speech-evoked envelope following responses (EFRs), a type of electroencephalography-based measure, could predict improved audibility with the use of a hearing aid in children with mild-to-severe permanent, mainly sensorineural, hearing loss. In 18 children, EFRs were elicited by six male-spoken band-limited phonemic stimuli--the first formants of /u/ and /i/, the second and higher formants of /u/ and /i/, and the fricatives /s/ and /∫/--presented together as /su∫i/. EFRs were recorded between the vertex and nape, when /su∫i/ was presented at 55, 65, and 75 dB SPL using insert earphones in unaided conditions and individually fit hearing aids in aided conditions. EFR amplitude and detectability improved with the use of a hearing aid, and the degree of improvement in EFR amplitude was dependent on the extent of change in behavioral thresholds between unaided and aided conditions. EFR detectability was primarily influenced by audibility; higher sensation level stimuli had an increased probability of detection. Overall EFR sensitivity in predicting audibility was significantly higher in aided (82.1%) than unaided conditions (66.5%) and did not vary as a function of stimulus or frequency. EFR specificity in ascertaining inaudibility was 90.8%. Aided improvement in EFR detectability was a significant predictor of hearing aid-facilitated change in speech discrimination accuracy. Results suggest that speech-evoked EFRs could be a useful objective tool in predicting hearing aid benefit in children with hearing loss.}, } @article {pmid36945094, year = {2023}, author = {Duan, H and Xie, Q and Zhang, Z}, title = {Characteristics of Alveolo-palatal Affricates Produced by Mandarin-speaking Children with Repaired Cleft Palate.}, journal = {American journal of health behavior}, volume = {47}, number = {1}, pages = {13-20}, doi = {10.5993/AJHB.47.1.2}, pmid = {36945094}, issn = {1945-7359}, mesh = {Humans ; Child ; Child, Preschool ; *Cleft Palate/surgery ; Phonetics ; Language ; }, abstract = {Objectives: In this study, examined the acoustic properties of affricates /t/ and /t[h]/ in Mandarin Chinese, and analyzed the differences of the acoustic characteristics of these affricates produced by children with repaired cleft palate and normally developing children. We also explored the relationship between the affricates and high-front vowel /i/. Methods: We analyzed 16 monosyllabic words with alveolo-palatal affricates as the initial consonants produced by children with repaired cleft palate (N=13, Mean=5.9 years) and normally developing children (N=6, Mean age=5.3 years). We used several acoustic parameters to investigate the characteristics of these affricates, such as the center of gravity, VOT and the formants of vowels. Results: Compared with normally developing children, children with cleft palate exhibited a lower center of gravity for the 2 affricates /t/ and /t[h]/. Data from the control group showed that the affricate /t[h]/ had a significantly greater center of gravity than that of /t/. The accuracy of /t , t[h]/ produced by speakers of cleft palate was significantly correlated with that of /i/ (r=0.63). High-front vowel /i/ is a significant index in diagnosing speech intelligibility which is more valuable than /a/ and /u/. There was a significant difference in F2 of vowel /i/ between children with cleft palate without speech therapy (CS1) and after speech therapy (CS2). After speech intervention, the accuracy of affricates produced by children with cleft palate was improved, the acoustic properties "stop + noise segments" appeared. Conclusion: Children with cleft palate can be distinguished better from children with normal development by 2 significant acoustic characteristics: center of gravity and VOT. As alveolo-palatal affricates /t , t[h]/ and high-front vowel /i/ have a similar place of articulation, front-tongue-blade, their production accuracy can be improved mutually. The analysis showed that the articulation of Chinese /i/ has a higher frontal lingual position and less variability, which is more conducive to articulation training and improves the effect of cleft palate training. These findings provide a potential relationship on affricates /t, t[h]/ and vowel /i/. Children with cleft palate have difficulty pronouncing the /t, t [h]/ and /i/. It is better to start with a vowel /i/, resulting in improvement in overall speech intelligibility.}, } @article {pmid36938342, year = {2023}, author = {Alghowinem, S and Gedeon, T and Goecke, R and Cohn, JF and Parker, G}, title = {Interpretation of Depression Detection Models via Feature Selection Methods.}, journal = {IEEE transactions on affective computing}, volume = {14}, number = {1}, pages = {133-152}, pmid = {36938342}, issn = {1949-3045}, support = {R01 MH051435/MH/NIMH NIH HHS/United States ; R01 MH065376/MH/NIMH NIH HHS/United States ; R01 MH096951/MH/NIMH NIH HHS/United States ; }, abstract = {Given the prevalence of depression worldwide and its major impact on society, several studies employed artificial intelligence modelling to automatically detect and assess depression. However, interpretation of these models and cues are rarely discussed in detail in the AI community, but have received increased attention lately. In this study, we aim to analyse the commonly selected features using a proposed framework of several feature selection methods and their effect on the classification results, which will provide an interpretation of the depression detection model. The developed framework aggregates and selects the most promising features for modelling depression detection from 38 feature selection algorithms of different categories. Using three real-world depression datasets, 902 behavioural cues were extracted from speech behaviour, speech prosody, eye movement and head pose. To verify the generalisability of the proposed framework, we applied the entire process to depression datasets individually and when combined. The results from the proposed framework showed that speech behaviour features (e.g. pauses) are the most distinctive features of the depression detection model. From the speech prosody modality, the strongest feature groups were F0, HNR, formants, and MFCC, while for the eye activity modality they were left-right eye movement and gaze direction, and for the head modality it was yaw head movement. Modelling depression detection using the selected features (even though there are only 9 features) outperformed using all features in all the individual and combined datasets. Our feature selection framework did not only provide an interpretation of the model, but was also able to produce a higher accuracy of depression detection with a small number of features in varied datasets. This could help to reduce the processing time needed to extract features and creating the model.}, } @article {pmid36882955, year = {2023}, author = {Hauser, I}, title = {Differential Cue Weighting in Mandarin Sibilant Production.}, journal = {Language and speech}, volume = {66}, number = {4}, pages = {1056-1090}, pmid = {36882955}, issn = {1756-6053}, mesh = {Humans ; *Cues ; Phonetics ; *Speech Perception ; Speech ; Speech Acoustics ; }, abstract = {Individual talkers vary in their relative use of different cues to signal phonological contrast. Previous work provides limited and conflicting data on whether such variation is modulated by cue trading or individual differences in speech style. This paper examines differential cue weighting patterns in Mandarin sibilants as a test case for these hypotheses. Standardized Mandarin exhibits a three-way place contrast between retroflex, alveopalatal, and alveolar sibilants with individual differences in relative weighting of spectral center of gravity (COG) and the second formant of the following vowel (F2). In results from a speech production task, cue weights of COG and F2 are inversely correlated across speakers, demonstrating a trade-off relationship in cue use. These findings are consistent with a cue trading account of individual differences in contrast signaling.}, } @article {pmid36880531, year = {2023}, author = {Yang, X and Guo, C and Zhang, M and Li, Y and Ren, M and Mao, S and Dhakal, R and Kim, NY and Dong, Z and Sun, B and Yao, Z}, title = {Ultrahigh-sensitivity multi-parameter tacrolimus solution detection based on an anchor planar millifluidic microwave biosensor.}, journal = {Analytical methods : advancing methods and applications}, volume = {15}, number = {14}, pages = {1765-1774}, doi = {10.1039/d3ay00100h}, pmid = {36880531}, issn = {1759-9679}, mesh = {*Tacrolimus ; Microwaves ; Radio Waves ; Limit of Detection ; *Biosensing Techniques ; }, abstract = {To detect drug concentration in tacrolimus solution, an anchor planar millifluidic microwave (APMM) biosensor is proposed. The millifluidic system integrated with the sensor enables accurate and efficient detection while eliminating interference caused by the fluidity of the tacrolimus sample. Different concentrations (10-500 ng mL[-1]) of the tacrolimus analyte were introduced into the millifluidic channel, where it completely interacts with the radio frequency patch electromagnetic field, thereby effectively and sensitively modifying the resonant frequency and amplitude of the transmission coefficient. Experimental results indicate that the sensor has an extremely low limit of detection (LoD) of 0.12 pg mL[-1] and a frequency detection resolution (FDR) of 1.59 (MHz (ng mL[-1])). The greater the FDR and the lower the LoD, the more the feasibility of a label-free biosensing method. Regression analysis revealed a strong linear correlation (R[2] = 0.992) between the concentration of tacrolimus and the frequency difference of the two resonant peaks of APMM. In addition, the difference in the reflection coefficient between the two formants was measured and calculated, and a strong linear correlation (R[2] = 0.998) was found between the difference and tacrolimus concentration. Five measurements were performed on each individual sample of tacrolimus to validate the biosensor's high repeatability. Consequently, the proposed biosensor is a potential candidate for the early detection of tacrolimus drug concentration levels in organ transplant recipients. This study presents a simple method for constructing microwave biosensors with high sensitivity and rapid response.}, } @article {pmid36859160, year = {2023}, author = {Liu, Z and Xu, Y}, title = {Deep learning assessment of syllable affiliation of intervocalic consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {848}, doi = {10.1121/10.0017117}, pmid = {36859160}, issn = {1520-8524}, mesh = {Male ; Humans ; *Deep Learning ; Acoustics ; Emotions ; Judgment ; Language ; }, abstract = {In English, a sentence like "He made out our intentions." could be misperceived as "He may doubt our intentions." because the coda /d/ sounds like it has become the onset of the next syllable. The nature and occurrence condition of this resyllabification phenomenon are unclear, however. Previous empirical studies mainly relied on listener judgment, limited acoustic evidence, such as voice onset time, or average formant values to determine the occurrence of resyllabification. This study tested the hypothesis that resyllabification is a coarticulatory reorganisation that realigns the coda consonant with the vowel of the next syllable. Deep learning in conjunction with dynamic time warping (DTW) was used to assess syllable affiliation of intervocalic consonants. The results suggest that convolutional neural network- and recurrent neural network-based models can detect cases of resyllabification using Mel-frequency spectrograms. DTW analysis shows that neural network inferred resyllabified sequences are acoustically more similar to their onset counterparts than their canonical productions. A binary classifier further suggests that, similar to the genuine onsets, the inferred resyllabified coda consonants are coarticulated with the following vowel. These results are interpreted with an account of resyllabification as a speech-rate-dependent coarticulatory reorganisation mechanism in speech.}, } @article {pmid36859151, year = {2023}, author = {Lasota, M and Šidlof, P and Maurerlehner, P and Kaltenbacher, M and Schoder, S}, title = {Anisotropic minimum dissipation subgrid-scale model in hybrid aeroacoustic simulations of human phonation.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {1052}, doi = {10.1121/10.0017202}, pmid = {36859151}, issn = {1520-8524}, mesh = {Humans ; *Acoustics ; Anisotropy ; Computer Simulation ; *Computer Systems ; Phonation ; }, abstract = {This article deals with large-eddy simulations of three-dimensional incompressible laryngeal flow followed by acoustic simulations of human phonation of five cardinal English vowels, /ɑ, æ, i, o, u/. The flow and aeroacoustic simulations were performed in OpenFOAM and in-house code openCFS, respectively. Given the large variety of scales in the flow and acoustics, the simulation is separated into two steps: (1) computing the flow in the larynx using the finite volume method on a fine moving grid with 2.2 million elements, followed by (2) computing the sound sources separately and wave propagation to the radiation zone around the mouth using the finite element method on a coarse static grid with 33 000 elements. The numerical results showed that the anisotropic minimum dissipation model, which is not well known since it is not available in common CFD software, predicted stronger sound pressure levels at higher harmonics, and especially at first two formants, than the wall-adapting local eddy-viscosity model. The model on turbulent flow in the larynx was employed and a positive impact on the quality of simulated vowels was found.}, } @article {pmid36857868, year = {2023}, author = {Huang, Z and Lobbezoo, F and Vanhommerig, JW and Volgenant, CMC and de Vries, N and Aarab, G and Hilgevoord, AAJ}, title = {Effects of demographic and sleep-related factors on snoring sound parameters.}, journal = {Sleep medicine}, volume = {104}, number = {}, pages = {3-10}, doi = {10.1016/j.sleep.2023.02.012}, pmid = {36857868}, issn = {1878-5506}, mesh = {Adult ; Humans ; Male ; Middle Aged ; *Snoring ; *Sleep Apnea, Obstructive ; Sleep ; Polysomnography ; Demography ; }, abstract = {OBJECTIVE: To investigate the effect of frequently reported between-individual (viz., age, gender, body mass index [BMI], and apnea-hypopnea index [AHI]) and within-individual (viz., sleep stage and sleep position) snoring sound-related factors on snoring sound parameters in temporal, intensity, and frequency domains.

METHODS: This study included 83 adult snorers (mean ± SD age: 42.2 ± 11.3 yrs; male gender: 59%) who underwent an overnight polysomnography (PSG) and simultaneous sound recording, from which a total of 131,745 snoring events were extracted and analyzed. Data on both between-individual and within-individual factors were extracted from the participants' PSG reports.

RESULTS: Gender did not have any significant effect on snoring sound parameters. The fundamental frequency (FF; coefficient = -0.31; P = 0.02) and dominant frequency (DF; coefficient = -12.43; P < 0.01) of snoring sounds decreased with the increase of age, and the second formant increased (coefficient = 22.91; P = 0.02) with the increase of BMI. Severe obstructive sleep apnea (OSA; AHI ≥30 events/hour), non-rapid eye movement sleep stage 3 (N3), and supine position were all associated with more, longer, and louder snoring events (P < 0.05). Supine position was associated with higher FF and DF, and lateral decubitus positions were associated with higher formants.

CONCLUSIONS: Within the limitations of the current patient profile and included factors, AHI was found to have greater effects on snoring sound parameters than the other between-individual factors. The included within-individual factors were found to have greater effects on snoring sound parameters than the between-individual factors under study.}, } @article {pmid36844947, year = {2023}, author = {Wang, L and Jiang, Z}, title = {Tidal Volume Level Estimation Using Respiratory Sounds.}, journal = {Journal of healthcare engineering}, volume = {2023}, number = {}, pages = {4994668}, pmid = {36844947}, issn = {2040-2309}, mesh = {Humans ; *Respiratory Sounds ; Snoring ; Tidal Volume ; *Sleep Apnea, Obstructive ; Algorithms ; }, abstract = {Respiratory sounds have been used as a noninvasive and convenient method to estimate respiratory flow and tidal volume. However, current methods need calibration, making them difficult to use in a home environment. A respiratory sound analysis method is proposed to estimate tidal volume levels during sleep qualitatively. Respiratory sounds are filtered and segmented into one-minute clips, all clips are clustered into three categories: normal breathing/snoring/uncertain with agglomerative hierarchical clustering (AHC). Formant parameters are extracted to classify snoring clips into simple snoring and obstructive snoring with the K-means algorithm. For simple snoring clips, the tidal volume level is calculated based on snoring last time. For obstructive snoring clips, the tidal volume level is calculated by the maximum breathing pause interval. The performance of the proposed method is evaluated on an open dataset, PSG-Audio, in which full-night polysomnography (PSG) and tracheal sound were recorded simultaneously. The calculated tidal volume levels are compared with the corresponding lowest nocturnal oxygen saturation (LoO2) data. Experiments show that the proposed method calculates tidal volume levels with high accuracy and robustness.}, } @article {pmid36816289, year = {2023}, author = {Aldamen, H and Al-Deaibes, M}, title = {Arabic emphatic consonants as produced by English speakers: An acoustic study.}, journal = {Heliyon}, volume = {9}, number = {2}, pages = {e13401}, pmid = {36816289}, issn = {2405-8440}, abstract = {This study examines the production of emphatic consonants as produced by American L2 learners of Arabic. To this end, 19 participants, 5 native speakers and 14 L2 learners, participated in a production experiment in which they produced monosyllabic CVC pairs that were contrasted in terms of whether the initial consonant was plain or emphatic. The acoustic parameters that were investigated are VOT of voiceless stops, COG of fricatives, and the first three formant frequencies of the target vowels. The results of the native speakers showed that VOT is a reliable acoustic correlate of emphasis in MSA. The results also showed that vowels in the emphatic context have higher F1 and F3 and lower F2. The results showed that the L2 learners produced comparable VOT values to those of native Arabic speakers. Further, L2 learners produced a significantly lower F2 of the vowels in the emphatic context than that in the plain context. Proficiency in Arabic played a role on the F2 measure; the intermediate learners tended to be more native-like than the beginning learners. As for F3, the results of the L2 learners unexpectedly showed that the beginning learners produced a higher F3 in the context of fricatives only. This suggests that the relationship between emphasis and proficiency depends on whether the preceding consonant is a stop or fricative.}, } @article {pmid36816122, year = {2023}, author = {Ali, IE and Sumita, Y and Wakabayashi, N}, title = {Comparison of Praat and Computerized Speech Lab for formant analysis of five Japanese vowels in maxillectomy patients.}, journal = {Frontiers in neuroscience}, volume = {17}, number = {}, pages = {1098197}, pmid = {36816122}, issn = {1662-4548}, abstract = {INTRODUCTION: Speech impairment is a common complication after surgical resection of maxillary tumors. Maxillofacial prosthodontists play a critical role in restoring this function so that affected patients can enjoy better lives. For that purpose, several acoustic software packages have been used for speech evaluation, among which Computerized Speech Lab (CSL) and Praat are widely used in clinical and research contexts. Although CSL is a commercial product, Praat is freely available on the internet and can be used by patients and clinicians to practice several therapy goals. Therefore, this study aimed to determine if both software produced comparable results for the first two formant frequencies (F1 and F2) and their respective formant ranges obtained from the same voice samples from Japanese participants with maxillectomy defects.

METHODS: CSL was used as a reference to evaluate the accuracy of Praat with both the default and newly proposed adjusted settings. Thirty-seven participants were enrolled in this study for formant analysis of the five Japanese vowels (a/i/u/e/o) using CSL and Praat. Spearman's rank correlation coefficient was used to judge the correlation between the analysis results of both programs regarding F1 and F2 and their respective formant ranges.

RESULTS: As the findings pointed out, highly positive correlations between both software were found for all acoustic features and all Praat settings.

DISCUSSION: The strong correlations between the results of both CSL and Praat suggest that both programs may have similar decision strategies for atypical speech and for both sexes. This study highlights that the default settings in Praat can be used for formant analysis in maxillectomy patients with predictable accuracy. The proposed adjusted settings in Praat can yield more accurate results for formant analysis of atypical speech in maxillectomy cases when the examiner cannot precisely locate the formant frequencies using the default settings or confirm analysis results obtained using CSL.}, } @article {pmid36748155, year = {2023}, author = {Zhang, C and Hou, Q and Guo, TT and Zhong, JT and Ren, H and Li, GL}, title = {[The effect of Wendler Glottoplasty to elevate vocal pitch in transgender women].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {58}, number = {2}, pages = {139-144}, doi = {10.3760/cma.j.cn115330-20220518-00275}, pmid = {36748155}, issn = {1673-0860}, support = {81900926//National Natural Science Foundation of China/ ; 7204246//Beijing Natural Science Foundation/ ; }, mesh = {Humans ; Male ; Female ; Young Adult ; Adult ; Middle Aged ; *Transgender Persons ; Retrospective Studies ; Speech Acoustics ; Voice Quality ; Phonation ; }, abstract = {Objective: To evaluate the effect of Wendler Glottoplasty to elevate vocal pitch in transgender women. Methods: The voice parameters of pre-and 3-month post-surgery of 29 transgender women who underwent Wendler Glottoplasty in department of otorhinolaryngology head and neck surgery of Beijing Friendship Hospital from January, 2017 to October, 2020 were retrospectively analyzed. The 29 transgender women ranged in age from 19-47 (27.0±6.3) years old. Subjective evaluation was performed using Transsexual Voice Questionnaire for Male to Female (TVQ[MtF]). Objective parameters included fundamental frequency (F0), highest pitch, lowest pitch, habitual volume, Jitter, Shimmer, maximal phonation time (MPT), noise to harmonic ratio (NHR) and formants frequencies(F1, F2, F3, F4). SPSS 25.0 software was used for statistically analysis. Results: Three months after surgery, the score of TVQ[MtF] was significantly decreased [(89.9±14.7) vs. (50.4±13.6), t=11.49, P<0.001]. The F0 was significantly elevated [(152.7±23.3) Hz vs. (207.7±45.9) Hz, t=-6.03, P<0.001]. Frequencies of F1, F2 and F3 were significantly elevated. No statistical difference was observed in the frequencies of F4. The highest pitch was not significantly altered while the lowest pitch was significantly elevated [(96.8±17.7) Hz vs. (120.0±28.9) Hz, t=-3.71, P=0.001]. Habitual speech volume was significantly increased [(60.0±5.2) dB vs. (63.6±9.6) dB, t=-2.12, P=0.043]. Jitter, Shimmer, NHR and MPT were not obviously altered (P>0.05). Conclusions: Wendler Glottoplasty could notably elevate the vocal pitch, formants frequencies and degree of vocal femininity in transgender women without affecting phonation ability and voice quality. It can be an effective treatment modality for voice feminization.}, } @article {pmid36742666, year = {2022}, author = {Gunjawate, DR and Ravi, R and Tauro, JP and Philip, R}, title = {Spectral and Temporal Characteristics of Vowels in Konkani.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4870-4879}, pmid = {36742666}, issn = {2231-3796}, abstract = {The present study was undertaken to study the acoustic characteristics of vowels using spectrographic analysis in Mangalorean Catholic Konkani dialect of Konkani spoken in Mangalore, Karnataka, India. Recordings were done using CVC words in 11 males and 19 females between the age range of 18-55 years. The CVC words consisted of combinations of vowels such as (/i, i:, e, ɵ, ə, u, o, ɐ, ӓ, ɔ/) and consonants such as (/m, k, w, s, ʅ, h, l, r, p, ʤ, g, n, Ɵ, ṭ, ḷ, b, dh/). Recordings were done in a sound-treated room using PRAAT software and spectrographic analysis was done and spectral and temporal characteristics such as fundamental frequency (F0), formants (F1, F2, F3) and vowel duration. The results showed that higher fundamental frequency values were observed for short, high and back vowels. Higher F1 values were noted for open vowels and F2 was higher for front vowels. Long vowels had longer duration compared to short vowels and females had longer vowel duration compared to males. The acoustic information in terms of spectral and temporal cues helps in better understanding the production and perception of languages and dialects.}, } @article {pmid36742539, year = {2022}, author = {Prakash, P and Boominathan, P and Mahalingam, S}, title = {Acoustic Description of Bhramari Pranayama.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4738-4747}, pmid = {36742539}, issn = {2231-3796}, abstract = {UNLABELLED: The study's aim was (1) To describe the acoustic characteristics of Bhramari pranayama, and (2) to compare the acoustic features of nasal consonant /m/ and the sound of Bhramari pranayama produced by yoga trainers. Cross-sectional study design. Thirty-three adult male yoga trainers performed five repeats of nasal consonant /m/ and Bhramari pranayama. These samples were recorded into Computerized Speech Lab, Kay Pentax model 4500b using a microphone (SM48). Formant frequencies (f F1, f F2, f F3, & f F4), formant bandwidths (BF1, BF2, BF3, & BF4), anti-formant, alpha and beta ratio were analyzed. Nasal consonant /m/ had higher f F2 and anti-formant compared to Bhramari pranayama. Statistical significant differences were noted in f F2, BF3, and anti-formants. Bhramari pranayama revealed a low alpha ratio and a higher beta ratio than /m/. However, these differences were not statistically significant. Findings are discussed from acoustic and physiological perspectives. Bhramari pranayama was assumed to be produced with a larger pharyngeal cavity and narrower velar passage when compared to nasal consonant /m/. Verification at the level of the glottis and with aerodynamic parameters may ascertain the above propositions.

SUPPLEMENTARY INFORMATION: The online version contains supplementary material available at 10.1007/s12070-021-03054-1.}, } @article {pmid36732236, year = {2023}, author = {Kondaurova, MV and Zheng, Q and Donaldson, CW and Smith, AF}, title = {Effect of telepractice on pediatric cochlear implant users and provider vowel space: A preliminary report.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {1}, pages = {467}, doi = {10.1121/10.0016866}, pmid = {36732236}, issn = {1520-8524}, mesh = {Child ; Humans ; *Cochlear Implants ; Speech Acoustics ; Speech Production Measurement ; *Cochlear Implantation ; *Deafness/rehabilitation ; Phonetics ; *Speech Perception ; }, abstract = {Clear speaking styles are goal-oriented modifications in which talkers adapt acoustic-phonetic characteristics of speech to compensate for communication challenges. Do children with hearing loss and a clinical provider modify speech characteristics during telepractice to adjust for remote communication? The study examined the effect of telepractice (tele-) on vowel production in seven (mean age 4:11 years, SD 1:2 years) children with cochlear implants (CIs) and a provider. The first (F1) and second (F2) formant frequencies of /i/, /ɑ/, and /u/ vowels were measured in child and provider speech during one in-person and one tele-speech-language intervention, order counterbalanced. Child and provider vowel space areas (VSA) were calculated. The results demonstrated an increase in F2 formant frequency for /i/ vowel in child and provider speech and an increase in F1 formant frequency for /ɑ/ vowel in the provider speech during tele- compared to in-person intervention. An expansion of VSA was found in child and provider speech in tele- compared to in-person intervention. In children, the earlier age of CI activation was associated with larger VSA in both tele- and in-person intervention. The results suggest that the children and the provider adjust vowel articulation in response to remote communication during telepractice.}, } @article {pmid36719795, year = {2022}, author = {Kirby, J and Pittayaporn, P and Brunelle, M}, title = {Transphonologization of onset voicing: revisiting Northern and Eastern Kmhmu'.}, journal = {Phonetica}, volume = {79}, number = {6}, pages = {591-629}, pmid = {36719795}, issn = {1423-0321}, mesh = {Humans ; *Voice ; Phonation ; Language ; Speech Acoustics ; Acoustics ; Phonetics ; }, abstract = {Phonation and vowel quality are often thought to play a vital role at the initial stage of tonogenesis. This paper investigates the production of voicing and tones in a tonal Northern Kmhmu' dialect spoken in Nan Province, Thailand, and a non-tonal Eastern Kmhmu' dialect spoken in Vientiane, Laos, from both acoustic and electroglottographic perspectives. Large and consistent VOT differences between voiced and voiceless stops are preserved in Eastern Kmhmu', but are not found in Northern Kmhmu', consistent with previous reports. With respect to pitch, f0 is clearly a secondary property of the voicing contrast in Eastern Kmhmu', but unquestionably the primary contrastive property in Northern Kmhmu'. Crucially, no evidence is found to suggest that either phonation type or formant differences act as significant cues to voicing in Eastern Kmhmu' or tones in Northern Kmhmu'. These results suggests that voicing contrasts can also be transphonologized directly into f0-based contrasts, skipping a registral stage based primarily on phonation and/or vowel quality.}, } @article {pmid36714887, year = {2023}, author = {Viegas, F and Camargo, Z and Viegas, D and Guimarães, GS and Luiz, RR and Ritto, F and Simões-Zenari, M and Nemr, K}, title = {Acoustic Measurements of Speech and Voice in Men with Angle Class II, Division 1, Malocclusion.}, journal = {International archives of otorhinolaryngology}, volume = {27}, number = {1}, pages = {e10-e15}, pmid = {36714887}, issn = {1809-9777}, abstract = {Introduction The acoustic analysis of speech (measurements of the fundamental frequency and formant frequencies) of different vowels produced by speakers with the Angle class II, division 1, malocclusion can provide information about the relationship between articulatory and phonatory mechanisms in this type of maxillomandibular disproportion. Objectives To investigate acoustic measurements related to the fundamental frequency (F0) and formant frequencies (F1 and F2) of the oral vowels of Brazilian Portuguese (BP) produced by male speakers with Angle class II, division 1, malocclusion (study group) and compare with men with Angle class I malocclusion (control group). Methods In total, 60 men (20 with class II, 40 with class I) aged between 18 and 40 years were included in the study. Measurements of F0, F1 and F2 of the seven oral vowels of BP were estimated from the audio samples containing repetitions of carrier sentences. The statistical analysis was performed using the Student t -test and the effect size was calculated. Results Significant differences (p -values) were detected for F0 values in five vowels ([e], [i], [ᴐ], [o] and [u]), and for F1 in vowels [a] and [ᴐ], with high levels for class II, division 1. Conclusion Statistical differences were found in the F0 measurements with higher values in five of the seven vowels analysed in subjects with Angle class II, division 1. The formant frequencies showed differences only in F1 in two vowels with higher values in the study group. The data suggest that data on voice and speech production must be included in the protocol's assessment of patients with malocclusion.}, } @article {pmid36712820, year = {2023}, author = {Freeman, V}, title = {Production and perception of prevelar merger: Two-dimensional comparisons using Pillai scores and confusion matrices.}, journal = {Journal of phonetics}, volume = {97}, number = {}, pages = {}, pmid = {36712820}, issn = {0095-4470}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, abstract = {Vowel merger production is quantified with gradient acoustic measures, while phonemic perception methods are often coarser, complicating comparisons within mergers in progress. This study implements a perception experiment in two-dimensional formant space (F1 × F2), allowing unified plotting, quantification, and statistics with production data. Production and perception are compared within 20 speakers for a two-part prevelar merger in progress in Pacific Northwest English, where mid-front /ɛ, e/ approximate or merge before voiced velar /ɡ/ (leg-vague merger), and low-front prevelar /æɡ/ raises toward them (bag-raising). Distributions are visualized with kernel density plots and overlap quantified with Pillai scores and confusion matrices from linear discriminant analysis models. Results suggest that leg-vague merger is perceived as more complete than it is produced (in both the sample and community), while bag-raising is highly variable in production but rejected in perception. Relationships between production and perception varied by age, with raising and merger progressing across two generations in production but not perception, followed by younger adults perceiving leg-vague merger but not producing it and varying in (minimal) raising perception while varying in bag-raising in production. Thus, prevelar raising/merger may be progressing among some social groups but reversing in others.}, } @article {pmid36701896, year = {2023}, author = {Holmes, E and Johnsrude, IS}, title = {Intelligibility benefit for familiar voices is not accompanied by better discrimination of fundamental frequency or vocal tract length.}, journal = {Hearing research}, volume = {429}, number = {}, pages = {108704}, doi = {10.1016/j.heares.2023.108704}, pmid = {36701896}, issn = {1878-5891}, support = {MOP 133450//CIHR/Canada ; }, mesh = {Humans ; *Voice ; Speech ; Cognition ; *Speech Perception ; Heart Rate ; }, abstract = {Speech is more intelligible when it is spoken by familiar than unfamiliar people. If this benefit arises because key voice characteristics like perceptual correlates of fundamental frequency or vocal tract length (VTL) are more accurately represented for familiar voices, listeners may be able to discriminate smaller manipulations to such characteristics for familiar than unfamiliar voices. We measured participants' (N = 17) thresholds for discriminating pitch (correlate of fundamental frequency, or glottal pulse rate) and formant spacing (correlate of VTL; 'VTL-timbre') for voices that were familiar (participants' friends) and unfamiliar (other participants' friends). As expected, familiar voices were more intelligible. However, discrimination thresholds were no smaller for the same familiar voices. The size of the intelligibility benefit for a familiar over an unfamiliar voice did not relate to the difference in discrimination thresholds for the same voices. Also, the familiar-voice intelligibility benefit was just as large following perceptible manipulations to pitch and VTL-timbre. These results are more consistent with cognitive accounts of speech perception than traditional accounts that predict better discrimination.}, } @article {pmid36689265, year = {2023}, author = {Ettore, E and Müller, P and Hinze, J and Riemenschneider, M and Benoit, M and Giordana, B and Hurlemann, R and Postin, D and Lecomte, A and Musiol, M and Lindsay, H and Robert, P and König, A}, title = {Digital Phenotyping for Differential Diagnosis of Major Depressive Episode: Narrative Review.}, journal = {JMIR mental health}, volume = {10}, number = {}, pages = {e37225}, pmid = {36689265}, issn = {2368-7959}, abstract = {BACKGROUND: Major depressive episode (MDE) is a common clinical syndrome. It can be found in different pathologies such as major depressive disorder (MDD), bipolar disorder (BD), posttraumatic stress disorder (PTSD), or even occur in the context of psychological trauma. However, only 1 syndrome is described in international classifications (Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition [DSM-5]/International Classification of Diseases 11th Revision [ICD-11]), which do not take into account the underlying pathology at the origin of the MDE. Clinical interviews are currently the best source of information to obtain the etiological diagnosis of MDE. Nevertheless, it does not allow an early diagnosis and there are no objective measures of extracted clinical information. To remedy this, the use of digital tools and their correlation with clinical symptomatology could be useful.

OBJECTIVE: We aimed to review the current application of digital tools for MDE diagnosis while highlighting shortcomings for further research. In addition, our work was focused on digital devices easy to use during clinical interview and mental health issues where depression is common.

METHODS: We conducted a narrative review of the use of digital tools during clinical interviews for MDE by searching papers published in PubMed/MEDLINE, Web of Science, and Google Scholar databases since February 2010. The search was conducted from June to September 2021. Potentially relevant papers were then compared against a checklist for relevance and reviewed independently for inclusion, with focus on 4 allocated topics of (1) automated voice analysis, behavior analysis by (2) video and physiological measures, (3) heart rate variability (HRV), and (4) electrodermal activity (EDA). For this purpose, we were interested in 4 frequently found clinical conditions in which MDE can occur: (1) MDD, (2) BD, (3) PTSD, and (4) psychological trauma.

RESULTS: A total of 74 relevant papers on the subject were qualitatively analyzed and the information was synthesized. Thus, a digital phenotype of MDE seems to emerge consisting of modifications in speech features (namely, temporal, prosodic, spectral, source, and formants) and in speech content, modifications in nonverbal behavior (head, hand, body and eyes movement, facial expressivity, and gaze), and a decrease in physiological measurements (HRV and EDA). We not only found similarities but also differences when MDE occurs in MDD, BD, PTSD, or psychological trauma. However, comparative studies were rare in BD or PTSD conditions, which does not allow us to identify clear and distinct digital phenotypes.

CONCLUSIONS: Our search identified markers from several modalities that hold promise for helping with a more objective diagnosis of MDE. To validate their potential, further longitudinal and prospective studies are needed.}, } @article {pmid36680472, year = {2023}, author = {Aoyama, K and Hong, L and Flege, JE and Akahane-Yamada, R and Yamada, T}, title = {Relationships Between Acoustic Characteristics and Intelligibility Scores: A Reanalysis of Japanese Speakers' Productions of American English Liquids.}, journal = {Language and speech}, volume = {66}, number = {4}, pages = {1030-1045}, doi = {10.1177/00238309221140910}, pmid = {36680472}, issn = {1756-6053}, mesh = {Adult ; Child ; Humans ; United States ; Japan ; *Speech Acoustics ; *Language ; Speech ; Acoustics ; Speech Intelligibility ; Phonetics ; }, abstract = {The primary purpose of this research report was to investigate the relationships between acoustic characteristics and perceived intelligibility for native Japanese speakers' productions of American English liquids. This report was based on a reanalysis of intelligibility scores and acoustic analyses that were reported in two previous studies. We examined which acoustic parameters were associated with higher perceived intelligibility scores for their productions of /l/ and /ɹ/ in American English, and whether Japanese speakers' productions of the two liquids were acoustically differentiated from each other. Results demonstrated that the second formant (F2) was strongly correlated with the perceived intelligibility scores for the Japanese adults' productions. Results also demonstrated that the Japanese adults' and children's productions of /l/ and /ɹ/ were indeed differentiated by some acoustic parameters including the third formant (F3). In addition, some changes occurred in the Japanese children's productions over the course of 1 year. Overall, the present report shows that Japanese speakers of American English may be making a distinction between /l/ and /ɹ/ in production, although the distinctions are made in a different way compared with native English speakers' productions. These findings have implications for setting realistic goals for improving intelligibility of English /l/ and /ɹ/ for Japanese speakers, as well as theoretical advancement of second-language speech learning.}, } @article {pmid36608104, year = {2023}, author = {Sahin, S and Sen Yilmaz, B}, title = {Effects of the Orthognathic Surgery on the Voice Characteristics of Skeletal Class III Patients.}, journal = {The Journal of craniofacial surgery}, volume = {34}, number = {1}, pages = {253-257}, doi = {10.1097/SCS.0000000000008843}, pmid = {36608104}, issn = {1536-3732}, mesh = {Adult ; Humans ; Male ; Female ; Voice Quality ; Speech Acoustics ; *Orthognathic Surgery ; *Voice ; Acoustics ; }, abstract = {OBJECTIVES: To analyze the effects of the bimaxillary orthognathic surgery on the voice characteristics of skeletal Class III cases, and to evaluate correlations between acoustic and skeletal changes.

METHOD: Skeletal Class III adult patients (7 male, 18 female) were asked to pronounce the sounds "[a], [ɛ], [ɯ], [i], [ɔ], [œ], [u], [y]" for 3 seconds. Voice records and lateral cephalometric x-rays were taken before the surgery (T0) and 6 months after (T1). Voice records were taken for the control group with 6 months of interval (n=20). The formant frequencies (F0, F1, F2, and F3), Shimmer, Jitter and Noise to Harmonic Ratio (NHR) parameters were considered with Praat version 6.0.43.

RESULTS: In the surgery group, significant differences were observed in the F1 of [e], F2 and Shimmer of [ɯ] and F1 and F2 of [œ] and F1 of [y] sound, the post-surgery values were lower. F3 of [u] sound was higher. In comparison with the control group, ΔF3 of the [ɔ], ΔF3 of the [u] and ΔF1 of the [y] sound, ΔShimmer of [ɛ], [ɯ], [i], [ɔ], [u] and [y], and the ΔNHR of [ɔ] sound significantly changed. The Pearson correlation analysis proved some correlations; ΔF2 between ΔSNA for [ɯ] and [œ] sounds, ΔF1 between ΔHBV for [y] sound.

CONCLUSION: Bimaxillary orthognathic surgery changed some voice parameters in skeletal Class III patients. Some correlations were found between skeletal and acoustic parameters. We advise clinicians to consider these findings and inform their patients.}, } @article {pmid36593767, year = {2023}, author = {Kim, S and Choi, J and Cho, T}, title = {Data on English coda voicing contrast under different prosodic conditions produced by American English speakers and Korean learners of English.}, journal = {Data in brief}, volume = {46}, number = {}, pages = {108816}, pmid = {36593767}, issn = {2352-3409}, abstract = {This data article provides acoustic data for individual speakers' production of coda voicing contrast between stops in English, which are based on laboratory speech recorded by twelve native speakers of American English and twenty-four Korean learners of English. There were four pairs of English monosyllabic target words with voicing contrast in the coda position (bet-bed, pet-ped, bat-bad, pat-pad). The words were produced in carrier sentences in which they were placed in two different prosodic boundary conditions (Intonational Phrase initial and Intonation Phrase medial), two pitch accent conditions (nuclear-pitch accented and unaccented), and three focus conditions (lexical focus, phonological focus and no focus). The raw acoustic measurement values that are included in a CSV-formated file are F0, F1, F2 and duration of each vowel preceding a coda consonant; and Voice Onset Time of word-initial stops. This article also provides figures that exemplify individual speaker variation of vowel duration, F0, F1 and F2 as a function of focus conditions. The data can thus be potentially reused to observe individual variations in phonetic encoding of coda voicing contrast as a function of the aforementioned prosodically-conditioned factors (i.e., prosodic boundary, pitch accent, focus) in native vs. non-native English. Some theoretical aspects of the data are discussed in the full-length article entitled "Phonetic encoding of coda voicing contrast under different focus conditions in L1 vs. L2 English" [1].}, } @article {pmid36586864, year = {2022}, author = {Herbst, CT and Story, BH}, title = {Computer simulation of vocal tract resonance tuning strategies with respect to fundamental frequency and voice source spectral slope in singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {6}, pages = {3548}, doi = {10.1121/10.0014421}, pmid = {36586864}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Singing ; Computer Simulation ; *Voice ; Sound ; Vibration ; }, abstract = {A well-known concept of singing voice pedagogy is "formant tuning," where the lowest two vocal tract resonances (fR1, fR2) are systematically tuned to harmonics of the laryngeal voice source to maximize the level of radiated sound. A comprehensive evaluation of this resonance tuning concept is still needed. Here, the effect of fR1, fR2 variation was systematically evaluated in silico across the entire fundamental frequency range of classical singing for three voice source characteristics with spectral slopes of -6, -12, and -18 dB/octave. Respective vocal tract transfer functions were generated with a previously introduced low-dimensional computational model, and resultant radiated sound levels were expressed in dB(A). Two distinct strategies for optimized sound output emerged for low vs high voices. At low pitches, spectral slope was the predominant factor for sound level increase, and resonance tuning only had a marginal effect. In contrast, resonance tuning strategies became more prevalent and voice source strength played an increasingly marginal role as fundamental frequency increased to the upper limits of the soprano range. This suggests that different voice classes (e.g., low male vs high female) likely have fundamentally different strategies for optimizing sound output, which has fundamental implications for pedagogical practice.}, } @article {pmid36578688, year = {2022}, author = {Ji, Y and Hu, Y and Jiang, X}, title = {Segmental and suprasegmental encoding of speaker confidence in Wuxi dialect vowels.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {1028106}, pmid = {36578688}, issn = {1664-1078}, abstract = {INTRODUCTION: Wuxi dialect is a variation of Wu dialect spoken in eastern China and is characterized by a rich tonal system. Compared with standard Mandarin speakers, those of Wuxi dialect as their mother tongue can be more efficient in varying vocal cues to encode communicative meanings in speech communication. While literature has demonstrated that speakers encode high vs. low confidence in global prosodic cues at the sentence level, it is unknown how speakers' intended confidence is encoded at a more local, phonetic level. This study aimed to explore the effects of speakers' intended confidence on both prosodic and formant features of vowels in two lexical tones (the flat tone and the contour tone) of Wuxi dialect.

METHODS: Words of a single vowel were spoken in confident, unconfident, or neutral tone of voice by native Wuxi dialect speakers using a standard elicitation procedure. Linear-mixed effects modeling and parametric bootstrapping testing were performed.

RESULTS: The results showed that (1) the speakers raised both F1 and F2 in the confident level (compared with the neutral-intending expression). Additionally, F1 can distinguish between the confident and unconfident expressions; (2) Compared with the neutral-intending expression, the speakers raised mean f0, had a greater variation of f0 and prolonged pronunciation time in the unconfident level while they raised mean intensity, had a greater variation of intensity and prolonged pronunciation time in the confident level. (3) The speakers modulated mean f0 and mean intensity to a larger extent on the flat tone than the contour tone to differentiate between levels of confidence in the voice, while they modulated f0 and intensity range more only on the contour tone.

DISCUSSION: These findings shed new light on the mechanisms of segmental and suprasegmental encoding of speaker confidence and lack of confidence at the vowel level, highlighting the interplay of lexical tone and vocal expression in speech communication.}, } @article {pmid36571115, year = {2023}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Expression of concern: 'Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage' (2022) by Grawunder et al.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1870}, pages = {20220476}, doi = {10.1098/rstb.2022.0476}, pmid = {36571115}, issn = {1471-2970}, } @article {pmid38875684, year = {2022}, author = {Iyer, R and Meyer, D}, title = {Detection of Suicide Risk Using Vocal Characteristics: Systematic Review.}, journal = {JMIR biomedical engineering}, volume = {7}, number = {2}, pages = {e42386}, pmid = {38875684}, issn = {2561-3278}, abstract = {BACKGROUND: In an age when telehealth services are increasingly being used for forward triage, there is a need for accurate suicide risk detection. Vocal characteristics analyzed using artificial intelligence are now proving capable of detecting suicide risk with accuracies superior to traditional survey-based approaches, suggesting an efficient and economical approach to ensuring ongoing patient safety.

OBJECTIVE: This systematic review aimed to identify which vocal characteristics perform best at differentiating between patients with an elevated risk of suicide in comparison with other cohorts and identify the methodological specifications of the systems used to derive each feature and the accuracies of classification that result.

METHODS: A search of MEDLINE via Ovid, Scopus, Computers and Applied Science Complete, CADTH, Web of Science, ProQuest Dissertations and Theses A&I, Australian Policy Online, and Mednar was conducted between 1995 and 2020 and updated in 2021. The inclusion criteria were human participants with no language, age, or setting restrictions applied; randomized controlled studies, observational cohort studies, and theses; studies that used some measure of vocal quality; and individuals assessed as being at high risk of suicide compared with other individuals at lower risk using a validated measure of suicide risk. Risk of bias was assessed using the Risk of Bias in Non-randomized Studies tool. A random-effects model meta-analysis was used wherever mean measures of vocal quality were reported.

RESULTS: The search yielded 1074 unique citations, of which 30 (2.79%) were screened via full text. A total of 21 studies involving 1734 participants met all inclusion criteria. Most studies (15/21, 71%) sourced participants via either the Vanderbilt II database of recordings (8/21, 38%) or the Silverman and Silverman perceptual study recording database (7/21, 33%). Candidate vocal characteristics that performed best at differentiating between high risk of suicide and comparison cohorts included timing patterns of speech (median accuracy 95%), power spectral density sub-bands (median accuracy 90.3%), and mel-frequency cepstral coefficients (median accuracy 80%). A random-effects meta-analysis was used to compare 22 characteristics nested within 14% (3/21) of the studies, which demonstrated significant standardized mean differences for frequencies within the first and second formants (standardized mean difference ranged between -1.07 and -2.56) and jitter values (standardized mean difference=1.47). In 43% (9/21) of the studies, risk of bias was assessed as moderate, whereas in the remaining studies (12/21, 57%), the risk of bias was assessed as high.

CONCLUSIONS: Although several key methodological issues prevailed among the studies reviewed, there is promise in the use of vocal characteristics to detect elevations in suicide risk, particularly in novel settings such as telehealth or conversational agents.

TRIAL REGISTRATION: PROSPERO International Prospective Register of Systematic Reviews CRD420200167413; https://www.crd.york.ac.uk/prospero/display_record.php?ID=CRD42020167413.}, } @article {pmid36508721, year = {2023}, author = {Moya-Galé, G and Wisler, AA and Walsh, SJ and McAuliffe, MJ and Levy, ES}, title = {Acoustic Predictors of Ease of Understanding in Spanish Speakers With Dysarthria Associated With Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {8S}, pages = {2999-3012}, doi = {10.1044/2022_JSLHR-22-00284}, pmid = {36508721}, issn = {1558-9102}, mesh = {Humans ; *Dysarthria/complications ; Speech Intelligibility ; Speech Acoustics ; *Parkinson Disease/complications ; Acoustics ; Speech Production Measurement ; }, abstract = {PURPOSE: The purpose of this study was to examine selected baseline acoustic features of hypokinetic dysarthria in Spanish speakers with Parkinson's disease (PD) and identify potential acoustic predictors of ease of understanding in Spanish.

METHOD: Seventeen Spanish-speaking individuals with mild-to-moderate hypokinetic dysarthria secondary to PD and eight healthy controls were recorded reading a translation of the Rainbow Passage. Acoustic measures of vowel space area, as indicated by the formant centralization ratio (FCR), envelope modulation spectra (EMS), and articulation rate were derived from the speech samples. Additionally, 15 healthy adults rated ease of understanding of the recordings on a visual analogue scale. A multiple linear regression model was implemented to investigate the predictive value of the selected acoustic parameters on ease of understanding.

RESULTS: Listeners' ease of understanding was significantly lower for speakers with dysarthria than for healthy controls. The FCR, EMS from the first 10 s of the reading passage, and the difference in EMS between the end and the beginning sections of the passage differed significantly between the two groups of speakers. Findings indicated that 67.7% of the variability in ease of understanding was explained by the predictive model, suggesting a moderately strong relationship between the acoustic and perceptual domains.

CONCLUSIONS: Measures of envelope modulation spectra were found to be highly significant model predictors of ease of understanding of Spanish-speaking individuals with hypokinetic dysarthria associated with PD. Articulation rate was also found to be important (albeit to a lesser degree) in the predictive model. The formant centralization ratio should be further examined with a larger sample size and more severe dysarthria to determine its efficacy in predicting ease of understanding.}, } @article {pmid36477984, year = {2023}, author = {Peng, H and Li, S and Xing, J and Yang, F and Wu, A}, title = {Surface plasmon resonance of Au/Ag metals for the photoluminescence enhancement of lanthanide ion Ln[3+] doped upconversion nanoparticles in bioimaging.}, journal = {Journal of materials chemistry. B}, volume = {11}, number = {24}, pages = {5238-5250}, doi = {10.1039/d2tb02251f}, pmid = {36477984}, issn = {2050-7518}, mesh = {*Lanthanoid Series Elements/chemistry ; Surface Plasmon Resonance ; *Nanoparticles/chemistry ; *Quantum Dots/chemistry ; }, abstract = {Deep tissue penetration, chemical inertness and biocompatibility give UCNPs a competitive edge over traditional fluorescent materials like organic dyes or quantum dots. However, the low quantum efficiency of UNCPs becomes an obstacle. Among extensive methods and strategies currently used to prominently solve this concerned issue, surface plasmon resonance (SPR) of noble metals is of great use due to the agreement between the SPR peak of metals and absorption band of UCNPs. A key challenge of this match is that the structures and sizes of noble metals have significant influences on the peak of SPR formants, where achieving an explicit elucidation of relationships between the physical properties of noble metals and their SPR formants is of great importance. This review aims to clarify the mechanism of the SPR effect of noble metals on the optical performance of UCNPs. Furthermore, novel research studies in which Au, Ag or Au/Ag composites in various structures and sizes are combined with UCNPs through different synthetic methods are summarized. We provide an overview of improved photoluminescence for bioimaging exhibited by different composite nanoparticles with respect to UCNPs acting as both cores and shells, taking Au@UCNPs, Ag@UCNPs and Au/Ag@UCNPs into account. Finally, there are remaining shortcomings and latent opportunities which deserve further research. This review will provide directions for the bioimaging applications of UCNPs through the introduction of the SPR effect of noble metals.}, } @article {pmid36460491, year = {2024}, author = {Wang, Y and Hattori, M and Liu, R and Sumita, YI}, title = {Digital acoustic analysis of the first three formant frequencies in patients with a prosthesis after maxillectomy.}, journal = {The Journal of prosthetic dentistry}, volume = {132}, number = {5}, pages = {1082-1087}, doi = {10.1016/j.prosdent.2022.10.010}, pmid = {36460491}, issn = {1097-6841}, mesh = {Humans ; Male ; Middle Aged ; Aged ; Adult ; Aged, 80 and over ; *Maxilla/surgery ; *Palatal Obturators ; *Speech Acoustics ; Young Adult ; Phonetics ; Speech Intelligibility ; }, abstract = {STATEMENT OF PROBLEM: Prosthetic rehabilitation with an obturator can help to restore or improve the intelligibility of speech in patients after maxillectomy. The frequency of formants 1 and 2 as well as their ranges were initially reported in patients with maxillary defects in 2002, and the evaluation method that was used is now applied in clinical evaluation. However, the details of formant 3 are not known and warrant investigation because, according to speech science, formant 3 is related to the pharyngeal volume. Clarifying the formant frequency values of formant 3 in patients after maxillectomy would enable prosthodontists to refer to these data when planning treatment and when assessing the outcome of an obturator.

PURPOSE: The purpose of this clinical study was to determine the acoustic characteristics of formant 3, together with those of formants 1 and 2, by using a digital acoustic analysis during maxillofacial prosthetic treatment. The utility of determining formant 3 in the evaluation of speech in patients after maxillectomy was also evaluated.

MATERIAL AND METHODS: Twenty-six male participants after a maxillectomy (mean age, 63 years; range, 20 to 93 years) were included, and the 5 Japanese vowels /a/, /e/, /i/, /o/, and /u/ produced with and without a definitive obturator prosthesis were recorded. The frequencies of the 3 formants were determined, and their ranges were calculated by using a speech analysis system (Computerized Speech Lab CSL 4400). The Wilcoxon signed rank test was used to compare the formants between the 2 use conditions (α=0.05).

RESULTS: Significant differences were found in the frequencies and ranges of all 3 formants between the use conditions. The ranges of all 3 formants produced with the prosthesis were significantly greater than those produced without it.

CONCLUSIONS: Based on the findings, both the first 2 formants and the third formant were changed by wearing an obturator prosthesis. Because formant 3 is related to the volume of the pharynx, evaluation of this formant and its range can reflect the effectiveness of the prosthesis to seal the oronasal communication and help reduce hypernasality, suggesting the utility of formant 3 analysis in prosthodontic rehabilitation.}, } @article {pmid36456282, year = {2022}, author = {Voeten, CC and Heeringa, W and Van de Velde, H}, title = {Normalization of nonlinearly time-dynamic vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {5}, pages = {2692}, doi = {10.1121/10.0015025}, pmid = {36456282}, issn = {1520-8524}, abstract = {This study compares 16 vowel-normalization methods for purposes of sociophonetic research. Most of the previous work in this domain has focused on the performance of normalization methods on steady-state vowels. By contrast, this study explicitly considers dynamic formant trajectories, using generalized additive models to model these nonlinearly. Normalization methods were compared using a hand-corrected dataset from the Flemish-Dutch Teacher Corpus, which contains 160 speakers from 8 geographical regions, who spoke regionally accented versions of Netherlandic/Flemish Standard Dutch. Normalization performance was assessed by comparing the methods' abilities to remove anatomical variation, retain vowel distinctions, and explain variation in the normalized F0-F3. In addition, it was established whether normalization competes with by-speaker random effects or supplements it, by comparing how much between-speaker variance remained to be apportioned to random effects after normalization. The results partly reproduce the good performance of Lobanov, Gerstman, and Nearey 1 found earlier and generally favor log-mean and centroid methods. However, newer methods achieve higher effect sizes (i.e., explain more variance) at only marginally worse performances. Random effects were found to be equally useful before and after normalization, showing that they complement it. The findings are interpreted in light of the way that the different methods handle formant dynamics.}, } @article {pmid36455242, year = {2023}, author = {Leyns, C and Daelman, J and Adriaansen, A and Tomassen, P and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Short-Term Acoustic Effects of Speech Therapy in Transgender Women: A Randomized Controlled Trial.}, journal = {American journal of speech-language pathology}, volume = {32}, number = {1}, pages = {145-168}, doi = {10.1044/2022_AJSLP-22-00135}, pmid = {36455242}, issn = {1558-9110}, mesh = {Humans ; Female ; *Speech Therapy ; Speech Acoustics ; *Transgender Persons ; Acoustics ; Speech ; }, abstract = {PURPOSE: This study measured and compared the acoustic short-term effects of pitch elevation training (PET) and articulation-resonance training (ART) and the combination of both programs, in transgender women.

METHOD: A randomized controlled study with cross-over design was used. Thirty transgender women were included and received 14 weeks of speech training. All participants started with 4 weeks of sham training; after which they were randomly assigned to one of two groups: One group continued with PET (5 weeks), followed by ART (5 weeks); the second group received both trainings in opposite order. Participants were recorded 4 times, in between the training blocks: pre, post 1 (after sham), post 2 (after training 1), and post 3 (after training 2). Speech samples included a sustained vowel, continuous speech during reading, and spontaneous speech and were analyzed using Praat software. Fundamental frequency (f o), intensity, voice range profile, vowel formant frequencies (F 1-2-3-4-5 of /a/-/i/-/u/), formant contrasts, vowel space, and vocal quality (Acoustic Voice Quality Index) were determined.

RESULTS AND CONCLUSIONS: Fundamental frequencies increased after both the PET and ART program, with a higher increase after PET. The combination of both interventions showed a mean increase of the f o of 49 Hz during a sustained vowel, 49 Hz during reading, and 29 Hz during spontaneous speech. However, the lower limit (percentile 5) of the f o during spontaneous speech did not change. Higher values were detected for F 1-2 of /a/, F 3 of /u/, and vowel space after PET and ART separately. F 1-2-3 of /a/, F 1-3-4 of /u/, vowel space, and formant contrasts increased after the combination of PET and ART; hence, the combination induced more increases in formant frequencies. Intensity and voice quality measurements did not change. No order effect was detected; that is, starting with PET or ART did not change the outcome.}, } @article {pmid36425833, year = {2022}, author = {Chen, S and Han, C and Wang, S and Liu, X and Wang, B and Wei, R and Lei, X}, title = {Hearing the physical condition: The relationship between sexually dimorphic vocal traits and underlying physiology.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {983688}, pmid = {36425833}, issn = {1664-1078}, abstract = {A growing amount of research has shown associations between sexually dimorphic vocal traits and physiological conditions related to reproductive advantage. This paper presented a review of the literature on the relationship between sexually dimorphic vocal traits and sex hormones, body size, and physique. Those physiological conditions are important in reproductive success and mate selection. Regarding sex hormones, there are associations between sex-specific hormones and sexually dimorphic vocal traits; about body size, formant frequencies are more reliable predictors of human body size than pitch/fundamental frequency; with regard to the physique, there is a possible but still controversial association between human voice and strength and combat power, while pitch is more often used as a signal of aggressive intent in conflict. Future research should consider demographic, cross-cultural, cognitive interaction, and emotional motivation influences, in order to more accurately assess the relationship between voice and physiology. Moreover, neurological studies were recommended to gain a deeper understanding of the evolutionary origins and adaptive functions of voice modulation.}, } @article {pmid36397662, year = {2022}, author = {Eichner, ACO and Donadon, C and Skarżyński, PH and Sanfins, MD}, title = {A Systematic Review of the Literature Between 2009 and 2019 to Identify and Evaluate Publications on the Effects of Age-Related Hearing Loss on Speech Processing.}, journal = {Medical science monitor : international medical journal of experimental and clinical research}, volume = {28}, number = {}, pages = {e938089}, pmid = {36397662}, issn = {1643-3750}, mesh = {Aged ; Animals ; Humans ; Speech ; *Speech Perception/physiology ; Acoustic Stimulation ; *Hearing Loss, Sensorineural ; *Cochlear Implants ; }, abstract = {Changes in central auditory processing due to aging in normal-hearing elderly patients, as well as age-related hearing loss, are often associated with difficulties in speech processing, especially in unfavorable acoustic environments. Speech processing depends on the perception of temporal and spectral features, and for this reason can be assessed by recordings of phase-locked neural activity when synchronized to transient and periodic sound stimuli frequency-following responses (FFRs). An electronic search of the PubMed and Web of Science databases was carried out in July 2019. Studies that evaluated the effects of age-related hearing loss on components of FFRs were included. Studies that were not in English, studies performed on animals, studies with cochlear implant users, literature reviews, letters to the editor, and case studies were excluded. Our search yielded 6 studies, each of which included 30 to 94 subjects aged between 18 and 80 years. Latency increases and significant amplitude reduction of the onset, offset, and sloop V/A components of FFRs were observed. Latency and amplitude impairment of the fundamental frequency, first formant, and high formants were related to peripheral sensorineural hearing loss in the elderly population. Conclusions: Temporal changes in FFR tracing were related to the aging process. Hearing loss also impacts the envelope fine structure, producing poorer speech comprehension in noisy environments. More research is needed to understand aspects related to hearing loss and cognitive aspects common to the elderly.}, } @article {pmid36376191, year = {2022}, author = {Raveendran, R and Yeshoda, K}, title = {Effects of Resonant Voice Therapy on Perceptual and Acoustic Source and Tract Parameters - A Preliminary Study on Indian Carnatic Classical Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.09.023}, pmid = {36376191}, issn = {1873-4588}, abstract = {PURPOSE: The aim of the study was to examine the effects of resonant voice therapy (RVT) on the vocal resonance of trained Carnatic singers. The specific objectives were to evaluate the effects of resonant voice therapy on the auditory perceptual judgments and acoustic source and tract parameters before and after RVT on phonation and sung voice samples.

METHOD: Six vocally healthy trained Carnatic singers, three males and three females aged 18-25 years (M = 23; S.D = 2.09) participated in the study. All the participants were assigned to a 21-days-long Resonance Voice Therapy (RVT) training program. The participants' pre and post training phonation and sung samples were subjected to auditory perceptual analysis and acoustic analysis.

RESULTS: The results revealed that the post training auditory perceptual ratings of the phonation task showed a statistically significant difference from the pre training scores (Z= 2.35; P = 0.019). While for the singing task, the post training perceptual ratings were not significantly different from the pre training perceptual rating scores (Z= 2.66; P = 0.08). A significant difference was observed between the pre and post training values for all the measured acoustic parameters of the phonation task. In singing task, though the fundamental frequency, third and fourth formant frequencies showed no significant difference in the pre and post training conditions (P > 0.05), the parameter of- difference between the first formant frequency and the fundamental frequency showed a significant decrease (P = 0.028).

CONCLUSION: The effects of resonant voice production led to a high vocal economy, as evidenced from the improved source and filter acoustic parameters. Indication for formant tuning through vocal tract modifications, probably an enlarged pharyngeal area resulting in increased resonant voice quality in both phonation and singing tasks, is inferred from these results.}, } @article {pmid36371478, year = {2022}, author = {Rocchesso, D and Andolina, S and Ilardo, G and Palumbo, SD and Galluzzo, Y and Randazzo, M}, title = {A perceptual sound space for auditory displays based on sung-vowel synthesis.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {19370}, pmid = {36371478}, issn = {2045-2322}, support = {PON AIM (id: AIM1875400-1, CUP: B74I18000210006)//Ministero dell'Istruzione, dell'Università e della Ricerca/ ; }, mesh = {Humans ; Sound Spectrography ; *Singing ; Sound ; *Speech Perception ; }, abstract = {When designing displays for the human senses, perceptual spaces are of great importance to give intuitive access to physical attributes. Similar to how perceptual spaces based on hue, saturation, and lightness were constructed for visual color, research has explored perceptual spaces for sounds of a given timbral family based on timbre, brightness, and pitch. To promote an embodied approach to the design of auditory displays, we introduce the Vowel-Type-Pitch (VTP) space, a cylindrical sound space based on human sung vowels, whose timbres can be synthesized by the composition of acoustic formants and can be categorically labeled. Vowels are arranged along the circular dimension, while voice type and pitch of the vowel correspond to the remaining two axes of the cylindrical VTP space. The decoupling and perceptual effectiveness of the three dimensions of the VTP space are tested through a vowel labeling experiment, whose results are visualized as maps on circular slices of the VTP cylinder. We discuss implications for the design of auditory and multi-sensory displays that account for human perceptual capabilities.}, } @article {pmid36360418, year = {2022}, author = {Yoon, TJ and Ha, S}, title = {Adults' Perception of Children's Vowel Production.}, journal = {Children (Basel, Switzerland)}, volume = {9}, number = {11}, pages = {}, pmid = {36360418}, issn = {2227-9067}, support = {NRF-2021S1A5A2A03064795//Ministry of Education of the Republic of Korean and the National Research Foundation of Korea/ ; }, abstract = {The study examined the link between Korean-speaking children's vowel production and its perception by inexperienced adults and also observed whether ongoing vowel changes in mid-back vowels affect adults' perceptions when the vowels are produced by children. This study analyzed vowels in monosyllabic words produced by 20 children, ranging from 2 to 6 years old, with a focus on gender distinction, and used them as perceptual stimuli for word perception by 20 inexperienced adult listeners. Acoustic analyses indicated that F0 was not a reliable cue for distinguishing gender, but the first two formants served as reliable cues for gender distinction. The results confirmed that the spacing of the two low formants is linguistically and para-linguistically important in identifying vowel types and gender. However, a pair of non-low back vowels caused difficulties in correct vowel identification. Proximal distance between the vowels could be interpreted to result in the highest mismatch between children's production and adults' perception of the two non-low back vowels in the Korean language. We attribute the source of the highest mismatch of the two non-low back vowels to the ongoing sound change observed in high and mid-back vowels in adult speech. The ongoing vowel change is also observed in the children's vowel space, which may well be shaped after the caregivers whose non-low back vowels are close to each other.}, } @article {pmid36359019, year = {2022}, author = {Guo, S and Wu, W and Liu, Y and Kang, X and Li, C}, title = {Effects of Valley Topography on Acoustic Communication in Birds: Why Do Birds Avoid Deep Valleys in Daqinggou Nature Reserve?.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {21}, pages = {}, pmid = {36359019}, issn = {2076-2615}, support = {No. 2022xjkk0802//The Ministry of Science and Technology of China/ ; No. 2019HJ2096001006//The Ministry of Ecology and Environment of China/ ; }, abstract = {To investigate the effects of valley topography on the acoustic transmission of avian vocalisations, we carried out playback experiments in Daqinggou valley, Inner Mongolia, China. During the experiments, we recorded the vocalisations of five avian species, the large-billed crow (Corvus macrorhynchos Wagler, 1827), common cuckoo (Cuculus canorus Linnaeus, 1758), Eurasian magpie (Pica pica Linnaeus, 1758), Eurasian tree sparrow (Passer montanus Linnaeus, 1758), and meadow bunting (Emberiza cioides Brand, 1843), at transmission distances of 30 m and 50 m in the upper and lower parts of the valley and analysed the intensity, the fundamental frequency (F0), and the first three formant frequencies (F1/F2/F3) of the sounds. We also investigated bird species diversity in the upper and lower valley. We found that: (1) at the distance of 30 m, there were significant differences in F0/F1/F2/F3 in Eurasian magpies, significant differences in F1/F2/F3 in the meadow bunting and Eurasian tree sparrow, and partially significant differences in sound frequency between the upper and lower valley in the other two species; (2) at the distance of 50 m, there were significant differences in F0/F1/F2/F3 in two avian species (large-billed crow and common cuckoo) between the upper and lower valley and partially significant differences in sound frequency between the upper and lower valley in the other three species; (2) there were significant differences in the acoustic intensities of crow, cuckoo, magpie, and bunting calls between the upper and lower valley. (3) Species number and richness were significantly higher in the upper valley than in the lower valley. We suggested that the structure of valley habitats may lead to the breakdown of acoustic signals and communication in birds to varying degrees. The effect of valley topography on acoustic communication could be one reason for animal species avoiding deep valleys.}, } @article {pmid36351244, year = {2022}, author = {Kim, Y and Thompson, A}, title = {An Acoustic-Phonetic Approach to Effects of Face Masks on Speech Intelligibility.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {12}, pages = {4679-4689}, pmid = {36351244}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; *Speech Intelligibility ; Phonetics ; Speech Acoustics ; Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: This study aimed to examine the effects of wearing a face mask on speech acoustics and intelligibility, using an acoustic-phonetic analysis of speech. In addition, the effects of speakers' behavioral modification while wearing a mask were examined.

METHOD: Fourteen female adults were asked to read a set of words and sentences under three conditions: (a) conversational, mask-off; (b) conversational, mask-on; and (c) clear, mask-on. Seventy listeners rated speech intelligibility using two methods: orthographic transcription and visual analog scale (VAS). Acoustic measures for vowels included duration, first (F1) and second (F2) formant frequency, and intensity ratio of F1/F2. For consonants, spectral moment coefficients and consonant-vowel (CV) boundary (intensity ratio between consonant and vowel) were measured.

RESULTS: Face masks had a negative impact on speech intelligibility as measured by both intelligibility ratings. However, speech intelligibility was recovered in the clear speech condition for VAS but not for transcription scores. Analysis of orthographic transcription showed that listeners tended to frequently confuse consonants (particularly fricatives, affricates, and stops), rather than vowels in the word-initial position. Acoustic data indicated a significant effect of condition on CV intensity ratio only.

CONCLUSIONS: Our data demonstrate a negative effect of face masks on speech intelligibility, mainly affecting consonants. However, intelligibility can be enhanced by speaking clearly, likely driven by prosodic alterations.}, } @article {pmid36322641, year = {2024}, author = {Baker, CP and Sundberg, J and Purdy, SC and Rakena, TO}, title = {Female adolescent singing voice characteristics: an exploratory study using LTAS and inverse filtering.}, journal = {Logopedics, phoniatrics, vocology}, volume = {49}, number = {2}, pages = {83-95}, doi = {10.1080/14015439.2022.2140455}, pmid = {36322641}, issn = {1651-2022}, mesh = {Humans ; Adolescent ; Female ; *Singing ; *Voice Quality ; Young Adult ; *Acoustics ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; Age Factors ; Time Factors ; Phonation ; Adolescent Development ; Adolescent Behavior ; Sex Factors ; }, abstract = {Background and Aim: To date, little research is available that objectively quantifies female adolescent singing-voice characteristics in light of the physiological and functional developments that occur from puberty to adulthood. This exploratory study sought to augment the pool of data available that offers objective voice analysis of female singers in late adolescence.Methods: Using long-term average spectra (LTAS) and inverse filtering techniques, dynamic range and voice-source characteristics were determined in a cohort of vocally healthy cis-gender female adolescent singers (17 to 19 years) from high-school choirs in Aotearoa New Zealand. Non-parametric statistics were used to determine associations and significant differences.Results: Wide intersubject variation was seen between dynamic range, spectral measures of harmonic organisation (formant cluster prominence, FCP), noise components in the spectrum (high-frequency energy ratio, HFER), and the normalised amplitude quotient (NAQ) suggesting great variability in ability to control phonatory mechanisms such as subglottal pressure (Psub), glottal configuration and adduction, and vocal tract shaping. A strong association between the HFER and NAQ suggest that these non-invasive measures may offer complimentary insights into vocal function, specifically with regard to glottal adduction and turbulent noise in the voice signal.Conclusion: Knowledge of the range of variation within healthy adolescent singers is necessary for the development of effective and inclusive pedagogical practices, and for vocal-health professionals working with singers of this age. LTAS and inverse filtering are useful non-invasive tools for determining such characteristics.}, } @article {pmid36313043, year = {2022}, author = {Easwar, V and Purcell, D and Eeckhoutte, MV and Aiken, SJ}, title = {The Influence of Male- and Female-Spoken Vowel Acoustics on Envelope-Following Responses.}, journal = {Seminars in hearing}, volume = {43}, number = {3}, pages = {223-239}, pmid = {36313043}, issn = {0734-0451}, abstract = {The influence of male and female vowel characteristics on the envelope-following responses (EFRs) is not well understood. This study explored the role of vowel characteristics on the EFR at the fundamental frequency (f0) in response to the vowel /ε/ (as in "head"). Vowel tokens were spoken by five males and five females and EFRs were measured in 25 young adults (21 females). An auditory model was used to estimate changes in auditory processing that might account for talker effects on EFR amplitude. There were several differences between male and female vowels in relation to the EFR. For male talkers, EFR amplitudes were correlated with the bandwidth and harmonic count of the first formant, and the amplitude of the trough below the second formant. For female talkers, EFR amplitudes were correlated with the range of f0 frequencies and the amplitude of the trough above the second formant. The model suggested that the f0 EFR reflects a wide distribution of energy in speech, with primary contributions from high-frequency harmonics mediated from cochlear regions basal to the peaks of the first and second formants, not from low-frequency harmonics with energy near f0. Vowels produced by female talkers tend to produce lower-amplitude EFR, likely because they depend on higher-frequency harmonics where speech sound levels tend to be lower. This work advances auditory electrophysiology by showing how the EFR evoked by speech relates to the acoustics of speech, for both male and female voices.}, } @article {pmid36304844, year = {2022}, author = {Pah, ND and Indrawati, V and Kumar, DK}, title = {Voice Features of Sustained Phoneme as COVID-19 Biomarker.}, journal = {IEEE journal of translational engineering in health and medicine}, volume = {10}, number = {}, pages = {4901309}, pmid = {36304844}, issn = {2168-2372}, mesh = {Humans ; *COVID-19 ; Cross-Sectional Studies ; Longitudinal Studies ; Pandemics ; SARS-CoV-2 ; Biomarkers ; }, abstract = {BACKGROUND: The COVID-19 pandemic has resulted in enormous costs to our society. Besides finding medicines to treat those infected by the virus, it is important to find effective and efficient strategies to prevent the spreading of the disease. One key factor to prevent transmission is to identify COVID-19 biomarkers that can be used to develop an efficient, accurate, noninvasive, and self-administered screening procedure. Several COVID-19 variants cause significant respiratory symptoms, and thus a voice signal may be a potential biomarker for COVID-19 infection.

AIM: This study investigated the effectiveness of different phonemes and a range of voice features in differentiating people infected by COVID-19 with respiratory tract symptoms.

METHOD: This cross-sectional, longitudinal study recorded six phonemes (i.e., /a/, /e/, /i/, /o/, /u/, and /m/) from 40 COVID-19 patients and 48 healthy subjects for 22 days. The signal features were obtained for the recordings, which were statistically analyzed and classified using Support Vector Machine (SVM).

RESULTS: The statistical analysis and SVM classification show that the voice features related to the vocal tract filtering (e.g., MFCC, VTL, and formants) and the stability of the respiratory muscles and lung volume (Intensity-SD) were the most sensitive to voice change due to COVID-19. The result also shows that the features extracted from the vowel /i/ during the first 3 days after admittance to the hospital were the most effective. The SVM classification accuracy with 18 ranked features extracted from /i/ was 93.5% (with F1 score of 94.3%).

CONCLUSION: A measurable difference exists between the voices of people with COVID-19 and healthy people, and the phoneme /i/ shows the most pronounced difference. This supports the potential for using computerized voice analysis to detect the disease and consider it a biomarker.}, } @article {pmid36293884, year = {2022}, author = {Choi, MK and Yoo, SD and Park, EJ}, title = {Destruction of Vowel Space Area in Patients with Dysphagia after Stroke.}, journal = {International journal of environmental research and public health}, volume = {19}, number = {20}, pages = {}, pmid = {36293884}, issn = {1660-4601}, mesh = {Humans ; Dysarthria/complications ; *Deglutition Disorders/etiology ; Speech Acoustics ; Deglutition ; *Stroke/complications ; }, abstract = {Dysphagia is associated with dysarthria in stroke patients. Vowel space decreases in stroke patients with dysarthria; destruction of the vowel space is often observed. We determined the correlation of destruction of acoustic vowel space with dysphagia in stroke patients. Seventy-four individuals with dysphagia and dysarthria who had experienced stroke were enrolled. For /a/, /ae/, /i/, and /u/ vowels, we determined formant parameter (it reflects vocal tract resonance frequency as a two-dimensional coordinate point), formant centralization ratio (FCR), and quadrilateral vowel space area (VSA). Swallowing function was assessed using the videofluoroscopic dysphagia scale (VDS) during videofluoroscopic swallowing studies. Pearson's correlation and linear regression were used to determine the correlation between VSA, FCR, and VDS. Subgroups were created based on VSA; vowel space destruction groups were compared using ANOVA and Scheffe's test. VSA and FCR were negatively and positively correlated with VDS, respectively. Groups were separated based on mean and standard deviation of VSA. One-way ANOVA revealed significant differences in VDS, FCR, and age between the VSA groups and no significant differences in VDS between mild and moderate VSA reduction and vowel space destruction groups. VSA and FCR values correlated with swallowing function. Vowel space destruction has characteristics similar to VSA reduction at a moderate-to-severe degree and has utility as an indicator of dysphagia severity.}, } @article {pmid36289365, year = {2022}, author = {Müller, M and Wang, Z and Caffier, F and Caffier, PP}, title = {New objective timbre parameters for classification of voice type and fach in professional opera singers.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {17921}, pmid = {36289365}, issn = {2045-2322}, mesh = {Humans ; *Singing ; Voice Quality ; *Voice ; Occupations ; Sound ; }, abstract = {Voice timbre is defined as sound color independent of pitch and volume, based on a broad frequency band between 2 and 4 kHz. Since there are no specific timbre parameters, previous studies have come to the very general conclusion that the center frequencies of the singer's formants are somewhat higher in the higher voice types than in the lower ones. For specification, a database was created containing 1723 sound examples of various voice types. The energy distribution in the frequency bands of the singer's formants was extracted for quantitative analysis. When the energy distribution function reached 50%, the corresponding absolute frequency in Hz was defined as Frequency of Half Energy (FHE). This new parameter quantifies the timbre of a singing voice as a concrete measure, independent of fundamental frequency, vowel color and volume. The database allows assigning FHE means ± SD as characteristic or comparative values for sopranos (3092 ± 284 Hz), tenors (2705 ± 221 Hz), baritones (2454 ± 206 Hz) and basses (2384 ± 164 Hz). In addition to vibrato, specific timbre parameters provide another valuable feature in vocal pedagogy for classification of voice type and fach according to the lyric or dramatic character of the voice.}, } @article {pmid36279585, year = {2022}, author = {Hussain, RO and Kumar, P and Singh, NK}, title = {Subcortical and Cortical Electrophysiological Measures in Children With Speech-in-Noise Deficits Associated With Auditory Processing Disorders.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4454-4468}, doi = {10.1044/2022_JSLHR-22-00094}, pmid = {36279585}, issn = {1558-9102}, mesh = {Child ; Humans ; Adolescent ; *Auditory Perceptual Disorders/diagnosis ; Speech ; Noise ; *Speech Perception/physiology ; Evoked Potentials, Auditory ; Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem/physiology ; }, abstract = {PURPOSE: The aim of this study was to analyze the subcortical and cortical auditory evoked potentials for speech stimuli in children with speech-in-noise (SIN) deficits associated with auditory processing disorder (APD) without any reading or language deficits.

METHOD: The study included 20 children in the age range of 9-13 years. Ten children were recruited to the APD group; they had below-normal scores on the speech-perception-in-noise test and were diagnosed as having APD. The remaining 10 were typically developing (TD) children and were recruited to the TD group. Speech-evoked subcortical (brainstem) and cortical (auditory late latency) responses were recorded and compared across both groups.

RESULTS: The results showed a statistically significant reduction in the amplitudes of the subcortical potentials (both for stimulus in quiet and in noise) and the magnitudes of the spectral components (fundamental frequency and the second formant) in children with SIN deficits in the APD group compared to the TD group. In addition, the APD group displayed enhanced amplitudes of the cortical potentials compared to the TD group.

CONCLUSION: Children with SIN deficits associated with APD exhibited impaired coding/processing of the auditory information at the level of the brainstem and the auditory cortex.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21357735.}, } @article {pmid36279201, year = {2022}, author = {Bochner, J and Samar, V and Prud'hommeaux, E and Huenerfauth, M}, title = {Phoneme Categorization in Prelingually Deaf Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4429-4453}, doi = {10.1044/2022_JSLHR-22-00038}, pmid = {36279201}, issn = {1558-9102}, mesh = {Adult ; Humans ; Young Adult ; *Cochlear Implants ; *Deafness/rehabilitation ; *Speech Perception ; *Cochlear Implantation ; Hearing ; }, abstract = {PURPOSE: Phoneme categorization (PC) for voice onset time and second formant transition was studied in adult cochlear implant (CI) users with early-onset deafness and hearing controls.

METHOD: Identification and discrimination tasks were administered to 30 participants implanted before 4 years of age, 21 participants implanted after 7 years of age, and 21 hearing individuals.

RESULTS: Distinctive identification and discrimination functions confirmed PC within all groups. Compared to hearing participants, the CI groups generally displayed longer/higher category boundaries, shallower identification function slopes, reduced identification consistency, and reduced discrimination performance. A principal component analysis revealed that identification consistency, discrimination accuracy, and identification function slope, but not boundary location, loaded on a single factor, reflecting general PC performance. Earlier implantation was associated with better PC performance within the early CI group, but not the late CI group. Within the early CI group, earlier implantation age but not PC performance was associated with better speech recognition. Conversely, within the late CI group, better PC performance but not earlier implantation age was associated with better speech recognition.

CONCLUSIONS: Results suggest that implantation timing within the sensitive period before 4 years of age partly determines the level of PC performance. They also suggest that early implantation may promote development of higher level processes that can compensate for relatively poor PC performance, as can occur in challenging listening conditions.}, } @article {pmid36266347, year = {2022}, author = {Skrabal, D and Rusz, J and Novotny, M and Sonka, K and Ruzicka, E and Dusek, P and Tykalova, T}, title = {Articulatory undershoot of vowels in isolated REM sleep behavior disorder and early Parkinson's disease.}, journal = {NPJ Parkinson's disease}, volume = {8}, number = {1}, pages = {137}, pmid = {36266347}, issn = {2373-8057}, support = {NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; MH CZ-DRO-VFN64165//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; }, abstract = {Imprecise vowels represent a common deficit associated with hypokinetic dysarthria resulting from a reduced articulatory range of motion in Parkinson's disease (PD). It is not yet unknown whether the vowel articulation impairment is already evident in the prodromal stages of synucleinopathy. We aimed to assess whether vowel articulation abnormalities are present in isolated rapid eye movement sleep behaviour disorder (iRBD) and early-stage PD. A total of 180 male participants, including 60 iRBD, 60 de-novo PD and 60 age-matched healthy controls performed reading of a standardized passage. The first and second formant frequencies of the corner vowels /a/, /i/, and /u/ extracted from predefined words, were utilized to construct articulatory-acoustic measures of Vowel Space Area (VSA) and Vowel Articulation Index (VAI). Compared to controls, VSA was smaller in both iRBD (p = 0.01) and PD (p = 0.001) while VAI was lower only in PD (p = 0.002). iRBD subgroup with abnormal olfactory function had smaller VSA compared to iRBD subgroup with preserved olfactory function (p = 0.02). In PD patients, the extent of bradykinesia and rigidity correlated with VSA (r = -0.33, p = 0.01), while no correlation between axial gait symptoms or tremor and vowel articulation was detected. Vowel articulation impairment represents an early prodromal symptom in the disease process of synucleinopathy. Acoustic assessment of vowel articulation may provide a surrogate marker of synucleinopathy in scenarios where a single robust feature to monitor the dysarthria progression is needed.}, } @article {pmid36266224, year = {2022}, author = {Zhang, T and He, M and Li, B and Zhang, C and Hu, J}, title = {Acoustic Characteristics of Cantonese Speech Through Protective Facial Coverings.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.08.029}, pmid = {36266224}, issn = {1873-4588}, abstract = {OBJECTIVES: Protective facial coverings (PFCs) such as surgical masks attenuate speech transmission and affect speech intelligibility, which is reported in languages such as English and German. The present study intended to verify the detrimental impacts on production of tonal languages such as Cantonese, by examining realization of speech correlates in Cantonese under PFCs including facial masks and shields.

METHODS: We recorded scripted speech in Hong Kong Cantonese produced by three adult speakers who wore various PFCs, including surgical masks, KF94 masks, and face shields (with and without surgical masks). Spectral and temporal parameters were measured, including mean intensity, speaking rate, long-term amplitude spectrum, formant frequencies of vowels, and duration and fundamental frequency (F0) of tone-bearing parts.

RESULTS: Significant changes were observed in all acoustic correlates of Cantonese speech under PFCs. Sound pressure levels were attenuated more intensely at ranges of higher frequencies in speech through face masks, whereas sound transmission was affected more at ranges of lower frequencies in speech under face shields. Vowel spaces derived from formant frequencies shrank under all PFCs, with the vowel /aa/ demonstrating largest changes in the first two formants. All tone-bearing parts were shortened and showed increments of F0 means in speech through PFCs. The decrease of tone duration was statistically significant in High-level and Low-level tones, while the increment of F0 means was significant in High-level tone only.

CONCLUSIONS: General filtering effect of PFCs is observed in Cantonese speech data, confirming language-universal patterns in acoustic attenuation by PFCs. The various coverings lower overall intensity levels of speech and degrade speech signal in higher frequency regions. Modification patterns specific to Hong Kong Cantonese are also identified. Vowel space area is reduced and found associated with increased speaking rates. Tones are produced with higher F0s under PFCs, which may be attributed to vocal tension caused by tightened vocal tract during speaking through facial coverings.}, } @article {pmid36215575, year = {2022}, author = {Urzúa, AR and Wolf, KB}, title = {Unitary rotation of pixellated polychromatic images.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {39}, number = {8}, pages = {1323-1329}, doi = {10.1364/JOSAA.462530}, pmid = {36215575}, issn = {1520-8532}, abstract = {Unitary rotations of polychromatic images on finite two-dimensional pixellated screens provide invertibility, group composition, and thus conservation of information. Rotations have been applied on monochromatic image data sets, where we now examine closer the Gibbs-like oscillations that appear due to discrete "discontinuities" of the input images under unitary transformations. Extended to three-color images, we examine here the display of color at the pixels where, due to oscillations, some pixel color values may fall outside their required common numerical range [0,1], between absence and saturation of the red, green, and blue formant colors we choose to represent the images.}, } @article {pmid36182345, year = {2022}, author = {Rothenberg, M and Rothenberg, S}, title = {Measuring the distortion of speech by a facemask.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095203}, doi = {10.1121/10.0014002}, pmid = {36182345}, issn = {2691-1191}, mesh = {Acoustics ; Masks ; Mouth ; *Speech ; *Voice ; }, abstract = {Most prior research focuses on the reduced amplitude of speech caused by facemasks. This paper argues that the interaction between the acoustic properties of a facemask and the acoustic properties of the vocal tract contributes to speech distortion by changing the formants of the voice. Speech distortion of a number of masks was tested by measuring the increase in damping of the first formant. Results suggest that masks dampen the first formant and that increasing the distance between the mask wall and mouth can reduce this distortion. These findings contribute to the research studying the impact of masks on speech.}, } @article {pmid36182341, year = {2022}, author = {Tran Ngoc, A and Meunier, F and Meyer, J}, title = {Testing perceptual flexibility in speech through the categorization of whistled Spanish consonants by French speakers.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095201}, doi = {10.1121/10.0013900}, pmid = {36182341}, issn = {2691-1191}, mesh = {Cues ; Humans ; Language ; Phonetics ; *Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Whistled speech is a form of modified speech where, in non-tonal languages, vowels and consonants are augmented and transposed to whistled frequencies, simplifying their timbre. According to previous studies, these transformations maintain some level of vowel recognition for naive listeners. Here, in a behavioral experiment, naive listeners' capacities for the categorization of four whistled consonants (/p/, /k/, /t/, and /s/) were analyzed. Results show patterns of correct responses and confusions that provide new insights into whistled speech perception, highlighting the importance of frequency modulation cues, transposed from phoneme formants, as well as the perceptual flexibility in processing these cues.}, } @article {pmid36182291, year = {2022}, author = {Winn, MB and Wright, RA}, title = {Reconsidering commonly used stimuli in speech perception experiments.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {3}, pages = {1394}, doi = {10.1121/10.0013415}, pmid = {36182291}, issn = {1520-8524}, mesh = {Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception/physiology ; *Voice ; }, abstract = {This paper examines some commonly used stimuli in speech perception experiments and raises questions about their use, or about the interpretations of previous results. The takeaway messages are: 1) the Hillenbrand vowels represent a particular dialect rather than a gold standard, and English vowels contain spectral dynamics that have been largely underappreciated, 2) the /ɑ/ context is very common but not clearly superior as a context for testing consonant perception, 3) /ɑ/ is particularly problematic when testing voice-onset-time perception because it introduces strong confounds in the formant transitions, 4) /dɑ/ is grossly overrepresented in neurophysiological studies and yet is insufficient as a generalized proxy for "speech perception," and 5) digit tests and matrix sentences including the coordinate response measure are systematically insensitive to important patterns in speech perception. Each of these stimulus sets and concepts is described with careful attention to their unique value and also cases where they might be misunderstood or over-interpreted.}, } @article {pmid36171463, year = {2022}, author = {Borodkin, K and Gassner, T and Ershaid, H and Amir, N}, title = {tDCS modulates speech perception and production in second language learners.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {16212}, pmid = {36171463}, issn = {2045-2322}, mesh = {Acoustic Stimulation ; Adult ; Humans ; Language ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; *Transcranial Direct Current Stimulation ; }, abstract = {Accurate identification and pronunciation of nonnative speech sounds can be particularly challenging for adult language learners. The current study tested the effects of a brief musical training combined with transcranial direct current stimulation (tDCS) on speech perception and production in a second language (L2). The sample comprised 36 native Hebrew speakers, aged 18-38, who studied English as L2 in a formal setting and had little musical training. Training encompassed musical perception tasks with feedback (i.e., timbre, duration, and tonal memory) and concurrent tDCS applied over the left posterior auditory-related cortex (including posterior superior temporal gyrus and planum temporale). Participants were randomly assigned to anodal or sham stimulation. Musical perception, L2 speech perception (measured by a categorical AXB discrimination task) and speech production (measured by a speech imitation task) were tested before and after training. There were no tDCS-dependent effects on musical perception post-training. However, only participants who received active stimulation showed increased accuracy of L2 phoneme discrimination and greater change in the acoustic properties of L2 speech sound production (i.e., second formant frequency in vowels and center of gravity in consonants). The results of this study suggest neuromodulation can facilitate the processing of nonnative speech sounds in adult learners.}, } @article {pmid36154230, year = {2022}, author = {Morse, RP and Holmes, SD and Irving, R and McAlpine, D}, title = {Noise helps cochlear implant listeners to categorize vowels.}, journal = {JASA express letters}, volume = {2}, number = {4}, pages = {042001}, doi = {10.1121/10.0010071}, pmid = {36154230}, issn = {2691-1191}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Noise/adverse effects ; Phonetics ; *Speech Perception ; }, abstract = {Theoretical studies demonstrate that controlled addition of noise can enhance the amount of information transmitted by a cochlear implant (CI). The present study is a proof-of-principle for whether stochastic facilitation can improve the ability of CI users to categorize speech sounds. Analogue vowels were presented to CI users through a single electrode with independent noise on multiple electrodes. Noise improved vowel categorization, particularly in terms of an increase in information conveyed by the first and second formant. Noise, however, did not significantly improve vowel recognition: the miscategorizations were just more consistent, giving the potential to improve with experience.}, } @article {pmid36129844, year = {2022}, author = {Easwar, V and Purcell, D and Lasarev, M and McGrath, E and Galloy, M}, title = {Speech-Evoked Envelope Following Responses in Children and Adults.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {10}, pages = {4009-4023}, doi = {10.1044/2022_JSLHR-22-00156}, pmid = {36129844}, issn = {1558-9102}, mesh = {Acoustic Stimulation ; Adolescent ; Child ; Hearing Tests ; Humans ; Male ; Sensitivity and Specificity ; *Speech ; *Speech Perception/physiology ; Young Adult ; }, abstract = {PURPOSE: Envelope following responses (EFRs) could be useful for objectively evaluating audibility of speech in children who are unable to participate in routine clinical tests. However, relative to adults, the characteristics of EFRs elicited by frequency-specific speech and their utility in predicting audibility in children are unknown.

METHOD: EFRs were elicited by the first (F1) and second and higher formants (F2+) of male-spoken vowels /u/ and /i/ and by fricatives /ʃ/ and /s/ in the token /suʃi/ presented at 15, 35, 55, 65, and 75 dB SPL. The F1, F2+, and fricatives were low-, mid-, and high-frequency dominant, respectively. EFRs were recorded between the vertex and the nape from twenty-three 6- to 17-year-old children and 21 young adults with normal hearing. Sensation levels of stimuli were estimated based on behavioral thresholds.

RESULTS: In children, amplitude decreased with age for /ʃ/-elicited EFRs but remained stable for low- and mid-frequency stimuli. As a group, EFR amplitude and phase coherence did not differ from that of adults. EFR sensitivity (proportion of audible stimuli detected) and specificity (proportion of inaudible stimuli not detected) did not vary between children and adults. Consistent with previous work, EFR sensitivity increased with stimulus frequency and level. The type of statistical indicator used for EFR detection did not influence accuracy in children.

CONCLUSIONS: Adultlike EFRs in 6- to 17-year-old typically developing children suggest mature envelope encoding for low- and mid-frequency stimuli. EFR sensitivity and specificity in children, when considering a wide range of stimulus levels and audibility, are ~77% and ~92%, respectively.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21136171.}, } @article {pmid36092651, year = {2022}, author = {Nault, DR and Mitsuya, T and Purcell, DW and Munhall, KG}, title = {Perturbing the consistency of auditory feedback in speech.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {905365}, pmid = {36092651}, issn = {1662-5161}, abstract = {Sensory information, including auditory feedback, is used by talkers to maintain fluent speech articulation. Current models of speech motor control posit that speakers continually adjust their motor commands based on discrepancies between the sensory predictions made by a forward model and the sensory consequences of their speech movements. Here, in two within-subject design experiments, we used a real-time formant manipulation system to explore how reliant speech articulation is on the accuracy or predictability of auditory feedback information. This involved introducing random formant perturbations during vowel production that varied systematically in their spatial location in formant space (Experiment 1) and temporal consistency (Experiment 2). Our results indicate that, on average, speakers' responses to auditory feedback manipulations varied based on the relevance and degree of the error that was introduced in the various feedback conditions. In Experiment 1, speakers' average production was not reliably influenced by random perturbations that were introduced every utterance to the first (F1) and second (F2) formants in various locations of formant space that had an overall average of 0 Hz. However, when perturbations were applied that had a mean of +100 Hz in F1 and -125 Hz in F2, speakers demonstrated reliable compensatory responses that reflected the average magnitude of the applied perturbations. In Experiment 2, speakers did not significantly compensate for perturbations of varying magnitudes that were held constant for one and three trials at a time. Speakers' average productions did, however, significantly deviate from a control condition when perturbations were held constant for six trials. Within the context of these conditions, our findings provide evidence that the control of speech movements is, at least in part, dependent upon the reliability and stability of the sensory information that it receives over time.}, } @article {pmid36063640, year = {2022}, author = {Frankford, SA and Cai, S and Nieto-Castañón, A and Guenther, FH}, title = {Auditory feedback control in adults who stutter during metronome-paced speech II. Formant Perturbation.}, journal = {Journal of fluency disorders}, volume = {74}, number = {}, pages = {105928}, pmid = {36063640}, issn = {1873-801X}, support = {R01 DC007683/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Stuttering/therapy ; Speech/physiology ; Feedback ; Feedback, Sensory/physiology ; Auditory Perception/physiology ; }, abstract = {PURPOSE: Prior work has shown that Adults who stutter (AWS) have reduced and delayed responses to auditory feedback perturbations. This study aimed to determine whether external timing cues, which increase fluency, resolve auditory feedback processing disruptions.

METHODS: Fifteen AWS and sixteen adults who do not stutter (ANS) read aloud a multisyllabic sentence either with natural stress and timing or with each syllable paced at the rate of a metronome. On random trials, an auditory feedback formant perturbation was applied, and formant responses were compared between groups and pacing conditions.

RESULTS: During normally paced speech, ANS showed a significant compensatory response to the perturbation by the end of the perturbed vowel, while AWS did not. In the metronome-paced condition, which significantly reduced the disfluency rate, the opposite was true: AWS showed a significant response by the end of the vowel, while ANS did not.

CONCLUSION: These findings indicate a potential link between the reduction in stuttering found during metronome-paced speech and changes in auditory motor integration in AWS.}, } @article {pmid36050247, year = {2022}, author = {Lee, SH and Lee, GS}, title = {Long-term Average Spectrum and Nasal Accelerometry in Sentences of Differing Nasality and Forward-Focused Vowel Productions Under Altered Auditory Feedback.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.026}, pmid = {36050247}, issn = {1873-4588}, abstract = {OBJECTIVES AND BACKGROUND: To investigate whether voice focus adjustments can alter the audio-vocal feedback and consequently modulate speech/voice motor control. Speaking with a forward-focused voice was expected to enhance audio-vocal feedback and thus decrease the variability of vocal fundamental frequency (F0).

MATERIALS AND METHOD: Twenty-two healthy, untrained adults (10 males and 12 females) were requested to sustain vowel /a/ with their natural focus and a forward focus and to naturally read the nasal, oral, and mixed oral-nasal sentences in normal noise-masked auditory conditions. Meanwhile, a miniature accelerometer was externally attached on the noise to detect the nasal vibrations during vocalization. Audio recordings were made and analyzed using the long-term average spectrum (LTAS) and power spectral analysis of F0.

RESULTS: Compared with naturally-focused vowel production and oral sentences, forward-focused vowel productions and nasal sentences both showed significant increases in nasal accelerometric amplitude and the spectral power within the range of 200∼300 Hz, and significantly decreased the F0 variability below 3 Hz, which has been reported to be associated with enhanced auditory feedback in our previous research. The auditory masking not only significantly increased the low-frequency F0 variability, but also significantly decreased the ratio of the spectral power within 200∼300 Hz to the power within 300∼1000 Hz for the vowel and sentence productions. Gender differences were found in the correlations between the degree of nasal coupling and F0 stability as well as in the LTAS characteristics in response to noise.

CONCLUSIONS: Variations in nasal-oral acoustic coupling not only change the formant features of speech signals, but involuntarily influence the auditory feedback control of vocal fold vibrations. Speakers tend to show improved F0 stability in response to a forward-focused voice adjustment.}, } @article {pmid36050180, year = {2022}, author = {Ibrahim, O and Yuen, I and van Os, M and Andreeva, B and Möbius, B}, title = {The combined effects of contextual predictability and noise on the acoustic realisation of German syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {911}, doi = {10.1121/10.0013413}, pmid = {36050180}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Noise/adverse effects ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speakers tend to speak clearly in noisy environments, while they tend to reserve effort by shortening word duration in predictable contexts. It is unclear how these two communicative demands are met. The current study investigates the acoustic realizations of syllables in predictable vs unpredictable contexts across different background noise levels. Thirty-eight German native speakers produced 60 CV syllables in two predictability contexts in three noise conditions (reference = quiet, 0 dB and -10 dB signal-to-noise ratio). Duration, intensity (average and range), F0 (median), and vowel formants of the target syllables were analysed. The presence of noise yielded significantly longer duration, higher average intensity, larger intensity range, and higher F0. Noise levels affected intensity (average and range) and F0. Low predictability syllables exhibited longer duration and larger intensity range. However, no interaction was found between noise and predictability. This suggests that noise-related modifications might be independent of predictability-related changes, with implications for including channel-based and message-based formulations in speech production.}, } @article {pmid36050169, year = {2022}, author = {Krumbiegel, J and Ufer, C and Blank, H}, title = {Influence of voice properties on vowel perception depends on speaker context.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {820}, doi = {10.1121/10.0013363}, pmid = {36050169}, issn = {1520-8524}, mesh = {Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Different speakers produce the same intended vowel with very different physical properties. Fundamental frequency (F0) and formant frequencies (FF), the two main parameters that discriminate between voices, also influence vowel perception. While it has been shown that listeners comprehend speech more accurately if they are familiar with a talker's voice, it is still unclear how such prior information is used when decoding the speech stream. In three online experiments, we examined the influence of speaker context via F0 and FF shifts on the perception of /o/-/u/ vowel contrasts. Participants perceived vowels from an /o/-/u/ continuum shifted toward /u/ when F0 was lowered or FF increased relative to the original speaker's voice and vice versa. This shift was reduced when the speakers were presented in a block-wise context compared to random order. Conversely, the original base voice was perceived to be shifted toward /u/ when presented in the context of a low F0 or high FF speaker, compared to a shift toward /o/ with high F0 or low FF speaker context. These findings demonstrate that that F0 and FF jointly influence vowel perception in speaker context.}, } @article {pmid36050157, year = {2022}, author = {Whalen, DH and Chen, WR and Shadle, CH and Fulop, SA}, title = {Formants are easy to measure; resonances, not so much: Lessons from Klatt (1986).}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {933}, pmid = {36050157}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Algorithms ; Canada ; Humans ; Language ; *Speech Acoustics ; }, abstract = {Formants in speech signals are easily identified, largely because formants are defined to be local maxima in the wideband sound spectrum. Sadly, this is not what is of most interest in analyzing speech; instead, resonances of the vocal tract are of interest, and they are much harder to measure. Klatt [(1986). in Proceedings of the Montreal Satellite Symposium on Speech Recognition, 12th International Congress on Acoustics, edited by P. Mermelstein (Canadian Acoustical Society, Montreal), pp. 5-7] showed that estimates of resonances are biased by harmonics while the human ear is not. Several analysis techniques placed the formant closer to a strong harmonic than to the center of the resonance. This "harmonic attraction" can persist with newer algorithms and in hand measurements, and systematic errors can persist even in large corpora. Research has shown that the reassigned spectrogram is less subject to these errors than linear predictive coding and similar measures, but it has not been satisfactorily automated, making its wider use unrealistic. Pending better techniques, the recommendations are (1) acknowledge limitations of current analyses regarding influence of F0 and limits on granularity, (2) report settings more fully, (3) justify settings chosen, and (4) examine the pattern of F0 vs F1 for possible harmonic bias.}, } @article {pmid36009709, year = {2022}, author = {Beeck, VC and Heilmann, G and Kerscher, M and Stoeger, AS}, title = {Sound Visualization Demonstrates Velopharyngeal Coupling and Complex Spectral Variability in Asian Elephants.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {16}, pages = {}, pmid = {36009709}, issn = {2076-2615}, support = {W 1262/FWF_/Austrian Science Fund FWF/Austria ; P31034-B29//Austrian Science Fund (FWF)/ ; W1262-B29//Austrian Science Fund (FWF)/ ; //Marie Jahoda-Scholarship/ ; Final fellowship//VDS CoBeNe/ ; P 31034/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {Sound production mechanisms set the parameter space available for transmitting biologically relevant information in vocal signals. Low-frequency rumbles play a crucial role in coordinating social interactions in elephants' complex fission-fusion societies. By emitting rumbles through either the oral or the three-times longer nasal vocal tract, African elephants alter their spectral shape significantly. In this study, we used an acoustic camera to visualize the sound emission of rumbles in Asian elephants, which have received far less research attention than African elephants. We recorded nine adult captive females and analyzed the spectral parameters of 203 calls, including vocal tract resonances (formants). We found that the majority of rumbles (64%) were nasally emitted, 21% orally, and 13% simultaneously through the mouth and trunk, demonstrating velopharyngeal coupling. Some of the rumbles were combined with orally emitted roars. The nasal rumbles concentrated most spectral energy in lower frequencies exhibiting two formants, whereas the oral and mixed rumbles contained higher formants, higher spectral energy concentrations and were louder. The roars were the loudest, highest and broadest in frequency. This study is the first to demonstrate velopharyngeal coupling in a non-human animal. Our findings provide a foundation for future research into the adaptive functions of the elephant acoustic variability for information coding, localizability or sound transmission, as well as vocal flexibility across species.}, } @article {pmid36007484, year = {2022}, author = {Rong, P and Hansen, O and Heidrick, L}, title = {Relationship between rate-elicited changes in muscular-kinematic control strategies and acoustic performance in individuals with ALS-A multimodal investigation.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106253}, doi = {10.1016/j.jcomdis.2022.106253}, pmid = {36007484}, issn = {1873-7994}, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis ; Biomechanical Phenomena/physiology ; Humans ; Speech/physiology ; Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; }, abstract = {INTRODUCTION: As a key control variable, duration has been long suspected to mediate the organization of speech motor control strategies, which has management implications for neuromotor speech disorders. This study aimed to experimentally delineate the role of duration in organizing speech motor control in neurologically healthy and impaired speakers using a voluntary speaking rate manipulation paradigm.

METHODS: Thirteen individuals with amyotrophic lateral sclerosis (ALS) and 10 healthy controls performed a sentence reading task three times, first at their habitual rate, then at a slower rate. A multimodal approach combining surface electromyography, kinematic, and acoustic technologies was used to record jaw muscle activities, jaw kinematics, and speech acoustics. Six muscular-kinematic features were extracted and factor-analyzed to characterize the organization of the mandibular control hierarchy. Five acoustic features were extracted, measuring the spectrotemporal properties of the diphthong /ɑɪ/ and the plosives /t/ and /k/.

RESULTS: The muscular-kinematic features converged into two interpretable latent factors, reflecting the level and cohesiveness/flexibility of mandibular control, respectively. Voluntary rate reduction led to a trend toward (1) finer, less cohesive, and more flexible mandibular control, and (2) increased range and decreased transition slope of the diphthong formants, across neurologically healthy and impaired groups. Differential correlations were found between the rate-elicited changes in mandibular control and acoustic performance for neurologically healthy and impaired speakers.

CONCLUSIONS: The results provided empirical evidence for the long-suspected but previously unsubstantiated role of duration in (re)organizing speech motor control strategies. The rate-elicited reorganization of muscular-kinematic control contributed to the acoustic performance of healthy speakers, in ways consistent with theoretical predictions. Such contributions were less consistent in impaired speakers, implying the complex nature of speaking rate reduction in ALS, possibly reflecting an interplay of disease-related constraints and volitional duration control. This information may help to stratify and identify candidates for the rate manipulation therapy.}, } @article {pmid36002663, year = {2022}, author = {Easwar, V and Aiken, S and Beh, K and McGrath, E and Galloy, M and Scollie, S and Purcell, D}, title = {Variability in the Estimated Amplitude of Vowel-Evoked Envelope Following Responses Caused by Assumed Neurophysiologic Processing Delays.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {23}, number = {6}, pages = {759-769}, pmid = {36002663}, issn = {1438-7573}, support = {//CIHR/Canada ; }, mesh = {Young Adult ; Child ; Male ; Humans ; Adolescent ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; Noise ; Electroencephalography/methods ; Reaction Time/physiology ; }, abstract = {Vowel-evoked envelope following responses (EFRs) reflect neural encoding of the fundamental frequency of voice (f0). Accurate analysis of EFRs elicited by natural vowels requires the use of methods like the Fourier analyzer (FA) to consider the production-related f0 changes. The FA's accuracy in estimating EFRs is, however, dependent on the assumed neurophysiological processing delay needed to time-align the f0 time course and the recorded electroencephalogram (EEG). For male-spoken vowels (f0 ~ 100 Hz), a constant 10-ms delay correction is often assumed. Since processing delays vary with stimulus and physiological factors, we quantified (i) the delay-related variability that would occur in EFR estimation, and (ii) the influence of stimulus frequency, non-f0 related neural activity, and the listener's age on such variability. EFRs were elicited by the low-frequency first formant, and mid-frequency second and higher formants of /u/, /a/, and /i/ in young adults and 6- to 17-year-old children. To time-align with the f0 time course, EEG was shifted by delays between 5 and 25 ms to encompass plausible response latencies. The delay-dependent range in EFR amplitude did not vary by stimulus frequency or age and was significantly smaller when interference from low-frequency activity was reduced. On average, the delay-dependent range was < 22% of the maximum variability in EFR amplitude that could be expected by noise. Results suggest that using a constant EEG delay correction in FA analysis does not substantially alter EFR amplitude estimation. In the present study, the lack of substantial variability was likely facilitated by using vowels with small f0 ranges.}, } @article {pmid35993422, year = {2024}, author = {Clarke, H and Leav, S and Zestic, J and Mohamed, I and Salisbury, I and Sanderson, P}, title = {Enhanced Neonatal Pulse Oximetry Sounds for the First Minutes of Life: A Laboratory Trial.}, journal = {Human factors}, volume = {66}, number = {4}, pages = {1017-1036}, doi = {10.1177/00187208221118472}, pmid = {35993422}, issn = {1547-8181}, mesh = {Humans ; Infant, Newborn ; *Resuscitation ; *Oximetry ; Oxygen ; Sound ; Heart Rate ; }, abstract = {OBJECTIVE: Auditory enhancements to the pulse oximetry tone may help clinicians detect deviations from target ranges for oxygen saturation (SpO2) and heart rate (HR).

BACKGROUND: Clinical guidelines recommend target ranges for SpO2 and HR during neonatal resuscitation in the first 10 minutes after birth. The pulse oximeter currently maps HR to tone rate, and SpO2 to tone pitch. However, deviations from target ranges for SpO2 and HR are not easy to detect.

METHOD: Forty-one participants were presented with 30-second simulated scenarios of an infant's SpO2 and HR levels in the first minutes after birth. Tremolo marked distinct HR ranges and formants marked distinct SpO2 ranges. Participants were randomly allocated to conditions: (a) No Enhancement control, (b) Enhanced HR Only, (c) Enhanced SpO2 Only, and (d) Enhanced Both.

RESULTS: Participants in the Enhanced HR Only and Enhanced SpO2 Only conditions identified HR and SpO2 ranges, respectively, more accurately than participants in the No Enhancement condition, ps < 0.001. In the Enhanced Both condition, the tremolo enhancement of HR did not affect participants' ability to identify SpO2 range, but the formants enhancement of SpO2 may have attenuated participants' ability to identify tremolo-enhanced HR range.

CONCLUSION: Tremolo and formant enhancements improve range identification for HR and SpO2, respectively, and could improve clinicians' ability to identify SpO2 and HR ranges in the first minutes after birth.

APPLICATION: Enhancements to the pulse oximeter tone to indicate clinically important ranges could improve the management of oxygen delivery to the neonate during resuscitation in the first 10 minutes after birth.}, } @article {pmid35961825, year = {2022}, author = {Nascimento, GFD and Silva, HJD and Oliveira, KGSC and Lira, SZ and Gomes, AOC}, title = {Relationship Between Oropharyngeal Geometry and Acoustic Parameters in Singers: A Preliminary Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.012}, pmid = {35961825}, issn = {1873-4588}, abstract = {OBJECTIVE: To verify possible correlations between formant and cepstral parameters and oropharyngeal geometry in singers, stratified by sex.

METHOD: Voice records and oropharyngeal measures of 31 singers - 13 females and 18 males, mean age of 28 (±5.0) years - were retrieved from a database and analyzed. The oropharyngeal geometry measures were collected with acoustic pharyngometry, and the voice records consisted of sustained vowel /Ԑ/ phonation, which were exported to Praat software and edited to obtain the formant and cepstral parameters, stratified by sex. The Pearson linear correlation test was applied to relate voice parameters to oropharyngeal geometry, at the 5% significance level; the linear regression test was used to justify the variable related to the second formant.

RESULTS: Differences between the sexes were identified only in the oral cavity length (greater in males) and pharyngeal cavity length (greater in females). There was a linear correlation between the third formant and the cepstrum in the female group. In the male group, there was a linear correlation between the cepstrum and the third and fourth formants. A positive linear correlation with up to 95% confidence was also identified between the pharyngeal cavity volume and the second formant in the female group, making it possible to estimate a regression model for the second formant (R2 = 0.70).

CONCLUSION: There are correlations between the oropharyngeal geometry and formant and cepstral parameters in relation to sex. The pharyngeal cavity volume showed the greatest correlation between females and the second formant.}, } @article {pmid35951711, year = {2022}, author = {Nishimura, T and Tokuda, IT and Miyachi, S and Dunn, JC and Herbst, CT and Ishimura, K and Kaneko, A and Kinoshita, Y and Koda, H and Saers, JPP and Imai, H and Matsuda, T and Larsen, ON and Jürgens, U and Hirabayashi, H and Kojima, S and Fitch, WT}, title = {Evolutionary loss of complexity in human vocal anatomy as an adaptation for speech.}, journal = {Science (New York, N.Y.)}, volume = {377}, number = {6607}, pages = {760-763}, doi = {10.1126/science.abm1574}, pmid = {35951711}, issn = {1095-9203}, mesh = {Animals ; *Biological Evolution ; Humans ; *Larynx/anatomy & histology ; *Phonation ; Phonetics ; *Primates ; *Speech ; Speech Acoustics ; *Vocal Cords/anatomy & histology ; }, abstract = {Human speech production obeys the same acoustic principles as vocal production in other animals but has distinctive features: A stable vocal source is filtered by rapidly changing formant frequencies. To understand speech evolution, we examined a wide range of primates, combining observations of phonation with mathematical modeling. We found that source stability relies upon simplifications in laryngeal anatomy, specifically the loss of air sacs and vocal membranes. We conclude that the evolutionary loss of vocal membranes allows human speech to mostly avoid the spontaneous nonlinear phenomena and acoustic chaos common in other primate vocalizations. This loss allows our larynx to produce stable, harmonic-rich phonation, ideally highlighting formant changes that convey most phonetic information. Paradoxically, the increased complexity of human spoken language thus followed simplification of our laryngeal anatomy.}, } @article {pmid35944059, year = {2022}, author = {Suresh, CH and Krishnan, A}, title = {Frequency-Following Response to Steady-State Vowel in Quiet and Background Noise Among Marching Band Participants With Normal Hearing.}, journal = {American journal of audiology}, volume = {31}, number = {3}, pages = {719-736}, doi = {10.1044/2022_AJA-21-00226}, pmid = {35944059}, issn = {1558-9137}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/physiology ; Hearing ; Humans ; *Noise ; Sound ; *Speech Perception/physiology ; }, abstract = {OBJECTIVE: Human studies enrolling individuals at high risk for cochlear synaptopathy (CS) have reported difficulties in speech perception in adverse listening conditions. The aim of this study is to determine if these individuals show a degradation in the neural encoding of speech in quiet and in the presence of background noise as reflected in neural phase-locking to both envelope periodicity and temporal fine structure (TFS). To our knowledge, there are no published reports that have specifically examined the neural encoding of both envelope periodicity and TFS of speech stimuli (in quiet and in adverse listening conditions) among a sample with loud-sound exposure history who are at risk for CS.

METHOD: Using scalp-recorded frequency-following response (FFR), the authors evaluated the neural encoding of envelope periodicity (FFRENV) and TFS (FFRTFS) for a steady-state vowel (English back vowel /u/) in quiet and in the presence of speech-shaped noise presented at +5- and 0 dB SNR. Participants were young individuals with normal hearing who participated in the marching band for at least 5 years (high-risk group) and non-marching band group with low-noise exposure history (low-risk group).

RESULTS: The results showed no group differences in the neural encoding of either the FFRENV or the first formant (F1) in the FFRTFS in quiet and in noise. Paradoxically, the high-risk group demonstrated enhanced representation of F2 harmonics across all stimulus conditions.

CONCLUSIONS: These results appear to be in line with a music experience-dependent enhancement of F2 harmonics. However, due to sound overexposure in the high-risk group, the role of homeostatic central compensation cannot be ruled out. A larger scale data set with different noise exposure background, longitudinal measurements with an array of behavioral and electrophysiological tests is needed to disentangle the nature of the complex interaction between the effects of central compensatory gain and experience-dependent enhancement.}, } @article {pmid35944047, year = {2022}, author = {McAllister, T and Eads, A and Kabakoff, H and Scott, M and Boyce, S and Whalen, DH and Preston, JL}, title = {Baseline Stimulability Predicts Patterns of Response to Traditional and Ultrasound Biofeedback Treatment for Residual Speech Sound Disorder.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {8}, pages = {2860-2880}, pmid = {35944047}, issn = {1558-9102}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {*Apraxias ; Biofeedback, Psychology/methods ; Humans ; Language ; Speech/physiology ; *Speech Sound Disorder/diagnostic imaging/therapy ; Speech Therapy/methods ; }, abstract = {PURPOSE: This study aimed to identify predictors of response to treatment for residual speech sound disorder (RSSD) affecting English rhotics. Progress was tracked during an initial phase of traditional motor-based treatment and a longer phase of treatment incorporating ultrasound biofeedback. Based on previous literature, we focused on baseline stimulability and sensory acuity as predictors of interest.

METHOD: Thirty-three individuals aged 9-15 years with residual distortions of /ɹ/ received a course of individual intervention comprising 1 week of intensive traditional treatment and 9 weeks of ultrasound biofeedback treatment. Stimulability for /ɹ/ was probed prior to treatment, after the traditional treatment phase, and after the end of all treatment. Accuracy of /ɹ/ production in each probe was assessed with an acoustic measure: normalized third formant (F3)-second formant (F2) distance. Model-based clustering analysis was applied to these acoustic measures to identify different average trajectories of progress over the course of treatment. The resulting clusters were compared with respect to acuity in auditory and somatosensory domains.

RESULTS: All but four individuals were judged to exhibit a clinically significant response to the combined course of treatment. Two major clusters were identified. The "low stimulability" cluster was characterized by very low accuracy at baseline, minimal response to traditional treatment, and strong response to ultrasound biofeedback. The "high stimulability" group was more accurate at baseline and made significant gains in both traditional and ultrasound biofeedback phases of treatment. The clusters did not differ with respect to sensory acuity.

CONCLUSIONS: This research accords with clinical intuition in finding that individuals who are more stimulable at baseline are more likely to respond to traditional intervention, whereas less stimulable individuals may derive greater relative benefit from biofeedback.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20422236.}, } @article {pmid35931553, year = {2022}, author = {Levi, SV}, title = {Teaching acoustic phonetics to undergraduates in communication sciences and disorders: Course structure and sample projects.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {651}, doi = {10.1121/10.0012984}, pmid = {35931553}, issn = {1520-8524}, mesh = {Acoustics ; Communication ; Humans ; *Phonetics ; *Speech Acoustics ; Students ; }, abstract = {Virtually all undergraduate communication sciences and disorders programs require a course that covers acoustic phonetics. Students typically have a separate phonetics (transcription) course prior to taking the acoustic phonetics course. This paper describes a way to structure an acoustic phonetics course into two halves: a first half that focuses on the source, including basic acoustics (simple harmonic motion, harmonics), vocal fold vibration, modes of phonation, and intonation, and a second half that focuses on the filter, including resonance and tube models, vowel formants, and consonant acoustics. Thus, basic acoustic properties are interwoven with specific examples of speech-related acoustics. In addition, two projects that illustrate concepts from the two halves of the course (one on fundamental frequency and the other on vowel formants) are presented.}, } @article {pmid35931547, year = {2022}, author = {Mills, HE and Shorey, AE and Theodore, RM and Stilp, CE}, title = {Context effects in perception of vowels differentiated by F1 are not influenced by variability in talkers' mean F1 or F3.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {55}, doi = {10.1121/10.0011920}, pmid = {35931547}, issn = {1520-8524}, mesh = {*Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Spectral properties of earlier sounds (context) influence recognition of later sounds (target). Acoustic variability in context stimuli can disrupt this process. When mean fundamental frequencies (f0's) of preceding context sentences were highly variable across trials, shifts in target vowel categorization [due to spectral contrast effects (SCEs)] were smaller than when sentence mean f0's were less variable; when sentences were rearranged to exhibit high or low variability in mean first formant frequencies (F1) in a given block, SCE magnitudes were equivalent [Assgari, Theodore, and Stilp (2019) J. Acoust. Soc. Am. 145(3), 1443-1454]. However, since sentences were originally chosen based on variability in mean f0, stimuli underrepresented the extent to which mean F1 could vary. Here, target vowels (/ɪ/-/ɛ/) were categorized following context sentences that varied substantially in mean F1 (experiment 1) or mean F3 (experiment 2) with variability in mean f0 held constant. In experiment 1, SCE magnitudes were equivalent whether context sentences had high or low variability in mean F1; the same pattern was observed in experiment 2 for new sentences with high or low variability in mean F3. Variability in some acoustic properties (mean f0) can be more perceptually consequential than others (mean F1, mean F3), but these results may be task-dependent.}, } @article {pmid35920586, year = {2023}, author = {Feng, Y and Peng, G}, title = {Development of categorical speech perception in Mandarin-speaking children and adolescents.}, journal = {Child development}, volume = {94}, number = {1}, pages = {28-43}, pmid = {35920586}, issn = {1467-8624}, mesh = {Male ; Adult ; Humans ; Child ; Adolescent ; *Speech Perception ; Cross-Sectional Studies ; Linguistics ; Asian ; China ; }, abstract = {Although children develop categorical speech perception at a very young age, the maturation process remains unclear. A cross-sectional study in Mandarin-speaking 4-, 6-, and 10-year-old children, 14-year-old adolescents, and adults (n = 104, 56 males, all Asians from mainland China) was conducted to investigate the development of categorical perception of four Mandarin phonemic contrasts: lexical tone contrast Tone 1-2, vowel contrast /u/-/i/, consonant aspiration contrast /p/-/p[h] /, and consonant formant transition contrast /p/-/t/. The results indicated that different types of phonemic contrasts, and even the identification and discrimination of the same phonemic contrast, matured asynchronously. The observation that tone and vowel perception are achieved earlier than consonant perception supports the phonological saliency hypothesis.}, } @article {pmid35916929, year = {2023}, author = {Song, J and Wan, Q and Wang, Y and Zhou, H}, title = {Establishment of a Multi-parameter Evaluation Model for Risk of Aspiration in Dysphagia: A Pilot Study.}, journal = {Dysphagia}, volume = {38}, number = {1}, pages = {406-414}, pmid = {35916929}, issn = {1432-0460}, mesh = {Humans ; Deglutition ; *Deglutition Disorders/diagnosis/etiology ; Pilot Projects ; Risk Factors ; }, abstract = {It's difficult for clinical bedside evaluations to accurately determine the occurrence of aspiration in patients. Although VFSS and FEES are the gold standards for clinical diagnosis of dysphagia, which are mainly used to evaluate people at high risk of dysphagia found by bedside screening, the operation is complicated and time-consuming. The aim of this pilot study was to present an objective measure based on a multi-parameter approach to screen for aspiration risk in patients with dysphagia. Objective evaluation techniques based on speech parameters were used to assess the oral motor function, vocal cord function, and voice changes before and after swallowing in 32 patients with dysphagia (16 low-risk aspiration group, 16 high-risk aspiration group). Student's t test combined with stepwise logistic regression were used to determine the optimal index. The best model consists of three parameters, and the equation is: logit(P) = - 3.824 - (0.504 × maximum phonation time) + (0.008 × second formant frequency of /u/) - 0.085 × (fundamental frequency difference before and after swallowing). An additional eight patients with dysphagia were randomly selected as the validation group of the model. When applied to validation, this model can accurately identify the risk of aspiration in 87.5% of patients, and the sensitivity is as high as 100%. Therefore, it has certain clinical practical value that may help clinicians to assess the risk of aspiration in patients with dysphagia, especially for silent aspiration.}, } @article {pmid35905807, year = {2022}, author = {Lee, GS and Chang, CW}, title = {Comparisons of auditory brainstem response elicited by compound click-sawtooths sound and synthetic consonant-vowel /da/.}, journal = {Physiology & behavior}, volume = {255}, number = {}, pages = {113922}, doi = {10.1016/j.physbeh.2022.113922}, pmid = {35905807}, issn = {1873-507X}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Humans ; Phonetics ; Reaction Time/physiology ; Sound ; *Speech Perception/physiology ; }, abstract = {The auditory brainstem response to complex sounds (cABR) could be evoked using speech sounds such as the 40 ms synthetic consonant-vowel syllable /da/ (CV-da) that was commonly used in basic and clinical research. cABR consists of responses to formant energy as well as the energy of fundamental frequency. The co-existence of the two energy makes cABR a mixed response. We introduced a new stimulus of click-sawtooths (CSW) with similar time-lock patterns but without formant or harmonic energy. Ten young healthy volunteers were recruited and the cABRs of CV-da and CSW of their 20 ears were acquired. The response latencies, amplitudes, and frequency-domain analytic results were compared pairwisely between stimuli. The response amplitudes were significantly greater for CSW and the latencies were significantly shorter for CSW. The latency-intensity functions were also greater for CSW. For CSW, adjustments of energy component can be made without causing biased changes to the other. CSW may be used in future basic research and clinical applications.}, } @article {pmid35894373, year = {2022}, author = {França, FP and Almeida, AA and Lopes, LW}, title = {Immediate effect of different exercises in the vocal space of women with and without vocal nodules.}, journal = {CoDAS}, volume = {34}, number = {5}, pages = {e20210157}, pmid = {35894373}, issn = {2317-1782}, mesh = {Exercise ; Female ; Humans ; Language ; *Phonetics ; *Speech Acoustics ; Tongue ; }, abstract = {PURPOSE: To investigate the immediate effect of voiced tongue vibration (VSL), high-resistance straw in the air (CAR), and overarticulation (OA) on the vocal space of vocally healthy women (MVS) and with vocal nodules (MNV).

METHODS: 12 women participated in the MNV and 12 women in the MVS, allocated to perform the vocal exercises of VSL, CAR, and OA. Each participant performed only one of the three proposed exercises, for 5 minutes, preceded and followed by recording a sequence of vehicle sentences for extracting formants (F1 and F2) from the vowel segments [a, i, u]. The vowel space was analyzed through the differences between the measures of the formants of the vowels.

RESULTS: we observed a reduction of F1 in the interval [a]-[i] and [i]-[u] and of F2 between the vowels [a]-[u] and [i]-[u] in the MVS, after performing the CAR. In MNV, we observed a reduction of F2 in the interval [a]-[i] after VSL. In the intergroup analysis, there were higher F1 values between the intervals of the vowels [a]-[i] and [i]-[u] in the MVS, before performing the CAR, and after exercise only in the interval [a]-[i]. A higher value of F1 and F2 was observed in the interval between the vowels [i]-[u] in the MNV after VSL.

CONCLUSION: The VSL exercise reduced the vowel space in MNV women. CAR reduced the vocal space of women in the MVS. The MNV had a smaller vowel space compared to the MVS before and after the CAR. We observed a reduction in the vowel space in the MNV compared to the MNV after the VSL exercise.}, } @article {pmid35874163, year = {2022}, author = {Wang, H and Max, L}, title = {Inter-Trial Formant Variability in Speech Production Is Actively Controlled but Does Not Affect Subsequent Adaptation to a Predictable Formant Perturbation.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {890065}, pmid = {35874163}, issn = {1662-5161}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; }, abstract = {Despite ample evidence that speech production is associated with extensive trial-to-trial variability, it remains unclear whether this variability represents merely unwanted system noise or an actively regulated mechanism that is fundamental for maintaining and adapting accurate speech movements. Recent work on upper limb movements suggest that inter-trial variability may be not only actively regulated based on sensory feedback, but also provide a type of workspace exploration that facilitates sensorimotor learning. We therefore investigated whether experimentally reducing or magnifying inter-trial formant variability in the real-time auditory feedback during speech production (a) leads to adjustments in formant production variability that compensate for the manipulation, (b) changes the temporal structure of formant adjustments across productions, and (c) enhances learning in a subsequent adaptation task in which a predictable formant-shift perturbation is applied to the feedback signal. Results show that subjects gradually increased formant variability in their productions when hearing auditory feedback with reduced variability, but subsequent formant-shift adaptation was not affected by either reducing or magnifying the perceived variability. Thus, findings provide evidence for speakers' active control of inter-trial formant variability based on auditory feedback from previous trials, but-at least for the current short-term experimental manipulation of feedback variability-not for a role of this variability regulation mechanism in subsequent auditory-motor learning.}, } @article {pmid35865705, year = {2022}, author = {Mailhos, A and Egea-Caparrós, DA and Guerrero Rodríguez, C and Luzardo, M and Kiskimska, ND and Martínez Sánchez, F}, title = {Vocal Cues to Male Physical Formidability.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {879102}, pmid = {35865705}, issn = {1664-1078}, abstract = {Animal vocalizations convey important information about the emitter, including sex, age, biological quality, and emotional state. Early on, Darwin proposed that sex differences in auditory signals and vocalizations were driven by sexual selection mechanisms. In humans, studies on the association between male voice attributes and physical formidability have thus far reported mixed results. Hence, with a view to furthering our understanding of the role of human voice in advertising physical formidability, we sought to identify acoustic attributes of male voices associated with physical formidability proxies. Mean fundamental frequency (F 0), formant dispersion (D f), formant position (P f), and vocal tract length (VTL) data from a sample of 101 male voices was analyzed for potential associations with height, weight, and maximal handgrip strength (HGS). F 0 correlated negatively with HGS; P f showed negative correlations with HGS, height and weight, whereas VTL positively correlated with HGS, height and weight. All zero-order correlations remained significant after controlling for false discovery rate (FDR) with the Benjamini-Hochberg method. After controlling for height and weight-and controlling for FDR-the correlation between F 0 and HGS remained significant. In addition, to evaluate the ability of human male voices to advertise physical formidability to potential mates, 151 heterosexual female participants rated the voices of the 10 strongest and the 10 weakest males from the original sample for perceived physical strength, and given that physical strength is a desirable attribute in male partners, perceived attractiveness. Generalized linear mixed model analyses-which allow for generalization of inferences to other samples of both raters and targets-failed to support a significant association of perceived strength or attractiveness from voices alone and actual physical strength. These results add to the growing body of work on the role of human voices in conveying relevant biological information.}, } @article {pmid35858255, year = {2022}, author = {Shao, J and Bakhtiar, M and Zhang, C}, title = {Impaired Categorical Perception of Speech Sounds Under the Backward Masking Condition in Adults Who Stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2554-2570}, doi = {10.1044/2022_JSLHR-21-00276}, pmid = {35858255}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Child ; Humans ; Phonetics ; Speech ; *Speech Perception ; *Stuttering ; *Voice ; }, abstract = {PURPOSE: Evidence increasingly indicates that people with developmental stuttering have auditory perception deficits. Our previous research has indicated similar but slower performance in categorical perception of the speech sounds under the quiet condition in children who stutter and adults who stutter (AWS) compared with their typically fluent counterparts. We hypothesized that the quiet condition may not be sufficiently sensitive to reveal subtle perceptual deficiencies in people who stutter. This study examined this hypothesis by testing the categorical perception of speech and nonspeech sounds under backward masking condition (i.e., a noise was presented immediately after the target stimuli).

METHOD: Fifteen Cantonese-speaking AWS and 15 adults who do not stutter (AWNS) were tested on the categorical perception of four stimulus continua, namely, consonant varying in voice onset time (VOT), vowel, lexical tone, and nonspeech, under the backward masking condition using identification and discrimination tasks.

RESULTS: AWS demonstrated a broader boundary width than AWNS in the identification task. AWS also exhibited a worse performance than AWNS in the discrimination of between-category stimuli but a comparable performance in the discrimination of within-category stimuli, indicating reduced sensitivity to sounds that belonged to different phonemic categories among AWS. Moreover, AWS showed similar patterns of impaired categorical perception across the four stimulus types, although the boundary location on the VOT continuum occurred at an earlier point in AWS than in AWNS.

CONCLUSIONS: The findings provide robust evidence that AWS exhibit impaired categorical perception of speech and nonspeech sounds under the backward masking condition. Temporal processing (i.e., VOT manipulation), frequency/spectral/formant processing (i.e., lexical tone or vowel manipulations), and nonlinguistic pitch processing were all found to be impaired in AWS. Altogether, the findings support the hypothesis that AWS might be less efficient in accessing the phonemic representations when exposed to a demanding listening condition.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20249718.}, } @article {pmid35858067, year = {2022}, author = {Baciadonna, L and Solvi, C and Del Vecchio, F and Pilenga, C and Baracchi, D and Bandoli, F and Isaja, V and Gamba, M and Favaro, L}, title = {Vocal accommodation in penguins (Spheniscus demersus) as a result of social environment.}, journal = {Proceedings. Biological sciences}, volume = {289}, number = {1978}, pages = {20220626}, pmid = {35858067}, issn = {1471-2954}, mesh = {Animals ; Communication ; Humans ; Language ; Social Environment ; *Spheniscidae ; Vocalization, Animal ; }, abstract = {The ability to vary the characteristics of one's voice is a critical feature of human communication. Understanding whether and how animals change their calls will provide insights into the evolution of language. We asked to what extent the vocalizations of penguins, a phylogenetically distant species from those capable of explicit vocal learning, are flexible and responsive to their social environment. Using a principal components (PCs) analysis, we reduced 14 vocal parameters of penguin's contact calls to four PCs, each comprising highly correlated parameters and which can be categorized as fundamental frequency, formant frequency, frequency modulation, and amplitude modulation rate and duration. We compared how these differed between individuals with varying degrees of social interactions: same-colony versus different-colony, same colony over 3 years and partners versus non-partners. Our analyses indicate that the more penguins experience each other's calls, the more similar their calls become over time, that vocal convergence requires a long time and relative stability in colony membership, and that partners' unique social bond may affect vocal convergence differently than non-partners. Our results suggest that this implicit form of vocal plasticity is perhaps more widespread across the animal kingdom than previously thought and may be a fundamental capacity of vertebrate vocalization.}, } @article {pmid35804282, year = {2022}, author = {Easwar, V and Chung, L}, title = {The influence of phoneme contexts on adaptation in vowel-evoked envelope following responses.}, journal = {The European journal of neuroscience}, volume = {56}, number = {5}, pages = {4572-4582}, pmid = {35804282}, issn = {1460-9568}, mesh = {Acoustic Stimulation ; Humans ; Male ; Phonetics ; *Speech Perception/physiology ; }, abstract = {Repeated stimulus presentation leads to neural adaptation and consequent amplitude reduction in vowel-evoked envelope following responses (EFRs)-a response that reflects neural activity phase-locked to envelope periodicity. EFRs are elicited by vowels presented in isolation or in the context of other phonemes such as consonants in syllables. While context phonemes could exert some forward influence on vowel-evoked EFRs, they may reduce the degree of adaptation. Here, we evaluated whether the properties of context phonemes between consecutive vowel stimuli influence adaptation. EFRs were elicited by the low-frequency first formant (resolved harmonics) and middle-to-high-frequency second and higher formants (unresolved harmonics) of a male-spoken /i/ when the presence, number and predictability of context phonemes (/s/, /a/, /∫/ and /u/) between vowel repetitions varied. Monitored over four iterations of /i/, adaptation was evident only for EFRs elicited by the unresolved harmonics. EFRs elicited by the unresolved harmonics decreased in amplitude by ~16-20 nV (10%-17%) after the first presentation of /i/ and remained stable thereafter. EFR adaptation was reduced by the presence of a context phoneme, but the reduction did not change with their number or predictability. The presence of a context phoneme, however, attenuated EFRs by a degree similar to that caused by adaptation (~21-23 nV). Such a trade-off in the short- and long-term influence of context phonemes suggests that the benefit of interleaving EFR-eliciting vowels with other context phonemes depends on whether the use of consonant-vowel syllables is critical to improve the validity of EFR applications.}, } @article {pmid35802401, year = {2022}, author = {Teferra, BG and Borwein, S and DeSouza, DD and Simpson, W and Rheault, L and Rose, J}, title = {Acoustic and Linguistic Features of Impromptu Speech and Their Association With Anxiety: Validation Study.}, journal = {JMIR mental health}, volume = {9}, number = {7}, pages = {e36828}, pmid = {35802401}, issn = {2368-7959}, abstract = {BACKGROUND: The measurement and monitoring of generalized anxiety disorder requires frequent interaction with psychiatrists or psychologists. Access to mental health professionals is often difficult because of high costs or insufficient availability. The ability to assess generalized anxiety disorder passively and at frequent intervals could be a useful complement to conventional treatment and help with relapse monitoring. Prior work suggests that higher anxiety levels are associated with features of human speech. As such, monitoring speech using personal smartphones or other wearable devices may be a means to achieve passive anxiety monitoring.

OBJECTIVE: This study aims to validate the association of previously suggested acoustic and linguistic features of speech with anxiety severity.

METHODS: A large number of participants (n=2000) were recruited and participated in a single web-based study session. Participants completed the Generalized Anxiety Disorder 7-item scale assessment and provided an impromptu speech sample in response to a modified version of the Trier Social Stress Test. Acoustic and linguistic speech features were a priori selected based on the existing speech and anxiety literature, along with related features. Associations between speech features and anxiety levels were assessed using age and personal income as covariates.

RESULTS: Word count and speaking duration were negatively correlated with anxiety scores (r=-0.12; P<.001), indicating that participants with higher anxiety scores spoke less. Several acoustic features were also significantly (P<.05) associated with anxiety, including the mel-frequency cepstral coefficients, linear prediction cepstral coefficients, shimmer, fundamental frequency, and first formant. In contrast to previous literature, second and third formant, jitter, and zero crossing rate for the z score of the power spectral density acoustic features were not significantly associated with anxiety. Linguistic features, including negative-emotion words, were also associated with anxiety (r=0.10; P<.001). In addition, some linguistic relationships were sex dependent. For example, the count of words related to power was positively associated with anxiety in women (r=0.07; P=.03), whereas it was negatively associated with anxiety in men (r=-0.09; P=.01).

CONCLUSIONS: Both acoustic and linguistic speech measures are associated with anxiety scores. The amount of speech, acoustic quality of speech, and gender-specific linguistic characteristics of speech may be useful as part of a system to screen for anxiety, detect relapse, or monitor treatment.}, } @article {pmid35778699, year = {2022}, author = {Lin, YC and Yan, HT and Lin, CH and Chang, HH}, title = {Predicting frailty in older adults using vocal biomarkers: a cross-sectional study.}, journal = {BMC geriatrics}, volume = {22}, number = {1}, pages = {549}, pmid = {35778699}, issn = {1471-2318}, mesh = {Aged ; Biomarkers ; Cross-Sectional Studies ; Female ; Frail Elderly ; *Frailty/diagnosis/epidemiology ; Humans ; Male ; Odds Ratio ; *Osteoporotic Fractures ; }, abstract = {BACKGROUND: Frailty is a common issue in the aging population. Given that frailty syndrome is little discussed in the literature on the aging voice, the current study aims to examine the relationship between frailty and vocal biomarkers in older people.

METHODS: Participants aged ≥ 60 years visiting geriatric outpatient clinics were recruited. They underwent frailty assessment (Cardiovascular Health Study [CHS] index; Study of Osteoporotic Fractures [SOF] index; and Fatigue, Resistance, Ambulation, Illness, and Loss of weight [FRAIL] index) and were asked to pronounce a sustained vowel /a/ for approximately 1 s. Four voice parameters were assessed: average number of zero crossings (A1), variations in local peaks and valleys (A2), variations in first and second formant frequencies (A3), and spectral energy ratio (A4).

RESULTS: Among 277 older adults, increased A1 was associated with a lower likelihood of frailty as defined by SOF (odds ratio [OR] 0.84, 95% confidence interval [CI] 0.74-0.96). Participants with larger A2 values were more likely to be frail, as defined by FRAIL and CHS (FRAIL: OR 1.41, 95% CI 1.12-1.79; CHS: OR 1.38, 95% CI 1.10-1.75). Sex differences were observed across the three frailty indices. In male participants, an increase in A3 by 10 points increased the odds of frailty by almost 7% (SOF: OR 1.07, 95% CI 1.02-1.12), 6% (FRAIL: OR 1.06, 95% CI 1.02-1.11), or 6% (CHS: OR 1.06, 95% CI 1.01-1.11). In female participants, an increase in A4 by 0.1 conferred a significant 2.8-fold (SOF: OR 2.81, 95% CI 1.71-4.62), 2.3-fold (FRAIL: OR 2.31, 95% CI 1.45-3.68), or 2.8-fold (CHS: OR 2.82, 95% CI 1.76-4.51, CHS) increased odds of frailty.

CONCLUSIONS: Vocal biomarkers, especially spectral-domain voice parameters, might have potential for estimating frailty, as a non-invasive, instantaneous, objective, and cost-effective estimation tool, and demonstrating sex differences for individualised treatment of frailty.}, } @article {pmid35778208, year = {2022}, author = {Jibson, J}, title = {Formant detail needed for identifying, rating, and discriminating vowels in Wisconsin English.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {6}, pages = {4004}, doi = {10.1121/10.0011539}, pmid = {35778208}, issn = {1520-8524}, mesh = {*Language ; Wisconsin ; }, abstract = {Neel [(2004). Acoust. Res. Lett. Online 5, 125-131] asked how much time-varying formant detail is needed for vowel identification. In that study, multiple stimuli were synthesized for each vowel: 1-point (monophthongal with midpoint frequencies), 2-point (linear from onset to offset), 3-point, 5-point, and 11-point. Results suggested that a 3-point model was optimal. This conflicted with the dual-target hypothesis of vowel inherent spectral change research, which has found that two targets are sufficient to model vowel identification. The present study replicates and expands upon the work of Neel. Ten English monophthongs were chosen for synthesis. One-, two-, three-, and five-point vowels were created as described above, and another 1-point stimulus was created with onset frequencies rather than midpoint frequencies. Three experiments were administered (n = 18 for each): vowel identification, goodness rating, and discrimination. The results ultimately align with the dual-target hypothesis, consistent with most vowel inherent spectral change studies.}, } @article {pmid35749662, year = {2022}, author = {Groll, MD and Dahl, KL and Cádiz, MD and Welch, B and Tracy, LF and Stepp, CE}, title = {Resynthesis of Transmasculine Voices to Assess Gender Perception as a Function of Testosterone Therapy.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2474-2489}, pmid = {35749662}, issn = {1558-9102}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech ; Speech Acoustics ; *Speech Perception ; Testosterone ; *Voice ; Young Adult ; }, abstract = {PURPOSE: The goal of this study was to use speech resynthesis to investigate the effects of changes to individual acoustic features on speech-based gender perception of transmasculine voice samples following the onset of hormone replacement therapy (HRT) with exogenous testosterone. We hypothesized that mean fundamental frequency (f o) would have the largest effect on gender perception of any single acoustic feature.

METHOD: Mean f o, f o contour, and formant frequencies were calculated for three pairs of transmasculine speech samples before and after HRT onset. Sixteen speech samples with unique combinations of these acoustic features from each pair of speech samples were resynthesized. Twenty young adult listeners evaluated each synthesized speech sample for gender perception and synthetic quality. Two analyses of variance were used to investigate the effects of acoustic features on gender perception and synthetic quality.

RESULTS: Of the three acoustic features, mean f o was the only single feature that had a statistically significant effect on gender perception. Differences between the speech samples before and after HRT onset that were not captured by changes in f o and formant frequencies also had a statistically significant effect on gender perception.

CONCLUSION: In these transmasculine voice samples, mean f o was the most important acoustic feature for voice masculinization as a result of HRT; future investigations in a larger number of transmasculine speakers and on the effects of behavioral therapy-based changes in concert with HRT is warranted.}, } @article {pmid35744460, year = {2022}, author = {Yan, S and Liu, P and Chen, Z and Liu, J and Shen, L and Zhang, X and Cui, J and Li, T and Cui, Y and Ren, Y}, title = {High-Property Refractive Index and Bio-Sensing Dual-Purpose Sensor Based on SPPs.}, journal = {Micromachines}, volume = {13}, number = {6}, pages = {}, pmid = {35744460}, issn = {2072-666X}, abstract = {A high-property plasma resonance-sensor structure consisting of two metal-insulator-metal (MIM) waveguides coupled with a transverse ladder-shaped nano-cavity (TLSNC) is designed based on surface plasmon polaritons. Its transmission characteristics are analyzed using multimode interference coupling mode theory (MICMT), and are simulated using finite element analysis (FEA). Meanwhile, the influence of different structural arguments on the performance of the structure is investigated. This study shows that the system presents four high-quality formants in the transmission spectrum. The highest sensitivity is 3000 nm/RIU with a high FOM[*] of 9.7 × 10[5]. In addition, the proposed structure could act as a biosensor to detect the concentrations of sodium ions (Na[+]), potassium ions (K[+]), and the glucose solution with maximum sensitivities of 0.45, 0.625 and 5.5 nm/mgdL[-1], respectively. Compared with other structures, the designed system has the advantages of a simple construction, a wide working band range, high reliability and easy nano-scale integration, providing a high-performance cavity choice for refractive index sensing and biosensing devices based on surface plasmons.}, } @article {pmid35737731, year = {2022}, author = {Ham, J and Yoo, HJ and Kim, J and Lee, B}, title = {Vowel speech recognition from rat electroencephalography using long short-term memory neural network.}, journal = {PloS one}, volume = {17}, number = {6}, pages = {e0270405}, pmid = {35737731}, issn = {1932-6203}, mesh = {Animals ; Electroencephalography/methods ; Male ; Memory, Short-Term ; Neural Networks, Computer ; Rats ; Rats, Sprague-Dawley ; Speech ; *Speech Perception ; }, abstract = {Over the years, considerable research has been conducted to investigate the mechanisms of speech perception and recognition. Electroencephalography (EEG) is a powerful tool for identifying brain activity; therefore, it has been widely used to determine the neural basis of speech recognition. In particular, for the classification of speech recognition, deep learning-based approaches are in the spotlight because they can automatically learn and extract representative features through end-to-end learning. This study aimed to identify particular components that are potentially related to phoneme representation in the rat brain and to discriminate brain activity for each vowel stimulus on a single-trial basis using a bidirectional long short-term memory (BiLSTM) network and classical machine learning methods. Nineteen male Sprague-Dawley rats subjected to microelectrode implantation surgery to record EEG signals from the bilateral anterior auditory fields were used. Five different vowel speech stimuli were chosen, /a/, /e/, /i/, /o/, and /u/, which have highly different formant frequencies. EEG recorded under randomly given vowel stimuli was minimally preprocessed and normalized by a z-score transformation to be used as input for the classification of speech recognition. The BiLSTM network showed the best performance among the classifiers by achieving an overall accuracy, f1-score, and Cohen's κ values of 75.18%, 0.75, and 0.68, respectively, using a 10-fold cross-validation approach. These results indicate that LSTM layers can effectively model sequential data, such as EEG; hence, informative features can be derived through BiLSTM trained with end-to-end learning without any additional hand-crafted feature extraction methods.}, } @article {pmid35731636, year = {2023}, author = {Pravitharangul, N and Miyamoto, JJ and Yoshizawa, H and Matsumoto, T and Suzuki, S and Chantarawaratit, PO and Moriyama, K}, title = {Vowel sound production and its association with cephalometric characteristics in skeletal Class III subjects.}, journal = {European journal of orthodontics}, volume = {45}, number = {1}, pages = {20-28}, doi = {10.1093/ejo/cjac031}, pmid = {35731636}, issn = {1460-2210}, mesh = {Male ; Humans ; *Speech Acoustics ; Speech ; Acoustics ; Cephalometry ; *Overbite ; }, abstract = {BACKGROUND: This study aimed to evaluate differences in vowel production using acoustic analysis in skeletal Class III and Class I Japanese participants and to identify the correlation between vowel sounds and cephalometric variables in skeletal Class III subjects.

MATERIALS AND METHODS: Japanese males with skeletal Class III (ANB < 0°) and Class I skeletal anatomy (0.62° < ANB < 5.94°) were recruited (n = 18/group). Acoustic analysis of vowel sounds and cephalometric analysis of lateral cephalograms were performed. For sound analysis, an isolated Japanese vowel (/a/,/i/,/u/,/e/,/o/) pattern was recorded. Praat software was used to extract acoustic parameters such as fundamental frequency (F0) and the first four formants (F1, F2, F3, and F4). The formant graph area was calculated. Cephalometric values were obtained using ImageJ. Correlations between acoustic and cephalometric variables in skeletal Class III subjects were then investigated.

RESULTS: Skeletal Class III subjects exhibited significantly higher/o/F2 and lower/o/F4 values. Mandibular length, SNB, and overjet of Class III subjects were moderately negatively correlated with acoustic variables.

LIMITATIONS: This study did not take into account vertical skeletal patterns and tissue movements during sound production.

CONCLUSION: Skeletal Class III males produced different /o/ (back and rounded vowel), possibly owing to their anatomical positions or adaptive changes. Vowel production was moderately associated with cephalometric characteristics of Class III subjects. Thus, changes in speech after orthognathic surgery may be expected. A multidisciplinary team approach that included the input of a speech pathologist would be useful.}, } @article {pmid35728449, year = {2022}, author = {Kabakoff, H and Gritsyk, O and Harel, D and Tiede, M and Preston, JL and Whalen, DH and McAllister, T}, title = {Characterizing sensorimotor profiles in children with residual speech sound disorder: a pilot study.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106230}, pmid = {35728449}, issn = {1873-7994}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; *Apraxias ; Child ; Humans ; *Language Development Disorders ; Pilot Projects ; Speech ; Speech Production Measurement ; *Speech Sound Disorder/therapy ; *Stuttering ; }, abstract = {PURPOSE: Children with speech errors who have reduced motor skill may be more likely to develop residual errors associated with lifelong challenges. Drawing on models of speech production that highlight the role of somatosensory acuity in updating motor plans, this pilot study explored the relationship between motor skill and speech accuracy, and between somatosensory acuity and motor skill in children. Understanding the connections among sensorimotor measures and speech outcomes may offer insight into how somatosensation and motor skill cooperate during speech production, which could inform treatment decisions for this population.

METHOD: Twenty-five children (ages 9-14) produced syllables in an /ɹ/ stimulability task before and after an ultrasound biofeedback treatment program targeting rhotics. We first tested whether motor skill (as measured by two ultrasound-based metrics of tongue shape complexity) predicted acoustically measured accuracy (the normalized difference between the second and third formant frequencies). We then tested whether somatosensory acuity (as measured by an oral stereognosis task) predicted motor skill, while controlling for auditory acuity.

RESULTS: One measure of tongue shape complexity was a significant predictor of accuracy, such that higher tongue shape complexity was associated with lower accuracy at pre-treatment but higher accuracy at post-treatment. Based on the same measure, children with better somatosensory acuity produced /ɹ/ tongue shapes that were more complex, but this relationship was only present at post-treatment.

CONCLUSION: The predicted relationships among somatosensory acuity, motor skill, and acoustically measured /ɹ/ production accuracy were observed after treatment, but unexpectedly did not hold before treatment. The surprising finding that greater tongue shape complexity was associated with lower accuracy at pre-treatment highlights the importance of evaluating tongue shape patterns (e.g., using ultrasound) prior to treatment, and has the potential to suggest that children with high tongue shape complexity at pre-treatment may be good candidates for ultrasound-based treatment.}, } @article {pmid35727115, year = {2022}, author = {González-Alvarez, J and Sos-Peña, R}, title = {Perceiving Body Height From Connected Speech: Higher Fundamental Frequency Is Associated With the Speaker's Height.}, journal = {Perceptual and motor skills}, volume = {129}, number = {5}, pages = {1349-1361}, doi = {10.1177/00315125221110392}, pmid = {35727115}, issn = {1558-688X}, mesh = {Body Height ; Body Size ; Female ; Humans ; Male ; *Speech ; *Speech Perception ; }, abstract = {To a certain degree, human listeners can perceive a speaker's body size from their voice. The speaker's voice pitch or fundamental frequency (Fo) and the vocal formant frequencies are the voice parameters that have been most intensively studied in past body size perception research (particularly for body height). Artificially lowering the Fo of isolated vowels from male speakers improved listeners' accuracy of binary (i.e., tall vs not tall) body height perceptions. This has been explained by the theory that a denser harmonic spectrum provided by a low pitch improved the perceptual resolution of formants that aid formant-based size assessments. In the present study, we extended this research using connected speech (i.e., words and sentences) pronounced by speakers of both sexes. Unexpectedly, we found that raising Fo, not lowering it, increased the participants' perceptual performance in two binary discrimination tasks of body size. We explain our new finding in the temporal domain by the dynamic and time-varying acoustic properties of connected speech. Increased Fo might increase the sampling density of sound wave acoustic cycles and provide more detailed information, such as higher resolution, on the envelope shape.}, } @article {pmid35712147, year = {2022}, author = {Sugiyama, Y}, title = {Identification of Minimal Pairs of Japanese Pitch Accent in Noise-Vocoded Speech.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {887761}, pmid = {35712147}, issn = {1664-1078}, abstract = {The perception of lexical pitch accent in Japanese was assessed using noise-excited vocoder speech, which contained no fundamental frequency (f o) or its harmonics. While prosodic information such as in lexical stress in English and lexical tone in Mandarin Chinese is known to be encoded in multiple acoustic dimensions, such multidimensionality is less understood for lexical pitch accent in Japanese. In the present study, listeners were tested under four different conditions to investigate the contribution of non-f o properties to the perception of Japanese pitch accent: noise-vocoded speech stimuli consisting of 10 3-ERBN-wide bands and 15 2-ERBN-wide bands created from a male and female speaker. Results found listeners were able to identify minimal pairs of final-accented and unaccented words at a rate better than chance in all conditions, indicating the presence of secondary cues to Japanese pitch accent. Subsequent analyses were conducted to investigate if the listeners' ability to distinguish minimal pairs was correlated with duration, intensity or formant information. The results found no strong or consistent correlation, suggesting the possibility that listeners used different cues depending on the information available in the stimuli. Furthermore, the comparison of the current results with equivalent studies in English and Mandarin Chinese suggest that, although lexical prosodic information exists in multiple acoustic dimensions in Japanese, the primary cue is more salient than in other languages.}, } @article {pmid35700949, year = {2022}, author = {Preisig, BC and Riecke, L and Hervais-Adelman, A}, title = {Speech sound categorization: The contribution of non-auditory and auditory cortical regions.}, journal = {NeuroImage}, volume = {258}, number = {}, pages = {119375}, doi = {10.1016/j.neuroimage.2022.119375}, pmid = {35700949}, issn = {1095-9572}, mesh = {Acoustic Stimulation/methods ; *Auditory Cortex/diagnostic imaging/physiology ; Auditory Perception ; Hearing ; Humans ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Which processes in the human brain lead to the categorical perception of speech sounds? Investigation of this question is hampered by the fact that categorical speech perception is normally confounded by acoustic differences in the stimulus. By using ambiguous sounds, however, it is possible to dissociate acoustic from perceptual stimulus representations. Twenty-seven normally hearing individuals took part in an fMRI study in which they were presented with an ambiguous syllable (intermediate between /da/ and /ga/) in one ear and with disambiguating acoustic feature (third formant, F3) in the other ear. Multi-voxel pattern searchlight analysis was used to identify brain areas that consistently differentiated between response patterns associated with different syllable reports. By comparing responses to different stimuli with identical syllable reports and identical stimuli with different syllable reports, we disambiguated whether these regions primarily differentiated the acoustics of the stimuli or the syllable report. We found that BOLD activity patterns in left perisylvian regions (STG, SMG), left inferior frontal regions (vMC, IFG, AI), left supplementary motor cortex (SMA/pre-SMA), and right motor and somatosensory regions (M1/S1) represent listeners' syllable report irrespective of stimulus acoustics. Most of these regions are outside of what is traditionally regarded as auditory or phonological processing areas. Our results indicate that the process of speech sound categorization implicates decision-making mechanisms and auditory-motor transformations.}, } @article {pmid35694910, year = {2023}, author = {Sayyahi, F and Boulenger, V}, title = {A temporal-based therapy for children with inconsistent phonological disorder: A case-series.}, journal = {Clinical linguistics & phonetics}, volume = {37}, number = {7}, pages = {655-681}, doi = {10.1080/02699206.2022.2075792}, pmid = {35694910}, issn = {1464-5076}, mesh = {Child, Preschool ; Humans ; Child ; *Speech Sound Disorder/therapy ; Phonetics ; Speech ; Language ; Vocabulary ; }, abstract = {Deficits in temporal auditory processing, and in particular higher gap detection thresholds have been reported in children with inconsistent phonological disorder (IPD). Here we hypothesized that providing these children with extra time for phoneme identification may in turn enhance their phonological planning abilities for production, and accordingly improve not only consistency but also accuracy of their speech. We designed and tested a new temporal-based therapy, inspired by Core Vocabulary Therapy and called it T-CVT, where we digitally lengthened formant transitions between phonemes of words used for therapy. This allowed to target both temporal auditory processing and word phonological planning. Four preschool Persian native children with IPD received T-CVT for eight weeks. We measured changes in speech consistency (% inconsistency) and accuracy (percentage of consonants correct PCC) to assess the effects of the intervention. Therapy significantly improved both consistency and accuracy of word production in the four children: % inconsistency decreased from 59% on average before therapy to 2% post-T-CVT, and PCC increased from 61% to 92% on average. Consistency and accuracy were furthermore maintained or even still improved at three-month follow-up (2% inconsistency and 99% PCC). Results in a nonword repetition task showed the generalization of these effects to non-treated material: % inconsistency for nonwords decreased from 67% to 10% post-therapy, and PCC increased from 63% to 90%. These preliminary findings support the efficacy of the T-CVT intervention for children with IPD who show temporal auditory processing deficits as reflected by higher gap detection thresholds.}, } @article {pmid35673798, year = {2022}, author = {Di Dona, G and Scaltritti, M and Sulpizio, S}, title = {Formant-invariant voice and pitch representations are pre-attentively formed from constantly varying speech and non-speech stimuli.}, journal = {The European journal of neuroscience}, volume = {56}, number = {3}, pages = {4086-4106}, pmid = {35673798}, issn = {1460-9568}, mesh = {Acoustic Stimulation/methods ; Attention ; Female ; Humans ; Male ; Reaction Time ; Speech ; *Speech Perception ; }, abstract = {The present study investigated whether listeners can form abstract voice representations while ignoring constantly changing phonological information and if they can use the resulting information to facilitate voice change detection. Further, the study aimed at understanding whether the use of abstraction is restricted to the speech domain or can be deployed also in non-speech contexts. We ran an electroencephalogram (EEG) experiment including one passive and one active oddball task, each featuring a speech and a rotated speech condition. In the speech condition, participants heard constantly changing vowels uttered by a male speaker (standard stimuli) which were infrequently replaced by vowels uttered by a female speaker with higher pitch (deviant stimuli). In the rotated speech condition, participants heard rotated vowels, in which the natural formant structure of speech was disrupted. In the passive task, the mismatch negativity was elicited after the presentation of the deviant voice in both conditions, indicating that listeners could successfully group together different stimuli into a formant-invariant voice representation. In the active task, participants showed shorter reaction times (RTs), higher accuracy and a larger P3b in the speech condition with respect to the rotated speech condition. Results showed that whereas at a pre-attentive level the cognitive system can track pitch regularities while presumably ignoring constantly changing formant information both in speech and in rotated speech, at an attentive level the use of such information is facilitated for speech. This facilitation was also testified by a stronger synchronisation in the theta band (4-7 Hz), potentially pointing towards differences in encoding/retrieval processes.}, } @article {pmid35667724, year = {2022}, author = {Hampsey, E and Meszaros, M and Skirrow, C and Strawbridge, R and Taylor, RH and Chok, L and Aarsland, D and Al-Chalabi, A and Chaudhuri, R and Weston, J and Fristed, E and Podlewska, A and Awogbemila, O and Young, AH}, title = {Protocol for Rhapsody: a longitudinal observational study examining the feasibility of speech phenotyping for remote assessment of neurodegenerative and psychiatric disorders.}, journal = {BMJ open}, volume = {12}, number = {6}, pages = {e061193}, pmid = {35667724}, issn = {2044-6055}, mesh = {Feasibility Studies ; Humans ; Longitudinal Studies ; *Mental Disorders ; *Mobile Applications ; Observational Studies as Topic ; Speech ; }, abstract = {INTRODUCTION: Neurodegenerative and psychiatric disorders (NPDs) confer a huge health burden, which is set to increase as populations age. New, remotely delivered diagnostic assessments that can detect early stage NPDs by profiling speech could enable earlier intervention and fewer missed diagnoses. The feasibility of collecting speech data remotely in those with NPDs should be established.

METHODS AND ANALYSIS: The present study will assess the feasibility of obtaining speech data, collected remotely using a smartphone app, from individuals across three NPD cohorts: neurodegenerative cognitive diseases (n=50), other neurodegenerative diseases (n=50) and affective disorders (n=50), in addition to matched controls (n=75). Participants will complete audio-recorded speech tasks and both general and cohort-specific symptom scales. The battery of speech tasks will serve several purposes, such as measuring various elements of executive control (eg, attention and short-term memory), as well as measures of voice quality. Participants will then remotely self-administer speech tasks and follow-up symptom scales over a 4-week period. The primary objective is to assess the feasibility of remote collection of continuous narrative speech across a wide range of NPDs using self-administered speech tasks. Additionally, the study evaluates if acoustic and linguistic patterns can predict diagnostic group, as measured by the sensitivity, specificity, Cohen's kappa and area under the receiver operating characteristic curve of the binary classifiers distinguishing each diagnostic group from each other. Acoustic features analysed include mel-frequency cepstrum coefficients, formant frequencies, intensity and loudness, whereas text-based features such as number of words, noun and pronoun rate and idea density will also be used.

ETHICS AND DISSEMINATION: The study received ethical approval from the Health Research Authority and Health and Care Research Wales (REC reference: 21/PR/0070). Results will be disseminated through open access publication in academic journals, relevant conferences and other publicly accessible channels. Results will be made available to participants on request.

TRIAL REGISTRATION NUMBER: NCT04939818.}, } @article {pmid35664509, year = {2022}, author = {Roessig, S and Winter, B and Mücke, D}, title = {Tracing the Phonetic Space of Prosodic Focus Marking.}, journal = {Frontiers in artificial intelligence}, volume = {5}, number = {}, pages = {842546}, pmid = {35664509}, issn = {2624-8212}, abstract = {Focus is known to be expressed by a wide range of phonetic cues but only a few studies have explicitly compared different phonetic variables within the same experiment. Therefore, we presented results from an analysis of 19 phonetic variables conducted on a data set of the German language that comprises the opposition of unaccented (background) vs. accented (in focus), as well as different focus types with the nuclear accent on the same syllable (broad, narrow, and contrastive focus). The phonetic variables are measures of the acoustic and articulographic signals of a target syllable. Overall, our results provide the highest number of reliable effects and largest effect sizes for accentuation (unaccented vs. accented), while the differentiation of focus types with accented target syllables (broad, narrow, and contrastive focus) are more subtle. The most important phonetic variables across all conditions are measures of the fundamental frequency. The articulatory variables and their corresponding acoustic formants reveal lower tongue positions for both vowels /o, a/, and larger lip openings for the vowel /a/ under increased prosodic prominence with the strongest effects for accentuation. While duration exhibits consistent mid-ranked results for both accentuation and the differentiation of focus types, measures related to intensity are particularly important for accentuation. Furthermore, voice quality and spectral tilt are affected by accentuation but also in the differentiation of focus types. Our results confirm that focus is realized via multiple phonetic cues. Additionally, the present analysis allows a comparison of the relative importance of different measures to better understand the phonetic space of focus marking.}, } @article {pmid35664350, year = {2022}, author = {Coughler, C and Quinn de Launay, KL and Purcell, DW and Oram Cardy, J and Beal, DS}, title = {Pediatric Responses to Fundamental and Formant Frequency Altered Auditory Feedback: A Scoping Review.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {858863}, pmid = {35664350}, issn = {1662-5161}, abstract = {PURPOSE: The ability to hear ourselves speak has been shown to play an important role in the development and maintenance of fluent and coherent speech. Despite this, little is known about the developing speech motor control system throughout childhood, in particular if and how vocal and articulatory control may differ throughout development. A scoping review was undertaken to identify and describe the full range of studies investigating responses to frequency altered auditory feedback in pediatric populations and their contributions to our understanding of the development of auditory feedback control and sensorimotor learning in childhood and adolescence.

METHOD: Relevant studies were identified through a comprehensive search strategy of six academic databases for studies that included (a) real-time perturbation of frequency in auditory input, (b) an analysis of immediate effects on speech, and (c) participants aged 18 years or younger.

RESULTS: Twenty-three articles met inclusion criteria. Across studies, there was a wide variety of designs, outcomes and measures used. Manipulations included fundamental frequency (9 studies), formant frequency (12), frequency centroid of fricatives (1), and both fundamental and formant frequencies (1). Study designs included contrasts across childhood, between children and adults, and between typical, pediatric clinical and adult populations. Measures primarily explored acoustic properties of speech responses (latency, magnitude, and variability). Some studies additionally examined the association of these acoustic responses with clinical measures (e.g., stuttering severity and reading ability), and neural measures using electrophysiology and magnetic resonance imaging.

CONCLUSION: Findings indicated that children above 4 years generally compensated in the opposite direction of the manipulation, however, in several cases not as effectively as adults. Overall, results varied greatly due to the broad range of manipulations and designs used, making generalization challenging. Differences found between age groups in the features of the compensatory vocal responses, latency of responses, vocal variability and perceptual abilities, suggest that maturational changes may be occurring in the speech motor control system, affecting the extent to which auditory feedback is used to modify internal sensorimotor representations. Varied findings suggest vocal control develops prior to articulatory control. Future studies with multiple outcome measures, manipulations, and more expansive age ranges are needed to elucidate findings.}, } @article {pmid35634052, year = {2022}, author = {Wang, X and Wang, T}, title = {Voice Recognition and Evaluation of Vocal Music Based on Neural Network.}, journal = {Computational intelligence and neuroscience}, volume = {2022}, number = {}, pages = {3466987}, pmid = {35634052}, issn = {1687-5273}, mesh = {Humans ; *Music ; Neural Networks, Computer ; Voice Quality ; Voice Recognition ; Voice Training ; }, abstract = {Artistic voice is the artistic life of professional voice users. In the process of selecting and cultivating artistic performing talents, the evaluation of voice even occupies a very important position. Therefore, an appropriate evaluation of the artistic voice is crucial. With the development of art education, how to scientifically evaluate artistic voice training methods and fairly select artistic voice talents is an urgent need for objective evaluation of artistic voice. The current evaluation methods for artistic voices are time-consuming, laborious, and highly subjective. In the objective evaluation of artistic voice, the selection of evaluation acoustic parameters is very important. Attempt to extract the average energy, average frequency error, and average range error of singing voice by using speech analysis technology as the objective evaluation acoustic parameters, use neural network method to objectively evaluate the singing quality of artistic voice, and compare with the subjective evaluation of senior professional teachers. In this paper, voice analysis technology is used to extract the first formant, third formant, fundamental frequency, sound range, fundamental frequency perturbation, first formant perturbation, third formant perturbation, and average energy of singing acoustic parameters. By using BP neural network methods, the quality of singing was evaluated objectively and compared with the subjective evaluation of senior vocal professional teachers. The results show that the BP neural network method can accurately and objectively evaluate the quality of singing voice by using the evaluation parameters, which is helpful in scientifically guiding the selection and training of artistic voice talents.}, } @article {pmid35612119, year = {2022}, author = {Rafi, S and Gangloff, C and Paulhet, E and Grimault, O and Soulat, L and Bouzillé, G and Cuggia, M}, title = {Out-of-Hospital Cardiac Arrest Detection by Machine Learning Based on the Phonetic Characteristics of the Caller's Voice.}, journal = {Studies in health technology and informatics}, volume = {294}, number = {}, pages = {445-449}, doi = {10.3233/SHTI220498}, pmid = {35612119}, issn = {1879-8365}, mesh = {*Cardiopulmonary Resuscitation ; Emergency Medical Service Communication Systems ; *Emergency Medical Services ; Humans ; Machine Learning ; *Out-of-Hospital Cardiac Arrest/diagnosis ; Phonetics ; }, abstract = {INTRODUCTION: Out-of-hospital cardiac arrest (OHCA) is a major public health issue. The prognosis is closely related to the time from collapse to return of spontaneous circulation. Resuscitation efforts are frequently initiated at the request of emergency call center professionals who are specifically trained to identify critical conditions over the phone. However, 25% of OHCAs are not recognized during the first call. Therefore, it would be interesting to develop automated computer systems to recognize OHCA on the phone. The aim of this study was to build and evaluate machine learning models for OHCA recognition based on the phonetic characteristics of the caller's voice.

METHODS: All patients for whom a call was done to the emergency call center of Rennes, France, between 01/01/2017 and 01/01/2019 were eligible. The predicted variable was OHCA presence. Predicting variables were collected by computer-automatized phonetic analysis of the call. They were based on the following voice parameters: fundamental frequency, formants, intensity, jitter, shimmer, harmonic to noise ratio, number of voice breaks, and number of periods. Three models were generated using binary logistic regression, random forest, and neural network. The area under the curve (AUC) was the primary outcome used to evaluate each model performance.

RESULTS: 820 patients were included in the study. The best model to predict OHCA was random forest (AUC=74.9, 95% CI=67.4-82.4).

CONCLUSION: Machine learning models based on the acoustic characteristics of the caller's voice can recognize OHCA. The integration of the acoustic parameters identified in this study will help to design decision-making support systems to improve OHCA detection over the phone.}, } @article {pmid35548492, year = {2022}, author = {Tomaschek, F and Ramscar, M}, title = {Understanding the Phonetic Characteristics of Speech Under Uncertainty-Implications of the Representation of Linguistic Knowledge in Learning and Processing.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {754395}, pmid = {35548492}, issn = {1664-1078}, abstract = {The uncertainty associated with paradigmatic families has been shown to correlate with their phonetic characteristics in speech, suggesting that representations of complex sublexical relations between words are part of speaker knowledge. To better understand this, recent studies have used two-layer neural network models to examine the way paradigmatic uncertainty emerges in learning. However, to date this work has largely ignored the way choices about the representation of inflectional and grammatical functions (IFS) in models strongly influence what they subsequently learn. To explore the consequences of this, we investigate how representations of IFS in the input-output structures of learning models affect the capacity of uncertainty estimates derived from them to account for phonetic variability in speech. Specifically, we examine whether IFS are best represented as outputs to neural networks (as in previous studies) or as inputs by building models that embody both choices and examining their capacity to account for uncertainty effects in the formant trajectories of word final [ɐ], which in German discriminates around sixty different IFS. Overall, we find that formants are enhanced as the uncertainty associated with IFS decreases. This result dovetails with a growing number of studies of morphological and inflectional families that have shown that enhancement is associated with lower uncertainty in context. Importantly, we also find that in models where IFS serve as inputs-as our theoretical analysis suggests they ought to-its uncertainty measures provide better fits to the empirical variance observed in [ɐ] formants than models where IFS serve as outputs. This supports our suggestion that IFS serve as cognitive cues during speech production, and should be treated as such in modeling. It is also consistent with the idea that when IFS serve as inputs to a learning network. This maintains the distinction between those parts of the network that represent message and those that represent signal. We conclude by describing how maintaining a "signal-message-uncertainty distinction" can allow us to reconcile a range of apparently contradictory findings about the relationship between articulation and uncertainty in context.}, } @article {pmid35529579, year = {2022}, author = {Haiduk, F and Fitch, WT}, title = {Understanding Design Features of Music and Language: The Choric/Dialogic Distinction.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {786899}, pmid = {35529579}, issn = {1664-1078}, support = {W 1262/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {Music and spoken language share certain characteristics: both consist of sequences of acoustic elements that are combinatorically combined, and these elements partition the same continuous acoustic dimensions (frequency, formant space and duration). However, the resulting categories differ sharply: scale tones and note durations of small integer ratios appear in music, while speech uses phonemes, lexical tone, and non-isochronous durations. Why did music and language diverge into the two systems we have today, differing in these specific features? We propose a framework based on information theory and a reverse-engineering perspective, suggesting that design features of music and language are a response to their differential deployment along three different continuous dimensions. These include the familiar propositional-aesthetic ('goal') and repetitive-novel ('novelty') dimensions, and a dialogic-choric ('interactivity') dimension that is our focus here. Specifically, we hypothesize that music exhibits specializations enhancing coherent production by several individuals concurrently-the 'choric' context. In contrast, language is specialized for exchange in tightly coordinated turn-taking-'dialogic' contexts. We examine the evidence for our framework, both from humans and non-human animals, and conclude that many proposed design features of music and language follow naturally from their use in distinct dialogic and choric communicative contexts. Furthermore, the hybrid nature of intermediate systems like poetry, chant, or solo lament follows from their deployment in the less typical interactive context.}, } @article {pmid35520977, year = {2021}, author = {Hall, A and Kawai, K and Graber, K and Spencer, G and Roussin, C and Weinstock, P and Volk, MS}, title = {Acoustic analysis of surgeons' voices to assess change in the stress response during surgical in situ simulation.}, journal = {BMJ simulation & technology enhanced learning}, volume = {7}, number = {6}, pages = {471-477}, pmid = {35520977}, issn = {2056-6697}, abstract = {INTRODUCTION: Stress may serve as an adjunct (challenge) or hindrance (threat) to the learning process. Determining the effect of an individual's response to situational demands in either a real or simulated situation may enable optimisation of the learning environment. Studies of acoustic analysis suggest that mean fundamental frequency and formant frequencies of voice vary with an individual's response during stressful events. This hypothesis is reviewed within the otolaryngology (ORL) simulation environment to assess whether acoustic analysis could be used as a tool to determine participants' stress response and cognitive load in medical simulation. Such an assessment could lead to optimisation of the learning environment.

METHODOLOGY: ORL simulation scenarios were performed to teach the participants teamwork and refine clinical skills. Each was performed in an actual operating room (OR) environment (in situ) with a multidisciplinary team consisting of ORL surgeons, OR nurses and anaesthesiologists. Ten of the scenarios were led by an ORL attending and ten were led by an ORL fellow. The vocal communication of each of the 20 individual leaders was analysed using a long-term pitch analysis PRAAT software (autocorrelation method) to obtain mean fundamental frequency (F0) and first four formant frequencies (F1, F2, F3 and F4). In reviewing individual scenarios, each leader's voice was analysed during a non-stressful environment (WHO sign-out procedure) and compared with their voice during a stressful portion of the scenario (responding to deteriorating oxygen saturations in the manikin).

RESULTS: The mean unstressed F0 for the male voice was 161.4 Hz and for the female voice was 217.9 Hz. The mean fundamental frequency of speech in the ORL fellow (lead surgeon) group increased by 34.5 Hz between the scenario's baseline and stressful portions. This was significantly different to the mean change of -0.5 Hz noted in the attending group (p=0.01). No changes were seen in F1, F2, F3 or F4.

CONCLUSIONS: This study demonstrates a method of acoustic analysis of the voices of participants taking part in medical simulations. It suggests acoustic analysis of participants may offer a simple, non-invasive, non-intrusive adjunct in evaluating and titrating the stress response during simulation.}, } @article {pmid35497112, year = {2022}, author = {Jarollahi, F and Valadbeigi, A and Jalaei, B and Maarefvand, M and Motasaddi Zarandy, M and Haghani, H and Shirzhiyzn, Z}, title = {Comparing Sound-Field Speech-Auditory Brainstem Response Components between Cochlear Implant Users with Different Speech Recognition in Noise Scores.}, journal = {Iranian journal of child neurology}, volume = {16}, number = {2}, pages = {93-105}, pmid = {35497112}, issn = {1735-4668}, abstract = {OBJECTIVES: Many studies have suggested that cochlear implant (CI) users vary in terms of speech recognition in noise. Studies in this field attribute this variety partly to subcortical auditory processing. Studying speech-Auditory Brainstem Response (speech-ABR) provides good information about speech processing; thus, this work was designed to compare speech-ABR components between two groups of CI users with good and poor speech recognition in noise scores.

MATERIALS & METHODS: The present study was conducted on two groups of CI users aged 8-10 years old. The first group (CI-good) consisted of 15 children with prelingual CI who had good speech recognition in noise performance. The second group (CI-poor) was matched with the first group, but they had poor speech recognition in noise performance. The speech-ABR test in a sound-field presentation was performed for all the participants.

RESULTS: The speech-ABR response showed more delay in C, D, E, F, O latencies in CI-poor than CI-good users (P <0.05), meanwhile no significant difference was observed in initial wave (V(t= -0.293, p= 0.771 and A (t= -1.051, p= 0.307). Analysis in spectral-domain showed a weaker representation of fundamental frequency as well as the first formant and high-frequency component of speech stimuli in the CI users with poor auditory performance.

CONCLUSIONS: Results revealed that CI users who showed poor auditory performance in noise performance had deficits in encoding the periodic portion of speech signals at the brainstem level. Also, this study could be as physiological evidence for poorer pitch processing in CI users with poor speech recognition in noise performance.}, } @article {pmid35452247, year = {2022}, author = {Houle, N and Goudelias, D and Lerario, MP and Levi, SV}, title = {Effect of Anchor Term on Auditory-Perceptual Ratings of Feminine and Masculine Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {6}, pages = {2064-2080}, pmid = {35452247}, issn = {1558-9102}, support = {T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Cues ; Female ; Humans ; Male ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {BACKGROUND: Studies investigating auditory perception of gender expression vary greatly in the specific terms applied to gender expression in rating scales.

PURPOSE: This study examined the effects of different anchor terms on listeners' auditory perceptions of gender expression in phonated and whispered speech. Additionally, token and speaker cues were examined to identify predictors of the auditory-perceptual ratings.

METHOD: Inexperienced listeners (n = 105) completed an online rating study in which they were asked to use one of five visual analog scales (VASs) to rate cis men, cis women, and transfeminine speakers in both phonated and whispered speech. The VASs varied by anchor term (very female/very male, feminine/masculine, feminine female/masculine male, very feminine/not at all feminine, and not at all masculine/very masculine).

RESULTS: Linear mixed-effects models revealed significant two-way interactions of gender expression by anchor term and gender expression by condition. In general, the feminine female/masculine male scale resulted in the most extreme ratings (closest to the end points), and the feminine/masculine scale resulted in the most central ratings. As expected, for all speakers, whispered speech was rated more centrally than phonated speech. Additionally, ratings of phonated speech were predicted by mean fundamental frequency (f o) within each speaker group and by smoothed cepstral peak prominence in cisgender speakers. In contrast, ratings of whispered speech, which lacks an f o, were predicted by indicators of vocal tract resonance (second formant and speaker height).

CONCLUSIONS: The current results indicate that differences in the terms applied to rating scales limit generalization of results across studies. Identifying the patterns across listener ratings of gender expression provide a rationale for researchers and clinicians when making choices about terms. Additionally, beyond f o and vocal tract resonance, predictors of listener ratings vary based on the anchor terms used to describe gender expression.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19617564.}, } @article {pmid35418360, year = {2022}, author = {Kırbac, A and Turkyılmaz, MD and Yağcıoglu, S}, title = {Gender Effects on Binaural Speech Auditory Brainstem Response.}, journal = {The journal of international advanced otology}, volume = {18}, number = {2}, pages = {125-130}, pmid = {35418360}, issn = {2148-3817}, mesh = {Acoustic Stimulation ; Adult ; Brain Stem/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {BACKGROUND: The speech auditory brainstem response is a tool that provides direct information on how speech sound is temporally and spectrally coded by the auditory brainstem. Speech auditory brainstem response is influenced by many variables, but the effect of gender is unclear, particularly in the binaural recording. Studies on speech auditory brainstem response evoked by binaural stimulation are limited, but gender studies are even more limited and contradictory. This study aimed at examining the effect of gender on speech auditory brainstem response in adults.

METHODS: Time- and frequency-domain analyses of speech auditory brainstem response recordings of 30 healthy participants (15 women and 15 men) aged 18-35 years with normal hearing and no musical education were obtained. For each adult, speech auditory brainstem response was recorded with the syllable /da/ presented binaurally. Peaks of time (V, A, C, D, E, F, and O) and frequency (fundamental frequency, first formant frequency, and high frequency) domains of speech auditory brainstem response were compared between men and women.

RESULTS: V, A, and F peak latencies of women were significantly shorter than those of men (P< .05). However, no difference was found in the peak amplitude of the time (P > .05) or frequency domain between women and men (P > .05).

CONCLUSION: Gender differences in binaural speech auditory brainstem response are significant in adults, particularly in the time domain. When speech stimuli are used for auditory brainstem responses, normative data specific to gender are required. Preliminary normative data from this study could serve as a reference for future studies on binaural speech auditory brainstem response among Turkish adults.}, } @article {pmid35416268, year = {2022}, author = {Yasar, OC and Ozturk, S and Kemal, O and Kocabicak, E}, title = {Effects of Subthalamic Nucleus Deep Brain Stimulation Surgery on Voice and Formant Frequencies of Vowels in Turkish.}, journal = {Turkish neurosurgery}, volume = {32}, number = {5}, pages = {764-772}, doi = {10.5137/1019-5149.JTN.36134-21.2}, pmid = {35416268}, issn = {2651-5032}, mesh = {*Deep Brain Stimulation/methods ; Humans ; Language ; *Parkinson Disease/surgery ; *Subthalamic Nucleus/physiology/surgery ; }, abstract = {AIM: To investigate the effects of deep brain stimulation (DBS) of the subthalamic nucleus (STN) on acoustic characteristics of voice production in Turkish patients with Parkinson's disease (PD).

MATERIAL AND METHODS: This study recruited 20 patients diagnosed with PD. Voice samples were recorded under the "stimulation on" and "stimulation off" conditions of STN-DBS. Acoustic recordings of the patients were made during the production of vowels /a/, /o/, and /i/ and repetition of the syllables /pa/-/ta/-/ka/. Acoustic analyses were performed using Praat.

RESULTS: A significant difference in the parameters was observed among groups for vowels. A positive significant difference was observed between preoperative med-on and postoperative med-on/stim-on groups for /a/ and the postoperative med-on/ stim-on and postoperative med-on/stim-off groups for /o/ and /i/ for frequency perturbation (jitter) and noise-to-harmonics ratio. No significant difference was noted between the preoperative med-on and postoperative med-on/stim-off groups for any vowels.

CONCLUSION: STN-DBS surgery has an acute positive effect on voice. Studies on formant frequency analysis in STN-DBS may be expanded with both articulation and intelligibility tests to enable us to combine patient abilities in various perspectives and to obtain precise results.}, } @article {pmid35400757, year = {2022}, author = {Whalen, DH and DiCanio, C and Dockum, R}, title = {Phonetic Documentation in Three Collections: Topics and Evolution.}, journal = {Journal of the International Phonetic Association}, volume = {52}, number = {1}, pages = {95-121}, pmid = {35400757}, issn = {0025-1003}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, abstract = {Phonetic aspects of many languages have been documented, though the breadth and focus of such documentation varies substantially. In this survey, phonetic aspects (here called "categories") that are typically reported were assessed in three English-language collections-the Illustrations of the IPA, articles from the Journal of Phonetics, and papers from the Ladefoged/Maddieson Sounds of the World's Languages (SOWL) documentation project. Categories were defined for consonants (e.g., Voice Onset Time (VOT) and frication spectrum; 10 in total), vowels (e.g., formants and duration; 7 total) and suprasegmentals (e.g., stress and distinctive vowel length, 6 total). The Illustrations, due to their brevity, had, on average, limited coverage of the selected categories (12% of the 23 categories). Journal of Phonetics articles were typically theoretically motivated, but 64 had sufficient measurements to count as phonetic documentation; these also covered 12% of the categories. The SOWL studies, designed to cover as much of the phonetic structure as feasible in an article-length treatment, achieved 41% coverage on average. Four book-length studies were also examined, with an average of 49% coverage. Phonetic properties of many language families have been studied, though Indo-European is still disproportionately represented. Physiological measures were excluded as being less common, and perceptual measures were excluded as being typically more theoretical. This preliminary study indicates that certain acoustic properties of languages are typically measured and may be considered as an impetus for later, fuller coverage, but broader consensus on the categories is needed. Current documentation efforts could be more useful if these considerations were addressed.}, } @article {pmid35394801, year = {2022}, author = {Dahl, KL and François, FA and Buckley, DP and Stepp, CE}, title = {Voice and Speech Changes in Transmasculine Individuals Following Circumlaryngeal Massage and Laryngeal Reposturing.}, journal = {American journal of speech-language pathology}, volume = {31}, number = {3}, pages = {1368-1382}, pmid = {35394801}, issn = {1558-9110}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Massage ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: The purpose of this study was to measure the short-term effects of circumlaryngeal massage and laryngeal reposturing on acoustic and perceptual characteristics of voice in transmasculine individuals.

METHOD: Fifteen transmasculine individuals underwent one session of sequential circumlaryngeal massage and laryngeal reposturing with a speech-language pathologist. Voice recordings were collected at three time points-baseline, postmassage, and postreposturing. Fundamental frequency (f o), formant frequencies, and relative fundamental frequency (RFF; an acoustic correlate of laryngeal tension) were measured. Estimates of vocal tract length (VTL) were derived from formant frequencies. Twelve listeners rated the perceived masculinity of participants' voices at each time point. Repeated-measures analyses of variance measured the effect of time point on f o, estimated VTL, RFF, and perceived voice masculinity. Significant effects were evaluated with post hoc Tukey's tests.

RESULTS: Between baseline and end of the session, f o decreased, VTL increased, and participant voices were perceived as more masculine, all with statistically significant differences. RFF did not differ significantly at any time point. Outcomes were highly variable at the individual level.

CONCLUSION: Circumlaryngeal massage and laryngeal reposturing have short-term effects on select acoustic (f o, estimated VTL) and perceptual characteristics (listener-assigned voice masculinity) of voice in transmasculine individuals.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19529299.}, } @article {pmid35377739, year = {2022}, author = {Swann, Z and Daliri, A and Honeycutt, CF}, title = {Impact of Startling Acoustic Stimuli on Word Repetition in Individuals With Aphasia and Apraxia of Speech Following Stroke.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {5}, pages = {1671-1685}, doi = {10.1044/2022_JSLHR-21-00486}, pmid = {35377739}, issn = {1558-9102}, mesh = {Acoustics ; *Aphasia/etiology ; *Apraxias/etiology ; Humans ; Reflex, Startle/physiology ; Speech Intelligibility ; *Stroke/complications ; }, abstract = {PURPOSE: The StartReact effect, whereby movements are elicited by loud, startling acoustic stimuli (SAS), allows the evaluation of movements when initiated through involuntary circuitry, before auditory feedback. When StartReact is applied during poststroke upper extremity movements, individuals exhibit increased muscle recruitment, reaction times, and reaching distances. StartReact releases unimpaired speech with similar increases in muscle recruitment and reaction time. However, as poststroke communication disorders have divergent neural circuitry from upper extremity tasks, it is unclear if StartReact will enhance speech poststroke. Our objective is to determine if (a) StartReact is present in individuals with poststroke aphasia and apraxia and (b) SAS exposure enhances speech intelligibility.

METHOD: We remotely delivered startling, 105-dB white noise bursts (SAS) and quiet, non-SAS cues to 15 individuals with poststroke aphasia and apraxia during repetition of six words. We evaluated average word intensity, pitch, pitch trajectories, vowel formants F1 and F2 (first and second formants), phonemic error rate, and percent incidence of each SAS versus non-SAS-elicited phoneme produced under each cue type.

RESULTS: For SAS trials compared to non-SAS, speech intensity increased (∆ + 0.6 dB), speech pitch increased (∆ + 22.7 Hz), and formants (F1 and F2) changed, resulting in a smaller vowel space after SAS. SAS affected pitch trajectories for some, but not all, words. Non-SAS trials had more stops (∆ + 4.7 utterances) while SAS trials had more sustained phonemes (fricatives, glides, affricates, liquids; ∆ + 5.4 utterances). SAS trials had fewer distortion errors but no change in substitution errors or overall error rate compared to non-SAS trials.

CONCLUSIONS: We show that stroke-impaired speech is susceptible to StartReact, evidenced by decreased intelligibility due to altered formants, pitch trajectories, and articulation, including increased incidence of sounds that could not be produced without SAS. Future studies should examine the impact of SAS on voluntary speech intelligibility and clinical measures of aphasia and apraxia.}, } @article {pmid35377182, year = {2022}, author = {Zhang, G and Shao, J and Zhang, C and Wang, L}, title = {The Perception of Lexical Tone and Intonation in Whispered Speech by Mandarin-Speaking Congenital Amusics.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {4}, pages = {1331-1348}, doi = {10.1044/2021_JSLHR-21-00345}, pmid = {35377182}, issn = {1558-9102}, mesh = {*Auditory Perceptual Disorders ; Humans ; Pitch Perception ; Recognition, Psychology ; Speech ; *Speech Perception ; }, abstract = {PURPOSE: A fundamental feature of human speech is variation, including the manner of phonation, as exemplified in the case of whispered speech. In this study, we employed whispered speech to examine an unresolved issue about congenital amusia, a neurodevelopmental disorder of musical pitch processing, which also affects speech pitch processing such as lexical tone and intonation perception. The controversy concerns whether amusia is a pitch-processing disorder or can affect speech processing beyond pitch.

METHOD: We examined lexical tone and intonation recognition in 19 Mandarin-speaking amusics and 19 matched controls in phonated and whispered speech, where fundamental frequency (f o) information is either present or absent.

RESULTS: The results revealed that the performance of congenital amusics was inferior to that of controls in lexical tone identification in both phonated and whispered speech. These impairments were also detected in identifying intonation (statements/questions) in phonated and whispered modes. Across the experiments, regression models revealed that f o and non-f o (duration, intensity, and formant frequency) acoustic cues predicted tone and intonation recognition in phonated speech, whereas non-f o cues predicted tone and intonation recognition in whispered speech. There were significant differences between amusics and controls in the use of both f o and non-f o cues.

CONCLUSION: The results provided the first evidence that the impairments of amusics in lexical tone and intonation identification prevail into whispered speech and support the hypothesis that the deficits of amusia extend beyond pitch processing.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19302275.}, } @article {pmid35363414, year = {2022}, author = {Carl, M and Levy, ES and Icht, M}, title = {Speech treatment for Hebrew-speaking adolescents and young adults with developmental dysarthria: A comparison of mSIT and Beatalk.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {3}, pages = {660-679}, doi = {10.1111/1460-6984.12715}, pmid = {35363414}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/etiology/therapy ; Humans ; Language ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Individuals with developmental dysarthria typically demonstrate reduced functioning of one or more of the speech subsystems, which negatively impacts speech intelligibility and communication within social contexts. A few treatment approaches are available for improving speech production and intelligibility among individuals with developmental dysarthria. However, these approaches have only limited application and research findings among adolescents and young adults.

AIMS: To determine and compare the effectiveness of two treatment approaches, the modified Speech Intelligibility Treatment (mSIT) and the Beatalk technique, on speech production and intelligibility among Hebrew-speaking adolescents and young adults with developmental dysarthria.

METHODS & PROCEDURES: Two matched groups of adolescents and young adults with developmental dysarthria participated in the study. Each received one of the two treatments, mSIT or Beatalk, over the course of 9 weeks. Measures of speech intelligibility, articulatory accuracy, voice and vowel acoustics were assessed both pre- and post-treatment.

OUTCOMES & RESULTS: Both the mSIT and Beatalk groups demonstrated gains in at least some of the outcome measures. Participants in the mSIT group exhibited improvement in speech intelligibility and voice measures, while participants in the Beatalk group demonstrated increased articulatory accuracy and gains in voice measures from pre- to post-treatment. Significant increases were noted post-treatment for first formant values for select vowels.

Results of this preliminary study are promising for both treatment approaches. The differentiated results indicate their distinct application to speech intelligibility deficits. The current findings also hold clinical significance for treatment among adolescents and young adults with motor speech disorders and application for a language other than English.

WHAT THIS PAPER ADDS: What is already known on the subject Developmental dysarthria (e.g., secondary to cerebral palsy) is a motor speech disorder that negatively impacts speech intelligibility, and thus communication participation. Select treatment approaches are available with the aim of improving speech intelligibility in individuals with developmental dysarthria; however, these approaches are limited in number and have only seldomly been applied specifically to adolescents and young adults. What this paper adds to existing knowledge The current study presents preliminary data regarding two treatment approaches, the mSIT and Beatalk technique, administered to Hebrew-speaking adolescents and young adults with developmental dysarthria in a group setting. Results demonstrate the initial effectiveness of the treatment approaches, with different gains noted for each approach across speech and voice domains. What are the potential or actual clinical implications of this work? The findings add to the existing literature on potential treatment approaches aiming to improve speech production and intelligibility among individuals with developmental dysarthria. The presented approaches also show promise for group-based treatments as well as the potential for improvement among adolescents and young adults with motor speech disorders.}, } @article {pmid35344948, year = {2022}, author = {Ho, GY and Kansy, IK and Klavacs, KA and Leonhard, M and Schneider-Stickler, B}, title = {Effect of FFP2/3 Masks on Voice Range Profile Measurement and Voice Acoustics in Routine Voice Diagnostics.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {74}, number = {5}, pages = {335-344}, doi = {10.1159/000524299}, pmid = {35344948}, issn = {1421-9972}, mesh = {*Acoustics ; Adult ; COVID-19 ; COVID-19 Testing ; Female ; Humans ; Male ; *Masks ; Middle Aged ; Pandemics ; Phonation ; Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {INTRODUCTION: Voice diagnostics including voice range profile (VRP) measurement and acoustic voice analysis is essential in laryngology and phoniatrics. Due to COVID-19 pandemic, wearing of 2 or 3 filtering face piece (FFP2/3) masks is recommended when high-risk aerosol-generating procedures like singing and speaking are being performed. Goal of this study was to compare VRP parameters when performed without and with FFP2/3 masks. Further, formant analysis for sustained vowels, singer's formant, and analysis of reading standard text samples were performed without/with FFP2/3 masks.

METHODS: Twenty subjects (6 males and 14 females) were enrolled in this study with an average age of 36 ± 16 years (mean ± SD). Fourteen patients were rated as euphonic/not hoarse and 6 patients as mildly hoarse. All subjects underwent the VRP measurements, vowel, and text recordings without/with FFP2/3 mask using the software DiVAS by XION medical (Berlin, Germany). Voice range of singing voice, equivalent of voice extension measure (eVEM), fundamental frequency (F0), sound pressure level (SPL) of soft speaking and shouting were calculated and analyzed. Maximum phonation time (MPT) and jitter-% were included for Dysphonia Severity Index (DSI) measurement. Analyses of singer's formant were performed. Spectral analyses of sustained vowels /a:/, /i:/, and /u:/ (first = F1 and second = F2 formants), intensity of long-term average spectrum, and alpha-ratio were calculated using the freeware praat.

RESULTS: For all subjects, the mean values of routine voice parameters without/with mask were analyzed: no significant differences were found in results of singing voice range, eVEM, SPL, and frequency of soft speaking/shouting, except significantly lower mean SPL of shouting with FFP2/3 mask, in particular that of the female subjects (p = 0.002). Results of MPT, jitter, and DSI without/with FFP2/3 mask showed no significant differences. Further mean values analyzed without/with mask were ratio singer's formant/loud singing, with lower ratio with FFP2/3 mask (p = 0.001), and F1 and F2 of /a:/, /i:/, /u:/, with no significant differences of the results, with the exception of F2 of /i:/ with lower value with FFP2/3 mask (p = 0.005). With the exceptions mentioned, the t test revealed no significant differences for each of the routine parameters tested in the recordings without and with wearing a FFP2/3 mask.

CONCLUSION: It can be concluded that VRP measurements including DSI performed with FFP2/3 masks provide reliable data in clinical routine with respect to voice condition/constitution. Spectral analyses of sustained vowel, text, and singer's formant will be affected by wearing FFP2/3 masks.}, } @article {pmid35344807, year = {2022}, author = {Chauvette, L and Fournier, P and Sharp, A}, title = {The frequency-following response to assess the neural representation of spectral speech cues in older adults.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108486}, doi = {10.1016/j.heares.2022.108486}, pmid = {35344807}, issn = {1878-5891}, mesh = {Acoustic Stimulation/methods ; Aged ; Cues ; *Hearing Loss ; Humans ; Speech ; *Speech Perception/physiology ; }, abstract = {Older adults often present difficulties understanding speech that cannot be explained by age-related changes in sound audibility. Psychoacoustic and electrophysiologic studies have linked these suprathreshold difficulties to age-related deficits in the auditory processing of temporal and spectral sound information. These studies suggest the existence of an age-related temporal processing deficit in the central auditory system, but the existence of such deficit in the spectral domain remains understudied. The FFR is an electrophysiological evoked response that assesses the ability of the neural auditory system to reproduce the spectral and temporal patterns of a sound. The main goal of this short review is to investigate if the FFR can identify and measure spectral processing deficits in the elderly compared to younger adults (for both, without hearing loss or competing noise). Furthermore, we want to determine what stimuli and analyses have been used in the literature to assess the neural encoding of spectral cues in older adults. Almost all reviewed articles showed an age-related decline in the auditory processing of spectral acoustic information. Even when using different speech and non-speech stimuli, studies reported an age-related decline at the fundamental frequency, at the first formant, and at other harmonic components using different metrics, such as the response's amplitude, inter-trial phase coherence, signal-to-response correlation, and signal-to-noise ratio. These results suggest that older adults may present age-related spectral processing difficulties, but further FFR studies are needed to clarify the effect of advancing age on the neural encoding of spectral speech cues. Spectral processing research on aging would benefit from using a broader variety of stimuli and from rigorously controlling for hearing thresholds even in the absence of disabling hearing loss. Advances in the understanding of the effect of age on FFR measures of spectral encoding could lead to the development of new clinical tools, with possible applications in the field of hearing aid fitting.}, } @article {pmid35310278, year = {2022}, author = {Zaltz, Y and Kishon-Rabin, L}, title = {Difficulties Experienced by Older Listeners in Utilizing Voice Cues for Speaker Discrimination.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {797422}, pmid = {35310278}, issn = {1664-1078}, abstract = {Human listeners are assumed to apply different strategies to improve speech recognition in background noise. Young listeners with normal hearing (NH), e.g., have been shown to follow the voice of a particular speaker based on the fundamental (F0) and formant frequencies, which are both influenced by the gender, age, and size of the speaker. However, the auditory and cognitive processes that underlie the extraction and discrimination of these voice cues across speakers may be subject to age-related decline. The present study aimed to examine the utilization of F0 and formant cues for voice discrimination (VD) in older adults with hearing expected for their age. Difference limens (DLs) for VD were estimated in 15 healthy older adults (65-78 years old) and 35 young adults (18-35 years old) using only F0 cues, only formant frequency cues, and a combination of F0 + formant frequencies. A three-alternative forced-choice paradigm with an adaptive-tracking threshold-seeking procedure was used. Wechsler backward digit span test was used as a measure of auditory working memory. Trail Making Test (TMT) was used to provide cognitive information reflecting a combined effect of processing speed, mental flexibility, and executive control abilities. The results showed that (a) the mean VD thresholds of the older adults were poorer than those of the young adults for all voice cues, although larger variability was observed among the older listeners; (b) both age groups found the formant cues more beneficial for VD, compared to the F0 cues, and the combined (F0 + formant) cues resulted in better thresholds, compared to each cue separately; (c) significant associations were found for the older adults in the combined F0 + formant condition between VD and TMT scores, and between VD and hearing sensitivity, supporting the notion that a decline with age in both top-down and bottom-up mechanisms may hamper the ability of older adults to discriminate between voices. The present findings suggest that older listeners may have difficulty following the voice of a specific speaker and thus implementing doing so as a strategy for listening amid noise. This may contribute to understanding their reported difficulty listening in adverse conditions.}, } @article {pmid35288014, year = {2024}, author = {Paulino, CEB and Silva, HJD and Gomes, AOC and Silva, JMSD and Cunha, DAD and Coriolano, MDGWS and Lopes, LW and Lira, ZS}, title = {Relationship Between Oropharyngeal Geometry and Vocal Parameters in Subjects With Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {4}, pages = {967.e9-967.e17}, doi = {10.1016/j.jvoice.2022.01.020}, pmid = {35288014}, issn = {1873-4588}, mesh = {Humans ; *Parkinson Disease/physiopathology/diagnosis ; Male ; Female ; Middle Aged ; Cross-Sectional Studies ; *Speech Acoustics ; *Voice Quality ; Aged ; Retrospective Studies ; *Oropharynx/physiopathology ; *Speech Production Measurement ; *Acoustics ; Phonation ; Voice Disorders/physiopathology/diagnosis/etiology ; }, abstract = {OBJECTIVE: To verify whether the dimensions of different segments of the oropharyngeal cavity have different proportions between Parkinson's disease patients and vocally healthy subjects and investigate whether the measurements of these subjects' oropharyngeal geometry associate with their acoustic measurements of voice.

METHOD: Quantitative, descriptive, cross-sectional, and retrospective study with secondary data, approved by the Human Research Ethics Committee under no. 4.325.029. We used vocal samples and data from the oropharyngeal geometry of 40 subjects - 20 with Parkinson's disease stages I to III and 20 who formed the control group, matched for sex and age. Each group had 10 males and 10 females, mean age of 61 years (±6.0). Formant (F1, F2, and F3) and cepstral measures of the sustained vowel /ε/ were extracted and arranged in the database to determine their values using Praat software. The data were descriptively analyzed, with statistics generated with R software. The proportion of oropharyngeal geometry measurements was arranged by mean values and coefficients of variation. Pearson's linear correlation test was applied to relate voice parameters to oropharyngeal geometry, considering P < 0.05, and linear regression test, to justify F2.

RESULTS: The Parkinson's disease group showed a linear relationship between oral cavity length and F1 in males (P = 0.04) and between glottal area and F2 in females (P = 0.00); linear relationships were established according to age in both groups, and a regression model for F2 was estimated (R[2] = 0.61). There was no difference between pathological and healthy voices; there was a difference in the proportional relationship of oropharyngeal geometry between the groups.

CONCLUSION: The proportional relationship of oropharyngeal geometry differs between the Parkinson's disease group and the control group, as well as the relationship between oropharyngeal geometry and formant and cepstral values of voice according to the subjects' sex and age.}, } @article {pmid35276418, year = {2022}, author = {Jüchter, C and Beutelmann, R and Klump, GM}, title = {Speech sound discrimination by Mongolian gerbils.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108472}, doi = {10.1016/j.heares.2022.108472}, pmid = {35276418}, issn = {1878-5891}, mesh = {Animals ; Auditory Perception/physiology ; Gerbillinae ; Humans ; *Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {The present study establishes the Mongolian gerbil (Meriones unguiculatus) as a model for investigating the perception of human speech sounds. We report data on the discrimination of logatomes (CVCs - consonant-vowel-consonant combinations with outer consonants /b/, /d/, /s/ and /t/ and central vowels /a/, /aː/, /ɛ/, /eː/, /ɪ/, /iː/, /ɔ/, /oː/, /ʊ/ and /uː/, VCVs - vowel-consonant-vowel combinations with outer vowels /a/, /ɪ/ and /ʊ/ and central consonants /b/, /d/, /f/, /g/, /k/, /l/, /m/, /n/, /p/, /s/, /t/ and /v/) by gerbils. Four gerbils were trained to perform an oddball target detection paradigm in which they were required to discriminate a deviant CVC or VCV in a sequence of CVC or VCV standards, respectively. The experiments were performed with an ICRA-1 noise masker with speech-like spectral properties, and logatomes of multiple speakers were presented at various signal-to-noise ratios. Response latencies were measured to generate perceptual maps employing multidimensional scaling, which visualize the gerbils' internal maps of the sounds. The dimensions of the perceptual maps were correlated to multiple phonetic features of the speech sounds for evaluating which features of vowels and consonants are most important for the discrimination. The perceptual representation of vowels and consonants in gerbils was similar to that of humans, although gerbils needed higher signal-to-noise ratios for the discrimination of speech sounds than humans. The gerbils' discrimination of vowels depended on differences in the frequencies of the first and second formant determined by tongue height and position. Consonants were discriminated based on differences in combinations of their articulatory features. The similarities in the perception of logatomes by gerbils and humans renders the gerbil a suitable model for human speech sound discrimination.}, } @article {pmid35259200, year = {2022}, author = {Tamura, T and Tanaka, Y and Watanabe, Y and Sato, K}, title = {Relationships between maximum tongue pressure and second formant transition in speakers with different types of dysarthria.}, journal = {PloS one}, volume = {17}, number = {3}, pages = {e0264995}, pmid = {35259200}, issn = {1932-6203}, mesh = {Adult ; Aged ; *Dysarthria ; Female ; Humans ; Male ; Pressure ; *Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {The effects of muscle weakness on speech are currently not fully known. We investigated the relationships between maximum tongue pressure and second formant transition in adults with different types of dysarthria. It focused on the slope in the second formant transition because it reflects the tongue velocity during articulation. Sixty-three Japanese speakers with dysarthria (median age, 68 years; interquartile range, 58-77 years; 44 men and 19 women) admitted to acute and convalescent hospitals were included. Thirty neurologically normal speakers aged 19-85 years (median age, 22 years; interquartile range, 21.0-23.8 years; 14 men and 16 women) were also included. The relationship between the maximum tongue pressure and speech function was evaluated using correlation analysis in the dysarthria group. Speech intelligibility, the oral diadochokinesis rate, and the second formant slope were based on the impaired speech index. More than half of the speakers had mild to moderate dysarthria. Speakers with dysarthria showed significantly lower maximum tongue pressure, speech intelligibility, oral diadochokinesis rate, and second formant slope than neurologically normal speakers. Only the second formant slope was significantly correlated with the maximum tongue pressure (r = 0.368, p = 0.003). The relationship between the second formant slope and maximum tongue pressure showed a similar correlation in the analysis of subgroups divided by sex. The oral diadochokinesis rate, which is related to the speed of articulation, is affected by voice on/off, mandibular opening/closing, and range of motion. In contrast, the second formant slope was less affected by these factors. These results suggest that the maximum isometric tongue strength is associated with tongue movement speed during articulation.}, } @article {pmid35250034, year = {2022}, author = {Georgiou, GP}, title = {Acoustic markers of vowels produced with different types of face masks.}, journal = {Applied acoustics. Acoustique applique. Angewandte Akustik}, volume = {191}, number = {}, pages = {108691}, pmid = {35250034}, issn = {0003-682X}, abstract = {The wide spread of SARS-CoV-2 led to the extensive use of face masks in public places. Although masks offer significant protection from infectious droplets, they also impact verbal communication by altering speech signal. The present study examines how two types of face masks affect the speech properties of vowels. Twenty speakers were recorded producing their native vowels in a /pVs/ context, maintaining a normal speaking rate. Speakers were asked to produce the vowels in three conditions: (a) with a surgical mask, (b) with a cotton mask, and (c) without a mask. The speakers' output was analyzed through Praat speech acoustics software. We fitted three linear mixed-effects models to investigate the mask-wearing effects on the first formant (F1), second formant (F2), and duration of vowels. The results demonstrated that F1 and duration of vowels remained intact in the masked conditions compared to the unmasked condition, while F2 was altered for three out of five vowels (/e a u/) in the surgical mask and two out of five vowels (/e a/) in the cotton mask. So, both types of masks altered to some extent speech signal and they mostly affected the same vowel qualities. It is concluded that some acoustic properties are more sensitive than other to speech signal modification when speech is filtered through masks, while various sounds are affected in a different way. The findings may have significant implications for second/foreign language instructors who teach pronunciation and for speech therapists who teach sounds to individuals with language disorders.}, } @article {pmid35249395, year = {2023}, author = {Bertucci, V and Stevens, K and Sidhu, N and Suri, S and Bressmann, T}, title = {The Impact of Fan-Type Rapid Palatal Expanders on Speech in Patients With Unilateral Cleft Lip and Palate.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {60}, number = {7}, pages = {875-887}, pmid = {35249395}, issn = {1545-1569}, mesh = {Humans ; *Cleft Lip/surgery ; Speech ; *Cleft Palate/surgery ; Prospective Studies ; }, abstract = {Rapid palatal expanders (RPEs) are commonly used in patients with cleft lip and palate (CLP) prior to secondary alveolar bone grafting (SABG). Their position and size can impede tongue movement and affect speech. This study assessed changes in perception and production of speech over the course of RPE treatment. Prospective longitudinal. Tertiary university-affiliated hospital. Twenty-five patients with unilateral CLP treated with Fan-type RPEs, and their parents. Patient and parent speech questionnaires and patient speech recordings were collected at baseline before RPE insertion (T1), directly after RPE insertion (T2), during RPE expansion (T3), during RPE retention (T4), directly after RPE removal but before SABG (T5), and at short-term follow-up after RPE removal and SABG (T6). Ratings for patient and parent questionnaires, first (F1) and second (F2) formants for vowels /a/, /i/, and /u/, and nasalance scores for non-nasal and nasal sentences, were obtained and analyzed using mixed model analyses of variance. Ratings worsened at T2. For the vowel /a/, F1 and F2 were unchanged at T2. For the vowel /i/, F1 increased and F2 decreased at T2. For the vowel /u/, F1 was unchanged and F2 decreased at T2. Nasalance was unchanged at T2. All outcome measures returned to T1 levels by T4. RPE insertion resulted in initial adverse effects on speech perception and production, which decreased to baseline prior to removal. Information regarding transient speech dysfunction and distress may help prepare patients for treatment.}, } @article {pmid35242348, year = {2022}, author = {Anikin, A and Pisanski, K and Reby, D}, title = {Static and dynamic formant scaling conveys body size and aggression.}, journal = {Royal Society open science}, volume = {9}, number = {1}, pages = {211496}, pmid = {35242348}, issn = {2054-5703}, abstract = {When producing intimidating aggressive vocalizations, humans and other animals often extend their vocal tracts to lower their voice resonance frequencies (formants) and thus sound big. Is acoustic size exaggeration more effective when the vocal tract is extended before, or during, the vocalization, and how do listeners interpret within-call changes in apparent vocal tract length? We compared perceptual effects of static and dynamic formant scaling in aggressive human speech and nonverbal vocalizations. Acoustic manipulations corresponded to elongating or shortening the vocal tract either around (Experiment 1) or from (Experiment 2) its resting position. Gradual formant scaling that preserved average frequencies conveyed the impression of smaller size and greater aggression, regardless of the direction of change. Vocal tract shortening from the original length conveyed smaller size and less aggression, whereas vocal tract elongation conveyed larger size and more aggression, and these effects were stronger for static than for dynamic scaling. Listeners familiarized with the speaker's natural voice were less often 'fooled' by formant manipulations when judging speaker size, but paid more attention to formants when judging aggressive intent. Thus, within-call vocal tract scaling conveys emotion, but a better way to sound large and intimidating is to keep the vocal tract consistently extended.}, } @article {pmid35240298, year = {2022}, author = {Haider, CL and Suess, N and Hauswald, A and Park, H and Weisz, N}, title = {Masking of the mouth area impairs reconstruction of acoustic speech features and higher-level segmentational features in the presence of a distractor speaker.}, journal = {NeuroImage}, volume = {252}, number = {}, pages = {119044}, doi = {10.1016/j.neuroimage.2022.119044}, pmid = {35240298}, issn = {1095-9572}, support = {P 31230/FWF_/Austrian Science Fund FWF/Austria ; P 34237/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Acoustic Stimulation ; Acoustics ; Humans ; Mouth ; *Speech ; *Speech Perception ; Visual Perception ; }, abstract = {Multisensory integration enables stimulus representation even when the sensory input in a single modality is weak. In the context of speech, when confronted with a degraded acoustic signal, congruent visual inputs promote comprehension. When this input is masked, speech comprehension consequently becomes more difficult. But it still remains inconclusive which levels of speech processing are affected under which circumstances by occluding the mouth area. To answer this question, we conducted an audiovisual (AV) multi-speaker experiment using naturalistic speech. In half of the trials, the target speaker wore a (surgical) face mask, while we measured the brain activity of normal hearing participants via magnetoencephalography (MEG). We additionally added a distractor speaker in half of the trials in order to create an ecologically difficult listening situation. A decoding model on the clear AV speech was trained and used to reconstruct crucial speech features in each condition. We found significant main effects of face masks on the reconstruction of acoustic features, such as the speech envelope and spectral speech features (i.e. pitch and formant frequencies), while reconstruction of higher level features of speech segmentation (phoneme and word onsets) were especially impaired through masks in difficult listening situations. As we used surgical face masks in our study, which only show mild effects on speech acoustics, we interpret our findings as the result of the missing visual input. Our findings extend previous behavioural results, by demonstrating the complex contextual effects of occluding relevant visual information on speech processing.}, } @article {pmid35232632, year = {2024}, author = {Hoyer, P and Riedler, M and Unterhofer, C and Graf, S}, title = {Vocal Tract and Subglottal Impedance in High Performance Singing: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {5}, pages = {1248.e11-1248.e21}, doi = {10.1016/j.jvoice.2022.01.015}, pmid = {35232632}, issn = {1873-4588}, mesh = {Humans ; *Singing ; *Phonation ; Female ; *Glottis/physiology ; *Voice Quality ; Prospective Studies ; Vibration ; Acoustics ; Electric Impedance ; Biomechanical Phenomena ; Inhalation/physiology ; Vocal Cords/physiology ; Adult ; Sound Spectrography ; Exhalation/physiology ; }, abstract = {OBJECTIVES/HYPOTHESIS: The respiratory process is important in vocal training and in professional singing, the airflow is highly important. It is hypothesized that subglottal resonances are important to the singing voice in high performance singing.

STUDY DESIGN: Single subject, prospective.

METHOD: A professional soprano singer shaped her vocal tract to form the vowels [a], [e], [i], [o], and [u] at the pitch d4. We measured phonated vowels and the vocal tract impedance spectra with a deterministic noise supplied by an iPhone buzzer in the range of 200 to 4,000 Hz at closed glottis, during exhalation and during inhalation while maintaining the shape of the vocal tract.

RESULTS: Measurements of the phonated vowels before and after the different glottal adjustments were highly reproducible. Vocal tract resonances and the ones resulting during respiration are reported. The impedance spectra show vowel dependent resonances with closed and open glottis. The formants of the vocal spectra are explained by including both, the vocal tract, and the subglottal resonances.

CONCLUSION: The findings indicate that subglottal resonances influence the first formant as well as the singers's formant cluster in high-performance singing. The instrumental setup used for the impedance measurement allows a simple and lightweight procedure for a measurement of vocal tract and subglottal resonances.}, } @article {pmid35232067, year = {2022}, author = {Luberadzka, J and Kayser, H and Hohmann, V}, title = {Making sense of periodicity glimpses in a prediction-update-loop-A computational model of attentive voice tracking.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {712}, pmid = {35232067}, issn = {1520-8524}, support = {R01 DC015429/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Bayes Theorem ; Computer Simulation ; Humans ; Periodicity ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Humans are able to follow a speaker even in challenging acoustic conditions. The perceptual mechanisms underlying this ability remain unclear. A computational model of attentive voice tracking, consisting of four computational blocks: (1) sparse periodicity-based auditory features (sPAF) extraction, (2) foreground-background segregation, (3) state estimation, and (4) top-down knowledge, is presented. The model connects the theories about auditory glimpses, foreground-background segregation, and Bayesian inference. It is implemented with the sPAF, sequential Monte Carlo sampling, and probabilistic voice models. The model is evaluated by comparing it with the human data obtained in the study by Woods and McDermott [Curr. Biol. 25(17), 2238-2246 (2015)], which measured the ability to track one of two competing voices with time-varying parameters [fundamental frequency (F0) and formants (F1,F2)]. Three model versions were tested, which differ in the type of information used for the segregation: version (a) uses the oracle F0, version (b) uses the estimated F0, and version (c) uses the spectral shape derived from the estimated F0 and oracle F1 and F2. Version (a) simulates the optimal human performance in conditions with the largest separation between the voices, version (b) simulates the conditions in which the separation in not sufficient to follow the voices, and version (c) is closest to the human performance for moderate voice separation.}, } @article {pmid35232065, year = {2022}, author = {Saba, JN and Hansen, JHL}, title = {The effects of Lombard perturbation on speech intelligibility in noise for normal hearing and cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {1007}, pmid = {35232065}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; *Cochlear Implants ; Hearing ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Natural compensation of speech production in challenging listening environments is referred to as the Lombard effect (LE). The resulting acoustic differences between neutral and Lombard speech have been shown to provide intelligibility benefits for normal hearing (NH) and cochlear implant (CI) listeners alike. Motivated by this outcome, three LE perturbation approaches consisting of pitch, duration, formant, intensity, and spectral contour modifications were designed specifically for CI listeners to combat speech-in-noise performance deficits. Experiment 1 analyzed the effects of loudness, quality, and distortion of approaches on speech intelligibility with and without formant-shifting. Significant improvements of +9.4% were observed in CI listeners without the formant-shifting approach at +5 dB signal-to-noise ratio (SNR) large-crowd-noise (LCN) when loudness was controlled, however, performance was found to be significantly lower for NH listeners. Experiment 2 evaluated the non-formant-shifting approach with additional spectral contour and high pass filtering to reduce spectral smearing and decrease distortion observed in Experiment 1. This resulted in significant intelligibility benefits of +30.2% for NH and +21.2% for CI listeners at 0 and +5 dB SNR LCN, respectively. These results suggest that LE perturbation may be useful as front-end speech modification approaches to improve intelligibility for CI users in noise.}, } @article {pmid35180005, year = {2022}, author = {Sen, A and Thakkar, H and Vincent, V and Rai, S and Singh, A and Mohanty, S and Roy, A and Ramakrishnan, L}, title = {Endothelial colony forming cells' tetrahydrobiopterin level in coronary artery disease patients and its association with circulating endothelial progenitor cells.}, journal = {Canadian journal of physiology and pharmacology}, volume = {100}, number = {5}, pages = {473-485}, doi = {10.1139/cjpp-2021-0548}, pmid = {35180005}, issn = {1205-7541}, mesh = {Biopterins/analogs & derivatives ; *Coronary Artery Disease ; *Endothelial Progenitor Cells ; Humans ; }, abstract = {Endothelial colony forming cells (ECFCs) participate in neovascularization. Endothelial nitric oxide synthase (eNOS) derived NO· helps in homing of endothelial progenitor cells (EPCs) at the site of vascular injury. The enzyme cofactor tetrahydrobiopterin (BH4) stabilizes the catalytic active state of eNOS. Association of intracellular ECFCs biopterins and ratio of reduced to oxidized biopterin (BH4:BH2) with circulatory EPCs and ECFCs functionality have not been studied. We investigated ECFCs biopterin levels and its association with circulatory EPCs as well as ECFCs proliferative potential in terms of day of appearance in culture. Circulatory EPCs were enumerated by flowcytometry in 53 coronary artery disease (CAD) patients and 42 controls. ECFCs were cultured, characterized, and biopterin levels assessed by high performance liquid chromatography. Appearance of ECFCs' colony and their number were recorded. Circulatory EPCs were significantly lower in CAD and ECFCs appeared in 56% and 33% of CAD and control subjects, respectively. Intracellular BH4 and BH4:BH2 were significantly reduced in CAD. BH4:BH2 was positively correlated with circulatory EPCs (p = 0.01), and negatively with day of appearance of ECFCs (p = 0.04). Circulatory EPCs negatively correlated with ECFCs appearance (p = 0.02). These findings suggest the role of biopterins in maintaining circulatory EPCs and functional integrity of ECFCs.}, } @article {pmid35175986, year = {2022}, author = {Lou, Q and Wang, X and Jiang, L and Wang, G and Chen, Y and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients with Unrepaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {33}, number = {5}, pages = {e528-e532}, doi = {10.1097/SCS.0000000000008567}, pmid = {35175986}, issn = {1536-3732}, mesh = {Adult ; *Cleft Palate/complications/surgery ; Humans ; Speech ; Speech Disorders/diagnosis/etiology ; Speech Intelligibility ; Speech Production Measurement/methods ; Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with adult patients with unrepaired cleft palate and their non-cleft peers.

PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality, and consonant missing rate, whereas objective acoustic parameters included normalized vowel formants, voice onset time, and the analysis of three-dimensional spectrogram and spectrum, were carried out on speech samples produced by 2 groups of speakers: (a) speakers with unrepaired cleft palate (n = 65, mean age = 25.1 years) and (b) typical speakers (n = 30, mean age = 23.7 years).

RESULTS: Compared with typical speakers, individuals with unrepaired cleft palate exhibited a lower speech intelligibility with higher nasality and consonant missing rate, the missing rate is highest for the 6 consonants syllables The acoustic parameters are mainly manifested as differences in vowel formants and voice onset time.

CONCLUSIONS: The results revealed important acoustical differences between adult patients with unrepaired cleft palate and typical speakers. The trend of spectral deviation may have contributed to the difficulty in producing pressure vowels and aspirated consonants in individuals with speech disorders related to cleft palate.}, } @article {pmid35166414, year = {2022}, author = {Nguyen, DD and Chacon, A and Payten, C and Black, R and Sheth, M and McCabe, P and Novakovic, D and Madill, C}, title = {Acoustic characteristics of fricatives, amplitude of formants and clarity of speech produced without and with a medical mask.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {2}, pages = {366-380}, pmid = {35166414}, issn = {1460-6984}, mesh = {Acoustics ; Humans ; Phonetics ; *Speech ; Speech Acoustics ; Speech Disorders ; *Speech Perception ; }, abstract = {BACKGROUND: Previous research has found that high-frequency energy of speech signals decreased while wearing face masks. However, no study has examined the specific spectral characteristics of fricative consonants and vowels and the perception of clarity of speech in mask wearing.

AIMS: To investigate acoustic-phonetic characteristics of fricative consonants and vowels and auditory perceptual rating of clarity of speech produced with and without wearing a face mask.

METHODS & PROCEDURES: A total of 16 healthcare workers read the Rainbow Passage using modal phonation in three conditions: without a face mask, with a standard surgical mask and with a KN95 mask (China GB2626-2006, a medical respirator with higher barrier level than the standard surgical mask). Speech samples were acoustically analysed for root mean square (RMS) amplitude (ARMS) and spectral moments of four fricatives /f/, /s/, /ʃ/ and /z/; and amplitude of the first three formants (A1, A2 and A3) measured from the reading passage and extracted vowels. Auditory perception of speech clarity was performed. Data were compared across mask and non-mask conditions using linear mixed models.

OUTCOMES & RESULTS: The ARMS of all included fricatives was significantly lower in surgical mask and KN95 mask compared with non-mask condition. Centre of gravity of /f/ decreased in both surgical and KN95 mask while other spectral moments did not show systematic significant linear trends across mask conditions. None of the formant amplitude measures was statistically different across conditions. Speech clarity was significantly poorer in both surgical and KN95 mask conditions.

Speech produced while wearing either a surgical mask or KN95 mask was associated with decreased fricative amplitude and poorer speech clarity.

WHAT THIS PAPER ADDS: What is already known on the subject Previous studies have shown that the overall spectral levels in high frequency ranges and intelligibility are decreased for speech produced with a face mask. It is unclear how different types of the speech signals that is, fricatives and vowels are presented in speech produced with wearing either a medical surgical or KN95 mask. It is also unclear whether ratings of speech clarity are similar for speech produced with these face masks. What this paper adds to existing knowledge Speech data collected using a real-world, clinical and non-laboratory-controlled settings showed differences in the amplitude of fricatives and speech clarity ratings between non-mask and mask-wearing conditions. Formant amplitude did not show significant differences in mask-wearing conditions compared with non-mask. What are the potential or actual clinical implications of this work? Wearing a surgical mask or a KN95 mask had different effects on consonants and vowels. It appeared from the findings in this study that these masks only affected fricative consonants and did not affect vowel production. The poorer speech clarity in these mask-wearing conditions has important implications for speech perception in communication between clinical staff and between medical officers and patients in clinics, and between people in everyday situations. The impact of these masks on speech perception may be more pronounced in people with hearing impairment and communication disorders. In voice evaluation and/or therapy sessions, the effects of wearing a medical mask can occur bidirectionally for both the clinician and the patient. The patient may find it more challenging to understand the speech conveyed by the clinician while the clinician may not perceptually assess patient's speech and voice accurately. Given the significant correlation between clarity ratings and fricative amplitude, improving fricative signals would be useful to improve speech clarity while wearing these medical face masks.}, } @article {pmid35142977, year = {2022}, author = {Gábor, A and Kaszás, N and Faragó, T and Pérez Fraga, P and Lovas, M and Andics, A}, title = {The acoustic bases of human voice identity processing in dogs.}, journal = {Animal cognition}, volume = {25}, number = {4}, pages = {905-916}, pmid = {35142977}, issn = {1435-9456}, support = {LP2017-13/2017//magyar tudományos akadémia/ ; 950159//h2020 european research council/ ; ÚNKP-20-4-II-ELTE-286//hungarian ministry for innovation and technology, national research, development and innovation fund/ ; ÚNKP-20-5-ELTE-337//hungarian ministry for innovation and technology, national research, development and innovation fund/ ; BO/751/20//mta bolyai research scholarship, hungary/ ; ÚNKP-21-5-ELTE-1061//Hungarian Ministry for Innovation and Technology, National Research, Development and Innovation Fund/ ; }, mesh = {Acoustics ; Animals ; Cues ; Dogs ; Humans ; Recognition, Psychology ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Speech carries identity-diagnostic acoustic cues that help individuals recognize each other during vocal-social interactions. In humans, fundamental frequency, formant dispersion and harmonics-to-noise ratio serve as characteristics along which speakers can be reliably separated. The ability to infer a speaker's identity is also adaptive for members of other species (like companion animals) for whom humans (as owners) are relevant. The acoustic bases of speaker recognition in non-humans are unknown. Here, we tested whether dogs can recognize their owner's voice and whether they rely on the same acoustic parameters for such recognition as humans use to discriminate speakers. Stimuli were pre-recorded sentences spoken by the owner and control persons, played through loudspeakers placed behind two non-transparent screens (with each screen hiding a person). We investigated the association between acoustic distance of speakers (examined along several dimensions relevant in intraspecific voice identification) and dogs' behavior. Dogs chose their owner's voice more often than that of control persons', suggesting that they can identify it. Choosing success and time spent looking in the direction of the owner's voice were positively associated, showing that looking time is an index of the ease of choice. Acoustic distance of speakers in mean fundamental frequency and jitter were positively associated with looking time, indicating that the shorter the acoustic distance between speakers with regard to these parameters, the harder the decision. So, dogs use these cues to discriminate their owner's voice from unfamiliar voices. These findings reveal that dogs use some but probably not all acoustic parameters that humans use to identify speakers. Although dogs can detect fine changes in speech, their perceptual system may not be fully attuned to identity-diagnostic cues in the human voice.}, } @article {pmid35141903, year = {2022}, author = {V, K and S, SP}, title = {Hybrid machine learning classification scheme for speaker identification.}, journal = {Journal of forensic sciences}, volume = {67}, number = {3}, pages = {1033-1048}, doi = {10.1111/1556-4029.15006}, pmid = {35141903}, issn = {1556-4029}, mesh = {*Machine Learning ; Speech ; *Support Vector Machine ; }, abstract = {Motivated by the requirement to prepare for the next generation of "Automatic Spokesperson Recognition" (ASR) system, this paper applied the fused spectral features with hybrid machine learning (ML) strategy to the speech communication field. This strategy involved the combined spectral features such as mel-frequency cepstral coefficients (MFCCs), spectral kurtosis, spectral skewness, normalized pitch frequency (NPF), and formants. The characterization of suggested classification method could possibly serve in advanced speaker identification scenarios. Special attention was given to hybrid ML scheme capable of finding unknown speakers equipped with speaker id-detecting classifier technique, known as "Random Forest-Support Vector Machine" (RF-SVM). The extracted speaker precise spectral attributes are applied to the hybrid RF-SVM classifier to identify/verify the particular speaker. This work aims to construct an ensemble decision tree on a bounded area with minimal misclassification error using a hybrid ensemble RF-SVM strategy. A series of standard, real-time speaker databases, and noise conditions are functionally tested to validate its performance with other state-of-the-art mechanisms. The proposed fusion method succeeds in the speaker identification task with a high identification rate (97% avg) and lower equal error rate (EER) (<2%), compared with the individual schemes for the recorded experimental dataset. The robustness of the classifier is validated using the standard ELSDSR, TIMIT, and NIST audio datasets. Experiments on ELSDSR, TIMIT, and NIST datasets show that the hybrid classifier produces 98%, 99%, and 94% accuracy, and EERs were 2%, 1%, and 2% respectively. The findings are then compared with well-known other speaker recognition schemes and found to be superior.}, } @article {pmid35135714, year = {2024}, author = {Menezes, DP and de Lira, ZS and Araújo, ANB and de Almeida, AAF and Gomes, AOC and Moraes, BT and Lucena, JA}, title = {Prosodic Differences in the Voices of Transgender and Cisgender Women: Self-Perception of Voice - An Auditory and Acoustic Analysis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {4}, pages = {844-857}, doi = {10.1016/j.jvoice.2021.12.020}, pmid = {35135714}, issn = {1873-4588}, mesh = {Humans ; Female ; *Transgender Persons/psychology ; *Voice Quality ; Adult ; *Speech Acoustics ; *Self Concept ; *Speech Perception ; Young Adult ; *Emotions ; *Speech Production Measurement ; *Acoustics ; Cross-Sectional Studies ; Middle Aged ; Male ; Transsexualism/physiopathology/psychology ; Gender Identity ; }, abstract = {INTRODUCTION: The voice is an important parameter for identifying the speaker's gender. Transgender people seek to adapt their bodies to gender identity, and transgender women have greater difficulties in achieving vocal acceptance. In this context, the evaluation of the various parameters of the voice of transgender and cisgender women is essential to make it possible to propose appropriate intervention measures.

OBJECTIVES: To identify the differences in vocal characteristics of transgender and cisgender women.

METHODS: An sectional study was conducted. The sample comprised 20 transgender women and 20 cisgender women who underwent evaluation of acoustic parameters, emotional prosody, self-perception, and perception of gender by lay listeners.

RESULTS: The vocal characteristics of transgender and cisgender women differ in terms of the following parameters: f0, glottal noise excitation (GNE), vocal intensity, speech range profile (SRP), the first three formants of the vowel /a/, and in terms of emotional prosody, including duration and melodic contour. Higher values were mostly found in the cisgender population, except for noise level and vocal intensity. In addition, in most cases lay listeners identified the voices of transgender women as belonging to the male gender. There was a negative correlation between vocal dissatisfaction and f0 among transgender women.

CONCLUSIONS: Even though they perform vocal adjustments, the voices of transgender women are different from cisgender women in terms of acoustic parameters, vocal extension, and emotional prosody including duration and melodic contour. These differences have repercussions on the perception of gender by listeners.}, } @article {pmid35130577, year = {2022}, author = {Rishiq, D and Harkrider, A and Springer, C and Hedrick, M}, title = {Effects of Spectral Shaping on Speech Auditory Brainstem Responses to Stop Consonant-Vowel Syllables.}, journal = {Journal of the American Academy of Audiology}, volume = {33}, number = {4}, pages = {232-243}, doi = {10.1055/a-1764-9805}, pmid = {35130577}, issn = {2157-3107}, mesh = {Humans ; Male ; Young Adult ; Aged ; Evoked Potentials, Auditory, Brain Stem/physiology ; Speech ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; *Hearing Aids ; }, abstract = {BACKGROUND: Spectral shaping is employed by hearing aids to make consonantal information, such as formant transitions, audible for listeners with hearing loss. How manipulations of the stimuli, such as spectral shaping, may alter encoding in the auditory brainstem has not been thoroughly studied.

PURPOSE: The aim of this study was to determine how spectral shaping of synthetic consonant-vowel (CV) syllables, varying in their second formant (F2) onset frequency, may affect encoding of the syllables in the auditory brainstem.

RESEARCH DESIGN: We employed a repeated measure design.

STUDY SAMPLE: Sixteen young adults (mean = 20.94 years, 6 males) and 11 older adults (mean = 58.60 years, 4 males) participated in this study.

DATA COLLECTION AND ANALYSIS: Speech-evoked auditory brainstem responses (speech-ABRs) were obtained from each participant using three CV exemplars selected from synthetic stimuli generated for a /ba-da-ga/ continuum. Brainstem responses were also recorded to corresponding three CV exemplars that were spectrally shaped to decrease low-frequency information and provide gain for middle and high frequencies according to a Desired Sensation Level function. In total, six grand average waveforms (3 phonemes [/ba/, /da/, /ga/] X 2 shaping conditions [unshaped, shaped]) were produced for each participant. Peak latencies and amplitudes, referenced to prestimulus baseline, were identified for 15 speech-ABR peaks. Peaks were marked manually using the program cursor on each individual waveform. Repeated-measures analysis of variances were used to determine the effects of shaping on the latencies and amplitudes of the speech-ABR peaks.

RESULTS: Shaping effects produced changes within participants in ABR latencies and amplitudes involving onset and major peaks of the speech-ABR waveform for certain phonemes. Specifically, data from onset peaks showed that shaping decreased latency for /ga/ in older listeners, and decreased amplitude onset for /ba/ in younger listeners. Shaping also increased the amplitudes of major peaks for /ga/ stimuli in both groups.

CONCLUSIONS: Encoding of speech in the ABR waveform may be more complex and multidimensional than a simple demarcation of source and filter information, and may also be influenced by cue intensity and age. These results suggest a more complex subcortical encoding of vocal tract filter information in the ABR waveform, which may also be influenced by cue intensity and age.}, } @article {pmid35120354, year = {2022}, author = {Easwar, V and Boothalingam, S and Wilson, E}, title = {Sensitivity of Vowel-Evoked Envelope Following Responses to Spectra and Level of Preceding Phoneme Context.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1327-1335}, doi = {10.1097/AUD.0000000000001190}, pmid = {35120354}, issn = {1538-4667}, mesh = {Electroencephalography ; Humans ; Male ; *Speech Perception/physiology ; Young Adult ; }, abstract = {OBJECTIVE: Vowel-evoked envelope following responses (EFRs) could be a useful noninvasive tool for evaluating neural activity phase-locked to the fundamental frequency of voice (f0). Vowel-evoked EFRs are often elicited by vowels in consonant-vowel syllables or words. Considering neural activity is susceptible to temporal masking, EFR characteristics elicited by the same vowel may vary with the features of the preceding phoneme. To this end, the objective of the present study was to evaluate the influence of the spectral and level characteristics of the preceding phoneme context on vowel-evoked EFRs.

DESIGN: EFRs were elicited by a male-spoken /i/ (stimulus; duration = 350 msec), modified to elicit two EFRs, one from the region of the first formant (F1) and one from the second and higher formants (F2+). The stimulus, presented at 65 dB SPL, was preceded by one of the four contexts: /∫/, /m/, /i/ or a silent gap of duration equal to that of the stimulus. The level of the context phonemes was either 50 or 80 dB SPL, 15 dB lower and higher than the level of the stimulus /i/. In a control condition, EFRs to the stimulus /i/ were elicited in isolation without any preceding phoneme contexts. The stimulus and the contexts were presented monaurally to a randomly chosen test ear in 21 young adults with normal hearing. EFRs were recorded using single-channel electroencephalogram between the vertex and the nape.

RESULTS: A repeated measures analysis of variance indicated a significant three-way interaction between context type (/∫/, /i/, /m/, silent gap), level (50, 80 dB SPL), and EFR-eliciting formant (F1, F2+). Post hoc analyses indicated no influence of the preceding phoneme context on F1-elicited EFRs. Relative to a silent gap as the preceding context, F2+-elicited EFRs were attenuated by /∫/ and /m/ presented at 50 and 80 dB SPL, as well as by /i/ presented at 80 dB SPL. The average attenuation ranged from 14.9 to 27.9 nV. When the context phonemes were presented at matched levels of 50 or 80 dB SPL, F2+-elicited EFRs were most often attenuated when preceded by /∫/. At 80 dB SPL, relative to the silent preceding gap, the average attenuation was 15.7 nV, and at 50 dB SPL, relative to the preceding context phoneme /i/, the average attenuation was 17.2 nV.

CONCLUSION: EFRs elicited by the second and higher formants of /i/ are sensitive to the spectral and level characteristics of the preceding phoneme context. Such sensitivity, measured as an attenuation in the present study, may influence the comparison of EFRs elicited by the same vowel in different consonant-vowel syllables or words. However, the degree of attenuation with realistic context levels exceeded the minimum measurable change only 12% of the time. Although the impact of the preceding context is statistically significant, it is likely to be clinically insignificant a majority of the time.}, } @article {pmid35111103, year = {2021}, author = {Chiu, C and Weng, Y and Chen, BW}, title = {Tongue Postures and Tongue Centers: A Study of Acoustic-Articulatory Correspondences Across Different Head Angles.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {768754}, pmid = {35111103}, issn = {1664-1078}, abstract = {Recent research on body and head positions has shown that postural changes may induce varying degrees of changes on acoustic speech signals and articulatory gestures. While the preservation of formant profiles across different postures is suitably accounted for by the two-tube model and perturbation theory, it remains unclear whether it is resulted from the accommodation of tongue postures. Specifically, whether the tongue accommodates the changes in head angle to maintain the target acoustics is yet to be determined. The present study examines vowel acoustics and their correspondence with the articulatory maneuvers of the tongue, including both tongue postures and movements of the tongue center, across different head angles. The results show that vowel acoustics, including pitch and formants, are largely unaffected by upward or downward tilting of the head. These preserved acoustics may be attributed to the lingual gestures that compensate for the effects of gravity. Our results also reveal that the tongue postures in response to head movements appear to be vowel-dependent, and the tongue center may serve as an underlying drive that covariates with the head angle changes. These results imply a close relationship between vowel acoustics and tongue postures as well as a target-oriented strategy for different head angles.}, } @article {pmid35105035, year = {2022}, author = {Merritt, B and Bent, T}, title = {Revisiting the acoustics of speaker gender perception: A gender expansive perspective.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {1}, pages = {484}, doi = {10.1121/10.0009282}, pmid = {35105035}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Femininity ; Humans ; Male ; Masculinity ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Examinations of speaker gender perception have primarily focused on the roles of fundamental frequency (fo) and formant frequencies from structured speech tasks using cisgender speakers. Yet, there is evidence to suggest that fo and formants do not fully account for listeners' perceptual judgements of gender, particularly from connected speech. This study investigated the perceptual importance of fo, formant frequencies, articulation, and intonation in listeners' judgements of gender identity and masculinity/femininity from spontaneous speech from cisgender male and female speakers as well as transfeminine and transmasculine speakers. Stimuli were spontaneous speech samples from 12 speakers who are cisgender (6 female and 6 male) and 12 speakers who are transgender (6 transfeminine and 6 transmasculine). Listeners performed a two-alternative forced choice (2AFC) gender identification task and masculinity/femininity rating task in two experiments that manipulated which acoustic cues were available. Experiment 1 confirmed that fo and formant frequency manipulations were insufficient to alter listener judgements across all speakers. Experiment 2 demonstrated that articulatory cues had greater weighting than intonation cues on the listeners' judgements when the fo and formant frequencies were in a gender ambiguous range. These findings counter the assumptions that fo and formant manipulations are sufficient to effectively alter perceived speaker gender.}, } @article {pmid35104414, year = {2022}, author = {Kim, Y and Chung, H and Thompson, A}, title = {Acoustic and Articulatory Characteristics of English Semivowels /ɹ, l, w/ Produced by Adult Second-Language Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {890-905}, doi = {10.1044/2021_JSLHR-21-00152}, pmid = {35104414}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Humans ; *Language ; *Multilingualism ; Phonetics ; Speech Acoustics ; }, abstract = {PURPOSE: This study presents the results of acoustic and kinematic analyses of word-initial semivowels (/ɹ, l, w/) produced by second-language (L2) speakers of English whose native language is Korean. In addition, the relationship of acoustic and kinematic measures to the ratings of foreign accent was examined by correlation analyses.

METHOD: Eleven L2 speakers and 10 native speakers (first language [L1]) of English read The Caterpillar passage. Acoustic and kinematic data were simultaneously recorded using an electromagnetic articulography system. In addition to speaking rate, two acoustic measures (ratio of third-formant [F3] frequency to second-formant [F2] frequency and duration of steady states of F2) and two kinematic measures (lip aperture and duration of lingual maximum hold) were obtained from individual target sounds. To examine the degree of contrast among the three sounds, acoustic and kinematic Euclidean distances were computed on the F2-F3 and x-y planes, respectively.

RESULTS: Compared with L1 speakers, L2 speakers exhibited a significantly slower speaking rate. For the three semivowels, L2 speakers showed a reduced F3/F2 ratio during constriction, increased lip aperture, and reduced acoustic Euclidean distances among semivowels. Additionally, perceptual ratings of foreign accent were significantly correlated with three measures: duration of steady F2, acoustic Euclidean distance, and kinematic Euclidean distance.

CONCLUSIONS: The findings provide acoustic and kinematic evidence for challenges that L2 speakers experience in the production of English semivowels, especially /ɹ/ and /w/. The robust and consistent finding of reduced contrasts among semivowels and their correlations with perceptual accent ratings suggests using sound contrasts as a potentially effective approach to accent modification paradigms.}, } @article {pmid35093243, year = {2022}, author = {Takemoto, N and Sanuki, T and Esaki, S and Iwasaki, S}, title = {Rabbit model with vocal fold hyperadduction.}, journal = {Auris, nasus, larynx}, volume = {49}, number = {5}, pages = {810-815}, doi = {10.1016/j.anl.2022.01.008}, pmid = {35093243}, issn = {1879-1476}, mesh = {Animals ; *Dysphonia ; Glottis ; Humans ; Laryngeal Muscles ; Phonation/physiology ; Rabbits ; *Vocal Cords ; }, abstract = {OBJECTIVE: Adductor spasmodic dysphonia (AdSD) is caused by hyperadduction of the vocal folds during phonation, resulting in a strained voice. Animal models are not yet used to elucidate this intractable disease because AdSD has a difficult pathology without a definitive origin. For the first step, we established an animal model with vocal fold hyperadduction and evaluated its validity by assessing laryngeal function.

METHODS: In this experimental animal study, three adult Japanese 20-week-old rabbits were used. The models were created using a combination of cricothyroid approximation, forced airflow, and electrical stimulation of the recurrent laryngeal nerves (RLNs). Cricothyroid approximation was added to produce a glottal slit. Thereafter, both RLNs were electrically stimulated to induce vocal fold hyperadduction. Finally, the left RLN was transected to relieve hyperadduction. The sound, endoscopic images, and subglottal pressure were recorded, and acoustic analysis was performed.

RESULTS: Subglottal pressure increased significantly, and the strained sound was produced after the electrical stimulation of the RLNs. After transecting the left RLN, the subglottal pressure decreased significantly, and the strained sound decreased. Acoustic analysis revealed an elevation of the standard deviation of F0 (SDF0) and degree of voice breaks (DVB) through stimulation of the RLNs, and degradation of SDF0 and DVB through RLN transection. Formant bands in the sound spectrogram were interrupted by the stimulation and appeared again after the RLN section.

CONCLUSION: This study developed a rabbit model with vocal fold hyperadduction . The subglottal pressure and acoustic analysis of this model resembled the characteristics of patients with AdSD. This model could be helpful to elucidate the pathology of the larynx caused by hyperadduction, and evaluate and compare the treatments for strained phonation.}, } @article {pmid35086866, year = {2022}, author = {Heeringa, AN and Köppl, C}, title = {Auditory Nerve Fiber Discrimination and Representation of Naturally-Spoken Vowels in Noise.}, journal = {eNeuro}, volume = {9}, number = {1}, pages = {}, pmid = {35086866}, issn = {2373-2822}, mesh = {Auditory Perception/physiology ; Cochlear Nerve/physiology ; Nerve Fibers/physiology ; *Noise ; Phonetics ; Speech ; *Speech Perception/physiology ; }, abstract = {To understand how vowels are encoded by auditory nerve (AN) fibers, a number of representation schemes have been suggested that extract the vowel's formant frequencies from AN-fiber spiking patterns. The current study aims to apply and compare these schemes for AN-fiber responses to naturally-spoken vowels in a speech-shaped background noise. Responses to three vowels were evaluated; based on behavioral experiments in the same species, two of these were perceptually difficult to discriminate from each other (/e/ vs /i/), and one was perceptually easy to discriminate from the other two (/a:/). Single-unit AN fibers were recorded from ketamine/xylazine-anesthetized Mongolian gerbils of either sex (n = 8). First, single-unit discrimination between the three vowels was studied. Compared with the perceptually easy discriminations, the average spike timing-based discrimination values were significantly lower for the perceptually difficult vowel discrimination. This was not true for an average rate-based discrimination metric, the rate d-prime (d'). Consistently, spike timing-based representation schemes, plotting the temporal responses of all recorded units as a function of their best frequency (BF), i.e., dominant component schemes, average localized interval rate, and fluctuation profiles, revealed representation of the vowel's formant frequencies, whereas no such representation was apparent in the rate-based excitation pattern. Making use of perceptual discrimination data, this study reveals that discrimination difficulties of naturally-spoken vowels in speech-shaped noise originate peripherally and can be studied in the spike timing patterns of single AN fibers.}, } @article {pmid35077652, year = {2022}, author = {Yüksel, M}, title = {Reliability and Efficiency of Pitch-Shifting Plug-Ins in Voice and Hearing Research.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {878-889}, doi = {10.1044/2021_JSLHR-21-00440}, pmid = {35077652}, issn = {1558-9102}, mesh = {Feedback, Sensory ; Female ; Hearing ; Humans ; Male ; *Music ; Pitch Perception ; Reproducibility of Results ; *Voice ; }, abstract = {PURPOSE: Auditory feedback perturbation with voice pitch manipulation has been widely used in previous studies. There are several hardware and software tools for such manipulations, but audio plug-ins developed for music, movies, and radio applications that operate in digital audio workstations may be extremely beneficial and are easy to use, accessible, and cost effective. However, it is unknown whether these plug-ins can perform similarly to tools that have been described in previous literature. Hence, this study aimed to evaluate the reliability and efficiency of these plug-ins.

METHOD: Six different plug-ins were used at +1 and -1 st pitch shifting with formant correction on and off to pitch shift the sustained /ɑ/ voice recording sample of 12 healthy participants (six cisgender males and six cisgender females). Pitch-shifting accuracy, formant shifting amount, intensity changes, and total latency values were reported.

RESULTS: Some variability was observed between different plug-ins and pitch shift settings. One plug-in managed to perform similarly in all four measured aspects with well-known hardware and software units with 1-cent pitch-shifting accuracy, low latency values, negligible intensity difference, and preserved formants. Other plug-ins performed similarly in some respects.

CONCLUSIONS: Audio plug-ins may be used effectively in pitch-shifting applications. Researchers and clinicians can access these plug-ins easily and test whether the features also fit their aims.}, } @article {pmid35071434, year = {2021}, author = {Cao, S and Xia, M and Zhou, R and Wang, J and Jin, CY and Pei, B and Zhou, ZK and Qian, YM and Jiang, H}, title = {Voice parameters for difficult mask ventilation evaluation: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {23}, pages = {1740}, pmid = {35071434}, issn = {2305-5839}, abstract = {BACKGROUND: Mask ventilation (MV) is an essential component of airway management. Difficult mask ventilation (DMV) is a major cause for perioperative hypoxic brain injury; however, predicting DMV remains a challenge. This study aimed to determine the potential value of voice parameters as novel predictors of DMV in patients scheduled for general anesthesia.

METHODS: We included 1,160 adult patients scheduled for elective surgery under general anesthesia. The clinical variables usually reported as predictors of DMV were collected before surgery. Voice sample of phonemes ([a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]) were recorded and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. The definition of DMV was the inability of an unassisted anesthesiologist to ensure adequate ventilation during MV under general anesthesia. Univariate and multivariate logistic regression analyses were used to explore the association between voice parameters and DMV. The predictive value of the voice parameters was evaluated by assessment of area under the curve (AUC) of receiver operating characteristic (ROC) curves of a stepwise forward model.

RESULTS: The prevalence of DMV was 218/1,160 (18.8%). The AUC of the stepwise forward model (including o_f4, e_bw2, i_f3, u_pitch, u_f1, u_f4, ü_bw4, ci_f1, qi_f1, qi_f4, qi_bw4, chi_f1, chi_bw2, chi_bw4, le_pitch, le_bw3, ke_bw2, en_pitch, and en_f2, en_bw4) attained a value of 0.779. The sensitivity and specificity of the model were 75.0% and 71.0%, respectively.

CONCLUSIONS: Voice parameters may be considered as alternative predictors of DMV, but additional studies are needed to confirm the initial findings.}, } @article {pmid35069371, year = {2021}, author = {Lee, A and Ng, E}, title = {Hong Kong Women Project a Larger Body When Speaking to Attractive Men.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {786507}, pmid = {35069371}, issn = {1664-1078}, abstract = {In this pilot study we investigated the vocal strategies of Cantonese women when addressing an attractive vs. unattractive male. We recruited 19 young female native speakers of Hong Kong Cantonese who completed an attractiveness rating task, followed by a speech production task where they were presented a subset of the same faces. By comparing the rating results and corresponding acoustic data of the facial stimuli, we found that when young Cantonese women spoke to an attractive male, they were less breathy, lower in fundamental frequency, and with denser formants, all of which are considered to project a larger body. Participants who were more satisfied with their own height used these vocal strategies more actively. These results are discussed in terms of the body size projection principle.}, } @article {pmid35062025, year = {2022}, author = {Suess, N and Hauswald, A and Reisinger, P and Rösch, S and Keitel, A and Weisz, N}, title = {Cortical tracking of formant modulations derived from silently presented lip movements and its decline with age.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {21}, pages = {4818-4833}, pmid = {35062025}, issn = {1460-2199}, support = {MR/W02912X/1/MRC_/Medical Research Council/United Kingdom ; P 31230/FWF_/Austrian Science Fund FWF/Austria ; P 34237/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Humans ; *Speech Perception ; Acoustic Stimulation ; Lip ; Speech ; Movement ; }, abstract = {The integration of visual and auditory cues is crucial for successful processing of speech, especially under adverse conditions. Recent reports have shown that when participants watch muted videos of speakers, the phonological information about the acoustic speech envelope, which is associated with but independent from the speakers' lip movements, is tracked by the visual cortex. However, the speech signal also carries richer acoustic details, for example, about the fundamental frequency and the resonant frequencies, whose visuophonological transformation could aid speech processing. Here, we investigated the neural basis of the visuo-phonological transformation processes of these more fine-grained acoustic details and assessed how they change as a function of age. We recorded whole-head magnetoencephalographic (MEG) data while the participants watched silent normal (i.e., natural) and reversed videos of a speaker and paid attention to their lip movements. We found that the visual cortex is able to track the unheard natural modulations of resonant frequencies (or formants) and the pitch (or fundamental frequency) linked to lip movements. Importantly, only the processing of natural unheard formants decreases significantly with age in the visual and also in the cingulate cortex. This is not the case for the processing of the unheard speech envelope, the fundamental frequency, or the purely visual information carried by lip movements. These results show that unheard spectral fine details (along with the unheard acoustic envelope) are transformed from a mere visual to a phonological representation. Aging affects especially the ability to derive spectral dynamics at formant frequencies. As listening in noisy environments should capitalize on the ability to track spectral fine details, our results provide a novel focus on compensatory processes in such challenging situations.}, } @article {pmid35038295, year = {2022}, author = {Almaghrabi, SA and Thewlis, D and Thwaites, S and Rogasch, NC and Lau, S and Clark, SR and Baumert, M}, title = {The Reproducibility of Bio-Acoustic Features is Associated With Sample Duration, Speech Task, and Gender.}, journal = {IEEE transactions on neural systems and rehabilitation engineering : a publication of the IEEE Engineering in Medicine and Biology Society}, volume = {30}, number = {}, pages = {167-175}, doi = {10.1109/TNSRE.2022.3143117}, pmid = {35038295}, issn = {1558-0210}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Reproducibility of Results ; *Speech ; Speech Acoustics ; *Voice ; }, abstract = {Bio-acoustic properties of speech show evolving value in analyzing psychiatric illnesses. Obtaining a sufficient speech sample length to quantify these properties is essential, but the impact of sample duration on the stability of bio-acoustic features has not been systematically explored. We aimed to evaluate bio-acoustic features' reproducibility against changes in speech durations and tasks. We extracted source, spectral, formant, and prosodic features in 185 English-speaking adults (98 w, 87 m) for reading-a-story and counting tasks. We compared features at 25% of the total sample duration of the reading task to those obtained from non-overlapping randomly selected sub-samples shortened to 75%, 50%, and 25% of total duration using intraclass correlation coefficients. We also compared the features extracted from entire recordings to those measured at 25% of the duration and features obtained from 50% of the duration. Further, we compared features extracted from reading-a-story to counting tasks. Our results show that the number of reproducible features (out of 125) decreased stepwise with duration reduction. Spectral shape, pitch, and formants reached excellent reproducibility. Mel-frequency cepstral coefficients (MFCCs), loudness, and zero-crossing rate achieved excellent reproducibility only at a longer duration. Reproducibility of source, MFCC derivatives, and voicing probability (VP) was poor. Significant gender differences existed in jitter, MFCC first-derivative, spectral skewness, pitch, VP, and formants. Around 97% of features in both genders were not reproducible across speech tasks, in part due to the short counting task duration. In conclusion, bio-acoustic features are less reproducible in shorter samples and are affected by gender.}, } @article {pmid35005711, year = {2021}, author = {Gaines, JL and Kim, KS and Parrell, B and Ramanarayanan, V and Nagarajan, SS and Houde, JF}, title = {Discrete constriction locations describe a comprehensive range of vocal tract shapes in the Maeda model.}, journal = {JASA express letters}, volume = {1}, number = {12}, pages = {124402}, pmid = {35005711}, issn = {2691-1191}, support = {F32 DC019538/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, abstract = {The Maeda model was used to generate a large set of vocoid-producing vocal tract configurations. The resulting dataset (a) produced a comprehensive range of formant frequencies and (b) displayed discrete tongue body constriction locations (palatal, velar/uvular, and lower pharyngeal). The discrete parameterization of constriction location across the vowel space suggests this is likely a fundamental characteristic of the human vocal tract, and not limited to any specific set of vowel contrasts. These findings suggest that in addition to established articulatory-acoustic constraints, fundamental biomechanical constraints of the vocal tract may also explain such discreteness.}, } @article {pmid34987356, year = {2021}, author = {Cheng, FY and Xu, C and Gold, L and Smith, S}, title = {Rapid Enhancement of Subcortical Neural Responses to Sine-Wave Speech.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {747303}, pmid = {34987356}, issn = {1662-4548}, support = {K01 DC017192/DC/NIDCD NIH HHS/United States ; }, abstract = {The efferent auditory nervous system may be a potent force in shaping how the brain responds to behaviorally significant sounds. Previous human experiments using the frequency following response (FFR) have shown efferent-induced modulation of subcortical auditory function online and over short- and long-term time scales; however, a contemporary understanding of FFR generation presents new questions about whether previous effects were constrained solely to the auditory subcortex. The present experiment used sine-wave speech (SWS), an acoustically-sparse stimulus in which dynamic pure tones represent speech formant contours, to evoke FFRSWS. Due to the higher stimulus frequencies used in SWS, this approach biased neural responses toward brainstem generators and allowed for three stimuli (/bɔ/, /bu/, and /bo/) to be used to evoke FFRSWS before and after listeners in a training group were made aware that they were hearing a degraded speech stimulus. All SWS stimuli were rapidly perceived as speech when presented with a SWS carrier phrase, and average token identification reached ceiling performance during a perceptual training phase. Compared to a control group which remained naïve throughout the experiment, training group FFRSWS amplitudes were enhanced post-training for each stimulus. Further, linear support vector machine classification of training group FFRSWS significantly improved post-training compared to the control group, indicating that training-induced neural enhancements were sufficient to bolster machine learning classification accuracy. These results suggest that the efferent auditory system may rapidly modulate auditory brainstem representation of sounds depending on their context and perception as non-speech or speech.}, } @article {pmid34975607, year = {2021}, author = {Meykadeh, A and Golfam, A and Nasrabadi, AM and Ameri, H and Sommer, W}, title = {First Event-Related Potentials Evidence of Auditory Morphosyntactic Processing in a Subject-Object-Verb Nominative-Accusative Language (Farsi).}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {698165}, pmid = {34975607}, issn = {1664-1078}, abstract = {While most studies on neural signals of online language processing have focused on a few-usually western-subject-verb-object (SVO) languages, corresponding knowledge on subject-object-verb (SOV) languages is scarce. Here we studied Farsi, a language with canonical SOV word order. Because we were interested in the consequences of second-language acquisition, we compared monolingual native Farsi speakers and equally proficient bilinguals who had learned Farsi only after entering primary school. We analyzed event-related potentials (ERPs) to correct and morphosyntactically incorrect sentence-final syllables in a sentence correctness judgment task. Incorrect syllables elicited a late posterior positivity at 500-700 ms after the final syllable, resembling the P600 component, as previously observed for syntactic violations at sentence-middle positions in SVO languages. There was no sign of a left anterior negativity (LAN) preceding the P600. Additionally, we provide evidence for a real-time discrimination of phonological categories associated with morphosyntactic manipulations (between 35 and 135 ms), manifesting the instantaneous neural response to unexpected perturbations. The L2 Farsi speakers were indistinguishable from L1 speakers in terms of performance and neural signals of syntactic violations, indicating that exposure to a second language at school entry may results in native-like performance and neural correlates. In nonnative (but not native) speakers verbal working memory capacity correlated with the late posterior positivity and performance accuracy. Hence, this first ERP study of morphosyntactic violations in a spoken SOV nominative-accusative language demonstrates ERP effects in response to morphosyntactic violations and the involvement of executive functions in non-native speakers in computations of subject-verb agreement.}, } @article {pmid34966297, year = {2021}, author = {Yamada, Y and Shinkawa, K and Nemoto, M and Arai, T}, title = {Automatic Assessment of Loneliness in Older Adults Using Speech Analysis on Responses to Daily Life Questions.}, journal = {Frontiers in psychiatry}, volume = {12}, number = {}, pages = {712251}, pmid = {34966297}, issn = {1664-0640}, abstract = {Loneliness is a perceived state of social and emotional isolation that has been associated with a wide range of adverse health effects in older adults. Automatically assessing loneliness by passively monitoring daily behaviors could potentially contribute to early detection and intervention for mitigating loneliness. Speech data has been successfully used for inferring changes in emotional states and mental health conditions, but its association with loneliness in older adults remains unexplored. In this study, we developed a tablet-based application and collected speech responses of 57 older adults to daily life questions regarding, for example, one's feelings and future travel plans. From audio data of these speech responses, we automatically extracted speech features characterizing acoustic, prosodic, and linguistic aspects, and investigated their associations with self-rated scores of the UCLA Loneliness Scale. Consequently, we found that with increasing loneliness scores, speech responses tended to have less inflections, longer pauses, reduced second formant frequencies, reduced variances of the speech spectrum, more filler words, and fewer positive words. The cross-validation results showed that regression and binary-classification models using speech features could estimate loneliness scores with an R [2] of 0.57 and detect individuals with high loneliness scores with 95.6% accuracy, respectively. Our study provides the first empirical results suggesting the possibility of using speech data that can be collected in everyday life for the automatic assessments of loneliness in older adults, which could help develop monitoring technologies for early detection and intervention for mitigating loneliness.}, } @article {pmid34963204, year = {2021}, author = {Hussain, Q and Kochetov, A}, title = {Acoustic classification of coronal stops of Eastern Punjabi.}, journal = {Phonetica}, volume = {79}, number = {1}, pages = {77-110}, doi = {10.1515/phon-2021-2015}, pmid = {34963204}, issn = {1423-0321}, mesh = {Acoustics ; Humans ; *Language ; Phonetics ; *Speech Acoustics ; Voice Quality ; }, abstract = {Punjabi is an Indo-Aryan language which contrasts a rich set of coronal stops at dental and retroflex places of articulation across three laryngeal configurations. Moreover, all these stops occur contrastively in various positions (word-initially, -medially, and -finally). The goal of this study is to investigate how various coronal place and laryngeal contrasts are distinguished acoustically both within and across word positions. A number of temporal and spectral correlates were examined in data from 13 speakers of Eastern Punjabi: Voice Onset Time, release and closure durations, fundamental frequency, F1-F3 formants, spectral center of gravity and standard deviation, H1*-H2*, and cepstral peak prominence. The findings indicated that higher formants and spectral measures were most important for the classification of place contrasts across word positions, whereas laryngeal contrasts were reliably distinguished by durational and voice quality measures. Word-medially and -finally, F2 and F3 of the preceding vowels played a key role in distinguishing the dental and retroflex stops, while spectral noise measures were more important word-initially. The findings of this study contribute to a better understanding of factors involved in the maintenance of typologically rare and phonetically complex sets of place and laryngeal contrasts in the coronal stops of Indo-Aryan languages.}, } @article {pmid34924928, year = {2021}, author = {Zheng, Z and Li, K and Feng, G and Guo, Y and Li, Y and Xiao, L and Liu, C and He, S and Zhang, Z and Qian, D and Feng, Y}, title = {Relative Weights of Temporal Envelope Cues in Different Frequency Regions for Mandarin Vowel, Consonant, and Lexical Tone Recognition.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {744959}, pmid = {34924928}, issn = {1662-4548}, abstract = {Objectives: Mandarin-speaking users of cochlear implants (CI) perform poorer than their English counterpart. This may be because present CI speech coding schemes are largely based on English. This study aims to evaluate the relative contributions of temporal envelope (E) cues to Mandarin phoneme (including vowel, and consonant) and lexical tone recognition to provide information for speech coding schemes specific to Mandarin. Design: Eleven normal hearing subjects were studied using acoustic temporal E cues that were extracted from 30 continuous frequency bands between 80 and 7,562 Hz using the Hilbert transform and divided into five frequency regions. Percent-correct recognition scores were obtained with acoustic E cues presented in three, four, and five frequency regions and their relative weights calculated using the least-square approach. Results: For stimuli with three, four, and five frequency regions, percent-correct scores for vowel recognition using E cues were 50.43-84.82%, 76.27-95.24%, and 96.58%, respectively; for consonant recognition 35.49-63.77%, 67.75-78.87%, and 87.87%; for lexical tone recognition 60.80-97.15%, 73.16-96.87%, and 96.73%. For frequency region 1 to frequency region 5, the mean weights in vowel recognition were 0.17, 0.31, 0.22, 0.18, and 0.12, respectively; in consonant recognition 0.10, 0.16, 0.18, 0.23, and 0.33; in lexical tone recognition 0.38, 0.18, 0.14, 0.16, and 0.14. Conclusion: Regions that contributed most for vowel recognition was Region 2 (502-1,022 Hz) that contains first formant (F1) information; Region 5 (3,856-7,562 Hz) contributed most to consonant recognition; Region 1 (80-502 Hz) that contains fundamental frequency (F0) information contributed most to lexical tone recognition.}, } @article {pmid34889651, year = {2022}, author = {Polka, L and Masapollo, M and Ménard, L}, title = {Setting the Stage for Speech Production: Infants Prefer Listening to Speech Sounds With Infant Vocal Resonances.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {1}, pages = {109-120}, doi = {10.1044/2021_JSLHR-21-00412}, pmid = {34889651}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Humans ; Infant ; Phonetics ; Speech ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: Current models of speech development argue for an early link between speech production and perception in infants. Recent data show that young infants (at 4-6 months) preferentially attend to speech sounds (vowels) with infant vocal properties compared to those with adult vocal properties, suggesting the presence of special "memory banks" for one's own nascent speech-like productions. This study investigated whether the vocal resonances (formants) of the infant vocal tract are sufficient to elicit this preference and whether this perceptual bias changes with age and emerging vocal production skills.

METHOD: We selectively manipulated the fundamental frequency (f0) of vowels synthesized with formants specifying either an infant or adult vocal tract, and then tested the effects of those manipulations on the listening preferences of infants who were slightly older than those previously tested (at 6-8 months).

RESULTS: Unlike findings with younger infants (at 4-6 months), slightly older infants in Experiment 1 displayed a robust preference for vowels with infant formants over adult formants when f0 was matched. The strength of this preference was also positively correlated with age among infants between 4 and 8 months. In Experiment 2, this preference favoring infant over adult formants was maintained when f0 values were modulated.

CONCLUSIONS: Infants between 6 and 8 months of age displayed a robust and distinct preference for speech with resonances specifying a vocal tract that is similar in size and length to their own. This finding, together with data indicating that this preference is not present in younger infants and appears to increase with age, suggests that nascent knowledge of the motor schema of the vocal tract may play a role in shaping this perceptual bias, lending support to current models of speech development.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.17131805.}, } @article {pmid34860148, year = {2023}, author = {Sundberg, J and Lindblom, B and Hefele, AM}, title = {Voice source, formant frequencies and vocal tract shape in overtone singing. A case study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {48}, number = {2}, pages = {75-87}, doi = {10.1080/14015439.2021.1998607}, pmid = {34860148}, issn = {1651-2022}, mesh = {Humans ; *Voice ; Phonation ; *Singing ; Voice Quality ; Tongue ; }, abstract = {Purpose: In overtone singing a singer produces two pitches simultaneously, a low-pitched, continuous drone plus a melody played on the higher, flutelike and strongly enhanced overtones of the drone. The purpose of this study was to analyse underlying acoustical, phonatory and articulatory phenomena.Methods: The voice source was analyzed by inverse filtering the sound, the articulation from a dynamic MRI video of the vocal tract profile, and the lip opening from a frontal-view video recording. Vocal tract cross-distances were measured in the MR recording and converted to area functions, the formant frequencies of which computed.Results: Inverse filtering revealed that the overtone enhancement resulted from a close clustering of formants 2 and 3. The MRI material showed that for low enhanced overtone frequencies (FE) the tongue tip was raised and strongly retracted, while for high FE the tongue tip was less retracted but forming a longer constriction. Thus, the tongue configuration changed from an apical/anterior to a dorsal/posterior articulation. The formant frequencies derived from the area functions matched almost perfectly those used for the inverse filtering. Further, analyses of the area functions revealed that the second formant frequency was strongly dependent on the back cavity, and the third on the front cavity, which acted like a Helmholtz resonator, tuned by the tongue tip position and lip opening.Conclusions: This type of overtone singing can be fully explained by the well-established source-filter theory of voice production, as recently found by Bergevin et al. [1] for another type of overtone singing.}, } @article {pmid34852626, year = {2021}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Mandatory dichotic integration of second-formant information: Contralateral sine bleats have predictable effects on consonant place judgments.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3693}, doi = {10.1121/10.0007132}, pmid = {34852626}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Judgment ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {Speech-on-speech informational masking arises because the interferer disrupts target processing (e.g., capacity limitations) or corrupts it (e.g., intrusions into the target percept); the latter should produce predictable errors. Listeners identified the consonant in monaural buzz-excited three-formant analogues of approximant-vowel syllables, forming a place of articulation series (/w/-/l/-/j/). There were two 11-member series; the vowel was either high-front or low-back. Series members shared formant-amplitude contours, fundamental frequency, and F1+F3 frequency contours; they were distinguished solely by the F2 frequency contour before the steady portion. Targets were always presented in the left ear. For each series, F2 frequency and amplitude contours were also used to generate interferers with altered source properties-sine-wave analogues of F2 (sine bleats) matched to their buzz-excited counterparts. Accompanying each series member with a fixed mismatched sine bleat in the contralateral ear produced systematic and predictable effects on category judgments; these effects were usually largest for bleats involving the fastest rate or greatest extent of frequency change. Judgments of isolated sine bleats using the three place labels were often unsystematic or arbitrary. These results indicate that informational masking by interferers involved corruption of target processing as a result of mandatory dichotic integration of F2 information, despite the grouping cues disfavoring this integration.}, } @article {pmid34852620, year = {2021}, author = {Lodermeyer, A and Bagheri, E and Kniesburges, S and Näger, C and Probst, J and Döllinger, M and Becker, S}, title = {The mechanisms of harmonic sound generation during phonation: A multi-modal measurement-based approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3485}, doi = {10.1121/10.0006974}, pmid = {34852620}, issn = {1520-8524}, mesh = {Glottis/diagnostic imaging ; Humans ; *Larynx ; *Phonation ; Sound ; Vocal Cords/diagnostic imaging ; }, abstract = {Sound generation during voiced speech remains an open research topic because the underlying process within the human larynx is hardly accessible for direct measurements. In the present study, harmonic sound generation during phonation was investigated with a model that replicates the fully coupled fluid-structure-acoustic interaction (FSAI). The FSAI was captured using a multi-modal approach by measuring the flow and acoustic source fields based on particle image velocimetry, as well as the surface velocity of the vocal folds based on laser vibrometry and high-speed imaging. Strong harmonic sources were localized near the glottis, as well as further downstream, during the presence of the supraglottal jet. The strongest harmonic content of the vocal fold surface motion was verified for the area near the glottis, which directly interacts with the glottal jet flow. Also, the acoustic back-coupling of the formant frequencies onto the harmonic oscillation of the vocal folds was verified. These findings verify that harmonic sound generation is the result of a strong interrelation between the vocal fold motion, modulated flow field, and vocal tract geometry.}, } @article {pmid34852594, year = {2021}, author = {Barreda, S and Assmann, PF}, title = {Perception of gender in children's voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3949}, doi = {10.1121/10.0006785}, pmid = {34852594}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Child ; Cues ; Female ; Humans ; Male ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {To investigate the perception of gender from children's voices, adult listeners were presented with /hVd/ syllables, in isolation and in sentence context, produced by children between 5 and 18 years. Half the listeners were informed of the age of the talker during trials, while the other half were not. Correct gender identifications increased with talker age; however, performance was above chance even for age groups where the cues most often associated with gender differentiation (i.e., average fundamental frequency and formant frequencies) were not consistently different between boys and girls. The results of acoustic models suggest that cues were used in an age-dependent manner, whether listeners were explicitly told the age of the talker or not. Overall, results are consistent with the hypothesis that talker age and gender are estimated jointly in the process of speech perception. Furthermore, results show that the gender of individual talkers can be identified accurately well before reliable anatomical differences arise in the vocal tracts of females and males. In general, results support the notion that the transmission of gender information from voice depends substantially on gender-dependent patterns of articulation, rather than following deterministically from anatomical differences between male and female talkers.}, } @article {pmid34847585, year = {2021}, author = {Wilson, RH and Scherer, NJ}, title = {Waveform Amplitude and Temporal Symmetric/Asymmetric Characteristics of Phoneme and Syllable Segments in the W-1 Spondaic Words Recorded by Four Speakers.}, journal = {Journal of the American Academy of Audiology}, volume = {32}, number = {7}, pages = {445-463}, doi = {10.1055/s-0041-1730959}, pmid = {34847585}, issn = {2157-3107}, mesh = {Data Collection ; Female ; Humans ; Male ; *Phonetics ; *Speech ; }, abstract = {BACKGROUND: The amplitude and temporal asymmetry of the speech waveform are mostly associated with voiced speech utterances and are obvious in recent graphic depictions in the literature. The asymmetries are attributed to the presence and interactions of the major formants characteristic of voicing with possible contributions from the unidirectional air flow that accompanies speaking.

PURPOSE: This study investigated the amplitude symmetry/asymmetry characteristics (polarity) of speech waveforms that to our knowledge have not been quantified.

STUDY SAMPLE: Thirty-six spondaic words spoken by two male speakers and two female speakers were selected because they were multisyllabic words providing a reasonable sampling of speech sounds and four recordings were available that were not related to the topic under study.

RESEARCH DESIGN: Collectively, the words were segmented into phonemes (vowels [130], diphthongs [77], voiced consonants [258], voiceless consonants [219]), syllables (82), and blends (6). For each segment the following were analyzed separately for the positive and negative datum points: peak amplitude, the percent of the total segment datum points, the root-mean-square (rms) amplitude, and the crest factor.

DATA COLLECTION AND ANALYSES: The digitized words (44,100 samples/s; 16-bit) were parsed into 144 files (36 words × 4 speakers), edited, transcribed to numeric values (±1), and stored in a spread sheet in which all analyses were performed with in-house routines. Overall approximately 85% of each waveform was analyzed, which excluded portions of silent intervals, transitions, and diminished waveform endings.

RESULTS: The vowel, diphthong, and syllable segments had durations (180-220 ms) that were about twice as long as the consonant durations (∼90 ms) and peak and rms amplitudes that were 6 to 12 dB higher than the consonant peak and rms amplitudes. Vowel, diphthong, and syllable segments had 10% more positive datum points (55%) than negative points (45%), which suggested temporal asymmetries within the segments. With voiced consonants, the distribution of positive and negative datum points dropped to 52 and 48% and essentially was equal with the voiceless consonants (50.3 and 49.6%). The mean rms amplitudes of the negative datum points were higher than the rms amplitudes for the positive points by 2 dB (vowels, diphthongs, and syllables), 1 dB (voiced consonants), and 0.1 dB (voiceless consonants). The 144 waveforms and segmentations are illustrated in the Supplementary Material along with the tabularized positive and negative segment characteristics.

CONCLUSIONS: The temporal and amplitude waveform asymmetries were by far most notable in segments that had a voicing component, which included the voiced consonants. These asymmetries were characterized by larger envelopes and more energy in the negative side of the waveform segment than in the positive side. Interestingly, these segments had more positive datum points than negative points, which indicated temporal asymmetry. All aspects of the voiceless consonants were equally divided between the positive and negative domains. There were female/male differences but with these limited samples such differences should not be generalized beyond the speakers in this study. The influence of the temporal and amplitude asymmetries on monaural word-recognition performance is thought to be negligible.}, } @article {pmid34827803, year = {2021}, author = {Hedwig, D and Poole, J and Granli, P}, title = {Does Social Complexity Drive Vocal Complexity? Insights from the Two African Elephant Species.}, journal = {Animals : an open access journal from MDPI}, volume = {11}, number = {11}, pages = {}, pmid = {34827803}, issn = {2076-2615}, abstract = {The social complexity hypothesis (SCH) for communication states that the range and frequency of social interactions drive the evolution of complex communication systems. Surprisingly, few studies have empirically tested the SHC for vocal communication systems. Filling this gap is important because a co-evolutionary runaway process between social and vocal complexity may have shaped the most intricate communication system, human language. We here propose the African elephant Loxodonta spec. as an excellent study system to investigate the relationships between social and vocal complexity. We review how the distinct differences in social complexity between the two species of African elephants, the forest elephant L. cyclotis and the savanna elephant L. africana, relate to repertoire size and structure, as well as complex communication skills in the two species, such as call combination or intentional formant modulation including the trunk. Our findings suggest that Loxodonta may contradict the SCH, as well as other factors put forth to explain patterns of vocal complexity across species. We propose that life history traits, a factor that has gained little attention as a driver of vocal complexity, and the extensive parental care associated with a uniquely low and slow reproductive rate, may have led to the emergence of pronounced vocal complexity in the forest elephant despite their less complex social system compared to the savanna elephant. Conclusions must be drawn cautiously, however. A better understanding of vocal complexity in the genus Loxodonta will depend on continuing advancements in remote data collection technologies to overcome the challenges of observing forest elephants in their dense rainforest habitat, as well as the availability of directly comparable data and methods, quantifying both structural and contextual variability in the production of rumbles and other vocalizations in both species of African elephants.}, } @article {pmid34809062, year = {2021}, author = {Du, X and Zhang, X and Wang, Y and Ma, G and Liu, Y and Wang, B and Mao, H}, title = {Highly sensitive detection of plant growth regulators by using terahertz time-domain spectroscopy combined with metamaterials.}, journal = {Optics express}, volume = {29}, number = {22}, pages = {36535-36545}, doi = {10.1364/OE.437909}, pmid = {34809062}, issn = {1094-4087}, mesh = {Biosensing Techniques/*methods ; Computer Simulation ; Equipment Design ; Glycylglycine/*analysis ; Hydrazines/*analysis ; Plant Growth Regulators/*analysis ; Plants/*chemistry ; Refractometry ; Sensitivity and Specificity ; Terahertz Spectroscopy/instrumentation/*methods ; }, abstract = {The rapid and sensitive detection of plant-growth-regulator (PGR) residue is essential for ensuring food safety for consumers. However, there are many disadvantages in current approaches to detecting PGR residue. In this paper, we demonstrate a highly sensitive PGR detection method by using terahertz time-domain spectroscopy combined with metamaterials. We propose a double formant metamaterial resonator based on a split-ring structure with titanium-gold nanostructure. The metamaterial resonator is a split-ring structure composed of a titanium-gold nanostructure based on polyimide film as the substrate. Also, terahertz spectral response and electric field distribution of metamaterials under different analyte thickness and refractive index were investigated. The simulation results showed that the theoretical sensitivity of resonance peak 1 and peak 2 of the refractive index sensor based on our designed metamaterial resonator approaches 780 and 720 gigahertz per refractive index unit (GHz/RIU), respectively. In experiments, a rapid solution analysis platform based on the double formant metamaterial resonator was set up and PGR residues in aqueous solution were directly and rapidly detected through terahertz time-domain spectroscopy. The results showed that metamaterials can successfully detect butylhydrazine and N-N diglycine at a concentration as low as 0.05 mg/L. This study paves a new way for sensitive, rapid, low-cost detection of PGRs. It also means that the double formant metamaterial resonator has significant potential for other applications in terahertz sensing.}, } @article {pmid34808474, year = {2022}, author = {Li, P and Ross, CF and Luo, ZX}, title = {Morphological disparity and evolutionary transformations in the primate hyoid apparatus.}, journal = {Journal of human evolution}, volume = {162}, number = {}, pages = {103094}, doi = {10.1016/j.jhevol.2021.103094}, pmid = {34808474}, issn = {1095-8606}, mesh = {Animals ; Female ; Haplorhini ; Hyoid Bone/anatomy & histology ; Phylogeny ; *Placenta ; Pregnancy ; *Primates/anatomy & histology ; }, abstract = {The hyoid apparatus plays an integral role in swallowing, respiration, and vocalization in mammals. Most placental mammals have a rod-shaped basihyal connected to the basicranium via both soft tissues and a mobile bony chain-the anterior cornu-whereas anthropoid primates have broad, shield-like or even cup-shaped basihyals suspended from the basicranium by soft tissues only. How the unique anthropoid hyoid morphology evolved is unknown, and hyoid morphology of nonanthropoid primates is poorly documented. Here we use phylogenetic comparative methods and linear morphometrics to address knowledge gaps in hyoid evolution among primates and their euarchontan outgroups. We find that dermopterans have variable reduction of cornu elements. Cynocephalus volans are sexually dimorphic in hyoid morphology. Tupaia and all lemuroids except Daubentonia have a fully ossified anterior cornu connecting a rod-shaped basihyal to the basicranium; this is the ancestral mammalian pattern that is also characteristic of the last common ancestor of Primates. Haplorhines exhibit a reduced anterior cornu, and anthropoids underwent further increase in basihyal aspect ratio values and in relative basihyal volume. Convergent with haplorhines, lorisoid strepsirrhines independently evolved a broad basihyal and reduced anterior cornua. While a reduced anterior cornu is hypothesized to facilitate vocal tract lengthening and lower formant frequencies in some mammals, our results suggest vocalization adaptations alone are unlikely to drive the iterative reduction of anterior cornua within Primates. Our new data on euarchontan hyoid evolution provide an anatomical basis for further exploring the form-function relationships of the hyoid across different behaviors, including vocalization, chewing, and swallowing.}, } @article {pmid34799495, year = {2022}, author = {Xu, L and Luo, J and Xie, D and Chao, X and Wang, R and Zahorik, P and Luo, X}, title = {Reverberation Degrades Pitch Perception but Not Mandarin Tone and Vowel Recognition of Cochlear Implant Users.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1139-1150}, doi = {10.1097/AUD.0000000000001173}, pmid = {34799495}, issn = {1538-4667}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; *Deafness/rehabilitation ; Humans ; Pitch Perception/physiology ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: The primary goal of this study was to investigate the effects of reverberation on Mandarin tone and vowel recognition of cochlear implant (CI) users and normal-hearing (NH) listeners. To understand the performance of Mandarin tone recognition, this study also measured participants' pitch perception and the availability of temporal envelope cues in reverberation.

DESIGN: Fifteen CI users and nine NH listeners, all Mandarin speakers, were asked to recognize Mandarin single-vowels produced in four lexical tones and rank harmonic complex tones in pitch with different reverberation times (RTs) from 0 to 1 second. Virtual acoustic techniques were used to simulate rooms with different degrees of reverberation. Vowel duration and correlation between amplitude envelope and fundamental frequency (F0) contour were analyzed for different tones as a function of the RT.

RESULTS: Vowel durations of different tones significantly increased with longer RTs. Amplitude-F0 correlation remained similar for the falling Tone 4 but greatly decreased for the other tones in reverberation. NH listeners had robust pitch-ranking, tone recognition, and vowel recognition performance as the RT increased. Reverberation significantly degraded CI users' pitch-ranking thresholds but did not significantly affect the overall scores of tone and vowel recognition with CIs. Detailed analyses of tone confusion matrices showed that CI users reduced the flat Tone-1 responses but increased the falling Tone-4 responses in reverberation, possibly due to the falling amplitude envelope of late reflections after the original vowel segment. CI users' tone recognition scores were not correlated with their pitch-ranking thresholds.

CONCLUSIONS: NH listeners can reliably recognize Mandarin tones in reverberation using salient pitch cues from spectral and temporal fine structures. However, CI users have poorer pitch perception using F0-related amplitude modulations that are reduced in reverberation. Reverberation distorts speech amplitude envelopes, which affect the distribution of tone responses but not the accuracy of tone recognition with CIs. Recognition of vowels with stationary formant trajectories is not affected by reverberation for both NH listeners and CI users, regardless of the available spectral resolution. Future studies should test how the relatively stable vowel and tone recognition may contribute to sentence recognition in reverberation of Mandarin-speaking CI users.}, } @article {pmid34783468, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Popadyuk, VI and Vostrikov, AM and Sheveleva, VA and Kleyman, VK and Shalamov, KP and Torshin, VI}, title = {[Dynamics of vowel acoustic space indicators in patients with long-term hearing loss].}, journal = {Vestnik otorinolaringologii}, volume = {86}, number = {5}, pages = {17-21}, doi = {10.17116/otorino20218605117}, pmid = {34783468}, issn = {0042-4668}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Male ; Phonetics ; Russia ; Speech Acoustics ; }, abstract = {UNLABELLED: New procedure of vowel acoustic space (VAS) (of vowel acoustic triangles) transformation for the purpose of characterization of vowel production in individuals with long-term hearing loss (HL) was developed.

OBJECTIVE: To characterize VAS of adult Russian speakers with long-term HL using newly developed acoustic indicators.

MATERIAL AND METHODS: Recordings of sustained Russian cardinal vowels /a/, /i/, /u/ of 10 women and 10 men with long-term HL were acoustically analyzed. For each participant, two first formants of each vowel were measured and log-transformed (logF1, logF2). VAS was transformed into right triangles, their /u/ corners were moved to the origin, and their legs were aligned with axes. VAS was almost symmetrical, equal and have a maximum size in the control group consisted of subjects without hearing impairment while these of long-term HL group VAS size tended to have reduced and VAS stretched along one axis.

RESULTS: Our study showed that a new VAS normalization approach can distinguish at least three groups of people with long-term HL.

CONCLUSION: There are those with vowel triangles stretched along logF1-axis, with vowel triangles stretched along logF2-axis, and with symmetrical vowel triangles. Causes of the VAS differences require further investigation.}, } @article {pmid34776842, year = {2021}, author = {Melchor, J and Vergara, J and Figueroa, T and Morán, I and Lemus, L}, title = {Formant-Based Recognition of Words and Other Naturalistic Sounds in Rhesus Monkeys.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {728686}, pmid = {34776842}, issn = {1662-4548}, abstract = {In social animals, identifying sounds is critical for communication. In humans, the acoustic parameters involved in speech recognition, such as the formant frequencies derived from the resonance of the supralaryngeal vocal tract, have been well documented. However, how formants contribute to recognizing learned sounds in non-human primates remains unclear. To determine this, we trained two rhesus monkeys to discriminate target and non-target sounds presented in sequences of 1-3 sounds. After training, we performed three experiments: (1) We tested the monkeys' accuracy and reaction times during the discrimination of various acoustic categories; (2) their ability to discriminate morphing sounds; and (3) their ability to identify sounds consisting of formant 1 (F1), formant 2 (F2), or F1 and F2 (F1F2) pass filters. Our results indicate that macaques can learn diverse sounds and discriminate from morphs and formants F1 and F2, suggesting that information from few acoustic parameters suffice for recognizing complex sounds. We anticipate that future neurophysiological experiments in this paradigm may help elucidate how formants contribute to the recognition of sounds.}, } @article {pmid34775826, year = {2022}, author = {Cartei, V and Reby, D and Garnham, A and Oakhill, J and Banerjee, R}, title = {Peer audience effects on children's vocal masculinity and femininity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200397}, pmid = {34775826}, issn = {1471-2970}, mesh = {Acoustics ; Child ; Female ; *Femininity ; Humans ; Male ; Masculinity ; *Voice ; }, abstract = {Existing evidence suggests that children from around the age of 8 years strategically alter their public image in accordance with known values and preferences of peers, through the self-descriptive information they convey. However, an important but neglected aspect of this 'self-presentation' is the medium through which such information is communicated: the voice itself. The present study explored peer audience effects on children's vocal productions. Fifty-six children (26 females, aged 8-10 years) were presented with vignettes where a fictional child, matched to the participant's age and sex, is trying to make friends with a group of same-sex peers with stereotypically masculine or feminine interests (rugby and ballet, respectively). Participants were asked to impersonate the child in that situation and, as the child, to read out loud masculine, feminine and gender-neutral self-descriptive statements to these hypothetical audiences. They also had to decide which of those self-descriptive statements would be most helpful for making friends. In line with previous research, boys and girls preferentially selected masculine or feminine self-descriptive statements depending on the audience interests. Crucially, acoustic analyses of fundamental frequency and formant frequency spacing revealed that children also spontaneously altered their vocal productions: they feminized their voices when speaking to members of the ballet club, while they masculinized their voices when speaking to members of the rugby club. Both sexes also feminized their voices when uttering feminine sentences, compared to when uttering masculine and gender-neutral sentences. Implications for the hitherto neglected role of acoustic qualities of children's vocal behaviour in peer interactions are discussed. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34775821, year = {2022}, author = {Pisanski, K and Anikin, A and Reby, D}, title = {Vocal size exaggeration may have contributed to the origins of vocalic complexity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200401}, pmid = {34775821}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; Body Size ; Speech ; Vocalization, Animal ; *Voice ; }, abstract = {Vocal tract elongation, which uniformly lowers vocal tract resonances (formant frequencies) in animal vocalizations, has evolved independently in several vertebrate groups as a means for vocalizers to exaggerate their apparent body size. Here, we propose that smaller speech-like articulatory movements that alter only individual formants can serve a similar yet less energetically costly size-exaggerating function. To test this, we examine whether uneven formant spacing alters the perceived body size of vocalizers in synthesized human vowels and animal calls. Among six synthetic vowel patterns, those characterized by the lowest first and second formant (the vowel /u/ as in 'boot') are consistently perceived as produced by the largest vocalizer. Crucially, lowering only one or two formants in animal-like calls also conveys the impression of a larger body size, and lowering the second and third formants simultaneously exaggerates perceived size to a similar extent as rescaling all formants. As the articulatory movements required for individual formant shifts are minor compared to full vocal tract extension, they represent a rapid and energetically efficient mechanism for acoustic size exaggeration. We suggest that, by favouring the evolution of uneven formant patterns in vocal communication, this deceptive strategy may have contributed to the origins of the phonemic diversification required for articulated speech. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34775819, year = {2022}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200455}, pmid = {34775819}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; *Pan troglodytes/physiology ; Phonetics ; Speech Acoustics ; *Voice Quality ; }, abstract = {The origins of human speech are obscure; it is still unclear what aspects are unique to our species or shared with our evolutionary cousins, in part due to a lack of a common framework for comparison. We asked what chimpanzee and human vocal production acoustics have in common. We examined visible supra-laryngeal articulators of four major chimpanzee vocalizations (hoos, grunts, barks, screams) and their associated acoustic structures, using techniques from human phonetic and animal communication analysis. Data were collected from wild adult chimpanzees, Taï National Park, Ivory Coast. Both discriminant and principal component classification procedures revealed classification of call types. Discriminating acoustic features include voice quality and formant structure, mirroring phonetic features in human speech. Chimpanzee lip and jaw articulation variables also offered similar discrimination of call types. Formant maps distinguished call types with different vowel-like sounds. Comparing our results with published primate data, humans show less F1-F2 correlation and further expansion of the vowel space, particularly for [i] sounds. Unlike recent studies suggesting monkeys achieve human vowel space, we conclude from our results that supra-laryngeal articulatory capacities show moderate evolutionary change, with vowel space expansion continuing through hominoid evolution. Studies on more primate species will be required to substantiate this. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34756498, year = {2024}, author = {Davatz, GC and Yamasaki, R and Hachiya, A and Tsuji, DH and Montagnoli, AN}, title = {Source and Filter Acoustic Measures of Young, Middle-Aged and Elderly Adults for Application in Vowel Synthesis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {253-263}, doi = {10.1016/j.jvoice.2021.08.025}, pmid = {34756498}, issn = {1873-4588}, mesh = {Male ; Middle Aged ; Young Adult ; Humans ; Female ; Aged ; Adolescent ; Adult ; Aged, 80 and over ; *Speech Acoustics ; Prospective Studies ; Acoustics ; Sound ; *Voice ; Phonetics ; }, abstract = {INTRODUCTION: The output sound has important changes throughout life due to anatomical and physiological modifications in the larynx and vocal tract. Understanding the young adult to the elderly speech acoustic characteristics may assist in the synthesis of representative voices of men and women of different age groups.

OBJECTIVE: To obtain the fundamental frequency (f0), formant frequencies (F1, F2, F3, F4), and bandwidth (B1, B2, B3, B4) values extracted from the sustained vowel /a/ of young, middle-aged, and elderly adults who are Brazilian Portuguese speakers; to present the application of these parameters in vowel synthesis.

STUDY DESIGN: Prospective study.

METHODS: The acoustic analysis of tokens of the 162 sustained vowel /a/ produced by vocally healthy adults, men, and women, between 18 and 80 years old, was performed. The adults were divided into three groups: young adults (18 to 44 years old); middle-aged adults (45 to 59 years old) and, elderly adults (60 to 80 years old). The f0, F1, F2, F3, F4, B1, B2, B3, B4 were extracted from the audio signals. Their average values were applied to a source-filter mathematical model to perform vowel synthesis in each age group both men and woman.

RESULTS: Young women had higher f0 than middle-aged and elderly women. Elderly women had lower F1 than middle-aged women. Young women had higher F2 than elderly women. For the men's output sound, the source-filter acoustic measures were statistically equivalent among the age groups. Average values of the f0, F1, F2, F3, F4, B1, and B2 were higher in women. The sound waves distance in signals, the position of formant frequencies and the dimension of the bandwidths visible in spectra of the synthesized sounds represent the average values extracted from the volunteers' emissions for the sustained vowel /a/ in Brazilian Portuguese.

CONCLUSION: Sustained vowel /a/ produced by women presented different values of f0,F1 and F2 between age groups, which was not observed for men. In addition to the f0 and the formant frequencies, the bandwidths were also different between women and men. The synthetic vowels available represent the acoustic changes found for each sex as a function of age.}, } @article {pmid34735295, year = {2021}, author = {Rowe, HP and Stipancic, KL and Lammert, AC and Green, JR}, title = {Validation of an Acoustic-Based Framework of Speech Motor Control: Assessing Criterion and Construct Validity Using Kinematic and Perceptual Measures.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4736-4753}, pmid = {34735295}, issn = {1558-9102}, support = {F31 DC019556/DC/NIDCD NIH HHS/United States ; R01 DC013547/DC/NIDCD NIH HHS/United States ; R01 DC009890/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; R01 DC017291/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; *Speech ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: This study investigated the criterion (analytical and clinical) and construct (divergent) validity of a novel, acoustic-based framework composed of five key components of motor control: Coordination, Consistency, Speed, Precision, and Rate.

METHOD: Acoustic and kinematic analyses were performed on audio recordings from 22 subjects with amyotrophic lateral sclerosis during a sequential motion rate task. Perceptual analyses were completed by two licensed speech-language pathologists, who rated each subject's speech on the five framework components and their overall severity. Analytical and clinical validity were assessed by comparing performance on the acoustic features to their kinematic correlates and to clinician ratings of the five components, respectively. Divergent validity of the acoustic-based framework was then assessed by comparing performance on each pair of acoustic features to determine whether the features represent distinct articulatory constructs. Bivariate correlations and partial correlations with severity as a covariate were conducted for each comparison.

RESULTS: Results revealed moderate-to-strong analytical validity for every acoustic feature, both with and without controlling for severity, and moderate-to-strong clinical validity for all acoustic features except Coordination, without controlling for severity. When severity was included as a covariate, the strong associations for Speed and Precision became weak. Divergent validity was supported by weak-to-moderate pairwise associations between all acoustic features except Speed (second-formant [F2] slope of consonant transition) and Precision (between-consonant variability in F2 slope).

CONCLUSIONS: This study demonstrated that the acoustic-based framework has potential as an objective, valid, and clinically useful tool for profiling articulatory deficits in individuals with speech motor disorders. The findings also suggest that compared to clinician ratings, instrumental measures are more sensitive to subtle differences in articulatory function. With further research, this framework could provide more accurate and reliable characterizations of articulatory impairment, which may eventually increase clinical confidence in the diagnosis and treatment of patients with different articulatory phenotypes.}, } @article {pmid34734018, year = {2021}, author = {Xia, M and Cao, S and Zhou, R and Wang, JY and Xu, TY and Zhou, ZK and Qian, YM and Jiang, H}, title = {Acoustic features as novel predictors of difficult laryngoscopy in orthognathic surgery: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {18}, pages = {1466}, pmid = {34734018}, issn = {2305-5839}, abstract = {BACKGROUND: The evaluation of the difficult intubation is an important process before anaesthesia. The unanticipated difficult intubation is associated with morbidity and mortality. This study aimed to determine whether acoustic features are valuable as an alternative method to predict difficult laryngoscopy (DL) in patients scheduled to undergo orthognathic surgery.

METHODS: This study included 225 adult patients who were undergoing elective orthognathic surgery under general anaesthesia with tracheal intubation. Preoperatively, clinical airway evaluation was performed, and the acoustic data were collected. Twelve phonemes {[a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]} were recorded, and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. Difficult laryngoscopy was defined as direct laryngoscopy with a Cormack-Lehane grade of 3 or 4. Univariate and multivariate logistic regression analyses were used to examine the associations between acoustic features and DL.

RESULTS: Difficult laryngoscopy was reported in 59/225 (26.2%) patients. The area under the curve (AUC) of the backward stepwise model including en_f2 [odds ratio (OR), 0.996; 95% confidence interval (CI), 0.994-0.999; P=0.006], ci_bw4 (OR, 0.997; 95% CI, 0.993-1.000; P=0.057), qi_bw4 (OR, 0.996; 95% CI, 0.993-0.999; P=0.017), le_f3 (OR, 0.998; 95% CI, 0.996-1.000; P=0.079), o_bw4 (OR, 1.001; 95% CI, 1.000-1.003; P=0.014), chi_f4 (OR, 1.003; 95% CI, 1.000-1.005; P=0.041), a_bw4 (OR, 0.999; 95% CI, 0.998-1.000; P=0.078) attained a value of 0.761 in the training set, but a value of 0.709 in the testing set. The sensitivity and specificity of the model in the testing set are 86.7% and 63.0%, respectively.

CONCLUSIONS: Acoustic features may be considered as useful predictors of DL during orthognathic surgery.}, } @article {pmid34731577, year = {2021}, author = {Abur, D and Subaciute, A and Daliri, A and Lester-Smith, RA and Lupiani, AA and Cilento, D and Enos, NM and Weerathunge, HR and Tardif, MC and Stepp, CE}, title = {Feedback and Feedforward Auditory-Motor Processes for Voice and Articulation in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4682-4694}, pmid = {34731577}, issn = {1558-9102}, support = {F31 DC019032/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Parkinson Disease/complications ; Speech ; Speech Intelligibility/physiology ; Speech Production Measurement ; *Voice ; }, abstract = {PURPOSE: Unexpected and sustained manipulations of auditory feedback during speech production result in "reflexive" and "adaptive" responses, which can shed light on feedback and feedforward auditory-motor control processes, respectively. Persons with Parkinson's disease (PwPD) have shown aberrant reflexive and adaptive responses, but responses appear to differ for control of vocal and articulatory features. However, these responses have not been examined for both voice and articulation in the same speakers and with respect to auditory acuity and functional speech outcomes (speech intelligibility and naturalness).

METHOD: Here, 28 PwPD on their typical dopaminergic medication schedule and 28 age-, sex-, and hearing-matched controls completed tasks yielding reflexive and adaptive responses as well as auditory acuity for both vocal and articulatory features.

RESULTS: No group differences were found for any measures of auditory-motor control, conflicting with prior findings in PwPD while off medication. Auditory-motor measures were also compared with listener ratings of speech function: first formant frequency acuity was related to speech intelligibility, whereas adaptive responses to vocal fundamental frequency manipulations were related to speech naturalness.

CONCLUSIONS: These results support that auditory-motor processes for both voice and articulatory features are intact for PwPD receiving medication. This work is also the first to suggest associations between measures of auditory-motor control and speech intelligibility and naturalness.}, } @article {pmid34717445, year = {2021}, author = {Cheung, ST and Thompson, K and Chen, JL and Yunusova, Y and Beal, DS}, title = {Response patterns to vowel formant perturbations in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {4}, pages = {2647}, doi = {10.1121/10.0006567}, pmid = {34717445}, issn = {1520-8524}, mesh = {Adaptation, Physiological ; Adolescent ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback is an important component of speech motor control, but its precise role in developing speech is less understood. The role of auditory feedback in development was probed by perturbing the speech of children 4-9 years old. The vowel sound /ɛ/ was shifted to /æ/ in real time and presented to participants as their own auditory feedback. Analyses of the resultant formant magnitude changes in the participants' speech indicated that children compensated and adapted by adjusting their formants to oppose the perturbation. Older and younger children responded to perturbation differently in F1 and F2. The compensatory change in F1 was greater for younger children, whereas the increase in F2 was greater for older children. Adaptation aftereffects were observed in both groups. Exploratory directional analyses in the two-dimensional formant space indicated that older children responded more directly and less variably to the perturbation than younger children, shifting their vowels back toward the vowel sound /ɛ/ to oppose the perturbation. Findings support the hypothesis that auditory feedback integration continues to develop between the ages of 4 and 9 years old such that the differences in the adaptive and compensatory responses arise between younger and older children despite receiving the same auditory feedback perturbation.}, } @article {pmid34717269, year = {2021}, author = {Tang, DL and McDaniel, A and Watkins, KE}, title = {Disruption of speech motor adaptation with repetitive transcranial magnetic stimulation of the articulatory representation in primary motor cortex.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {145}, number = {}, pages = {115-130}, pmid = {34717269}, issn = {1973-8102}, support = {/WT_/Wellcome Trust/United Kingdom ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Humans ; *Motor Cortex ; *Speech ; Transcranial Magnetic Stimulation ; }, abstract = {When auditory feedback perturbation is introduced in a predictable way over a number of utterances, speakers learn to compensate by adjusting their own productions, a process known as sensorimotor adaptation. Despite multiple lines of evidence indicating the role of primary motor cortex (M1) in motor learning and memory, whether M1 causally contributes to sensorimotor adaptation in the speech domain remains unclear. Here, we aimed to assay whether temporary disruption of the articulatory representation in left M1 by repetitive transcranial magnetic stimulation (rTMS) impairs speech adaptation. To induce sensorimotor adaptation, the frequencies of first formants (F1) were shifted up and played back to participants when they produced "head", "bed", and "dead" repeatedly (the learning phase). A low-frequency rTMS train (.6 Hz, subthreshold, 12 min) over either the tongue or the hand representation of M1 (between-subjects design) was applied before participants experienced altered auditory feedback in the learning phase. We found that the group who received rTMS over the hand representation showed the expected compensatory response for the upwards shift in F1 by significantly reducing F1 and increasing the second formant (F2) frequencies in their productions. In contrast, these expected compensatory changes in both F1 and F2 did not occur in the group that received rTMS over the tongue representation. Critically, rTMS (subthreshold) over the tongue representation did not affect vowel production, which was unchanged from baseline. These results provide direct evidence that the articulatory representation in left M1 causally contributes to sensorimotor learning in speech. Furthermore, these results also suggest that M1 is critical to the network supporting a more global adaptation that aims to move the altered speech production closer to a learnt pattern of speech production used to produce another vowel.}, } @article {pmid34714438, year = {2022}, author = {Sturdy, SK and Smith, DRR and George, DN}, title = {Domestic dogs (Canis lupus familiaris) are sensitive to the correlation between pitch and timbre in human speech.}, journal = {Animal cognition}, volume = {25}, number = {3}, pages = {545-554}, pmid = {34714438}, issn = {1435-9456}, mesh = {Animals ; Dogs ; Female ; Humans ; Male ; Pitch Perception ; Sex Characteristics ; Speech ; Speech Acoustics ; *Voice ; *Wolves ; }, abstract = {The perceived pitch of human voices is highly correlated with the fundamental frequency (f0) of the laryngeal source, which is determined largely by the length and mass of the vocal folds. The vocal folds are larger in adult males than in adult females, and men's voices consequently have a lower pitch than women's. The length of the supralaryngeal vocal tract (vocal-tract length; VTL) affects the resonant frequencies (formants) of speech which characterize the timbre of the voice. Men's longer vocal tracts produce lower frequency, and less dispersed, formants than women's shorter vocal tracts. Pitch and timbre combine to influence the perception of speaker characteristics such as size and age. Together, they can be used to categorize speaker sex with almost perfect accuracy. While it is known that domestic dogs can match a voice to a person of the same sex, there has been no investigation into whether dogs are sensitive to the correlation between pitch and timbre. We recorded a female voice giving three commands ('Sit', 'Lay down', 'Come here'), and manipulated the recordings to lower the fundamental frequency (thus lowering pitch), increase simulated VTL (hence affecting timbre), or both (synthesized adult male voice). Dogs responded to the original adult female and synthesized adult male voices equivalently. Their tendency to obey the commands was, however, reduced when either pitch or timbre was manipulated alone. These results suggest that dogs are sensitive to both the pitch and timbre of human voices, and that they learn about the natural covariation of these perceptual attributes.}, } @article {pmid34649740, year = {2024}, author = {Lester-Smith, RA and Derrick, E and Larson, CR}, title = {Characterization of Source-Filter Interactions in Vocal Vibrato Using a Neck-Surface Vibration Sensor: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {1-9}, pmid = {34649740}, issn = {1873-4588}, support = {90AR5015/ACL/ACL HHS/United States ; R21 DC017001/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Pilot Projects ; Voice Quality ; Vibration ; *Voice/physiology ; *Singing ; }, abstract = {PURPOSE: Vocal vibrato is a singing technique that involves periodic modulation of fundamental frequency (fo) and intensity. The physiological sources of modulation within the speech mechanism and the interactions between the laryngeal source and vocal tract filter in vibrato are not fully understood. Therefore, the purpose of this study was to determine if differences in the rate and extent of fo and intensity modulation could be captured using simultaneously recorded signals from a neck-surface vibration sensor and a microphone, which represent features of the source before and after supraglottal vocal tract filtering.

METHOD: Nine classically-trained singers produced sustained vowels with vibrato while simultaneous signals were recorded using a vibration sensor and a microphone. Acoustical analyses were performed to measure the rate and extent of fo and intensity modulation for each trial. Paired-samples sign tests were used to analyze differences between the rate and extent of fo and intensity modulation in the vibration sensor and microphone signals.

RESULTS: The rate and extent of fo modulation and the extent of intensity modulation were equivalent in the vibration sensor and microphone signals, but the rate of intensity modulation was significantly higher in the microphone signal than in the vibration sensor signal. Larger differences in the rate of intensity modulation were seen with vowels that typically have smaller differences between the first and second formant frequencies.

CONCLUSIONS: This study demonstrated that the rate of intensity modulation at the source prior to supraglottal vocal tract filtering, as measured in neck-surface vibration sensor signals, was lower than the rate of intensity modulation after supraglottal vocal tract filtering, as measured in microphone signals. The difference in rate varied based on the vowel. These findings provide further support of the resonance-harmonics interaction in vocal vibrato. Further investigation is warranted to determine if differences in the physiological source(s) of vibrato account for inconsistent relationships between the extent of intensity modulation in neck-surface vibration sensor and microphone signals.}, } @article {pmid34642073, year = {2024}, author = {Tarai, SK and Chatterjee, I and Pani, S}, title = {A Comparative Acoustic Analysis of Bangla Folk Song and RabindraSangeet on Long-Term Average Spectrum.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {304-308}, doi = {10.1016/j.jvoice.2021.08.014}, pmid = {34642073}, issn = {1873-4588}, mesh = {Humans ; Female ; Adolescent ; Young Adult ; Adult ; Voice Quality ; *Voice ; Phonation ; Acoustics ; *Singing ; }, abstract = {BACKGROUND: Singing is defined as a sensory-motor phenomenon that requires particular balanced physical skills such as respiration, phonation, resonance, and articulation. The long-term average spectrum (LTAS) is widely accepted as a robust and effective tool for the assessment of voice characteristics.

METHOD: Eighty female singers within the age range of 18-30 years were considered for the study. Among 80 participants, 40 were asked to perform one traditional song from Bangla Folk representing the Baul style and another 40 were asked to perform a traditional song from Rabindra Sangeet. Recordings were done and then acoustic (LTAS) analyses were done through PRAAT software. Statistical analyses were done for the analyzed data. software package of social sciences (Version 20.0) was used.

RESULTS: The averaged LTAS curve of Baul style showed a broad peak in the frequency range between 2000 and 3600Hz and its amplitude about 16 dB, Rabindra Sangeet showed a broader peak in the frequency range between 2200 and 3800 Hz and its amplitude about 15 dB. This evidence showed the presence of singer's formants in both singing styles.

CONCLUSION: It can be concluded from the present study that, there is an acoustical difference between the Bangla Folk and Rabindra Sangeet singing style which can be evidenced using LTAS through PRAAT.}, } @article {pmid34642071, year = {2024}, author = {Lee, Y and Park, HJ and Bae, IH and Kim, G}, title = {Resonance Characteristics in Epiglottic Cyst: Formant Frequency, Vowel Space Area, Vowel Articulatory Index, and Formant Centralization Ratio.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {273-278}, doi = {10.1016/j.jvoice.2021.09.008}, pmid = {34642071}, issn = {1873-4588}, mesh = {Humans ; Male ; Speech Acoustics ; *Laryngeal Diseases/diagnostic imaging/surgery ; *Voice ; Voice Quality ; *Cysts/diagnostic imaging/surgery ; Phonetics ; }, abstract = {OBJECTIVES: Resonance characteristics can change due to alterations in the shape of the vocal tract in patients with epiglottic cysts. This study aimed to analyze the resonance characteristics before and after the surgical excision of epiglottic cysts.

METHODS: Twelve male patients with epiglottic cysts were enrolled in this study. We analyzed the first and second formants (F1 and F2) in vowels /a/, /e/, /i/, /o/, and /u/, vowel space area (VSA), vowel articulatory index (VAI), and formant centralization ratio (FCR). We measured these parameters before and after the surgical excision of epiglottic cysts.

RESULTS: There was a significant increase in the F1 values of /a/, VSA, and VAI, and a significant decrease in the value of FCR after the surgery.

CONCLUSION: We confirmed the change in the resonance characteristics in patients with epiglottic cysts. It is considered that further studies on epiglottic cysts and resonance changes are needed in the future.}, } @article {pmid34641989, year = {2021}, author = {König, A and Mallick, E and Tröger, J and Linz, N and Zeghari, R and Manera, V and Robert, P}, title = {Measuring neuropsychiatric symptoms in patients with early cognitive decline using speech analysis.}, journal = {European psychiatry : the journal of the Association of European Psychiatrists}, volume = {64}, number = {1}, pages = {e64}, pmid = {34641989}, issn = {1778-3585}, mesh = {Aged ; Anxiety/diagnosis ; *Apathy ; *Cognitive Dysfunction/diagnosis ; Female ; Humans ; Machine Learning ; Male ; Neuropsychological Tests ; Speech ; }, abstract = {BACKGROUND: Certain neuropsychiatric symptoms (NPS), namely apathy, depression, and anxiety demonstrated great value in predicting dementia progression, representing eventually an opportunity window for timely diagnosis and treatment. However, sensitive and objective markers of these symptoms are still missing. Therefore, the present study aims to investigate the association between automatically extracted speech features and NPS in patients with mild neurocognitive disorders.

METHODS: Speech of 141 patients aged 65 or older with neurocognitive disorder was recorded while performing two short narrative speech tasks. NPS were assessed by the neuropsychiatric inventory. Paralinguistic markers relating to prosodic, formant, source, and temporal qualities of speech were automatically extracted, correlated with NPS. Machine learning experiments were carried out to validate the diagnostic power of extracted markers.

RESULTS: Different speech variables are associated with specific NPS; apathy correlates with temporal aspects, and anxiety with voice quality-and this was mostly consistent between male and female after correction for cognitive impairment. Machine learning regressors are able to extract information from speech features and perform above baseline in predicting anxiety, apathy, and depression scores.

CONCLUSIONS: Different NPS seem to be characterized by distinct speech features, which are easily extractable automatically from short vocal tasks. These findings support the use of speech analysis for detecting subtypes of NPS in patients with cognitive impairment. This could have great implications for the design of future clinical trials as this cost-effective method could allow more continuous and even remote monitoring of symptoms.}, } @article {pmid34632373, year = {2021}, author = {Coto-Solano, R and Stanford, JN and Reddy, SK}, title = {Advances in Completely Automated Vowel Analysis for Sociophonetics: Using End-to-End Speech Recognition Systems With DARLA.}, journal = {Frontiers in artificial intelligence}, volume = {4}, number = {}, pages = {662097}, pmid = {34632373}, issn = {2624-8212}, abstract = {In recent decades, computational approaches to sociophonetic vowel analysis have been steadily increasing, and sociolinguists now frequently use semi-automated systems for phonetic alignment and vowel formant extraction, including FAVE (Forced Alignment and Vowel Extraction, Rosenfelder et al., 2011; Evanini et al., Proceedings of Interspeech, 2009), Penn Aligner (Yuan and Liberman, J. Acoust. Soc. America, 2008, 123, 3878), and DARLA (Dartmouth Linguistic Automation), (Reddy and Stanford, DARLA Dartmouth Linguistic Automation: Online Tools for Linguistic Research, 2015a). Yet these systems still have a major bottleneck: manual transcription. For most modern sociolinguistic vowel alignment and formant extraction, researchers must first create manual transcriptions. This human step is painstaking, time-consuming, and resource intensive. If this manual step could be replaced with completely automated methods, sociolinguists could potentially tap into vast datasets that have previously been unexplored, including legacy recordings that are underutilized due to lack of transcriptions. Moreover, if sociolinguists could quickly and accurately extract phonetic information from the millions of hours of new audio content posted on the Internet every day, a virtual ocean of speech from newly created podcasts, videos, live-streams, and other audio content would now inform research. How close are the current technological tools to achieving such groundbreaking changes for sociolinguistics? Prior work (Reddy et al., Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75) showed that an HMM-based Automated Speech Recognition system, trained with CMU Sphinx (Lamere et al., 2003), was accurate enough for DARLA to uncover evidence of the US Southern Vowel Shift without any human transcription. Even so, because that automatic speech recognition (ASR) system relied on a small training set, it produced numerous transcription errors. Six years have passed since that study, and since that time numerous end-to-end automatic speech recognition (ASR) algorithms have shown considerable improvement in transcription quality. One example of such a system is the RNN/CTC-based DeepSpeech from Mozilla (Hannun et al., 2014). (RNN stands for recurrent neural networks, the learning mechanism for DeepSpeech. CTC stands for connectionist temporal classification, the mechanism to merge phones into words). The present paper combines DeepSpeech with DARLA to push the technological envelope and determine how well contemporary ASR systems can perform in completely automated vowel analyses with sociolinguistic goals. Specifically, we used these techniques on audio recordings from 352 North American English speakers in the International Dialects of English Archive (IDEA), extracting 88,500 tokens of vowels in stressed position from spontaneous, free speech passages. With this large dataset we conducted acoustic sociophonetic analyses of the Southern Vowel Shift and the Northern Cities Chain Shift in the North American IDEA speakers. We compared the results using three different sources of transcriptions: 1) IDEA's manual transcriptions as the baseline "ground truth", 2) the ASR built on CMU Sphinx used by Reddy et al. (Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75), and 3) the latest publicly available Mozilla DeepSpeech system. We input these three different transcriptions to DARLA, which automatically aligned and extracted the vowel formants from the 352 IDEA speakers. Our quantitative results show that newer ASR systems like DeepSpeech show considerable promise for sociolinguistic applications like DARLA. We found that DeepSpeech's automated transcriptions had significantly fewer character error rates than those from the prior Sphinx system (from 46 to 35%). When we performed the sociolinguistic analysis of the extracted vowel formants from DARLA, we found that the automated transcriptions from DeepSpeech matched the results from the ground truth for the Southern Vowel Shift (SVS): five vowels showed a shift in both transcriptions, and two vowels didn't show a shift in either transcription. The Northern Cities Shift (NCS) was more difficult to detect, but ground truth and DeepSpeech matched for four vowels: One of the vowels showed a clear shift, and three showed no shift in either transcription. Our study therefore shows how technology has made progress toward greater automation in vowel sociophonetics, while also showing what remains to be done. Our statistical modeling provides a quantified view of both the abilities and the limitations of a completely "hands-free" analysis of vowel shifts in a large dataset. Naturally, when comparing a completely automated system against a semi-automated system involving human manual work, there will always be a tradeoff between accuracy on the one hand versus speed and replicability on the other hand [Kendall and Joseph, Towards best practices in sociophonetics (with Marianna DiPaolo), 2014]. The amount of "noise" that can be tolerated for a given study will depend on the particular research goals and researchers' preferences. Nonetheless, our study shows that, for certain large-scale applications and research goals, a completely automated approach using publicly available ASR can produce meaningful sociolinguistic results across large datasets, and these results can be generated quickly, efficiently, and with full replicability.}, } @article {pmid34632133, year = {2021}, author = {Sondhi, S and Salhan, A and Santoso, CA and Doucoure, M and Dharmawan, DM and Sureka, A and Natasha, BN and Danusaputro, AD and Dowson, NS and Yap, MSL and Hadiwidjaja, MA and Veeraraghavan, SG and Hatta, AZR and Lee, C and Megantara, RA and Wihardja, AN and Sharma, M and Lardizabal, EL and Sondhi, LJ and Raina, R and Vashisth, S and Hedwig, R}, title = {Voice processing for COVID-19 scanning and prognostic indicator.}, journal = {Heliyon}, volume = {7}, number = {10}, pages = {e08134}, pmid = {34632133}, issn = {2405-8440}, abstract = {COVID-19 pandemic has posed serious risk of contagion to humans. There is a need to find reliable non-contact tests like vocal correlates of COVID-19 infection. Thirty-six Asian ethnic volunteers 16 (8M & 8F) infected subjects and 20 (10M &10F) non-infected controls participated in this study by vocalizing vowels /a/, /e/, /i/, /o/, /u/. Voice correlates of 16 COVID-19 positive patients were compared during infection and after recovery with 20 non-infected controls. Compared to non-infected controls, significantly higher values of energy intensity for /o/ (p = 0.048); formant F1 for /o/ (p = 0.014); and formant F3 for /u/ (p = 0.032) were observed in male patients, while higher values of Jitter (local, abs) for /o/ (p = 0.021) and Jitter (ppq5) for /a/ (p = 0.014) were observed in female patients. However, formant F2 for /u/ (p = 0.018), mean pitch F0 for /e/, /i/ and /o/ (p = 0.033; 0.036; 0.047) decreased for female patients under infection. Compared to recovered conditions, HNR for /e/ (p = 0.014) was higher in male patients under infection, while Jitter (rap) for /a/ (p = 0.041); Jitter (ppq5) for /a/ (p = 0.032); Shimmer (local, dB) for /i/ (p = 0.024); Shimmer (apq5) for /u/ (p = 0.019); and formant F4 for vowel /o/ (p = 0.022) were higher in female patients under infection. However, HNR for /e/ (p = 0.041); and formant F1 for /o/ (p = 0.002) were lower in female patients compared to their recovered conditions. Obtained results support the hypothesis since changes in voice parameters were observed in the infected patients which can be correlated to a combination of acoustic measures like fundamental frequency, formant characteristics, HNR, and voice perturbations like jitter and shimmer for different vowels. Thus, voice analysis can be used for scanning and prognosis of COVID-19 infection. Based on the findings of this study, a mobile application can be developed to analyze human voice in real-time to detect COVID-19 symptoms for remedial measures and necessary action.}, } @article {pmid34550454, year = {2022}, author = {Gama, R and Castro, ME and van Lith-Bijl, JT and Desuter, G}, title = {Does the wearing of masks change voice and speech parameters?.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {279}, number = {4}, pages = {1701-1708}, pmid = {34550454}, issn = {1434-4726}, mesh = {Acoustics ; Humans ; Phonation ; Speech ; Speech Acoustics ; *Voice ; *Voice Disorders/etiology/prevention & control ; Voice Quality ; }, abstract = {PURPOSE: The authors aim to review available reports on the potential effects of masks on voice and speech parameters.

METHODS: A literature search was conducted using MEDLINE and Google Scholar databases through July 2021. Several targeted populations, mask scenarios and methodologies were approached. The assessed voice parameters were divided into self-reported, acoustic and aerodynamic.

RESULTS: It was observed that the wearing of a face mask has been shown to induce several changes in voice parameters: (1) self-reported-significantly increased vocal effort and fatigue, increased vocal tract discomfort and increased values of voice handicap index (VHI) were observed; (2) acoustics-increased voice intensity, altered formants frequency (F2 and F3) with no changes in fundamental frequency, increased harmonics-to-noise ratio (HNR) and increased mean spectral values in high-frequency levels (1000-8000 Hz), especially with KN95 mask; (3) aerodynamics-maximum phonatory time was assessed in only two reports, and showed no alterations.

CONCLUSION: Despite the different populations, mask-type scenarios and methodologies described by each study, the results of this review outline the significant changes in voice characteristics with the use of face masks. Wearing a mask shows to increase the perception of vocal effort and an alteration of the vocal tract length and speech articulatory movements, leading to spectral sound changes, impaired communication and perception. Studies analyzing the effect of masks on voice aerodynamics are lacking. Further research is required to study the long-term effects of face masks on the potential development of voice pathology.}, } @article {pmid34543515, year = {2021}, author = {Wang, Y and Qiu, X and Wang, F and Li, Y and Guo, H and Nie, L}, title = {Single-crystal ordered macroporous metal-organic framework as support for molecularly imprinted polymers and their integration in membrane formant for the specific recognition of zearalenone.}, journal = {Journal of separation science}, volume = {44}, number = {22}, pages = {4190-4199}, doi = {10.1002/jssc.202100393}, pmid = {34543515}, issn = {1615-9314}, support = {20180307024//Science and Technology Project of Guangdong Province/ ; 2017KTSCX169//Department of Education of Guangdong Province/ ; SZ2018KJ03//scientific research project of Shaoguan University/ ; S202010576027//National College Students Innovation and Entrepreneurship Training Program/ ; pdjh2020a0530//Special Fund for Science and Technology Innovation Strategy of Guangdong Province/ ; CX20201043//Hunan Provincial Innovation Foundation For Postgraduate/ ; 19A144//Scientific Research Fund of Hunan Provincial Education Department/ ; 2019JJ60058//Natural Science Foundation of Hunan Province/ ; 2020JJ6102//Natural Science Foundation of Hunan Province/ ; }, mesh = {Chromatography, High Pressure Liquid/methods ; Edible Grain/*chemistry ; Extraction and Processing Industry/methods ; Food Contamination/analysis ; Metal-Organic Frameworks ; Molecular Imprinting/methods ; Molecularly Imprinted Polymers ; Mycotoxins/analysis/chemistry ; Solid Phase Extraction/methods ; Zearalenone/*analysis/chemistry ; }, abstract = {Zearalenone is a fungal contaminant that is widely present in grains. Here, a novel molecularly imprinted membrane based on SOM-ZIF-8 was developed for the rapid and highly selective identification of zearalenone in grain samples. The molecularly imprinted membrane was prepared using polyvinylidene fluoride, cyclododecyl 2,4-dihydroxybenzoate as a template and SOM-ZIF-8 as a carrier. The factors influencing the extraction of zearalenone using this membrane, including the solution pH, extraction time, elution solvent, elution time, and elution volume, were studied in detail. The optimized conditions were 5 mL of sample solution at pH 6, extraction time of 45 min, 4 mL of acetonitrile:methanol = 9:1 as elution solvent, and elution time of 20 min. This method displayed a good linear range of 12-120 ng/g (R[2 ] = 0.998) with the limits of detection and quantification of this method are 1.7 and 5.5 ng/g, respectively. In addition, the membrane was used to selectively identify zearalenone in grain samples with percent recoveries ranging from 87.9 to 101.0% and relative standard deviation of less than 6.6%. Overall, this study presents a simple and effective chromatographic pretreatment method for detecting zearalenone in food samples.}, } @article {pmid34538710, year = {2022}, author = {Erdur, OE and Yilmaz, BS}, title = {Voice changes after surgically assisted rapid maxillary expansion.}, journal = {American journal of orthodontics and dentofacial orthopedics : official publication of the American Association of Orthodontists, its constituent societies, and the American Board of Orthodontics}, volume = {161}, number = {1}, pages = {125-132}, doi = {10.1016/j.ajodo.2020.06.055}, pmid = {34538710}, issn = {1097-6752}, mesh = {Acoustics ; Adult ; Humans ; Maxilla ; *Palatal Expansion Technique ; *Voice Quality ; }, abstract = {INTRODUCTION: This study aimed to investigate voice changes in patients who had surgically assisted rapid maxillary expansion (SARME).

METHODS: Nineteen adult patients with maxillary transverse deficiency were asked to pronounce the sounds "[a], [ϵ], [ɯ], [i], [ɔ], [œ] [u], [y]" for 3 seconds. Voice records were taken before the expansion appliance was placed (T0) and 5.8 weeks after removal (T1, after 5.2 months of retention). The same records were taken for the control group (n = 19). The formant frequencies (F0, F1, F2, and F3), shimmer, jitter, and noise-to-harmonics ratio (NHR) parameters were considered with Praat (version 6.0.43).

RESULTS: In the SARME group, significant differences were observed in the F1 of [a] (P = 0.005), F2 of [ϵ] (P = 0.008), and [œ] sounds (P = 0.004). The postexpansion values were lower than those recorded before. In contrast, the F1 of [y] sound (P = 0.02), F2 of [u] sound (P = 0.01), the jitter parameter of [ɯ] and [i] sounds (P = 0.04; P = 0.002), and the NHR value of [ϵ] sound (P = 0.04) were significantly than the baseline values. In the comparison with the control group, significant differences were found in the F0 (P = 0.025) and F1 (P = 0.046) of the [u] sound, the F1 of the [a] sound (P = 0.03), and the F2 of the [ϵ] sound (P = 0.037). Significant differences were also found in the shimmer of [i] (P = 0.017) and [ɔ] (P = 0.002), the jitter of [ϵ] (P = 0.046) and [i] (P = 0.017), and the NHR of [i] (P = 0.012) and [ɔ] (P = 0.009).

CONCLUSION: SARME led to significant differences in some of the acoustics parameters.}, } @article {pmid34498908, year = {2022}, author = {Perlman, M and Paul, J and Lupyan, G}, title = {Vocal communication of magnitude across language, age, and auditory experience.}, journal = {Journal of experimental psychology. General}, volume = {151}, number = {4}, pages = {885-896}, doi = {10.1037/xge0001103}, pmid = {34498908}, issn = {1939-2222}, support = {//NSF-INSPIRE/ ; //NSF-PAC/ ; }, mesh = {Adolescent ; Animals ; China ; Culture ; Humans ; *Language ; *Voice ; }, abstract = {Like many other vocalizing vertebrates, humans convey information about their body size through the sound of their voice. Vocalizations of larger animals are typically longer in duration, louder in intensity, and lower in frequency. We investigated people's ability to use voice-size correspondences to communicate about the magnitude of external referents. First, we asked hearing children, as well as deaf children and adolescents, living in China to improvise nonlinguistic vocalizations to distinguish between paired items contrasting in magnitude (e.g., a long vs. short string, a big vs. small ball). Then we played these vocalizations back to adult listeners in the United States and China to assess their ability to correctly guess the intended referents. We find that hearing and deaf producers both signaled greater magnitude items with longer and louder vocalizations and with smaller formant spacing. Only hearing producers systematically used fundamental frequency, communicating greater magnitude with higher fo. The vocalizations of both groups were understandable to Chinese and American listeners, although accuracy was higher with vocalizations from older producers. American listeners relied on the same acoustic properties as Chinese listeners: both groups interpreted vocalizations with longer duration and greater intensity as referring to greater items; neither American nor Chinese listeners consistently used fo or formant spacing as a cue. These findings show that the human ability to use vocalizations to communicate about the magnitude of external referents is highly robust, extending across listeners of disparate linguistic and cultural backgrounds, as well as across age and auditory experience. (PsycInfo Database Record (c) 2022 APA, all rights reserved).}, } @article {pmid34482728, year = {2021}, author = {Stansbury, AL and Janik, VM}, title = {The role of vocal learning in call acquisition of wild grey seal pups.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {376}, number = {1836}, pages = {20200251}, pmid = {34482728}, issn = {1471-2970}, mesh = {Animals ; Female ; *Learning ; Male ; *Seals, Earless ; *Vocalization, Animal ; }, abstract = {Pinnipeds have been identified as one of the best available models for the study of vocal learning. Experimental evidence for their learning skills is demonstrated with advanced copying skills, particularly in formant structure when copying human speech sounds and melodies. By contrast, almost no data are available on how learning skills are used in their own communication systems. We investigated the impact of playing modified seal sounds in a breeding colony of grey seals (Halichoerus grypus) to study how acoustic input influenced vocal development of eight pups. Sequences of two or three seal pup calls were edited so that the average peak frequency between calls in a sequence changed up or down. We found that seals copied the specific stimuli played to them and that copies became more accurate over time. The differential response of different groups showed that vocal production learning was used to achieve conformity, suggesting that geographical variation in seal calls can be caused by horizontal cultural transmission. While learning of pup calls appears to have few benefits, we suggest that it also affects the development of the adult repertoire, which may facilitate social interactions such as mate choice. This article is part of the theme issue 'Vocal learning in animals and humans'.}, } @article {pmid34474938, year = {2024}, author = {Güths, RC and Rolim, MRP and Coelho, A}, title = {Glottal Voice Distortions: Nasolaryngoscopic and Spectral Analysis of Anatomophysiologic Changes in Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {31-39}, doi = {10.1016/j.jvoice.2021.07.018}, pmid = {34474938}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Voice Quality ; *Voice ; Vocal Cords ; *Larynx ; }, abstract = {The distorted voices, commonly called vocal drives in Brazil and in some other South American countries, are vocal ornaments belonging to the aesthetics of popular singing and desired by singers of different styles. The advances in vocal sciences have allowed the demystification of this type of technique in the last four decades, classifying them as glottal, supraglottic or mixed distortions/drives. The interdisciplinary approach in the evaluation of singers who use glottal distortions is fundamental for a broad understanding of the particularities of each case. The present study has as main objective to describe the anatomophysiological and spectral findings of the glottal distortions, identified in the practice of many singers. A sample of three singers in a sung emission with and without vocal distortions was collected. PreSonus® AudioBox Studio One kit was used to record the voice during the nasolaryngoscopic evaluation. The singers underwent vocal warm-up and functional evaluation of the larynx based on two studies on contemporary singers. The singers performed the Snarl Voice and Phaser distortions and both showed particular anatomophysiological behaviors. The larynx was low in the first distortion and the level of the clean voice in the second, with the posterior opening of the glottis in both distortions being observed, with opening of the middle third of the glottis for the first as well. Formants vary according to the vocal tract settings used for the distortions. The glottic distortions present a complex anatomophysiological behavior in their composition, with fundamental participation of the transverse interarytenoid muscle and lateral cricoarytenoids, as well as the the participation of the vocal fold in the frequency break. F3 varied according to the longitudinal length and F4 with the diameter, both being related to the three-dimensional adjustments of the vocal tract.}, } @article {pmid34470280, year = {2021}, author = {Stehr, DA and Hickok, G and Ferguson, SH and Grossman, ED}, title = {Examining vocal attractiveness through articulatory working space.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1548}, doi = {10.1121/10.0005730}, pmid = {34470280}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Humans ; Language ; Male ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Robust gender differences exist in the acoustic correlates of clearly articulated speech, with females, on average, producing speech that is acoustically and phonetically more distinct than that of males. This study investigates the relationship between several acoustic correlates of clear speech and subjective ratings of vocal attractiveness. Talkers were recorded producing vowels in /bVd/ context and sentences containing the four corner vowels. Multiple measures of working vowel space were computed from continuously sampled formant trajectories and were combined with measures of speech timing known to co-vary with clear articulation. Partial least squares regression (PLS-R) modeling was used to predict ratings of vocal attractiveness for male and female talkers based on the acoustic measures. PLS components that loaded on size and shape measures of working vowel space-including the quadrilateral vowel space area, convex hull area, and bivariate spread of formants-along with measures of speech timing were highly successful at predicting attractiveness in female talkers producing /bVd/ words. These findings are consistent with a number of hypotheses regarding human attractiveness judgments, including the role of sexual dimorphism in mate selection, the significance of traits signalling underlying health, and perceptual fluency accounts of preferences.}, } @article {pmid34470262, year = {2021}, author = {Sahoo, S and Dandapat, S}, title = {Analyzing the vocal tract characteristics for out-of-breath speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1524}, doi = {10.1121/10.0005945}, pmid = {34470262}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Sound Spectrography ; Speech ; *Speech Acoustics ; *Voice ; }, abstract = {In this work, vocal tract characteristic changes under the out-of-breath condition are explored. Speaking under the influence of physical exercise is called out-of-breath speech. The change in breathing pattern results in perceptual changes in the produced sound. For vocal tract, the first four formants show a lowering in their average frequency. The bandwidths BF1 and BF2 widen, whereas the other two get narrowed. The change in bandwidth is small for the last three. For a speaker, the change in frequency and bandwidth may not be uniform across formants. Subband analysis is carried out around formants for comparing the variation of the vocal tract with the source. A vocal tract adaptive empirical wavelet transform is used for extracting formant specific subbands from speech and source. The support vector machine performs the subband-based binary classification between the normal and out-of-breath speech. For all speakers, it shows an F1-score improvement of 4% over speech subbands. Similarly, a performance improvement of 5% can be seen for both male and female speakers. Furthermore, the misclassification amount is less for source compared to speech. These results suggest that physical exercise influences the source more than the vocal tract.}, } @article {pmid34470045, year = {2022}, author = {Dastolfo-Hromack, C and Bush, A and Chrabaszcz, A and Alhourani, A and Lipski, W and Wang, D and Crammond, DJ and Shaiman, S and Dickey, MW and Holt, LL and Turner, RS and Fiez, JA and Richardson, RM}, title = {Articulatory Gain Predicts Motor Cortex and Subthalamic Nucleus Activity During Speech.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {7}, pages = {1337-1349}, pmid = {34470045}, issn = {1460-2199}, support = {U01 NS098969/NS/NINDS NIH HHS/United States ; }, mesh = {*Deep Brain Stimulation ; Humans ; *Motor Cortex/physiology ; *Parkinson Disease/therapy ; Speech ; *Subthalamic Nucleus/physiology ; }, abstract = {Speaking precisely is important for effective verbal communication, and articulatory gain is one component of speech motor control that contributes to achieving this goal. Given that the basal ganglia have been proposed to regulate the speed and size of limb movement, that is, movement gain, we explored the basal ganglia contribution to articulatory gain, through local field potentials (LFP) recorded simultaneously from the subthalamic nucleus (STN), precentral gyrus, and postcentral gyrus. During STN deep brain stimulation implantation for Parkinson's disease, participants read aloud consonant-vowel-consonant syllables. Articulatory gain was indirectly assessed using the F2 Ratio, an acoustic measurement of the second formant frequency of/i/vowels divided by/u/vowels. Mixed effects models demonstrated that the F2 Ratio correlated with alpha and theta activity in the precentral gyrus and STN. No correlations were observed for the postcentral gyrus. Functional connectivity analysis revealed that higher phase locking values for beta activity between the STN and precentral gyrus were correlated with lower F2 Ratios, suggesting that higher beta synchrony impairs articulatory precision. Effects were not related to disease severity. These data suggest that articulatory gain is encoded within the basal ganglia-cortical loop.}, } @article {pmid34400103, year = {2023}, author = {Aires, MM and de Vasconcelos, D and Lucena, JA and Gomes, AOC and Moraes, BT}, title = {Effect of Wendler glottoplasty on voice and quality of life of transgender women.}, journal = {Brazilian journal of otorhinolaryngology}, volume = {89}, number = {1}, pages = {22-29}, pmid = {34400103}, issn = {1808-8686}, mesh = {Male ; Humans ; Female ; Adult ; *Transgender Persons ; Quality of Life ; Prospective Studies ; Treatment Outcome ; Speech Acoustics ; }, abstract = {OBJECTIVE: To investigate the effect of Wendler glottoplasty on voice feminization, voice quality and voice-related quality of life.

METHODS: Prospective interventional cohort of transgender women submitted to Wendler glottoplasty. Acoustic analysis of the voice included assessment of fundamental frequency, maximum phonation time formant frequencies (F1 and F2), frequency range, jitter and shimmer. Voice quality was blindly assessed through GRBAS scale. Voice-related quality of life was measured using the Trans Woman Voice Questionnaire and the self-perceived femininity of the voice.

RESULTS: A total of 7 patients were included. The mean age was 35.4 years, and the mean postoperative follow-up time was 13.7 months. There was a mean increase of 47.9 ± 46.6 Hz (p = 0.023) in sustained/e/F0 and a mean increase of 24.6 ± 27.5 Hz (p = 0.029) in speaking F0 after glottoplasty. There was no statistical significance in the pre- and postoperative comparison of maximum phonation time, formant frequencies, frequency range, jitter, shimmer, and grade, roughness, breathiness, asthenia, and strain scale. Trans Woman Voice Questionnaire decreased following surgery from 98.3 ± 9.2 to 54.1 ± 25.0 (p = 0.007) and mean self-perceived femininity of the voice increased from 2.8 ± 1.8 to 7.7 ± 2.4 (p = 0.008). One patient (14%) presented a postoperative granuloma and there was 1 (14%) premature suture dehiscence.

CONCLUSION: Glottoplasty is safe and effective for feminizing the voice of transgender women. There was an increase in fundamental frequency, without aggravating other acoustic parameters or voice quality. Voice-related quality of life improved after surgery.}, } @article {pmid34396801, year = {2022}, author = {Chung, H}, title = {Acoustic Characteristics of Pre- and Post-vocalic /l/: Patterns from One Southern White Vernacular English.}, journal = {Language and speech}, volume = {65}, number = {2}, pages = {513-528}, doi = {10.1177/00238309211037368}, pmid = {34396801}, issn = {1756-6053}, mesh = {Acoustics ; Adult ; Female ; Humans ; Language ; Male ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study examined acoustic characteristics of the phoneme /l/ produced by young female and male adult speakers of Southern White Vernacular English (SWVE) from Louisiana. F1, F2, and F2-F1 values extracted at the /l/ midpoint were analyzed by word position (pre- vs. post-vocalic) and vowel contexts (/i, ɪ/ vs. /ɔ, a/). Descriptive analysis showed that SWVE /l/ exhibited characteristics of the dark /l/ variant. The formant patterns of /l/, however, differed significantly by word position and vowel context, with pre-vocalic /l/ showing significantly higher F2-F1 values than post-vocalic /l/, and /l/ in the high front vowel context showing significantly higher F2-F1 values than those in the low back vowel context. Individual variation in the effects of word position and vowel contexts on /l/ pattern was also observed. Overall, the findings of the current study showed a gradient nature of SWVE /l/ variants whose F2-F1 patterns generally fell into the range of the dark /l/ variant, while varying by word position and vowel context.}, } @article {pmid34388438, year = {2021}, author = {Yang, L and Fu, K and Zhang, J and Shinozaki, T}, title = {Non-native acoustic modeling for mispronunciation verification based on language adversarial representation learning.}, journal = {Neural networks : the official journal of the International Neural Network Society}, volume = {142}, number = {}, pages = {597-607}, doi = {10.1016/j.neunet.2021.07.017}, pmid = {34388438}, issn = {1879-2782}, mesh = {Acoustics ; Humans ; *Language ; Language Development ; Speech ; *Speech Perception ; }, abstract = {Non-native mispronunciation verification is designed to provide feedback to guide language learners to correct their pronunciation errors in their further learning and it plays an important role in the computer-aided pronunciation training (CAPT) system. Most existing approaches focus on establishing the acoustic model directly using non-native corpus thus they are suffering the data sparsity problem due to time-consuming non-native speech data collection and annotation tasks. In this work, to address this problem, we propose a pre-trained approach to utilize the speech data of two native languages (the learner's native and target languages) for non-native mispronunciation verification. We set up an unsupervised model to extract knowledge from a large scale of unlabeled raw speech of the target language by making predictions about future observations in the speech signal, then the model is trained with language adversarial training using the learner's native language to align the feature distribution of two languages by confusing a language discriminator. In addition, sinc filter is incorporated at the first convolutional layer to capture the formant-like feature. Formant is relevant to the place and manner of articulation. Therefore, it is useful not only for pronunciation error detection but also for providing instructive feedback. Then the pre-trained model serves as the feature extractor in the downstream mispronunciation verification task. Through the experiments on the Japanese part of the BLCU inter-Chinese speech corpus, the experimental results demonstrate that for the non-native phone recognition and mispronunciation verification tasks (1) the knowledge learned from two native languages speech with the proposed unsupervised approach is useful for these two tasks (2) our proposed language adversarial representation learning is effective to improve the performance (3) formant-like feature can be incorporated by introducing sinc filter to further improve the performance of mispronunciation verification.}, } @article {pmid34384662, year = {2024}, author = {Leyns, C and Corthals, P and Cosyns, M and Papeleu, T and Van Borsel, J and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Acoustic and Perceptual Effects of Articulation Exercises in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {246.e15-246.e25}, doi = {10.1016/j.jvoice.2021.06.033}, pmid = {34384662}, issn = {1873-4588}, mesh = {Male ; Humans ; Female ; *Speech Acoustics ; *Transgender Persons ; Acoustics ; Speech ; Masculinity ; Phonetics ; }, abstract = {PURPOSE: This study measured the impact of articulation exercises using a cork and articulation exercises for lip spreading on the formant frequencies of vowels and listener perceptions of femininity in transgender women.

METHODS: Thirteen transgender women were recorded before and after the cork exercise and before and after the lip spreading exercise. Speech samples included continuous speech during reading and were analyzed using Praat software. Vowel formant frequencies (F1, F2, F3, F4, F5) and vowel space were determined. A listening experiment was organized using naïve cisgender women and cisgender men rating audio samples of continuous speech. Masculinity/femininity, vocal quality and age were rated, using a visual analogue scale (VAS).

RESULTS: Concerning vowel formant frequencies, F2 /a/ and F5 /u/ significantly increased after the lip spreading exercise, as well as F3 /a/, F3 /u/ and F4 /a/ after the cork exercise. The lip spreading exercise had more impact on the F2 /a/ than the cork exercise. Vowel space did not change after the exercises. The fundamental frequency (fo) increased simultaneously during both exercises. Both articulation exercises were associated with significantly increased listener perceptions of femininity of the voice.

CONCLUSION: Subtle changes in formant frequencies can be observed after performing articulation exercises, but not in every formant frequency or vowel. Cisgender listeners rated the speech of the transgender women more feminine after the exercises. Further research with a more extensive therapy program and listening experiment is needed to examine these preliminary findings.}, } @article {pmid34344099, year = {2021}, author = {Yang, JJ and Cheng, LY and Xu, W}, title = {[Study on changes of voice characteristics after adenotonsillectomy or adenoidectomy in children].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {56}, number = {7}, pages = {724-729}, doi = {10.3760/cma.j.cn115330-20200813-00672}, pmid = {34344099}, issn = {1673-0860}, mesh = {Adenoidectomy ; *Adenoids/surgery ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech Acoustics ; *Tonsillectomy ; Voice Quality ; }, abstract = {Objective: To study voice changes in children after adenotonsillectomy or adenoidectomy and the relationship with the vocal tract structure. Methods: Fifty patients were recruited in this study prospectively, aged from 4 to 12 years old with the median age of 6. They were underwent adenotonsillectomy or adenoidectomy in Beijing Tongren Hospital, Capital Medical University from July 2019 to August 2020. In the cases, there are 31 males and 19 females. Thirty-six patients underwent adenotonsillectomy and 14 patients underwent adenoidectomy alone. Twenty-two children (13 males, 9 females) with Ⅰ degree of bilateral tonsils without adenoid hypertrophy and no snoring were selected as normal controls. Adenoid and tonsil sizes were evaluated. Subjective changes of voice were recorded after surgery. Moreover, voice data including fundamental frequency(F0), jitter, shimmer, noise to harmonic ratio(NHR), maximum phonation time(MPT), formant frequencies(F1-F5) and bandwidths(B1-B5) of vowel/a/and/i/were analyzed before, 3 days and 1 month after surgery respectively.SPSS 23.0 was used for statistical analysis. Results: Thirty-six patients(72.0%,36/50) complained of postoperative voice changes. The incidence was inversely correlated with age. In children aged 4-6, 7-9, and 10-12, the incidence was 83.3%(25/30), 63.6%(7/11) and 44.4%(4/9) respectively. Voice changes appeared more common in children underwent adenotonsillectomy(77.8%,28/36) than in those underwent adenoidectomy alone(57.1%,8/14), but there was no statistical difference. After operation, for vowel/a/, MPT(Z=2.18,P=0.041) and F2(t=2.13,P=0.040) increased, B2(Z=2.04,P=0.041) and B4(Z=2.00,P=0.046) decreased. For vowel/i/, F2(t=2.035,P=0.050) and F4(t=4.44,P=0.0001) increased, B2(Z=2.36,P=0.019) decreased. Other acoustic parameters were not significantly different from those before surgery. The F2(r=-0.392, P =0.032) of vowel/a/and F2(r=-0.279, P=0.048) and F4 (r=-0.401, P =0.028) of vowel/i/after adenotonsillectomy were significantly higher than those of adenoidectomy alone. Half of patients with postopertive voice changes can recover spontaneously 1 month after surgery. Conclusions: Voice changes in children underwent adenotonsillectomy or adenoidectomy might be related to their changes in formants and bandwidths. The effect of adenotonsillectomy on voice was more significant compared with that of adenoidectomy alone. The acoustic parameters did not change significantly after surgery except MPT.}, } @article {pmid34342877, year = {2021}, author = {Frey, R and Wyman, MT and Johnston, M and Schofield, M and Locatelli, Y and Reby, D}, title = {Roars, groans and moans: Anatomical correlates of vocal diversity in polygynous deer.}, journal = {Journal of anatomy}, volume = {239}, number = {6}, pages = {1336-1369}, pmid = {34342877}, issn = {1469-7580}, mesh = {Acoustics ; Animals ; *Deer ; Female ; *Larynx ; Male ; Vocal Cords ; Vocalization, Animal ; }, abstract = {Eurasian deer are characterized by the extraordinary diversity of their vocal repertoires. Male sexual calls range from roars with relatively low fundamental frequency (hereafter fo) in red deer Cervus elaphus, to moans with extremely high fo in sika deer Cervus nippon, and almost infrasonic groans with exceptionally low fo in fallow deer Dama dama. Moreover, while both red and fallow males are capable of lowering their formant frequencies during their calls, sika males appear to lack this ability. Female contact calls are also characterized by relatively less pronounced, yet strong interspecific differences. The aim of this study is to examine the anatomical bases of these inter-specific and inter-sexual differences by identifying if the acoustic variation is reflected in corresponding anatomical variation. To do this, we investigated the vocal anatomy of male and female specimens of each of these three species. Across species and sexes, we find that the observed acoustic variability is indeed related to expected corresponding anatomical differences, based on the source-filter theory of vocal production. At the source level, low fo is associated with larger vocal folds, whereas high fo is associated with smaller vocal folds: sika deer have the smallest vocal folds and male fallow deer the largest. Red and sika deer vocal folds do not appear to be sexually dimorphic, while fallow deer exhibit strong sexual dimorphism (after correcting for body size differences). At the filter level, the variability in formants is related to the configuration of the vocal tract: in fallow and red deer, both sexes have evolved a permanently descended larynx (with a resting position of the larynx much lower in males than in females). Both sexes also have the potential for momentary, call-synchronous vocal tract elongation, again more pronounced in males than in females. In contrast, the resting position of the larynx is high in both sexes of sika deer and the potential for further active vocal tract elongation is virtually absent in both sexes. Anatomical evidence suggests an evolutionary reversal in larynx position within sika deer, that is, a secondary larynx ascent. Together, our observations confirm that the observed diversity of vocal behaviour in polygynous deer is supported by strong anatomical differences, highlighting the importance of anatomical specializations in shaping mammalian vocal repertoires. Sexual selection is discussed as a potential evolutionary driver of the observed vocal diversity and sexual dimorphisms.}, } @article {pmid34340503, year = {2021}, author = {Strycharczuk, P and Ćavar, M and Coretta, S}, title = {Distance vs time. Acoustic and articulatory consequences of reduced vowel duration in Polish.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {592}, doi = {10.1121/10.0005585}, pmid = {34340503}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; Poland ; Speech ; *Speech Acoustics ; }, abstract = {This paper presents acoustic and articulatory (ultrasound) data on vowel reduction in Polish. The analysis focuses on the question of whether the change in formant value in unstressed vowels can be explained by duration-driven undershoot alone or whether there is also evidence for additional stress-specific articulatory mechanisms that systematically affect vowel formants. On top of the expected durational differences between the stressed and unstressed conditions, the duration is manipulated by inducing changes in the speech rate. The observed vowel formants are compared to expected formants derived from the articulatory midsagittal tongue data in different conditions. The results show that the acoustic vowel space is reduced in size and raised in unstressed vowels compared to stressed vowels. Most of the spectral reduction can be explained by reduced vowel duration, but there is also an additional systematic effect of F1-lowering in unstressed non-high vowels that does not follow from tongue movement. The proposed interpretation is that spectral vowel reduction in Polish behaves largely as predicted by the undershoot model of vowel reduction, but the effect of undershoot is enhanced for low unstressed vowels, potentially by a stress marking strategy which involves raising the fundamental frequency.}, } @article {pmid34340486, year = {2021}, author = {Petersen, EA and Colinot, T and Silva, F and H-Turcotte, V}, title = {The bassoon tonehole lattice: Links between the open and closed holes and the radiated sound spectrum.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {398}, doi = {10.1121/10.0005627}, pmid = {34340486}, issn = {1520-8524}, abstract = {The acoustics of the bassoon has been the subject of relatively few studies compared with other woodwind instruments. One reason for this may lie in its complicated resonator geometry, which includes irregularly spaced toneholes with chimney heights ranging from 3 to 31 mm. The current article evaluates the effect of the open and closed tonehole lattice (THL) on the acoustic response of the bassoon resonator. It is shown that this response can be divided into three distinct frequency bands that are determined by the open and closed THL: below 500 Hz, 500-2200 Hz, and above 2200 Hz. The first is caused by the stopband of the open THL, where the low frequency effective length of the instrument is determined by the location of the first open tonehole. The second is due to the passband of the open THL, such that the modes are proportional to the total length of the resonator. The third is due to the closed THL, where part of the acoustical power is trapped within the resonator. It is proposed that these three frequency bands impact the radiated spectrum by introducing a formant in the vicinity of 500 Hz and suppressing radiation above 2200 Hz for most first register fingerings.}, } @article {pmid34340472, year = {2021}, author = {Uezu, Y and Hiroya, S and Mochida, T}, title = {Articulatory compensation for low-pass filtered formant-altered auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {64}, doi = {10.1121/10.0004775}, pmid = {34340472}, issn = {1520-8524}, mesh = {Feedback ; Feedback, Sensory ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback while speaking plays an important role in stably controlling speech articulation. Its importance has been verified in formant-altered auditory feedback (AAF) experiments where speakers utter while listening to speech with perturbed first (F1) and second (F2) formant frequencies. However, the contribution of the frequency components higher than F2 to the articulatory control under the perturbations of F1 and F2 has not yet been investigated. In this study, a formant-AAF experiment was conducted in which a low-pass filter was applied to speech. The experimental results showed that the deviation in the compensatory response was significantly larger when a low-pass filter with a cutoff frequency of 3 kHz was used compared to that when cutoff frequencies of 4 and 8 kHz were used. It was also found that the deviation in the 3-kHz condition correlated with the fundamental frequency and spectral tilt of the produced speech. Additional simulation results using a neurocomputational model of speech production (SimpleDIVA model) and the experimental data showed that the feedforward learning rate increased as the cutoff frequency decreased. These results suggest that high-frequency components of the auditory feedback would be involved in the determination of corrective motor commands from auditory errors.}, } @article {pmid34291230, year = {2021}, author = {Lynn, E and Narayanan, SS and Lammert, AC}, title = {Dark tone quality and vocal tract shaping in soprano song production: Insights from real-time MRI.}, journal = {JASA express letters}, volume = {1}, number = {7}, pages = {075202}, pmid = {34291230}, issn = {2691-1191}, abstract = {Tone quality termed "dark" is an aesthetically important property of Western classical voice performance and has been associated with lowered formant frequencies, lowered larynx, and widened pharynx. The present study uses real-time magnetic resonance imaging with synchronous audio recordings to investigate dark tone quality in four professionally trained sopranos with enhanced ecological validity and a relatively complete view of the vocal tract. Findings differ from traditional accounts, indicating that labial narrowing may be the primary driver of dark tone quality across performers, while many other aspects of vocal tract shaping are shown to differ significantly in a performer-specific way.}, } @article {pmid34265989, year = {2021}, author = {Liu, R and Wang, G and Deng, D and Zhang, T}, title = {Spin Hall effect of Laguerre-Gaussian beams in PT symmetric metamaterials.}, journal = {Optics express}, volume = {29}, number = {14}, pages = {22192-22201}, doi = {10.1364/OE.427869}, pmid = {34265989}, issn = {1094-4087}, abstract = {Spin Hall effect (SHE) of Laguerre-Gaussian (LG) beams reflected and transmitted in parity-time (PT) symmetric metamaterials are investigated near the coherent-perfect-absorption (CPA)-laser point and exceptional points (EPs). The numerical results show that large transverse shifts occur at the CPA-laser point regardless of the incident direction. But at EPs, the SHE increases at one side and disappears at the other side, thus achieving the intense SHE of the reflected light beams at the specified side incidence. In addition, it is found that Bragg oscillation can be generated by increasing the period number of PT symmetric metamaterial layers, thus increasing the number of formants in transverse displacement. In particular, the transverse shift peaks of the transmitted beams merge into a positive peak when the incident angle is close to 90[∘] and does not change basically with the increasing of Im(ɛ), which can also be considered as a strong tolerance to the variation of Im(ɛ). This feature is expected to realize a new type of optoelectronic devices with anti-interference performance. These results provide a feasible path for the modulation of spin Hall effect of light (SHEL) and provide the possibility for the development of new nanophotonic devices.}, } @article {pmid34261582, year = {2023}, author = {Joshi, A and Procter, T and Kulesz, PA}, title = {COVID-19: Acoustic Measures of Voice in Individuals Wearing Different Facemasks.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {6}, pages = {971.e1-971.e8}, pmid = {34261582}, issn = {1873-4588}, mesh = {Adult ; Male ; Female ; Humans ; *COVID-19 ; Pandemics ; Speech Acoustics ; Masks/adverse effects ; Acoustics ; }, abstract = {AIM: The global health pandemic caused by the SARS-coronavirus 2 (COVID-19) has led to the adoption of facemasks as a necessary safety precaution. Depending on the level of risk for exposure to the virus, the facemasks that are used can vary. The aim of this study was to examine the effect of different types of facemasks, typically used by healthcare professionals and the public during the COVID-19 pandemic, on measures of voice.

METHODS: Nineteen adults (ten females, nine males) with a normal voice quality completed sustained vowel tasks. All tasks were performed for each of the six mask conditions: no mask, cloth mask, surgical mask, KN95 mask and, surgical mask over a KN95 mask with and without a face shield. Intensity measurements were obtained at a 1ft and 6ft distance from the speaker with sound level meters. Tasks were recorded with a 1ft mouth-to-microphone distance. Acoustic variables of interest were fundamental frequency (F0), and formant frequencies (F1, F2) for /a/ and /i/ and smoothed cepstral peak prominence (CPPs) for /a/.

RESULTS: Data were analyzed to compare differences between sex and mask types. There was statistical significance between males and females for intensity measures and all acoustic variables except F2 for /a/ and F1 for /i/. Few pairwise comparisons between masks reached significance even though main effects for mask type were observed. These are further discussed in the article.

CONCLUSION: The masks tested in this study did not have a significant impact on intensity, fundamental frequency, CPPs, first or second formant frequency compared to voice output without a mask. Use of a face shield seemed to affect intensity and CPPs to some extent. Implications of these findings are discussed further in the article.}, } @article {pmid34260437, year = {2022}, author = {Easwar, V and Birstler, J and Harrison, A and Scollie, S and Purcell, D}, title = {The Influence of Sensation Level on Speech-Evoked Envelope Following Responses.}, journal = {Ear and hearing}, volume = {43}, number = {1}, pages = {250-254}, pmid = {34260437}, issn = {1538-4667}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; //CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Phonetics ; Sensation ; *Speech ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: To evaluate sensation level (SL)-dependent characteristics of envelope following responses (EFRs) elicited by band-limited speech dominant in low, mid, and high frequencies.

DESIGN: In 21 young normal hearing adults, EFRs were elicited by 8 male-spoken speech stimuli-the first formant, and second and higher formants of /u/, /a/ and /i/, and modulated fricatives, /∫/ and /s/. Stimulus SL was computed from behaviorally measured thresholds.

RESULTS: At 30 dB SL, the amplitude and phase coherence of fricative-elicited EFRs were ~1.5 to 2 times higher than all vowel-elicited EFRs, whereas fewer and smaller differences were found among vowel-elicited EFRs. For all stimuli, EFR amplitude and phase coherence increased by roughly 50% for every 10 dB increase in SL between ~0 and 50 dB.

CONCLUSIONS: Stimulus and frequency dependency in EFRs exist despite accounting for differences in audibility of speech sounds. The growth rate of EFR characteristics with SL is independent of stimulus and its frequency.}, } @article {pmid34256982, year = {2023}, author = {Zealouk, O and Satori, H and Hamidi, M and Laaidi, N and Salek, A and Satori, K}, title = {Analysis of COVID-19 Resulting Cough Using Formants and Automatic Speech Recognition System.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {6}, pages = {971.e9-971.e16}, pmid = {34256982}, issn = {1873-4588}, mesh = {Humans ; *Speech Recognition Software ; Cough/diagnosis/etiology ; Pandemics ; *COVID-19/complications/diagnosis ; Speech ; }, abstract = {As part of our contributions to researches on the ongoing COVID-19 pandemic worldwide, we have studied the cough changes to the infected people based on the Hidden Markov Model (HMM) speech recognition classification, formants frequency and pitch analysis. In this paper, An HMM-based cough recognition system was implemented with 5 HMM states, 8 Gaussian Mixture Distributions (GMMs) and 13 dimensions of the basic Mel-Frequency Cepstral Coefficients (MFCC) with 39 dimensions of the overall feature vector. A comparison between formants frequency and pitch extracted values is realized based on the cough of COVID-19 infected people and healthy ones to confirm our cough recognition system results. The experimental results present that the difference between the recognition rates of infected and non-infected people is 6.7%. Whereas, the formant analysis variation based on the cough of infected and non-infected people is clearly observed with F1, F3, and F4 and lower for F0 and F2.}, } @article {pmid34251887, year = {2021}, author = {Easwar, V and Scollie, S and Lasarev, M and Urichuk, M and Aiken, SJ and Purcell, DW}, title = {Characteristics of Speech-Evoked Envelope Following Responses in Infancy.}, journal = {Trends in hearing}, volume = {25}, number = {}, pages = {23312165211004331}, pmid = {34251887}, issn = {2331-2165}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Infant ; Phonetics ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {Envelope following responses (EFRs) may be a useful tool for evaluating the audibility of speech sounds in infants. The present study aimed to evaluate the characteristics of speech-evoked EFRs in infants with normal hearing, relative to adults, and identify age-dependent changes in EFR characteristics during infancy. In 42 infants and 21 young adults, EFRs were elicited by the first (F1) and the second and higher formants (F2+) of the vowels /u/, /a/, and /i/, dominant in low and mid frequencies, respectively, and by amplitude-modulated fricatives /s/ and /∫/, dominant in high frequencies. In a subset of 20 infants, the in-ear stimulus level was adjusted to match that of an average adult ear (65 dB sound pressure level [SPL]). We found that (a) adult-infant differences in EFR amplitude, signal-to-noise ratio, and intertrial phase coherence were larger and spread across the frequency range when in-ear stimulus level was adjusted in infants, (b) adult-infant differences in EFR characteristics were the largest for low-frequency stimuli, (c) infants demonstrated adult-like phase coherence when they received a higher (i.e., unadjusted) stimulus level, and (d) EFR phase coherence and signal-to-noise ratio changed with age in the first year of life for a few F2+ vowel stimuli in a level-specific manner. Together, our findings reveal that development-related changes in EFRs during infancy likely vary by stimulus frequency, with low-frequency stimuli demonstrating the largest adult-infant differences. Consistent with previous research, our findings emphasize the significant role of stimulus level calibration methods while investigating developmental trends in EFRs.}, } @article {pmid34241428, year = {2021}, author = {Echternach, M and Herbst, CT and Köberlein, M and Story, B and Döllinger, M and Gellrich, D}, title = {Are source-filter interactions detectable in classical singing during vowel glides?.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {4565}, doi = {10.1121/10.0005432}, pmid = {34241428}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Occupations ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {In recent studies, it has been assumed that vocal tract formants (Fn) and the voice source could interact. However, there are only few studies analyzing this assumption in vivo. Here, the vowel transition /i/-/a/-/u/-/i/ of 12 professional classical singers (6 females, 6 males) when phonating on the pitch D4 [fundamental frequency (ƒo) ca. 294 Hz] were analyzed using transnasal high speed videoendoscopy (20.000 fps), electroglottography (EGG), and audio recordings. Fn data were calculated using a cepstral method. Source-filter interaction candidates (SFICs) were determined by (a) algorithmic detection of major intersections of Fn/nƒo and (b) perceptual assessment of the EGG signal. Although the open quotient showed some increase for the /i-a/ and /u-i/ transitions, there were no clear effects at the expected Fn/nƒo intersections. In contrast, ƒo adjustments and changes in the phonovibrogram occurred at perceptually derived SFICs, suggesting level-two interactions. In some cases, these were constituted by intersections between higher nƒo and Fn. The presented data partially corroborates that vowel transitions may result in level-two interactions also in professional singers. However, the lack of systematically detectable effects suggests either the absence of a strong interaction or existence of confounding factors, which may potentially counterbalance the level-two-interactions.}, } @article {pmid34241427, year = {2021}, author = {Zhang, C and Jepson, K and Lohfink, G and Arvaniti, A}, title = {Comparing acoustic analyses of speech data collected remotely.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {3910}, pmid = {34241427}, issn = {1520-8524}, mesh = {Acoustics ; *COVID-19 ; Humans ; Phonetics ; SARS-CoV-2 ; *Speech ; Speech Acoustics ; }, abstract = {Face-to-face speech data collection has been next to impossible globally as a result of the COVID-19 restrictions. To address this problem, simultaneous recordings of three repetitions of the cardinal vowels were made using a Zoom H6 Handy Recorder with an external microphone (henceforth, H6) and compared with two alternatives accessible to potential participants at home: the Zoom meeting application (henceforth, Zoom) and two lossless mobile phone applications (Awesome Voice Recorder, and Recorder; henceforth, Phone). F0 was tracked accurately by all of the devices; however, for formant analysis (F1, F2, F3), Phone performed better than Zoom, i.e., more similarly to H6, although the data extraction method (VoiceSauce, Praat) also resulted in differences. In addition, Zoom recordings exhibited unexpected drops in intensity. The results suggest that lossless format phone recordings present a viable option for at least some phonetic studies.}, } @article {pmid34240071, year = {2021}, author = {Diamant, N and Amir, O}, title = {Examining the voice of Israeli transgender women: Acoustic measures, voice femininity and voice-related quality-of-life.}, journal = {International journal of transgender health}, volume = {22}, number = {3}, pages = {281-293}, pmid = {34240071}, issn = {2689-5277}, abstract = {BACKGROUND: Transgender women may experience gender-dysphoria associated with their voice and the way it is perceived. Previous studies have shown that specific acoustic measures are associated with the perception of voice-femininity and with voice-related quality-of-life, yet results are inconsistent.

AIMS: This study aimed to examine the associations between specific voice measures of transgender women, voice-related quality-of-life, and the perception of voice-femininity by listeners and by the speakers themselves.

METHODS: Thirty Hebrew speaking transgender women were recorded. They had also rated their voice-femininity and completed the Hebrew version of the TVQ[MtF] questionnaire. Recordings were analyzed to extract mean fundamental frequency (F0), formant frequencies (F1, F2, F3), and vocal-range (calculated in Hz. and in semitones). Recordings were also rated on a voice-gender 7-point scale, by 20 naïve cisgender listeners.

RESULTS: Significant correlations were found between both F0 and F1 and listeners' as well as speakers' evaluation of voice-femininity. TVQ[MtF] scores were significantly correlated with F0 and with the lower and upper boundaries of the vocal-range. Voice-femininity ratings were strongly correlated with vocal-range, when calculated in Hz, but not when defined in semitones. Listeners' evaluation and speakers' self-evaluation of voice-femininity were significantly correlated. However, TVQ[MtF] scores were significantly correlated only with the speakers' voice-femininity ratings, but not with those of the listeners.

CONCLUSION: Higher F0 and F1, which are perceived as more feminine, jointly improved speakers' satisfaction with their voice. Speakers' self-evaluation of voice-femininity does not mirror listeners' judgment, as it is affected by additional factors, related to self-satisfaction and personal experience. Combining listeners' and speakers' voice evaluation with acoustic analysis is valuable by providing a more holistic view on how transgender women feel about their voice and how it is perceived by listeners.}, } @article {pmid34232704, year = {2021}, author = {Leung, Y and Oates, J and Chan, SP and Papp, V}, title = {Associations Between Speaking Fundamental Frequency, Vowel Formant Frequencies, and Listener Perceptions of Speaker Gender and Vocal Femininity-Masculinity.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {7}, pages = {2600-2622}, doi = {10.1044/2021_JSLHR-20-00747}, pmid = {34232704}, issn = {1558-9102}, mesh = {Australia ; Female ; *Femininity ; Humans ; Male ; *Masculinity ; Perception ; Speech Acoustics ; }, abstract = {Purpose The aim of the study was to examine associations between speaking fundamental frequency (f os), vowel formant frequencies (F), listener perceptions of speaker gender, and vocal femininity-masculinity. Method An exploratory study was undertaken to examine associations between f os, F 1-F 3, listener perceptions of speaker gender (nominal scale), and vocal femininity-masculinity (visual analog scale). For 379 speakers of Australian English aged 18-60 years, f os mode and F 1-F 3 (12 monophthongs; total of 36 Fs) were analyzed on a standard reading passage. Seventeen listeners rated speaker gender and vocal femininity-masculinity on randomized audio recordings of these speakers. Results Model building using principal component analysis suggested the 36 Fs could be succinctly reduced to seven principal components (PCs). Generalized structural equation modeling (with the seven PCs of F and f os as predictors) suggested that only F 2 and f os predicted listener perceptions of speaker gender (male, female, unable to decide). However, listener perceptions of vocal femininity-masculinity behaved differently and were predicted by F 1, F 3, and the contrast between monophthongs at the extremities of the F 1 acoustic vowel space, in addition to F 2 and f os. Furthermore, listeners' perceptions of speaker gender also influenced ratings of vocal femininity-masculinity substantially. Conclusion Adjusted odds ratios highlighted the substantially larger contribution of F to listener perceptions of speaker gender and vocal femininity-masculinity relative to f os than has previously been reported.}, } @article {pmid34229221, year = {2021}, author = {Easwar, V and Boothalingam, S and Flaherty, R}, title = {Fundamental frequency-dependent changes in vowel-evoked envelope following responses.}, journal = {Hearing research}, volume = {408}, number = {}, pages = {108297}, doi = {10.1016/j.heares.2021.108297}, pmid = {34229221}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Male ; Speech ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {Scalp-recorded envelope following responses (EFRs) provide a non-invasive method to assess the encoding of the fundamental frequency (f0) of voice that is important for speech understanding. It is well-known that EFRs are influenced by voice f0. However, this effect of f0 has not been examined independent of concomitant changes in spectra or neural generators. We evaluated the effect of voice f0 on EFRs while controlling for vowel formant characteristics and potentially avoiding significant changes in dominant neural generators using a small f0 range. EFRs were elicited by a male-spoken vowel /u/ (average f0 = 100.4 Hz) and its lowered f0 version (average f0 = 91.9 Hz) with closely matched formant characteristics. Vowels were presented to each ear of 17 young adults with normal hearing. EFRs were simultaneously recorded between the vertex and the nape, and the vertex and the ipsilateral mastoid-the two most common electrode montages used for EFRs. Our results indicate that when vowel formant characteristics are matched, an increase in f0 by 8.5 Hz reduces EFR amplitude by 25 nV, phase coherence by 0.05 and signal-to-noise ratio by 3.5 dB, on average. The reduction in EFR characteristics was similar across ears of stimulation and the two montages used. These findings will help parse the influence of f0 or stimulus spectra on EFRs when both co-vary.}, } @article {pmid34213387, year = {2022}, author = {Eravci, FC and Yildiz, BD and Özcan, KM and Moran, M and Çolak, M and Karakurt, SE and Karakuş, MF and Ikinciogullari, A}, title = {Acoustic parameter changes after bariatric surgery.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {4}, pages = {256-261}, doi = {10.1080/14015439.2021.1945676}, pmid = {34213387}, issn = {1651-2022}, mesh = {Humans ; Adult ; Middle Aged ; *Speech Acoustics ; Voice Quality ; Prospective Studies ; Longitudinal Studies ; Acoustics ; *Bariatric Surgery/adverse effects ; Weight Loss ; }, abstract = {OBJECTIVE: To investigate the acoustic parameter changes after weight loss in bariatric surgery patients.

MATERIALS AND METHODS: This prospective, longitudinal study was conducted with 15 patients with planned bariatric surgery, who were evaluated pre-operatively and at 6 months post-operatively. Fundamental frequency (F0), Formant frequency (F1, F2, F3, and F4), Frequency perturbation (Jitter), Amplitude perturbation (Shimmer) and Noise-to-Harmonics Ratio (NHR) parameters were evaluated for /a/, /e/, /i/, /o/, and /u/ vowels. Changes in the acoustic analysis parameters for each vowel were compared. The study group was separated into two groups according to whether the Mallampati score had not changed (Group 1) or had decreased (Group 2) and changes in the formant frequencies were compared between these groups.

RESULTS: A total of 15 patients with a median age of 40 ± 11 years completed the study. The median weight of the patients was 122 ± 14 kg pre-operatively and 80 ± 15 kg, post-operatively. BMI declined from 46 ± 4 to 31 ± 5 kg/m[2]. The Mallampati score decreased by one point in six patients and remained stable in nine. Of the acoustic voice analysis parameters of vowels, in general, fundamental frequency tended to decrease, and shimmer and jitter values tended to increase. Some of the formant frequencies were specifically affected by the weight loss and this showed statistical significance between Group 1 and Group 2.

CONCLUSION: The present study reveals that some specific voice characteristics might be affected by successful weight loss after bariatric surgery.HighlightsObesity reduces the size of the pharyngeal lumen at different levels.The supralaryngeal vocal tract size and configuration is a determinative factor in the features of the voice.Changes in the length and shape of the vocal tract, or height and position of the tongue can result in changes especially in formant frequencies in acoustic analysis.}, } @article {pmid34160929, year = {2021}, author = {Yang, J}, title = {Vowel development in young Mandarin-English bilingual children.}, journal = {Phonetica}, volume = {78}, number = {3}, pages = {241-272}, doi = {10.1515/phon-2021-2006}, pmid = {34160929}, issn = {1423-0321}, mesh = {Child ; Child, Preschool ; Humans ; Language ; Language Development ; *Multilingualism ; Phonetics ; *Speech Perception ; }, abstract = {This study examined the development of vowel categories in young Mandarin -English bilingual children. The participants included 35 children aged between 3 and 4 years old (15 Mandarin-English bilinguals, six English monolinguals, and 14 Mandarin monolinguals). The bilingual children were divided into two groups: one group had a shorter duration (<1 year) of intensive immersion in English (Bi-low group) and one group had a longer duration (>1 year) of intensive immersion in English (Bi-high group). The participants were recorded producing one list of Mandarin words containing the vowels /a, i, u, y, ɤ/ and/or one list of English words containing the vowels /i, ɪ, e, ɛ, æ, u, ʊ, o, ɑ, ʌ/. Formant frequency values were extracted at five equidistant time locations (the 20-35-50-65-80% point) over the course of vowel duration. Cross-language and within-language comparisons were conducted on the midpoint formant values and formant trajectories. The results showed that children in the Bi-low group produced their English vowels into clusters and showed positional deviations from the monolingual targets. However, they maintained the phonetic features of their native vowel sounds well and mainly used an assimilatory process to organize the vowel systems. Children in the Bi-high group separated their English vowels well. They used both assimilatory and dissimilatory processes to construct and refine the two vowel systems. These bilingual children approximated monolingual English children to a better extent than the children in the Bi-low group. However, when compared to the monolingual peers, they demonstrated observable deviations in both L1 and L2.}, } @article {pmid34116888, year = {2023}, author = {Lin, Y and Cheng, L and Wang, Q and Xu, W}, title = {Effects of Medical Masks on Voice Assessment During the COVID-19 Pandemic.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {5}, pages = {802.e25-802.e29}, doi = {10.1016/j.jvoice.2021.04.028}, pmid = {34116888}, issn = {1873-4588}, mesh = {Male ; Female ; Humans ; Middle Aged ; Phonation ; Speech Acoustics ; Masks/adverse effects ; Pandemics/prevention & control ; *COVID-19/prevention & control ; *Voice ; }, abstract = {OBJECTIVE: Voice assessment is of great significance to the evaluation of voice quality. Our study aims to explore the effects of medical masks on healthy people in acoustic, aerodynamic and formant parameters during the COVID-19 pandemic. In addition, we also attempted to verify the differences between different sexes and ages.

METHODS: Fifty-three healthy participants (25 males and 28 females) were involved in our study. The acoustic parameters, including fundamental frequency (F0), sound pressure level (SPL), percentage of jitter (%), percentage of shimmer (%), noise to harmonic ratio (NHR) and cepstral peak prominence (CPP), aerodynamic parameter (maximum phonation time, MPT) and formant parameters (formant frequency, F1, F2, F3) without and with wearing medical masks were included. We further investigated the potential differences in the impact on different sexes and ages (≤45 years old and >45 years old).

RESULTS: While wearing medical masks, the SPL significantly increased (71.22±4.25 dB, 72.42±3.96 dB, P = 0.021). Jitter and shimmer significantly decreased (jitter 1.19±0.83, 0.87±0.67 P = 0.005; shimmer 4.49±2.20, 3.66±2.02 P = 0.002), as did F3 (2855±323.34 Hz, 2781.89±353.42 Hz P = 0.004). F0, MPT, F1 and F2 showed increasing trends without statistical significance, and NHR as well as CPP showed little change without and with wearing medical masks. There were no significant differences seen between males and females. Regarding to age, a significant difference in MPT was seen (>45-year-old 16.15±6.98 s, 15.38±7.02 s; ≤45-year-old 20.26±6.47 s, 21.44±6.98 s, P = 0.032).

CONCLUSION: Healthy participants showed a significantly higher SPL, a smaller perturbation and an evident decrease in F3 after wearing medical masks. These changes may result from the adjustment of the vocal tract and the filtration function of medical masks, leading to the stability of voices we recorded being overstated. The impacts of medical masks on sex were not evident, while the MPT in the >45-year-old group was influenced more than that in the ≤45-year-old group.}, } @article {pmid34091212, year = {2021}, author = {Madrid, AM and Walker, KA and Smith, SB and Hood, LJ and Prieve, BA}, title = {Relationships between click auditory brainstem response and speech frequency following response with development in infants born preterm.}, journal = {Hearing research}, volume = {407}, number = {}, pages = {108277}, doi = {10.1016/j.heares.2021.108277}, pmid = {34091212}, issn = {1878-5891}, support = {R01 DC011777/DC/NIDCD NIH HHS/United States ; }, mesh = {Child, Preschool ; *Evoked Potentials, Auditory, Brain Stem ; Gestational Age ; Humans ; Infant ; Infant, Newborn ; Infant, Premature ; Speech ; *Speech Perception ; }, abstract = {The speech evoked frequency following response (sFFR) is used to study relationships between neural processing and functional aspects of speech and language that are not captured by click or toneburst evoked auditory brainstem responses (ABR). The sFFR is delayed, deviant, or weak in school age children having a variety of disorders, including autism, dyslexia, reading and language disorders, in relation to their typically developing peers. Much less is known about the developmental characteristics of sFFR, especially in preterm infants, who are at risk of having language delays. In term neonates, phase locking and spectral representation of the fundamental frequency is developed in the early days of life. Spectral representation of higher harmonics and latencies associated with transient portions of the stimulus are still developing in term infants through at least 10 months of age. The goal of this research was to determine whether sFFR could be measured in preterm infants and to characterize its developmental trajectory in the time and frequency domain. Click ABR and sFFR were measured in 28 preterm infants at ages 33 to 64 weeks gestational age. The sFFR could be measured in the majority of infants at 33 weeks gestational age, and the detectability of all sFFR waves was 100% by 64 weeks gestational age. The latency of all waves associated with the transient portion of the response (waves V, A, and O), and most waves (waves D and E) associated with the quasi-steady state decreased with increasing age. The interpeak wave A-O latency did not change with age, indicating that these waves share a neural generator, or the neural generators are developing at the same rate. The spectral amplitude of F0 and the lower frequencies of the first formant increased with age, but that for higher frequencies of the first formant and higher harmonics did not. The results suggest that the sFFR can be reliably recorded in preterm infants, including those cared for in the neonatal intensive care unit. These findings support that in preterm infants, F0 amplitude continues to develop within the first 6 months of life and develops before efficient representation of higher frequency harmonics. Further research is needed to determine if the sFFR in preterm infants is predictive of long-term language or learning disorders.}, } @article {pmid34045154, year = {2023}, author = {Andrade, PA and Frič, M and Otčenášek, Z}, title = {Assessment of Changes in Laryngeal Configuration and Voice Parameters Among Different Frequencies of Neuromuscular Electrical Stimulation (NMES) and Cumulative Effects of NMES in a Normophonic Subject: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {632.e1-632.e20}, doi = {10.1016/j.jvoice.2021.03.018}, pmid = {34045154}, issn = {1873-4588}, mesh = {Humans ; Pilot Projects ; *Voice/physiology ; Vocal Cords/physiology ; Laryngeal Muscles/physiology ; Electric Stimulation ; }, abstract = {INTRODUCTION: Neuromuscular electrical stimulation (NMES) is a complementary resource to voice therapy that can be used for the treatment of hypofunctional voice disorders. Although positive clinical studies have been reported, neutral and even potentially harmful effects of NMES are also described in the literature. Furthermore, in the studies examined by the authors, the use of different methods of NMES have been identified, which further contributes to the inconsistent results found among studies. Moreover, limited rationale is provided for the chosen NMES parameters such as electrode placement, frequency of NMES and length of treatment. The aims of this pilot study were to investigate the a) impact of different frequencies of NMES on glottal configuration and vocal fold vibration patterns and b) changes in laryngeal configuration and vocal output across 12 minutes of NMES.

METHOD: Three experiments were carried out looking at changes in laryngeal configuration and voice output using different imaging techniques (fibreoptic nasolaryngoscopy and high-speed video), acoustical analysis (F0, formant analysis, SPL, CPPS and LHSR values), electroglottography (EGG) and Relative Fundamental Frequency (RFF) analyses. Glottal parameters and acoustical measures were recorded before, during, and after stimulation. Data was collected at rest and during phonation.

RESULTS: Overall the results showed global changes in laryngeal configuration from normal to hyperfunctional (ie, increased RFF, SPL, CQ, and stiffness). Changes were more pronounced for lower frequencies of NMES and were significant within less than three minutes of application.

CONCLUSION: NMES is an effective resource for the activation of intrinsic laryngeal muscles producing significant levels of adduction within few minutes of application. Lower NMES frequencies produced greater muscle activation when compared to higher frequencies.}, } @article {pmid34043445, year = {2021}, author = {Daliri, A}, title = {A Computational Model for Estimating the Speech Motor System's Sensitivity to Auditory Prediction Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1841-1854}, pmid = {34043445}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Female ; Humans ; Sound ; *Speech ; *Speech Perception ; }, abstract = {Purpose The speech motor system uses feedforward and feedback control mechanisms that are both reliant on prediction errors. Here, we developed a state-space model to estimate the error sensitivity of the control systems. We examined (a) whether the model accounts for the error sensitivity of the control systems and (b) whether the two systems have similar error sensitivity. Method Participants (N = 50) completed an adaptation paradigm, in which their first and second formants were perturbed such that a participant's /ε/ would sound like her /ӕ/. We measured adaptive responses to the perturbations at early (0-80 ms) and late (220-300 ms) time points relative to the onset of the perturbations. As data-driven correlates of the error sensitivity of the feedforward and feedback systems, we used the average early responses and difference responses (i.e., late minus early responses), respectively. We fitted the state-space model to participants' adaptive responses and used the model's parameters as model-based estimates of error sensitivity. Results We found that the late responses were larger than the early responses. Additionally, the model-based estimates of error sensitivity strongly correlated with the data-driven estimates. However, the data-driven and model-based estimates of error sensitivity of the feedforward system did not correlate with those of the feedback system. Conclusions Overall, our results suggested that the dynamics of adaptive responses as well as error sensitivity of the control systems can be accurately predicted by the model. Furthermore, our results suggested that the feedforward and feedback control systems function independently. Supplemental Material https://doi.org/10.23641/asha.14669808.}, } @article {pmid34019777, year = {2021}, author = {Souza, PE and Ellis, G and Marks, K and Wright, R and Gallun, F}, title = {Does the Speech Cue Profile Affect Response to Amplitude Envelope Distortion?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {2053-2069}, pmid = {34019777}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cues ; *Hearing Aids ; *Hearing Loss, Sensorineural ; Humans ; Speech ; *Speech Perception ; }, abstract = {Purpose A broad area of interest to our group is to understand the consequences of the "cue profile" (a measure of how well a listener can utilize audible temporal and/or spectral cues for listening scenarios in which a subset of cues is distorted. The study goal was to determine if listeners whose cue profile indicated that they primarily used temporal cues for recognition would respond differently to speech-envelope distortion than listeners who utilized both spectral and temporal cues. Method Twenty-five adults with sensorineural hearing loss participated in the study. The listener's cue profile was measured by analyzing identification patterns for a set of synthetic syllables in which envelope rise time and formant transitions were varied. A linear discriminant analysis quantified the relative contributions of spectral and temporal cues to identification patterns. Low-context sentences in noise were processed with time compression, wide-dynamic range compression, or a combination of time compression and wide-dynamic range compression to create a range of speech-envelope distortions. An acoustic metric, a modified version of the Spectral Correlation Index, was calculated to quantify envelope distortion. Results A binomial generalized linear mixed-effects model indicated that envelope distortion, the cue profile, the interaction between envelope distortion and the cue profile, and the pure-tone average were significant predictors of sentence recognition. Conclusions The listeners with good perception of spectro-temporal contrasts were more resilient to the detrimental effects of envelope compression than listeners who used temporal cues to a greater extent. The cue profile may provide information about individual listening that can direct choice of hearing aid parameters, especially those parameters that affect the speech envelope.}, } @article {pmid33987821, year = {2021}, author = {Stilp, CE and Assgari, AA}, title = {Contributions of natural signal statistics to spectral context effects in consonant categorization.}, journal = {Attention, perception & psychophysics}, volume = {83}, number = {6}, pages = {2694-2708}, pmid = {33987821}, issn = {1943-393X}, mesh = {Acoustic Stimulation ; Humans ; Language ; *Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speech perception, like all perception, takes place in context. Recognition of a given speech sound is influenced by the acoustic properties of surrounding sounds. When the spectral composition of earlier (context) sounds (e.g., a sentence with more energy at lower third formant [F3] frequencies) differs from that of a later (target) sound (e.g., consonant with intermediate F3 onset frequency), the auditory system magnifies this difference, biasing target categorization (e.g., towards higher-F3-onset /d/). Historically, these studies used filters to force context stimuli to possess certain spectral compositions. Recently, these effects were produced using unfiltered context sounds that already possessed the desired spectral compositions (Stilp & Assgari, 2019, Attention, Perception, & Psychophysics, 81, 2037-2052). Here, this natural signal statistics approach is extended to consonant categorization (/g/-/d/). Context sentences were either unfiltered (already possessing the desired spectral composition) or filtered (to imbue specific spectral characteristics). Long-term spectral characteristics of unfiltered contexts were poor predictors of shifts in consonant categorization, but short-term characteristics (last 475 ms) were excellent predictors. This diverges from vowel data, where long-term and shorter-term intervals (last 1,000 ms) were equally strong predictors. Thus, time scale plays a critical role in how listeners attune to signal statistics in the acoustic environment.}, } @article {pmid33979206, year = {2021}, author = {Dromey, C and Richins, M and Low, T}, title = {Kinematic and Acoustic Changes to Vowels and Diphthongs in Bite Block Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1794-1801}, doi = {10.1044/2021_JSLHR-20-00630}, pmid = {33979206}, issn = {1558-9102}, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Young Adult ; }, abstract = {Purpose We examined the effect of bite block insertion (BBI) on lingual movements and formant frequencies in corner vowel and diphthong production in a sentence context. Method Twenty young adults produced the corner vowels (/u/, /ɑ/, /æ/, /i/) and the diphthong /ɑɪ/ in sentence contexts before and after BBI. An electromagnetic articulograph measured the movements of the tongue back, middle, and front. Results There were significant decreases in the acoustic vowel articulation index and vowel space area following BBI. The kinematic vowel articulation index decreased significantly for the back and middle of the tongue but not for the front. There were no significant acoustic changes post-BBI for the diphthong, other than a longer transition duration. Diphthong kinematic changes after BBI included smaller movements for the back and middle of the tongue, but not the front. Conclusions BBI led to a smaller acoustic working space for the corner vowels. The adjustments made by the front of the tongue were sufficient to compensate for the BBI perturbation in the diphthong, resulting in unchanged formant trajectories. The back and middle of the tongue were likely biomechanically restricted in their displacement by the fixation of the jaw, whereas the tongue front showed greater movement flexibility.}, } @article {pmid33977813, year = {2024}, author = {Onosson, S and Stewart, J}, title = {The Effects of Language Contact on Non-Native Vowel Sequences in Lexical Borrowings: The Case of Media Lengua.}, journal = {Language and speech}, volume = {67}, number = {2}, pages = {498-527}, pmid = {33977813}, issn = {1756-6053}, mesh = {Humans ; *Phonetics ; *Speech Acoustics ; Female ; Male ; *Multilingualism ; Adult ; Speech Production Measurement ; Young Adult ; Language ; }, abstract = {Media Lengua (ML), a mixed language derived from Quichua and Spanish, exhibits a phonological system that largely conforms to that of Quichua acoustically. Yet, it incorporates a large number of vowel sequences from Spanish which do not occur in the Quichua system. This includes the use of mid-vowels, which are phonetically realized in ML as largely overlapping with the high-vowels in acoustic space. We analyze and compare production of vowel sequences by speakers of ML, Quichua, and Spanish through the use of generalized additive mixed models to determine statistically significant differences between vowel formant trajectories. Our results indicate that Spanish-derived ML vowel sequences frequently differ significantly from their Spanish counterparts, largely occupying a more central region of the vowel space and frequently exhibiting markedly reduced trajectories over time. In contrast, we find only one case where an ML vowel sequence differs significantly from its Quichua counterpart-and even in this case the difference from Spanish is substantially greater. Our findings show how the vowel system of ML successfully integrates novel vowel sequence patterns from Spanish into what is essentially Quichua phonology by markedly adapting their production, while still maintaining contrasts which are not expressed in Quichua.}, } @article {pmid33951578, year = {2021}, author = {Isler, B and Giroud, N and Hirsiger, S and Kleinjung, T and Meyer, M}, title = {Bilateral age-related atrophy in the planum temporale is associated with vowel discrimination difficulty in healthy older adults.}, journal = {Hearing research}, volume = {406}, number = {}, pages = {108252}, doi = {10.1016/j.heares.2021.108252}, pmid = {33951578}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Aged ; Atrophy/pathology ; *Auditory Cortex/diagnostic imaging ; Humans ; Speech ; *Speech Perception ; *Temporal Lobe/pathology ; }, abstract = {In this study we investigated the association between age-related brain atrophy and behavioural as well as electrophysiological markers of vowel perception in a sample of healthy younger and older adults with normal pure-tone hearing. Twenty-three older adults and 27 younger controls discriminated a set of vowels with altered second formants embedded in consonant-vowel syllables. Additionally, mismatch negativity (MMN) responses were recorded in a separate oddball paradigm with the same set of stimuli. A structural magnet resonance scan was obtained for each participant to determine cortical architecture of the left and right planum temporale (PT). The PT was chosen for its function as a major processor of auditory cues and speech. Results suggested that older adults performed worse in vowel discrimination despite normal-for-age pure-tone hearing. In the older group, we found evidence that those with greater age-related cortical atrophy (i.e., lower cortical surface area and cortical volume) in the left and right PT also showed weaker vowel discrimination. In comparison, we found a lateralized correlation in the younger group suggesting that those with greater cortical thickness in only the left PT performed weaker in the vowel discrimination task. We did not find any associations between macroanatomical traits of the PT and MMN responses. We conclude that deficient vowel processing is not only caused by pure-tone hearing loss but is also influenced by atrophy-related changes in the ageing auditory-related cortices. Furthermore, our results suggest that auditory processing might become more bilateral across the lifespan.}, } @article {pmid33938165, year = {2021}, author = {Xiao, Y and Wang, T and Deng, W and Yang, L and Zeng, B and Lao, X and Zhang, S and Liu, X and Ouyang, D and Liao, G and Liang, Y}, title = {Data mining of an acoustic biomarker in tongue cancers and its clinical validation.}, journal = {Cancer medicine}, volume = {10}, number = {11}, pages = {3822-3835}, pmid = {33938165}, issn = {2045-7634}, mesh = {Adult ; Aged ; Analysis of Variance ; Area Under Curve ; Articulation Disorders/diagnosis/*physiopathology ; China ; Cross-Sectional Studies ; *Data Mining ; Female ; Humans ; Male ; Middle Aged ; Quality of Life ; Sex Factors ; Speech Production Measurement/methods ; Support Vector Machine ; Tongue/surgery ; Tongue Neoplasms/diagnosis/pathology/*physiopathology/surgery ; }, abstract = {The promise of speech disorders as biomarkers in clinical examination has been identified in a broad spectrum of neurodegenerative diseases. However, to the best of our knowledge, a validated acoustic marker with established discriminative and evaluative properties has not yet been developed for oral tongue cancers. Here we cross-sectionally collected a screening dataset that included acoustic parameters extracted from 3 sustained vowels /ɑ/, /i/, /u/ and binary perceptual outcomes from 12 consonant-vowel syllables. We used a support vector machine with linear kernel function within this dataset to identify the formant centralization ratio (FCR) as a dominant predictor of different perceptual outcomes across gender and syllable. The Acoustic analysis, Perceptual evaluation and Quality of Life assessment (APeQoL) was used to validate the FCR in 33 patients with primary resectable oral tongue cancers. Measurements were taken before (pre-op) and four to six weeks after (post-op) surgery. The speech handicap index (SHI), a speech-specific questionnaire, was also administrated at these time points. Pre-op correlation analysis within the APeQoL revealed overall consistency and a strong correlation between FCR and SHI scores. FCRs also increased significantly with increasing T classification pre-operatively, especially for women. Longitudinally, the main effects of T classification, the extent of resection, and their interaction effects with time (pre-op vs. post-op) on FCRs were all significant. For pre-operative FCR, after merging the two datasets, a cut-off value of 0.970 produced an AUC of 0.861 (95% confidence interval: 0.785-0.938) for T3-4 patients. In sum, this study determined that FCR is an acoustic marker with the potential to detect disease and related speech function in oral tongue cancers. These are preliminary findings that need to be replicated in longitudinal studies and/or larger cohorts.}, } @article {pmid33909840, year = {2021}, author = {Rocha-Muniz, CN and Schochat, E}, title = {Investigation of the neural discrimination of acoustic characteristics of speech sounds in normal-hearing individuals through Frequency-following Response (FFR).}, journal = {CoDAS}, volume = {33}, number = {1}, pages = {e20180324}, doi = {10.1590/2317-1782/20202018324}, pmid = {33909840}, issn = {2317-1782}, mesh = {Acoustic Stimulation ; Acoustics ; Child ; Evoked Potentials, Auditory, Brain Stem ; Hearing ; Humans ; *Phonetics ; *Speech Perception ; }, abstract = {PURPOSE: To evaluate how the auditory pathways encode and discriminate the plosive syllables [ga], [da] and [ba] using the auditory evoked Frequency-following Response (FFR) in children with typical development.

METHODS: Twenty children aged 6-12 years were evaluated using the FFR for the [ga], [da] and [ba] stimuli. The stimuli were composed of six formants and were differentiated in the F2 to F3 transition (transient portion). The other formants were identical in the three syllables (sustained portion). The latencies of the 16 waves of the transient portion (<70ms) and of the 21 waves of the sustained portion (90-160ms) of the stimuli were analyzed in the neural responses obtained for each of the syllables.

RESULTS: The transient portion latencies were different in the three syllables, indicating a distinction in the acoustic characteristics of these syllables through their neural representations. In addition, the transient portion latencies progressively increased in the following order: [ga] <[da] <[ba], whereas no significant differences were observed in the sustained portion.

CONCLUSION: The FFR proved to be an efficient tool to investigate the subcortical acoustic differences in speech sounds, since it demonstrated different electrophysiological responses for the three evoked syllables. Changes in latency were observed in the transient portion (consonants) but not in the sustained portion (vowels) for the three stimuli. These results indicate the neural ability to distinguish between acoustic characteristics of the [ga], [da] and [ba] stimuli.}, } @article {pmid33900806, year = {2021}, author = {Chiu, YF and Neel, A and Loux, T}, title = {Exploring the Acoustic Perceptual Relationship of Speech in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1560-1570}, doi = {10.1044/2021_JSLHR-20-00610}, pmid = {33900806}, issn = {1558-9102}, mesh = {Acoustics ; Aged ; Dysarthria/diagnosis/etiology ; Humans ; *Parkinson Disease/complications ; *Speech ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Auditory perceptual judgments are commonly used to diagnose dysarthria and assess treatment progress. The purpose of the study was to examine the acoustic underpinnings of perceptual speech abnormalities in individuals with Parkinson's disease (PD). Method Auditory perceptual judgments were obtained from sentences produced by 13 speakers with PD and five healthy older adults. Twenty young listeners rated overall ease of understanding, articulatory precision, voice quality, and prosodic adequacy on a visual analog scale. Acoustic measures associated with the speech subsystems of articulation, phonation, and prosody were obtained, including second formant transitions, articulation rate, cepstral and spectral measures of voice, and pitch variations. Regression analyses were performed to assess the relationships between perceptual judgments and acoustic variables. Results Perceptual impressions of Parkinsonian speech were related to combinations of several acoustic variables. Approximately 36%-49% of the variance in the perceptual ratings were explained by the acoustic measures indicating a modest acoustic perceptual relationship. Conclusions The relationships between perceptual ratings and acoustic signals in Parkinsonian speech are multifactorial and involve a variety of acoustic features simultaneously. The modest acoustic perceptual relationships, however, suggest that future work is needed to further examine the acoustic bases of perceptual judgments in dysarthria.}, } @article {pmid33900786, year = {2021}, author = {Parrell, B and Ivry, RB and Nagarajan, SS and Houde, JF}, title = {Intact Correction for Self-Produced Vowel Formant Variability in Individuals With Cerebellar Ataxia Regardless of Auditory Feedback Availability.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2234-2247}, pmid = {33900786}, issn = {1558-9102}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cerebellar Ataxia ; Feedback ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose Individuals with cerebellar ataxia (CA) caused by cerebellar degeneration exhibit larger reactive compensatory responses to unexpected auditory feedback perturbations than neurobiologically typical speakers, suggesting they may rely more on feedback control during speech. We test this hypothesis by examining variability in unaltered speech. Previous studies of typical speakers have demonstrated a reduction in formant variability (centering) observed during the initial phase of vowel production from vowel onset to vowel midpoint. Centering is hypothesized to reflect feedback-based corrections for self-produced variability and thus may provide a behavioral assay of feedback control in unperturbed speech in the same manner as the compensatory response does for feedback perturbations. Method To comprehensively compare centering in individuals with CA and controls, we examine centering in two vowels (/i/ and /ɛ/) under two contexts (isolated words and connected speech). As a control, we examine speech produced both with and without noise to mask auditory feedback. Results Individuals with CA do not show increased centering compared to age-matched controls, regardless of vowel, context, or masking. Contrary to previous results in neurobiologically typical speakers, centering was not affected by the presence of masking noise in either group. Conclusions The similar magnitude of centering seen with and without masking noise questions whether centering is driven by auditory feedback. However, if centering is at least partially driven by auditory/somatosensory feedback, these results indicate that the larger compensatory response to altered auditory feedback observed in individuals with CA may not reflect typical motor control processes during normal, unaltered speech production.}, } @article {pmid33895925, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Reshetov, IV and Popadyuk, VI}, title = {Study of the Role of Hearing Aid on the Area of the Acoustic Field of Vowels.}, journal = {Doklady. Biochemistry and biophysics}, volume = {497}, number = {1}, pages = {108-111}, pmid = {33895925}, issn = {1608-3091}, mesh = {*Acoustics ; Adult ; Female ; *Hearing Aids ; Humans ; Male ; Sound ; }, abstract = {The method of transformation of acoustic vowel triangles (AVT) /a/, /i/, /u/ was used for an objective assessment of the acoustic features of vowels in the speech production of 20 persons with long-term hearing impairment (LHI). The logarithm of the values of the first two formants of each vowel (logF1, logF2) was determined for each subject. AVTs were transformed into the right-angled triangles, the vertices of the sound /u/ of which were moved to the origin of coordinates and the legs were aligned with the coordinate axes. In patients with LHI, the size of the triangles usually decreased, and they were stretched along one of the axes, which probably depends not only on the hearing loss severity but also on the duration of hearing aid use. The presented approach to the normalization of AVTs makes it possible to distinguish at least three groups of persons with LHI: in the first group, vowel triangles are stretched along the logF1 axis; in the second group, vowel triangles are stretched along the logF2 axis; and in the third group, AVT are symmetric.}, } @article {pmid33863624, year = {2023}, author = {Lã, FMB and Silva, LS and Granqvist, S}, title = {Long-Term Average Spectrum Characteristics of Portuguese Fado-Canção from Coimbra.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {631.e7-631.e15}, doi = {10.1016/j.jvoice.2021.03.005}, pmid = {33863624}, issn = {1873-4588}, mesh = {Humans ; Speech Acoustics ; Portugal ; *Voice ; *Singing ; Acoustics ; }, abstract = {Descriptions of acoustical characteristics of Fado, a Portuguese urban style sung in Lisbon and Oporto, are scarce, particularly concerning Fado-Canção, a related style sung in Coimbra. The present study aims at describing long-term average spectrum (LTAS) parameters of 16 professional singers while singing and reading the lyrics of a typical Fado-Canção. LTAS parameters were investigated in terms of: (1) equivalent sound level (Leq); (2) spectral differences between 3 frequency bands 0-2, 2-5, and 5-8 kHz; and (3) quantification of spectral prominence between 2 and 4 kHz, calculated as the level difference between the peak in this frequency region and a reference trendline between 1 and 5 kHz, henceforth Formant Cluster Prominence (FCP). Given that Fado-Canção, besides Fado and traditional styles, originated also from classical singing, and that previous studies on Fado suggest the absence of a singer's formant cluster, the averaged LTAS for all Fado-Canção singers was further compared to the LTAS of two world-touring opera baritones singing an operatic aria and a lied. Results show that Fado-Canção is commonly sung with a Leq of 86.4 dB and a FCP of about 10 dB, values significantly higher when compared to reading. The FCP in Fado-Canção, although smaller than for the two classical opera singers' examples (14.8 and 20 dB, respectively), suggests that the style preserved some of its original lyrical influence. However, because younger singers present higher energy in the 5-8 kHz region relative to the remaining frequency bands as compared to older singers, it seems that Fado-Canção may be drifting towards non-classical vocal practices. FCP seems to be a promising straightforward method to quantify the degree of formant clustering around the region of the singer's formant in LTAS, allowing comparisons between different singers and singing styles.}, } @article {pmid33856659, year = {2021}, author = {Loni, DY and Subbaraman, S}, title = {Genetically related singers-acoustic feature analysis and impact on singer identification.}, journal = {Journal of applied genetics}, volume = {62}, number = {3}, pages = {459-467}, pmid = {33856659}, issn = {2190-3883}, mesh = {Acoustics ; Female ; Humans ; Male ; Music ; Parents ; Siblings ; Singing/*genetics ; Voice Quality/*genetics ; }, abstract = {Studies relating music with genetics have been one of the fascinating fields of research. In this study, we have attempted to answer the most curious question-how acoustically close are the genetically related singers? The present study has investigated this perception using two genetically different relations-three female sibling singers and father-son singer relation. These are famous Indian playback singers and the acoustic features are extracted using the songs of Bollywood films. Three different sets of self-developed cappella database are used for the experimentation. Positive correlations among the major musical aptitudes-pitch, vibrato, formant, and harmonic spectral envelope for both the singer relationships-revealed the genetic impact on the acoustic features. Also, the investigation of timbre spectral feature proved it a significant acoustic feature that differentiates similar voices. With Spearman's correlation coefficient, we conclude that strong acoustical association was observed between the acoustic features of genetically related singers, especially the female sibling singers. This was further validated by correlating these singers with genetically unrelated singers. A human perception test performed using cover songs indicated the genetic impact in voice similarity, while the automatic singer identification system discriminated singers more accurately than the human listeners.}, } @article {pmid33833720, year = {2021}, author = {Hsieh, IH and Yeh, WT}, title = {The Interaction Between Timescale and Pitch Contour at Pre-attentive Processing of Frequency-Modulated Sweeps.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {637289}, pmid = {33833720}, issn = {1664-1078}, abstract = {Speech comprehension across languages depends on encoding the pitch variations in frequency-modulated (FM) sweeps at different timescales and frequency ranges. While timescale and spectral contour of FM sweeps play important roles in differentiating acoustic speech units, relatively little work has been done to understand the interaction between the two acoustic dimensions at early cortical processing. An auditory oddball paradigm was employed to examine the interaction of timescale and pitch contour at pre-attentive processing of FM sweeps. Event-related potentials to frequency sweeps that vary in linguistically relevant pitch contour (fundamental frequency F0 vs. first formant frequency F1) and timescale (local vs. global) in Mandarin Chinese were recorded. Mismatch negativities (MMNs) were elicited by all types of sweep deviants. For local timescale, FM sweeps with F0 contours yielded larger MMN amplitudes than F1 contours. A reversed MMN amplitude pattern was obtained with respect to F0/F1 contours for global timescale stimuli. An interhemispheric asymmetry of MMN topography was observed corresponding to local and global-timescale contours. Falling but not rising frequency difference waveforms sweep contours elicited right hemispheric dominance. Results showed that timescale and pitch contour interacts with each other in pre-attentive auditory processing of FM sweeps. Findings suggest that FM sweeps, a type of non-speech signal, is processed at an early stage with reference to its linguistic function. That the dynamic interaction between timescale and spectral pattern is processed during early cortical processing of non-speech frequency sweep signal may be critical to facilitate speech encoding at a later stage.}, } @article {pmid33833252, year = {2021}, author = {Wright, E and Grawunder, S and Ndayishimiye, E and Galbany, J and McFarlin, SC and Stoinski, TS and Robbins, MM}, title = {Chest beats as an honest signal of body size in male mountain gorillas (Gorilla beringei beringei).}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6879}, pmid = {33833252}, issn = {2045-2322}, mesh = {Acoustics ; Animals ; *Body Size ; *Competitive Behavior ; Gorilla gorilla/*physiology ; Male ; *Reproduction ; Thorax/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Acoustic signals that reliably indicate body size, which usually determines competitive ability, are of particular interest for understanding how animals assess rivals and choose mates. Whereas body size tends to be negatively associated with formant dispersion in animal vocalizations, non-vocal signals have received little attention. Among the most emblematic sounds in the animal kingdom is the chest beat of gorillas, a non-vocal signal that is thought to be important in intra and inter-sexual competition, yet it is unclear whether it reliably indicates body size. We examined the relationship among body size (back breadth), peak frequency, and three temporal characteristics of the chest beat: duration, number of beats and beat rate from sound recordings of wild adult male mountain gorillas. Using linear mixed models, we found that larger males had significantly lower peak frequencies than smaller ones, but we found no consistent relationship between body size and the temporal characteristics measured. Taken together with earlier findings of positive correlations among male body size, dominance rank and reproductive success, we conclude that the gorilla chest beat is an honest signal of competitive ability. These results emphasize the potential of non-vocal signals to convey important information in mammal communication.}, } @article {pmid33831309, year = {2021}, author = {Jekiel, M and Malarski, K}, title = {Musical Hearing and Musical Experience in Second Language English Vowel Acquisition.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1666-1682}, doi = {10.1044/2021_JSLHR-19-00253}, pmid = {33831309}, issn = {1558-9102}, mesh = {Adult ; Hearing ; Hearing Tests ; Humans ; Language ; *Multilingualism ; *Music ; Phonetics ; *Speech Perception ; }, abstract = {Purpose Former studies suggested that music perception can help produce certain accentual features in the first and second language (L2), such as intonational contours. What was missing in many of these studies was the identification of the exact relationship between specific music perception skills and the production of different accentual features in a foreign language. Our aim was to verify whether empirically tested musical hearing skills can be related to the acquisition of English vowels by learners of English as an L2 before and after a formal accent training course. Method Fifty adult Polish speakers of L2 English were tested before and after a two-semester accent training in order to observe the effect of musical hearing on the acquisition of English vowels. Their L2 English vowel formant contours produced in consonant-vowel-consonant context were compared with the target General British vowels produced by their pronunciation teachers. We juxtaposed these results with their musical hearing test scores and self-reported musical experience to observe a possible relationship between successful L2 vowel acquisition and musical aptitude. Results Preexisting rhythmic memory was reported as a significant predictor before training, while musical experience was reported as a significant factor in the production of more native-like L2 vowels after training. We also observed that not all vowels were equally acquired or affected by musical hearing or musical experience. The strongest estimate we observed was the closeness to model before training, suggesting that learners who already managed to acquire some features of a native-like accent were also more successful after training. Conclusions Our results are revealing in two aspects. First, the learners' former proficiency in L2 pronunciation is the most robust predictor in acquiring a native-like accent. Second, there is a potential relationship between rhythmic memory and L2 vowel acquisition before training, as well as years of musical experience after training, suggesting that specific musical skills and music practice can be an asset in learning a foreign language accent.}, } @article {pmid33825503, year = {2021}, author = {Michell, CT and Nyman, T}, title = {Microbiomes of willow-galling sawflies: effects of host plant, gall type, and phylogeny on community structure and function.}, journal = {Genome}, volume = {64}, number = {6}, pages = {615-626}, doi = {10.1139/gen-2020-0018}, pmid = {33825503}, issn = {1480-3321}, mesh = {Animals ; Bacteria/*classification/*genetics ; Biodiversity ; Host Microbial Interactions ; Host Specificity ; Insecta ; Larva ; Microbiota/*genetics/*physiology ; *Phylogeny ; Plant Growth Regulators ; Plant Leaves ; RNA, Ribosomal, 16S/genetics ; Salix/*microbiology ; }, abstract = {While free-living herbivorous insects are thought to harbor microbial communities composed of transient bacteria derived from their diet, recent studies indicate that insects that induce galls on plants may be involved in more intimate host-microbe relationships. We used 16S rDNA metabarcoding to survey larval microbiomes of 20 nematine sawfly species that induce bud or leaf galls on 13 Salix species. The 391 amplicon sequence variants (ASVs) detected represented 69 bacterial genera in six phyla. Multi-variate statistical analyses showed that the structure of larval microbiomes is influenced by willow host species as well as by gall type. Nevertheless, a "core" microbiome composed of 58 ASVs is shared widely across the focal galler species. Within the core community, the presence of many abundant, related ASVs representing multiple distantly related bacterial taxa is reflected as a statistically significant effect of bacterial phylogeny on galler-microbe associations. Members of the core community have a variety of inferred functions, including degradation of phenolic compounds, nutrient supplementation, and production of plant hormones. Hence, our results support suggestions of intimate and diverse interactions between galling insects and microbes and add to a growing body of evidence that microbes may play a role in the induction of insect galls on plants.}, } @article {pmid33798490, year = {2021}, author = {Zhang, K and Sjerps, MJ and Peng, G}, title = {Integral perception, but separate processing: The perceptual normalization of lexical tones and vowels.}, journal = {Neuropsychologia}, volume = {156}, number = {}, pages = {107839}, doi = {10.1016/j.neuropsychologia.2021.107839}, pmid = {33798490}, issn = {1873-3514}, mesh = {Adult ; Cues ; Humans ; Language ; Phonetics ; Pitch Perception ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {In tonal languages, speech variability arises in both lexical tone (i.e., suprasegmentally) and vowel quality (segmentally). Listeners can use surrounding speech context to overcome variability in both speech cues, a process known as extrinsic normalization. Although vowels are the main carriers of tones, it is still unknown whether the combined percept (lexical tone and vowel quality) is normalized integrally or in partly separate processes. Here we used electroencephalography (EEG) to investigate the time course of lexical tone normalization and vowel normalization to answer this question. Cantonese adults listened to synthesized three-syllable stimuli in which the identity of a target syllable - ambiguous between high vs. mid-tone (Tone condition) or between /o/ vs. /u/ (Vowel condition) - was dependent on either the tone range (Tone condition) or the formant range (Vowel condition) of the first two syllables. It was observed that the ambiguous tone was more often interpreted as a high-level tone when the context had a relatively low pitch than when it had a high pitch (Tone condition). Similarly, the ambiguous vowel was more often interpreted as /o/ when the context had a relatively low formant range than when it had a relatively high formant range (Vowel condition). These findings show the typical pattern of extrinsic tone and vowel normalization. Importantly, the EEG results of participants showing the contrastive normalization effect demonstrated that the effects of vowel normalization could already be observed within the N2 time window (190-350 ms), while the first reliable effect of lexical tone normalization on cortical processing was observable only from the P3 time window (220-500 ms) onwards. The ERP patterns demonstrate that the contrastive perceptual normalization of lexical tones and that of vowels occur at least in partially separate time windows. This suggests that the extrinsic normalization can operate at the level of phonemes and tonemes separately instead of operating on the whole syllable at once.}, } @article {pmid33795617, year = {2021}, author = {Smith, ML and Winn, MB}, title = {Individual Variability in Recalibrating to Spectrally Shifted Speech: Implications for Cochlear Implants.}, journal = {Ear and hearing}, volume = {42}, number = {5}, pages = {1412-1427}, pmid = {33795617}, issn = {1538-4667}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Humans ; Reproducibility of Results ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: Cochlear implant (CI) recipients are at a severe disadvantage compared with normal-hearing listeners in distinguishing consonants that differ by place of articulation because the key relevant spectral differences are degraded by the implant. One component of that degradation is the upward shifting of spectral energy that occurs with a shallow insertion depth of a CI. The present study aimed to systematically measure the effects of spectral shifting on word recognition and phoneme categorization by specifically controlling the amount of shifting and using stimuli whose identification specifically depends on perceiving frequency cues. We hypothesized that listeners would be biased toward perceiving phonemes that contain higher-frequency components because of the upward frequency shift and that intelligibility would decrease as spectral shifting increased.

DESIGN: Normal-hearing listeners (n = 15) heard sine wave-vocoded speech with simulated upward frequency shifts of 0, 2, 4, and 6 mm of cochlear space to simulate shallow CI insertion depth. Stimuli included monosyllabic words and /b/-/d/ and /∫/-/s/ continua that varied systematically by formant frequency transitions or frication noise spectral peaks, respectively. Recalibration to spectral shifting was operationally defined as shifting perceptual acoustic-phonetic mapping commensurate with the spectral shift. In other words, adjusting frequency expectations for both phonemes upward so that there is still a perceptual distinction, rather than hearing all upward-shifted phonemes as the higher-frequency member of the pair.

RESULTS: For moderate amounts of spectral shifting, group data suggested a general "halfway" recalibration to spectral shifting, but individual data suggested a notably different conclusion: half of the listeners were able to recalibrate fully, while the other halves of the listeners were utterly unable to categorize shifted speech with any reliability. There were no participants who demonstrated a pattern intermediate to these two extremes. Intelligibility of words decreased with greater amounts of spectral shifting, also showing loose clusters of better- and poorer-performing listeners. Phonetic analysis of word errors revealed certain cues were more susceptible to being compromised due to a frequency shift (place and manner of articulation), while voicing was robust to spectral shifting.

CONCLUSIONS: Shifting the frequency spectrum of speech has systematic effects that are in line with known properties of speech acoustics, but the ensuing difficulties cannot be predicted based on tonotopic mismatch alone. Difficulties are subject to substantial individual differences in the capacity to adjust acoustic-phonetic mapping. These results help to explain why speech recognition in CI listeners cannot be fully predicted by peripheral factors like electrode placement and spectral resolution; even among listeners with functionally equivalent auditory input, there is an additional factor of simply being able or unable to flexibly adjust acoustic-phonetic mapping. This individual variability could motivate precise treatment approaches guided by an individual's relative reliance on wideband frequency representation (even if it is mismatched) or limited frequency coverage whose tonotopy is preserved.}, } @article {pmid33792205, year = {2021}, author = {Chen, F and Zhang, H and Ding, H and Wang, S and Peng, G and Zhang, Y}, title = {Neural coding of formant-exaggerated speech and nonspeech in children with and without autism spectrum disorders.}, journal = {Autism research : official journal of the International Society for Autism Research}, volume = {14}, number = {7}, pages = {1357-1374}, doi = {10.1002/aur.2509}, pmid = {33792205}, issn = {1939-3806}, mesh = {*Autism Spectrum Disorder/complications ; Child ; Child, Preschool ; Evoked Potentials ; Humans ; Language Development ; Phonetics ; Speech ; *Speech Perception ; }, abstract = {The presence of vowel exaggeration in infant-directed speech (IDS) may adapt to the age-appropriate demands in speech and language acquisition. Previous studies have provided behavioral evidence of atypical auditory processing towards IDS in children with autism spectrum disorders (ASD), while the underlying neurophysiological mechanisms remain unknown. This event-related potential (ERP) study investigated the neural coding of formant-exaggerated speech and nonspeech in 24 4- to 11-year-old children with ASD and 24 typically-developing (TD) peers. The EEG data were recorded using an alternating block design, in which each stimulus type (exaggerated/non-exaggerated sound) was presented with equal probability. ERP waveform analysis revealed an enhanced P1 for vowel formant exaggeration in the TD group but not in the ASD group. This speech-specific atypical processing in ASD was not found for the nonspeech stimuli which showed similar P1 enhancement in both ASD and TD groups. Moreover, the time-frequency analysis indicated that children with ASD showed differences in neural synchronization in the delta-theta bands for processing acoustic formant changes embedded in nonspeech. Collectively, the results add substantiating neurophysiological evidence (i.e., a lack of neural enhancement effect of vowel exaggeration) for atypical auditory processing of IDS in children with ASD, which may exert a negative effect on phonetic encoding and language learning. LAY SUMMARY: Atypical responses to motherese might act as a potential early marker of risk for children with ASD. This study investigated the neural responses to such socially relevant stimuli in the ASD brain, and the results suggested a lack of neural enhancement responding to the motherese even in individuals without intellectual disability.}, } @article {pmid33786072, year = {2021}, author = {Carmona-Duarte, C and Ferrer, MA and Plamondon, R and Gómez-Rodellar, A and Gómez-Vilda, P}, title = {Sigma-Lognormal Modeling of Speech.}, journal = {Cognitive computation}, volume = {13}, number = {2}, pages = {488-503}, pmid = {33786072}, issn = {1866-9956}, abstract = {Human movement studies and analyses have been fundamental in many scientific domains, ranging from neuroscience to education, pattern recognition to robotics, health care to sports, and beyond. Previous speech motor models were proposed to understand how speech movement is produced and how the resulting speech varies when some parameters are changed. However, the inverse approach, in which the muscular response parameters and the subject's age are derived from real continuous speech, is not possible with such models. Instead, in the handwriting field, the kinematic theory of rapid human movements and its associated Sigma-lognormal model have been applied successfully to obtain the muscular response parameters. This work presents a speech kinematics-based model that can be used to study, analyze, and reconstruct complex speech kinematics in a simplified manner. A method based on the kinematic theory of rapid human movements and its associated Sigma-lognormal model are applied to describe and to parameterize the asymptotic impulse response of the neuromuscular networks involved in speech as a response to a neuromotor command. The method used to carry out transformations from formants to a movement observation is also presented. Experiments carried out with the (English) VTR-TIMIT database and the (German) Saarbrucken Voice Database, including people of different ages, with and without laryngeal pathologies, corroborate the link between the extracted parameters and aging, on the one hand, and the proportion between the first and second formants required in applying the kinematic theory of rapid human movements, on the other. The results should drive innovative developments in the modeling and understanding of speech kinematics.}, } @article {pmid33775469, year = {2023}, author = {Oren, L and Rollins, M and Gutmark, E and Howell, R}, title = {How Face Masks Affect Acoustic and Auditory Perceptual Characteristics of the Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {515-521}, doi = {10.1016/j.jvoice.2021.02.028}, pmid = {33775469}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Voice Quality ; Masks ; *COVID-19 ; Acoustics ; }, abstract = {Wearing a face mask has been accepted as one of the most effective ways for slowing the spread of COVID-19. Yet information regarding the degree to which masks affect acoustics and perception associated with voice performers is scarce. This study examines these effects with common face masks, namely a neck gaiter, disposable surgical mask, and N95 mask, as well as a novel material that could be used as a mask (acoustic foam). A recorded excerpt from the "Star-Spangled Banner" was played through a miniature speaker placed inside the mouth of a masked manikin. Experienced listeners were asked to rate perceptual qualities of these singing stimuli by blindly comparing them with the same recording captured without a mask. Acoustic analysis showed that face masks affected the sound by enhancing or suppressing different frequency bands compared to no mask. Acoustic energy around the singer's formant was reduced when using surgical and N95 masks, which matches observations that these masks are more detrimental to the perceptions of singing voice compared with neck gaiter or acoustic foam. It suggests that singers can benefit from masks designed for minimal impact on auditory perception of the singing voice while maintaining reasonable efficacy of filtering efficiency.}, } @article {pmid33773895, year = {2023}, author = {Havel, M and Sundberg, J and Traser, L and Burdumy, M and Echternach, M}, title = {Effects of Nasalization on Vocal Tract Response Curve.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {339-347}, doi = {10.1016/j.jvoice.2021.02.013}, pmid = {33773895}, issn = {1873-4588}, mesh = {Humans ; *Nose/physiology ; *Paranasal Sinuses/physiology ; Vibration ; Magnetic Resonance Imaging ; Models, Biological ; Speech Acoustics ; }, abstract = {BACKGROUND: Earlier studies have shown that nasalization affects the radiated spectrum by modifying the vocal tract transfer function in a complex manner.

METHODS: Here we study this phenomenon by measuring sine-sweep response of 3-D models of the vowels /u, a, ᴂ, i/, derived from volumetric MR imaging, coupled by means of tubes of different lengths and diameters to a 3-D model of a nasal tract.

RESULTS: The coupling introduced a dip into the vocal tract transfer function. The dip frequency was close to the main resonance of the nasal tract, a result in agreement with the Fujimura & Lindqvist in vivo sweep tone measurements [Fujimura & Lindqvist, 1972]. With increasing size of the coupling tube the depth of the dip increased and the first formant peak either changed in frequency or was split by the dip. Only marginal effects were observed of the paranasal sinuses. For certain coupling tube sizes, the spectrum balance was changed, boosting the formant peaks in the 2 - 4 kHz range.

CONCLUSION: A velopharyngeal opening introduces a dip in the transfer function at the main resonance of the nasal tract. Its depth increases with the area of the opening and its frequency rises in some vowels.}, } @article {pmid33769836, year = {2021}, author = {Coughler, C and Hamel, EM and Cardy, JO and Archibald, LMD and Purcell, DW}, title = {Compensation to Altered Auditory Feedback in Children With Developmental Language Disorder and Typical Development.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2363-2376}, doi = {10.1044/2020_JSLHR-20-00374}, pmid = {33769836}, issn = {1558-9102}, mesh = {Child ; Feedback ; Humans ; *Language Development Disorders ; Speech ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Purpose Developmental language disorder (DLD), an unexplained problem using and understanding spoken language, has been hypothesized to have an underlying auditory processing component. Auditory feedback plays a key role in speech motor control. The current study examined whether auditory feedback is used to regulate speech production in a similar way by children with DLD and their typically developing (TD) peers. Method Participants aged 6-11 years completed tasks measuring hearing, language, first formant (F1) discrimination thresholds, partial vowel space, and responses to altered auditory feedback with F1 perturbation. Results Children with DLD tended to compensate more than TD children for the positive F1 manipulation and compensated less than TD children in the negative shift condition. Conclusion Our findings suggest that children with DLD make atypical use of auditory feedback.}, } @article {pmid33758251, year = {2021}, author = {Arenillas-Alcón, S and Costa-Faidella, J and Ribas-Prats, T and Gómez-Roig, MD and Escera, C}, title = {Neural encoding of voice pitch and formant structure at birth as revealed by frequency-following responses.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6660}, pmid = {33758251}, issn = {2045-2322}, mesh = {*Acoustic Stimulation ; Adult ; Age Factors ; Biomarkers ; Brain/*physiology ; Cognition ; Humans ; Infant, Newborn ; Pediatrics ; *Pitch Perception ; Sound Spectrography ; Speech Perception ; *Voice ; }, abstract = {Detailed neural encoding of voice pitch and formant structure plays a crucial role in speech perception, and is of key importance for an appropriate acquisition of the phonetic repertoire in infants since birth. However, the extent to what newborns are capable of extracting pitch and formant structure information from the temporal envelope and the temporal fine structure of speech sounds, respectively, remains unclear. Here, we recorded the frequency-following response (FFR) elicited by a novel two-vowel, rising-pitch-ending stimulus to simultaneously characterize voice pitch and formant structure encoding accuracy in a sample of neonates and adults. Data revealed that newborns tracked changes in voice pitch reliably and no differently than adults, but exhibited weaker signatures of formant structure encoding, particularly at higher formant frequency ranges. Thus, our results indicate a well-developed encoding of voice pitch at birth, while formant structure representation is maturing in a frequency-dependent manner. Furthermore, we demonstrate the feasibility to assess voice pitch and formant structure encoding within clinical evaluation times in a hospital setting, and suggest the possibility to use this novel stimulus as a tool for longitudinal developmental studies of the auditory system.}, } @article {pmid33741872, year = {2021}, author = {Emrani, E and Ghaemi, H and Labafchi, A and Samieirad, S}, title = {The Effect of Bimaxillary Orthognathic Surgery on Voice Characteristics in Skeletal Class 3 Deformity Patients: An Evaluation Using Acoustic Analysis.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {6}, pages = {2129-2133}, doi = {10.1097/SCS.0000000000007479}, pmid = {33741872}, issn = {1536-3732}, mesh = {Acoustics ; Adult ; Cephalometry ; Female ; Follow-Up Studies ; Humans ; Male ; *Malocclusion, Angle Class III/surgery ; Mandible ; Maxilla ; *Orthognathic Surgery ; *Orthognathic Surgical Procedures ; Osteotomy, Le Fort ; Osteotomy, Sagittal Split Ramus ; }, abstract = {The aim of this study was to analyze the effects of bimaxillary orthognathic surgery on the acoustic voice characteristics of skeletal class 3 patients. All healthy nonsyndromic patients with Class 3 deformity who were eligible for bimaxillary orthognathic surgery, were included in this before and after quasi-experimental study. This experiment's main intervention was mandibular setback surgery by bilateral sagittal split osteotomy plus maxillary advancement using LeFort 1 osteotomy. Age, sex, and intraoperative jaw movements were recorded. Acoustic analysis of voice samples (vowels /a/ and /i/) was performed with Praat software as outcome variables. The formant frequencies (F0, F1, F2, and F3) of these vowels were extracted 1 week preoperatively (T0), 1 and 6 months (T1, T2) postoperatively by a speech therapist. The significance level was set at 0.05 using SPSS 19. The study sample comprised 20 patients including 11 women (55%) and 9 men (45%) with a mean age of 31.95 ± 4.72 years. The average mandibular setback and maxillary advancement were 3.30 ± 0.86 and 2.85 ± 0.74 mm, respectively. The fundamental frequency (F0) and the first, second, and third formants (F1, F2, F3) of vowels /i/ and /a/ were significantly decreased over time intervals, postoperatively (P < 0.05). The finding revealed that bimaxillary orthognathic surgery (maxillary advancement and mandibular setback with bilateral sagittal split osteotomy) might reduce the acoustic formant parameters of voice to the normal frequency ranges, in patients with class 3 skeletal deformities. More clinical trials with greater sample sizes and long-term follow-ups are suggested in the future.}, } @article {pmid33740875, year = {2022}, author = {Geng, P and Gu, W}, title = {Acoustic and Perceptual Characteristics of Mandarin Speech in Gay and Heterosexual Male Speakers.}, journal = {Language and speech}, volume = {65}, number = {4}, pages = {1096-1109}, doi = {10.1177/00238309211000783}, pmid = {33740875}, issn = {1756-6053}, mesh = {Male ; Humans ; Speech Acoustics ; Speech ; Heterosexuality ; Acoustics ; *Speech Perception ; *Sexual and Gender Minorities ; }, abstract = {This study investigated acoustic and perceptual characteristics of Mandarin speech produced by gay and heterosexual male speakers. Acoustic analysis of monosyllabic words showed significant differences between the two groups in voice fundamental frequency (F0), F1 of low vowel, and duration of aspiration/frication in consonants. The acoustic patterns on F0, formants, and center of gravity as well as spectral skewness of /s/ differed from those reported for Western languages like American English, which could be interpreted from a sociopsychological point of view based on different acceptability of gay identity in the two societies. The results of a perceptual experiment revealed significant but weak correlations between the acoustic parameters and the score of perceived gayness, which was significantly higher on gay speech than on heterosexual male speech. Although the observed F0 and F1 patterns in Mandarin gay speech were opposite to the stereotype of gayness, gay identity can still be identified to some extent from speech due to the existence of other acoustic cues such as a longer fricative duration, which is not a stereotype of gayness but has been consistently observed in Mandarin and Western languages.}, } @article {pmid33739930, year = {2021}, author = {König, A and Riviere, K and Linz, N and Lindsay, H and Elbaum, J and Fabre, R and Derreumaux, A and Robert, P}, title = {Measuring Stress in Health Professionals Over the Phone Using Automatic Speech Analysis During the COVID-19 Pandemic: Observational Pilot Study.}, journal = {Journal of medical Internet research}, volume = {23}, number = {4}, pages = {e24191}, pmid = {33739930}, issn = {1438-8871}, mesh = {Adult ; Anxiety/*diagnosis/etiology/psychology ; Burnout, Professional/*diagnosis/etiology/psychology ; COVID-19/epidemiology/*psychology ; Female ; Health Personnel/*psychology ; Humans ; Male ; Pandemics ; Pilot Projects ; SARS-CoV-2 ; Speech/*physiology ; *Speech Acoustics ; Surveys and Questionnaires ; Telephone ; }, abstract = {BACKGROUND: During the COVID-19 pandemic, health professionals have been directly confronted with the suffering of patients and their families. By making them main actors in the management of this health crisis, they have been exposed to various psychosocial risks (stress, trauma, fatigue, etc). Paradoxically, stress-related symptoms are often underreported in this vulnerable population but are potentially detectable through passive monitoring of changes in speech behavior.

OBJECTIVE: This study aims to investigate the use of rapid and remote measures of stress levels in health professionals working during the COVID-19 outbreak. This was done through the analysis of participants' speech behavior during a short phone call conversation and, in particular, via positive, negative, and neutral storytelling tasks.

METHODS: Speech samples from 89 health care professionals were collected over the phone during positive, negative, and neutral storytelling tasks; various voice features were extracted and compared with classical stress measures via standard questionnaires. Additionally, a regression analysis was performed.

RESULTS: Certain speech characteristics correlated with stress levels in both genders; mainly, spectral (ie, formant) features, such as the mel-frequency cepstral coefficient, and prosodic characteristics, such as the fundamental frequency, appeared to be sensitive to stress. Overall, for both male and female participants, using vocal features from the positive tasks for regression yielded the most accurate prediction results of stress scores (mean absolute error 5.31).

CONCLUSIONS: Automatic speech analysis could help with early detection of subtle signs of stress in vulnerable populations over the phone. By combining the use of this technology with timely intervention strategies, it could contribute to the prevention of burnout and the development of comorbidities, such as depression or anxiety.}, } @article {pmid33733165, year = {2020}, author = {Strycharczuk, P and López-Ibáñez, M and Brown, G and Leemann, A}, title = {General Northern English. Exploring Regional Variation in the North of England With Machine Learning.}, journal = {Frontiers in artificial intelligence}, volume = {3}, number = {}, pages = {48}, pmid = {33733165}, issn = {2624-8212}, abstract = {In this paper, we present a novel computational approach to the analysis of accent variation. The case study is dialect leveling in the North of England, manifested as reduction of accent variation across the North and emergence of General Northern English (GNE), a pan-regional standard accent associated with middle-class speakers. We investigated this instance of dialect leveling using random forest classification, with audio data from a crowd-sourced corpus of 105 urban, mostly highly-educated speakers from five northern UK cities: Leeds, Liverpool, Manchester, Newcastle upon Tyne, and Sheffield. We trained random forest models to identify individual northern cities from a sample of other northern accents, based on first two formant measurements of full vowel systems. We tested the models using unseen data. We relied on undersampling, bagging (bootstrap aggregation) and leave-one-out cross-validation to address some challenges associated with the data set, such as unbalanced data and relatively small sample size. The accuracy of classification provides us with a measure of relative similarity between different pairs of cities, while calculating conditional feature importance allows us to identify which input features (which vowels and which formants) have the largest influence in the prediction. We do find a considerable degree of leveling, especially between Manchester, Leeds and Sheffield, although some differences persist. The features that contribute to these differences most systematically are typically not the ones discussed in previous dialect descriptions. We propose that the most systematic regional features are also not salient, and as such, they serve as sociolinguistic regional indicators. We supplement the random forest results with a more traditional variationist description of by-city vowel systems, and we use both sources of evidence to inform a description of the vowels of General Northern English.}, } @article {pmid33705674, year = {2021}, author = {Niziolek, CA and Parrell, B}, title = {Responses to Auditory Feedback Manipulations in Speech May Be Affected by Previous Exposure to Auditory Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2169-2181}, pmid = {33705674}, issn = {1558-9102}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; Reproducibility of Results ; *Speech ; *Speech Perception ; }, abstract = {Purpose Speakers use auditory feedback to guide their speech output, although individuals differ in the magnitude of their compensatory response to perceived errors in feedback. Little is known about the factors that contribute to the compensatory response or how fixed or flexible they are within an individual. Here, we test whether manipulating the perceived reliability of auditory feedback modulates speakers' compensation to auditory perturbations, as predicted by optimal models of sensorimotor control. Method Forty participants produced monosyllabic words in two separate sessions, which differed in the auditory feedback given during an initial exposure phase. In the veridical session exposure phase, feedback was normal. In the noisy session exposure phase, small, random formant perturbations were applied, reducing reliability of auditory feedback. In each session, a subsequent test phase introduced larger unpredictable formant perturbations. We assessed whether the magnitude of within-trial compensation for these larger perturbations differed across the two sessions. Results Compensatory responses to downward (though not upward) formant perturbations were larger in the veridical session than the noisy session. However, in post hoc testing, we found the magnitude of this effect is highly dependent on the choice of analysis procedures. Compensation magnitude was not predicted by other production measures, such as formant variability, and was not reliably correlated across sessions. Conclusions Our results, though mixed, provide tentative support that the feedback control system monitors the reliability of sensory feedback. These results must be interpreted cautiously given the potentially limited stability of auditory feedback compensation measures across analysis choices and across sessions. Supplemental Material https://doi.org/10.23641/asha.14167136.}, } @article {pmid33705004, year = {2021}, author = {Hernández-García, E and Velazquez, LM and González, R and Godino Llorente, JI and Plaza, G}, title = {Influence of Upper Airway Surgery on Voice and Speech Recognition.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {2}, pages = {660-663}, doi = {10.1097/SCS.0000000000007175}, pmid = {33705004}, issn = {1536-3732}, mesh = {Humans ; Prospective Studies ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; Voice Quality ; }, abstract = {PURPOSE: Upper airway surgery comprises a set of techniques that modify the anatomy of the vocal tract, including tonsillectomy and septoplasty. The objective of this work is to study the changes in acoustic parameters and the effects on the identification or verification of the speaker through the speech produced after the vocal tract surgeries, comparing them with a control group.

METHODS: A prospective study was performed between January 2019 and June 2019 including. The final study sample consisted of 84 patients who met the inclusion criteria. Of these, 31 underwent septoplasty, 26 tonsillectomy patients, and 27 controls. Demographic data and GRBAS evaluation were statistically evaluated. Tests were taken before surgery, 2 weeks after surgery and 3 months later. Furthermore, to establish the equal error rate, the recording of patients' voices was made with a succeeding acoustic analysis and programmed identification of the speaker through machine learning systems.

RESULTS: A significant variance was observed in GRBAS, after surgery. Regarding acoustic parameters, a greater change was observed in the fundamental frequency at 2 weeks after surgery in the tonsillectomy group. Formants (F1-F3) and antiformants (AntiF1-AntiF3) changed in septoplasty group, not in tonsillectomy and control group at 3 months. When studying the impact of voice changes on the verification of the speaker through the speech, it was observed that there was a greater error in recognition in the tonsillectomy group at 2 weeks, coinciding with the results obtained in the rest of the parameters studied.

CONCLUSIONS: Results suggest that upper airway surgery produces modifications in the vocal tract affecting GRBAS, acoustic parameters, including formants and antiformants, producing an effect on verification of the speaker through the speech.}, } @article {pmid33679344, year = {2021}, author = {Riedinger, M and Nagels, A and Werth, A and Scharinger, M}, title = {Asymmetries in Accessing Vowel Representations Are Driven by Phonological and Acoustic Properties: Neural and Behavioral Evidence From Natural German Minimal Pairs.}, journal = {Frontiers in human neuroscience}, volume = {15}, number = {}, pages = {612345}, pmid = {33679344}, issn = {1662-5161}, abstract = {In vowel discrimination, commonly found discrimination patterns are directional asymmetries where discrimination is faster (or easier) if differing vowels are presented in a certain sequence compared to the reversed sequence. Different models of speech sound processing try to account for these asymmetries based on either phonetic or phonological properties. In this study, we tested and compared two of those often-discussed models, namely the Featurally Underspecified Lexicon (FUL) model (Lahiri and Reetz, 2002) and the Natural Referent Vowel (NRV) framework (Polka and Bohn, 2011). While most studies presented isolated vowels, we investigated a large stimulus set of German vowels in a more naturalistic setting within minimal pairs. We conducted an mismatch negativity (MMN) study in a passive and a reaction time study in an active oddball paradigm. In both data sets, we found directional asymmetries that can be explained by either phonological or phonetic theories. While behaviorally, the vowel discrimination was based on phonological properties, both tested models failed to explain the found neural patterns comprehensively. Therefore, we additionally examined the influence of a variety of articulatory, acoustical, and lexical factors (e.g., formant structure, intensity, duration, and frequency of occurrence) but also the influence of factors beyond the well-known (perceived loudness of vowels, degree of openness) in depth via multiple regression analyses. The analyses revealed that the perceptual factor of perceived loudness has a greater impact than considered in the literature and should be taken stronger into consideration when analyzing preattentive natural vowel processing.}, } @article {pmid33675539, year = {2021}, author = {Kim, KS and Max, L}, title = {Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, journal = {The European journal of neuroscience}, volume = {53}, number = {9}, pages = {3093-3108}, pmid = {33675539}, issn = {1460-9568}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adaptation, Physiological ; Adult ; Child ; Feedback ; Feedback, Sensory ; Humans ; Learning ; *Speech ; *Stuttering ; }, abstract = {The neural mechanisms underlying stuttering remain poorly understood. A large body of work has focused on sensorimotor integration difficulties in individuals who stutter, including recently the capacity for sensorimotor learning. Typically, sensorimotor learning is assessed with adaptation paradigms in which one or more sensory feedback modalities are experimentally perturbed in real time. Our own previous work on speech with perturbed auditory feedback revealed substantial auditory-motor learning limitations in both children and adults who stutter (AWS). It remains unknown, however, which subprocesses of sensorimotor learning are impaired. Indeed, new insights from research on upper limb motor control indicate that sensorimotor learning involves at least two distinct components: (a) an explicit component that includes intentional strategy use and presumably is driven by target error and (b) an implicit component that updates an internal model without awareness of the learner and presumably is driven by sensory prediction error. Here, we attempted to dissociate these components for speech auditory-motor learning in AWS versus adults who do not stutter (AWNS). Our formant-shift auditory-motor adaptation results replicated previous findings that such sensorimotor learning is limited in AWS. Novel findings are that neither control nor stuttering participants reported any awareness of changing their productions in response to the auditory perturbation and that neither group showed systematic drift in auditory target judgments made throughout the adaptation task. These results indicate that speech auditory-motor adaptation to formant-shifted feedback relies exclusively on implicit learning processes. Thus, limited adaptation in AWS reflects poor implicit sensorimotor learning. Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, } @article {pmid33658966, year = {2021}, author = {Stefanich, S and Cabrelli, J}, title = {The Effects of L1 English Constraints on the Acquisition of the L2 Spanish Alveopalatal Nasal.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {640354}, pmid = {33658966}, issn = {1664-1078}, abstract = {This study examines whether L1 English/L2 Spanish learners at different proficiency levels acquire a novel L2 phoneme, the Spanish palatal nasal /ɲ/. While alveolar /n/ is part of the Spanish and English inventories, /ɲ/, which consists of a tautosyllabic palatal nasal+glide element, is not. This crosslinguistic disparity presents potential difficulty for L1 English speakers due to L1 segmental and phonotactic constraints; the closest English approximation is the heterosyllabic sequence /nj/ (e.g., "canyon" /kænjn/ ['k[h]æn.jn], cf. Spanish cañón "canyon" /kaɲon/ [ka.'ɲon]). With these crosslinguistic differences in mind, we ask: (1a) Do L1 English learners of L2 Spanish produce acoustically distinct Spanish /n/ and /ɲ/ and (1b) Does the distinction of /n/ and /ɲ/ vary by proficiency? In the case that learners distinguish /n/ and /ɲ/, the second question investigates the acoustic quality of /ɲ/ to determine (2a) if learners' L2 representation patterns with that of an L1 Spanish representation or if learners rely on an L1 representation (here, English /nj/) and (2b) if the acoustic quality of L2 Spanish /ɲ/ varies as a function of proficiency. Beginner (n = 9) and advanced (n = 8) L1 English/L2 Spanish speakers and a comparison group of 10 L1 Spanish/L2 English speakers completed delayed repetition tasks in which disyllabic nonce words were produced in a carrier phrase. English critical items contained an intervocalic heterosyllabic /nj/ sequence (e.g., ['p[h]an.jə]); Spanish critical items consisted of items with either intervocalic onset /ɲ/ (e.g., ['xa.ɲa]) or /n/ ['xa.na]. We measured duration and formant contours of the following vocalic portion as acoustic indices of the /n/~/ɲ/ and /ɲ/ ~/nj/ distinctions. Results show that, while L2 Spanish learners produce an acoustically distinct /n/ ~ /ɲ/ contrast even at a low level of proficiency, the beginners produce an intermediate /ɲ/ that falls acoustically between their English /nj/ and the L1 Spanish /ɲ/ while the advanced learners' Spanish /ɲ/ and English /nj/ appear to be in the process of equivalence classification. We discuss these outcomes as they relate to the robustness of L1 phonological constraints in late L2 acquisition coupled with the role of perceptual cues, functional load, and questions of intelligibility.}, } @article {pmid33657098, year = {2021}, author = {Tabas, A and von Kriegstein, K}, title = {Neural modelling of the encoding of fast frequency modulation.}, journal = {PLoS computational biology}, volume = {17}, number = {3}, pages = {e1008787}, pmid = {33657098}, issn = {1553-7358}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Pathways/*physiology ; Computational Biology ; Female ; Humans ; Male ; *Models, Neurological ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Frequency modulation (FM) is a basic constituent of vocalisation in many animals as well as in humans. In human speech, short rising and falling FM-sweeps of around 50 ms duration, called formant transitions, characterise individual speech sounds. There are two representations of FM in the ascending auditory pathway: a spectral representation, holding the instantaneous frequency of the stimuli; and a sweep representation, consisting of neurons that respond selectively to FM direction. To-date computational models use feedforward mechanisms to explain FM encoding. However, from neuroanatomy we know that there are massive feedback projections in the auditory pathway. Here, we found that a classical FM-sweep perceptual effect, the sweep pitch shift, cannot be explained by standard feedforward processing models. We hypothesised that the sweep pitch shift is caused by a predictive feedback mechanism. To test this hypothesis, we developed a novel model of FM encoding incorporating a predictive interaction between the sweep and the spectral representation. The model was designed to encode sweeps of the duration, modulation rate, and modulation shape of formant transitions. It fully accounted for experimental data that we acquired in a perceptual experiment with human participants as well as previously published experimental results. We also designed a new class of stimuli for a second perceptual experiment to further validate the model. Combined, our results indicate that predictive interaction between the frequency encoding and direction encoding neural representations plays an important role in the neural processing of FM. In the brain, this mechanism is likely to occur at early stages of the processing hierarchy.}, } @article {pmid33656916, year = {2021}, author = {Levy, ES and Chang, YM and Hwang, K and McAuliffe, MJ}, title = {Perceptual and Acoustic Effects of Dual-Focus Speech Treatment in Children With Dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2301-2316}, doi = {10.1044/2020_JSLHR-20-00301}, pmid = {33656916}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; *Dysarthria/etiology/therapy ; Humans ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Children with dysarthria secondary to cerebral palsy may experience reduced speech intelligibility and diminished communicative participation. However, minimal research has been conducted examining the outcomes of behavioral speech treatments in this population. This study examined the effect of Speech Intelligibility Treatment (SIT), a dual-focus speech treatment targeting increased articulatory excursion and vocal intensity, on intelligibility of narrative speech, speech acoustics, and communicative participation in children with dysarthria. Method American English-speaking children with dysarthria (n = 17) received SIT in a 3-week summer camplike setting at Columbia University. SIT follows motor-learning principles to train the child-friendly, dual-focus strategy, "Speak with your big mouth and strong voice." Children produced a story narrative at baseline, immediate posttreatment (POST), and at 6-week follow-up (FUP). Outcomes were examined via blinded listener ratings of ease of understanding (n = 108 adult listeners), acoustic analyses, and questionnaires focused on communicative participation. Results SIT resulted in significant increases in ease of understanding at POST, that were maintained at FUP. There were no significant changes to vocal intensity, speech rate, or vowel spectral characteristics, with the exception of an increase in second formant difference between vowels following SIT. Significantly enhanced communicative participation was evident at POST and FUP. Considerable variability in response to SIT was observed between children. Conclusions Dual-focus treatment shows promise for improving intelligibility and communicative participation in children with dysarthria, although responses to treatment vary considerably across children. Possible mechanisms underlying the intelligibility gains, enhanced communicative participation, and variability in treatment effects are discussed.}, } @article {pmid33646815, year = {2021}, author = {Howson, PJ and Redford, MA}, title = {The Acquisition of Articulatory Timing for Liquids: Evidence From Child and Adult Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {734-753}, pmid = {33646815}, issn = {1558-9102}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged, 80 and over ; Child ; Child, Preschool ; Family ; Humans ; *Language ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {Purpose Liquids are among the last sounds to be acquired by English-speaking children. The current study considers their acquisition from an articulatory timing perspective by investigating anticipatory posturing for /l/ versus /ɹ/ in child and adult speech. Method In Experiment 1, twelve 5-year-old, twelve 8-year-old, and 11 college-aged speakers produced carrier phrases with penultimate stress on monosyllabic words that had /l/, /ɹ/, or /d/ (control) as singleton onsets and /æ/ or /u/ as the vowel. Short-domain anticipatory effects were acoustically investigated based on schwa formant values extracted from the preceding determiner (= the) and dynamic formant values across the /ə#LV/ sequence. In Experiment 2, long-domain effects were perceptually indexed using a previously validated forward-gated audiovisual speech prediction task. Results Experiment 1 results indicated that all speakers distinguished /l/ from /ɹ/ along F3. Adults distinguished /l/ from /ɹ/ with a lower F2. Older children produced subtler versions of the adult pattern; their anticipatory posturing was also more influenced by the following vowel. Younger children did not distinguish /l/ from /ɹ/ along F2, but both liquids were distinguished from /d/ in the domains investigated. Experiment 2 results indicated that /ɹ/ was identified earlier than /l/ in gated adult speech; both liquids were identified equally early in 5-year-olds' speech. Conclusions The results are interpreted to suggest a pattern of early tongue-body retraction for liquids in /ə#LV/ sequences in children's speech. More generally, it is suggested that children must learn to inhibit the influence of vowels on liquid articulation to achieve an adultlike contrast between /l/ and /ɹ/ in running speech.}, } @article {pmid33639824, year = {2021}, author = {Raharjo, I and Kothare, H and Nagarajan, SS and Houde, JF}, title = {Speech compensation responses and sensorimotor adaptation to formant feedback perturbations.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {1147}, pmid = {33639824}, issn = {1520-8524}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 DC017690/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 NS100440/NS/NINDS NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Phonetics ; *Speech ; *Speech Perception ; }, abstract = {Control of speech formants is important for the production of distinguishable speech sounds and is achieved with both feedback and learned feedforward control. However, it is unclear whether the learning of feedforward control involves the mechanisms of feedback control. Speakers have been shown to compensate for unpredictable transient mid-utterance perturbations of pitch and loudness feedback, demonstrating online feedback control of these speech features. To determine whether similar feedback control mechanisms exist in the production of formants, responses to unpredictable vowel formant feedback perturbations were examined. Results showed similar within-trial compensatory responses to formant perturbations that were presented at utterance onset and mid-utterance. The relationship between online feedback compensation to unpredictable formant perturbations and sensorimotor adaptation to consistent formant perturbations was further examined. Within-trial online compensation responses were not correlated with across-trial sensorimotor adaptation. A detailed analysis of within-trial time course dynamics across trials during sensorimotor adaptation revealed that across-trial sensorimotor adaptation responses did not result from an incorporation of within-trial compensation response. These findings suggest that online feedback compensation and sensorimotor adaptation are governed by distinct neural mechanisms. These findings have important implications for models of speech motor control in terms of how feedback and feedforward control mechanisms are implemented.}, } @article {pmid33639809, year = {2021}, author = {Carignan, C}, title = {A practical method of estimating the time-varying degree of vowel nasalization from acoustic features.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {911}, doi = {10.1121/10.0002925}, pmid = {33639809}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This paper presents a simple and easy-to-use method of creating a time-varying signal of the degree of nasalization in vowels, generated from acoustic features measured in oral and nasalized vowel contexts. The method is presented for separate models constructed using two sets of acoustic features: (1) an uninformed set of 13 Mel-frequency cepstral coefficients (MFCCs) and (2) a combination of the 13 MFCCs and a phonetically informed set of 20 acoustic features of vowel nasality derived from previous research. Both models are compared against two traditional approaches to estimating vowel nasalization from acoustics: A1-P0 and A1-P1, as well as their formant-compensated counterparts. Data include productions from six speakers of different language backgrounds, producing 11 different qualities within the vowel quadrilateral. The results generated from each of the methods are compared against nasometric measurements, representing an objective "ground truth" of the degree of nasalization. The results suggest that the proposed method is more robust than conventional acoustic approaches, generating signals which correlate strongly with nasometric measures across all vowel qualities and all speakers and accurately approximate the time-varying change in the degree of nasalization. Finally, an experimental example is provided to help researchers implement the method in their own study designs.}, } @article {pmid33630668, year = {2021}, author = {Chung, H and Weismer, G}, title = {Formant Trajectory Patterns of American English /l/ Produced by Adults and Children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {809-822}, doi = {10.1044/2020_JSLHR-20-00345}, pmid = {33630668}, issn = {1558-9102}, mesh = {Adult ; Child ; Child Language ; Child, Preschool ; Female ; Humans ; Language ; Language Development ; *Phonetics ; *Speech ; Speech Acoustics ; United States ; }, abstract = {Purpose Most acoustic and articulatory studies on /l/ have focused on either duration, formant frequencies, or tongue shape during the constriction interval. Only a limited set of data exists for the transition characteristics of /l/ to and from surrounding vowels. The aim of this study was to examine second formant (F2) transition characteristics of /l/ produced by young children and adults. This was to better understand articulatory behaviors in the production of /l/ and potential clinical applications of these data to typical and delayed /l/ development. Method Participants included 17 children with typically developing speech between the ages of 2 and 5 years, and 10 female adult speakers of Southern American English. Each subject produced single words containing pre- and postvocalic /l/ in two vowel contexts (/i, ɪ/ and /ɔ, ɑ/). F2 transitions, out of and into /l/ constriction intervals from the adjacent vowels, were analyzed for perceptually acceptable /l/ productions. The F2 transition extent, duration, and rate, as well as F2 loci data, were compared across age groups by vowel context for both pre- and postvocalic /l/. Results F2 transitions of adults' /l/ showed a great similarity across and within speakers. Those of young children showed greater variability, but became increasingly similar to those of adults with age. The F2 loci data seemed consistent with greater coarticulation among children than adults. This conclusion, however, must be regarded as preliminary due to the possible influence of different vocal tract size across ages and variability in the data. Conclusions The results suggest that adult patterns can serve as a reliable reference to which children's /l/ productions can be evaluated. The articulatory configurations associated with the /l/ constriction interval and the vocal tract movements into and out of that interval may provide insight into the underlying difficulties related to misarticulated /l/.}, } @article {pmid33615923, year = {2021}, author = {Ng, ML and Woo, HK}, title = {Effect of total laryngectomy on vowel production: An acoustic study of vowels produced by alaryngeal speakers of Cantonese.}, journal = {International journal of speech-language pathology}, volume = {23}, number = {6}, pages = {652-661}, doi = {10.1080/17549507.2021.1876166}, pmid = {33615923}, issn = {1754-9515}, mesh = {Acoustics ; Humans ; Laryngectomy ; *Larynx, Artificial ; Phonetics ; Speech ; Speech Acoustics ; *Speech, Alaryngeal ; }, abstract = {Purpose: To investigate the effect of total laryngectomy on vowel production, the present study examined the change in vowel articulation associated with different types of alaryngeal speech in comparison with laryngeal speech using novel derived formant metrics.Method: Six metrics derived from the first two formants (F1 and F2) including the First and Second Formant Range Ratios (F1RR and F2RR), triangular and pentagonal Vowel Space Area (tVSA and pVSA), Formant Centralisation Ratio (FCR) and Average Vowel Spacing (AVS) were measured from vowels (/i, y, ɛ, a, ɔ, œ, u/) produced by oesophageal (ES), tracheoesophageal (TE), electrolaryngeal (EL), pneumatic artificial laryngeal (PA) speakers, as well as laryngeal speakers.Result: Data revealed a general reduction in articulatory range and a tendency of vowel centralisation in Cantonese alaryngeal speakers. Significant articulatory difference was found for PA and EL compared with ES, TE, and laryngeal speakers.Conclusion: The discrepant results among alaryngeal speakers may be related to the difference in new sound source (external vs internal). Sensitivity and correlation analyses confirmed the use of the matrix of derived formant metrics provided a more comprehensive profile of the articulatory pattern in the alaryngeal population.}, } @article {pmid33608184, year = {2023}, author = {Maryn, Y and Wuyts, FL and Zarowski, A}, title = {Are Acoustic Markers of Voice and Speech Signals Affected by Nose-and-Mouth-Covering Respiratory Protective Masks?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {468.e1-468.e12}, pmid = {33608184}, issn = {1873-4588}, mesh = {Humans ; *Speech ; Masks ; Speech Acoustics ; *COVID-19 ; Acoustics ; Speech Production Measurement ; }, abstract = {BACKGROUND: Worldwide use of nose-and-mouth-covering respiratory protective mask (RPM) has become ubiquitous during COVID19 pandemic. Consequences of wearing RPMs, especially regarding perception and production of spoken communication, are gradually emerging. The present study explored how three prevalent RPMs affect various speech and voice sound properties.

METHODS: Pre-recorded sustained [a] vowels and read sentences from 47 subjects were played by a speech production model ('Voice Emitted by Spare Parts', or 'VESPA') in four conditions: without RPM (C1), with disposable surgical mask (C2), with FFP2 mask (C3), and with transparent plastic mask (C4). Differences between C1 and masked conditions were assessed with Dunnett's t test in 26 speech sound properties related to voice production (fundamental frequency, sound intensity level), voice quality (jitter percent, shimmer percent, harmonics-to-noise ratio, smoothed cepstral peak prominence, Acoustic Voice Quality Index), articulation and resonance (first and second formant frequencies, first and second formant bandwidths, spectral center of gravity, spectral standard deviation, spectral skewness, spectral kurtosis, spectral slope, and spectral energy in ten 1-kHz bands from 0 to 10 kHz).

RESULTS: C2, C3, and C4 significantly affected 10, 15, and 19 of the acoustic speech markers, respectively. Furthermore, absolute differences between unmasked and masked conditions were largest for C4 and smallest for C2.

CONCLUSIONS: All RPMs influenced more or less speech sound properties. However, this influence was least for surgical RPMs and most for plastic RPMs. Surgical RPMs are therefore preferred when spoken communication is priority next to respiratory protection.}, } @article {pmid33600430, year = {2021}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {Acoustic analysis of vowel formant frequencies in genetically-related and non-genetically related speakers with implications for forensic speaker comparison.}, journal = {PloS one}, volume = {16}, number = {2}, pages = {e0246645}, pmid = {33600430}, issn = {1932-6203}, mesh = {Acoustics ; Adult ; Brazil ; Forensic Sciences/methods ; Humans ; Language ; Male ; Phonetics ; Psychoacoustics ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/physiology ; Twins, Monozygotic ; Verbal Behavior/*physiology ; }, abstract = {The purpose of this study was to explore the speaker-discriminatory potential of vowel formant mean frequencies in comparisons of identical twin pairs and non-genetically related speakers. The influences of lexical stress and the vowels' acoustic distances on the discriminatory patterns of formant frequencies were also assessed. Acoustic extraction and analysis of the first four speech formants F1-F4 were carried out using spontaneous speech materials. The recordings comprise telephone conversations between identical twin pairs while being directly recorded through high-quality microphones. The subjects were 20 male adult speakers of Brazilian Portuguese (BP), aged between 19 and 35. As for comparisons, stressed and unstressed oral vowels of BP were segmented and transcribed manually in the Praat software. F1-F4 formant estimates were automatically extracted from the middle points of each labeled vowel. Formant values were represented in both Hertz and Bark. Comparisons within identical twin pairs using the Bark scale were performed to verify whether the measured differences would be potentially significant when following a psychoacoustic criterion. The results revealed consistent patterns regarding the comparison of low-frequency and high-frequency formants in twin pairs and non-genetically related speakers, with high-frequency formants displaying a greater speaker-discriminatory power compared to low-frequency formants. Among all formants, F4 seemed to display the highest discriminatory potential within identical twin pairs, followed by F3. As for non-genetically related speakers, both F3 and F4 displayed a similar high discriminatory potential. Regarding vowel quality, the central vowel /a/ was found to be the most speaker-discriminatory segment, followed by front vowels. Moreover, stressed vowels displayed a higher inter-speaker discrimination than unstressed vowels in both groups; however, the combination of stressed and unstressed vowels was found even more explanatory in terms of the observed differences. Although identical twins displayed a higher phonetic similarity, they were not found phonetically identical.}, } @article {pmid33589372, year = {2023}, author = {Lau, HYC and Scherer, RC}, title = {Objective Measures of Two Musical Interpretations of an Excerpt From Berlioz's "La mort d'Ophélie".}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {301.e9-301.e25}, doi = {10.1016/j.jvoice.2020.12.045}, pmid = {33589372}, issn = {1873-4588}, mesh = {Humans ; Voice Quality ; *Music ; Speech Acoustics ; Glottis/physiology ; *Voice ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study aimed to determine objective production differences relative to two emotional interpretations in performing an excerpt from a classical art song. The null hypothesis was proposed.

METHODS: The first author recorded an excerpt from an art song. The excerpt was sung with two contrasting musical interpretations: an "empathetic legato" approach, and a "sarcastic" approach characterized by emphatic attacks. Microphone, airflow, and electroglottography signals were digitized. The vowels were analyzed in terms of intensity, long term average spectra, fundamental frequency (fo), airflow vibrato rate and extent, vowel onset slope, intensity comparison of harmonic frequencies, and glottal measures based on electroglottograph waveforms. Four consonant tokens were analyzed relative to airflow, voice onset time, and production duration.

RESULTS & CONCLUSIONS: The emphatic performance had faster vowel onset, increased glottal adduction, increased intensity of harmonics in 2-3 kHz, increased intensity in the fourth and fifth formants, inferred subglottal pressure increase, increased airflow for /f/, and greater aspiration airflow for /p, t/. Vibrato extents for intensity, fo, and airflow were wider in the emphatic approach. Findings revealed larger EGGW25 and peak-to-peak amplitude values of the electroglottography waveform, suggesting greater vocal fold contact area and longer glottal closure for the emphatic approach. Long-term average spectrum analyses of the entire production displayed minor variation across all formant frequencies, suggesting an insignificant change in vocal tract shaping between the two approaches. This single-case objective study emphasizes the reality of physiological, aerodynamic, and acoustic production differences in the interpretive and pedagogical aspects of art song performance.}, } @article {pmid33577218, year = {2021}, author = {Easwar, V and Bridgwater, E and Purcell, D}, title = {The Influence of Vowel Identity, Vowel Production Variability, and Consonant Environment on Envelope Following Responses.}, journal = {Ear and hearing}, volume = {42}, number = {3}, pages = {662-672}, doi = {10.1097/AUD.0000000000000966}, pmid = {33577218}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem ; Humans ; Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {OBJECTIVES: The vowel-evoked envelope following response (EFR) is a useful tool for studying brainstem processing of speech in natural consonant-vowel productions. Previous work, however, demonstrates that the amplitude of EFRs is highly variable across vowels. To clarify factors contributing to the variability observed, the objectives of the present study were to evaluate: (1) the influence of vowel identity and the consonant context surrounding each vowel on EFR amplitude and (2) the effect of variations in repeated productions of a vowel on EFR amplitude while controlling for the consonant context.

DESIGN: In Experiment 1, EFRs were recorded in response to seven English vowels (/ij/, /Ι/, /ej/, /ε/, /æ/, /u/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic1/v/2021-04-30T105427Z/r/image-tiff/) embedded in each of four consonant contexts (/hVd/, /sVt/, /zVf/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic2/v/2021-04-30T105427Z/r/image-tiffVv/). In Experiment 2, EFRs were recorded in response to four different variants of one of the four possible vowels (/ij/, /ε/, /æ/, or /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic3/v/2021-04-30T105427Z/r/image-tiff/), embedded in the same consonant-vowel-consonant environments used in Experiment 1. All vowels were edited to minimize formant transitions before embedding in a consonant context. Different talkers were used for the two experiments. Data from a total of 30 and 64 (16 listeners/vowel) young adults with normal hearing were included in Experiments 1 and 2, respectively. EFRs were recorded using a single-channel electrode montage between the vertex and nape of the neck while stimuli were presented monaurally.

RESULTS: In Experiment 1, vowel identity had a significant effect on EFR amplitude with the vowel /æ/ eliciting the highest amplitude EFRs (170 nV, on average), and the vowel /ej/ eliciting the lowest amplitude EFRs (106 nV, on average). The consonant context surrounding each vowel stimulus had no statistically significant effect on EFR amplitude. Similarly in Experiment 2, consonant context did not influence the amplitude of EFRs elicited by the vowel variants. Vowel identity significantly altered EFR amplitude with /ε/ eliciting the highest amplitude EFRs (104 nV, on average). Significant, albeit small, differences (<21 nV, on average) in EFR amplitude were evident between some variants of /ε/ and /u/.

CONCLUSION: Based on a comprehensive set of naturally produced vowel samples in carefully controlled consonant contexts, the present study provides additional evidence for the sensitivity of EFRs to vowel identity and variations in vowel production. The surrounding consonant context (after removal of formant transitions) has no measurable effect on EFRs, irrespective of vowel identity and variant. The sensitivity of EFRs to nuances in vowel acoustics emphasizes the need for adequate control and evaluation of stimuli proposed for clinical and research purposes.}, } @article {pmid33568701, year = {2021}, author = {Hodges-Simeon, CR and Grail, GPO and Albert, G and Groll, MD and Stepp, CE and Carré, JM and Arnocky, SA}, title = {Testosterone therapy masculinizes speech and gender presentation in transgender men.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {3494}, pmid = {33568701}, issn = {2045-2322}, support = {T32 DC013017/DC/NIDCD NIH HHS/United States ; DC013017/NH/NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Speech/*drug effects/physiology ; Speech Acoustics ; Speech Perception/*drug effects/physiology ; Testosterone/*pharmacology ; Transgender Persons/psychology ; Transsexualism/*drug therapy ; Voice/drug effects ; Voice Quality/drug effects ; Young Adult ; }, abstract = {Voice is one of the most noticeably dimorphic traits in humans and plays a central role in gender presentation. Transgender males seeking to align internal identity and external gender expression frequently undergo testosterone (T) therapy to masculinize their voices and other traits. We aimed to determine the importance of changes in vocal masculinity for transgender men and to determine the effectiveness of T therapy at masculinizing three speech parameters: fundamental frequency (i.e., pitch) mean and variation (fo and fo-SD) and estimated vocal tract length (VTL) derived from formant frequencies. Thirty transgender men aged 20 to 40 rated their satisfaction with traits prior to and after T therapy and contributed speech samples and salivary T. Similar-aged cisgender men and women contributed speech samples for comparison. We show that transmen viewed voice change as critical to transition success compared to other masculine traits. However, T therapy may not be sufficient to fully masculinize speech: while fo and fo-SD were largely indistinguishable from cismen, VTL was intermediate between cismen and ciswomen. fo was correlated with salivary T, and VTL associated with T therapy duration. This argues for additional approaches, such as behavior therapy and/or longer duration of hormone therapy, to improve speech transition.}, } @article {pmid33555417, year = {2021}, author = {Heimbauer, LA and Beran, MJ and Owren, MJ}, title = {A chimpanzee recognizes varied acoustical versions of sine-wave and noise-vocoded speech.}, journal = {Animal cognition}, volume = {24}, number = {4}, pages = {843-854}, pmid = {33555417}, issn = {1435-9456}, support = {IBN-9876754//National Science Foundation/ ; }, mesh = {Acoustic Stimulation/veterinary ; Animals ; Cues ; Noise ; Pan troglodytes ; *Speech ; *Speech Perception ; }, abstract = {Previous research demonstrated that a language-trained chimpanzee recognized familiar English words in sine-wave and noise-vocoded forms (Heimbauer et al. Curr Biol 21:1210-1214, 2011). However, those results did not provide information regarding processing strategies of the specific acoustic cues to which the chimpanzee may have attended. The current experiments tested this chimpanzee and adult humans using sine-wave and noise-vocoded speech manipulated using specific sine-waves and a different number of noise bands, respectively. Similar to humans tested with the same stimuli, the chimpanzee was more successful identifying sine-wave speech when both SW1 and SW2 were present - the components that are modeled on formants F1 and F2 in the natural speech signal. Results with noise-vocoded speech revealed that the chimpanzee and humans performed best with stimuli that included four or five noise bands, as compared to those with three and two. Overall, amplitude and frequency modulation over time were important for identification of sine-wave and noise-vocoded speech, with further evidence that a nonhuman primate is capable of using top-down processes for speech perception when the signal is altered and incomplete.}, } @article {pmid33524265, year = {2021}, author = {Yang, J and Xu, L}, title = {Vowel Production in Prelingually Deafened Mandarin-Speaking Children With Cochlear Implants.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {2}, pages = {664-682}, doi = {10.1044/2020_JSLHR-20-00469}, pmid = {33524265}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Child ; Child, Preschool ; *Cochlear Implantation ; *Cochlear Implants ; *Deafness/surgery ; Humans ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose The purpose of this study was to characterize the acoustic profile and to evaluate the intelligibility of vowel productions in prelingually deafened, Mandarin-speaking children with cochlear implants (CIs). Method Twenty-five children with CIs and 20 age-matched children with normal hearing (NH) were recorded producing a list of Mandarin disyllabic and trisyllabic words containing 20 Mandarin vowels [a, i, u, y, ɤ, ɿ, ʅ, ai, ei, ia, ie, ye, ua, uo, au, ou, iau, iou, uai, uei] located in the first consonant-vowel syllable. The children with CIs were all prelingually deafened and received unilateral implantation before 7 years of age with an average length of CI use of 4.54 years. In the acoustic analysis, the first two formants (F1 and F2) were extracted at seven equidistant time locations for the tested vowels. The durational and spectral features were compared between the CI and NH groups. In the vowel intelligibility task, the extracted vowel portions in both NH and CI children were presented to six Mandarin-speaking, NH adult listeners for identification. Results The acoustic analysis revealed that the children with CIs deviated from the NH controls in the acoustic features for both single vowels and compound vowels. The acoustic deviations were reflected in longer duration, more scattered vowel categories, smaller vowel space area, and distinct formant trajectories in the children with CIs in comparison to NH controls. The vowel intelligibility results showed that the recognition accuracy of the vowels produced by the children with CIs was significantly lower than that of the NH children. The confusion pattern of vowel recognition in the children with CIs generally followed that in the NH children. Conclusion Our data suggested that the prelingually deafened children with CIs, with a relatively long duration of CI experience, still showed measurable acoustic deviations and lower intelligibility in vowel productions in comparison to the NH children.}, } @article {pmid33522087, year = {2021}, author = {Carl, M and Icht, M}, title = {Acoustic vowel analysis and speech intelligibility in young adult Hebrew speakers: Developmental dysarthria versus typical development.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {283-298}, doi = {10.1111/1460-6984.12598}, pmid = {33522087}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/diagnosis ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Developmental dysarthria is a motor speech impairment commonly characterized by varying levels of reduced speech intelligibility. The relationship between intelligibility deficits and acoustic vowel space among these individuals has long been noted in the literature, with evidence of vowel centralization (e.g., in English and Mandarin). However, the degree to which this centralization occurs and the intelligibility-acoustic relationship is maintained in different vowel systems has yet to be studied thoroughly. In comparison with American English, the Hebrew vowel system is significantly smaller, with a potentially smaller vowel space area, a factor that may impact upon the comparisons of the acoustic vowel space and its correlation with speech intelligibility. Data on vowel space and speech intelligibility are particularly limited for Hebrew speakers with motor speech disorders.

AIMS: To determine the nature and degree of vowel space centralization in Hebrew-speaking adolescents and young adults with dysarthria, in comparison with typically developing (TD) peers, and to correlate these findings with speech intelligibility scores.

METHODS & PROCEDURES: Adolescents and young adults with developmental dysarthria (secondary to cerebral palsy (CP) and other motor deficits, n = 17) and their TD peers (n = 17) were recorded producing Hebrew corner vowels within single words. For intelligibility assessments, naïve listeners transcribed those words produced by speakers with CP, and intelligibility scores were calculated.

OUTCOMES & RESULTS: Acoustic analysis of vowel formants (F1, F2) revealed a centralization of vowel space among speakers with CP for all acoustic metrics of vowel formants, and mainly for the formant centralization ratio (FCR), in comparison with TD peers. Intelligibility scores were correlated strongly with the FCR metric for speakers with CP.

The main results, vowel space centralization for speakers with CP in comparison with TD peers, echo previous cross-linguistic results. The correlation of acoustic results with speech intelligibility carries clinical implications. Taken together, the results contribute to better characterization of the speech production deficit in Hebrew speakers with motor speech disorders. Furthermore, they may guide clinical decision-making and intervention planning to improve speech intelligibility. What this paper adds What is already known on the subject Speech production and intelligibility deficits among individuals with developmental dysarthria (e.g., secondary to CP) are well documented. These deficits have also been correlated with centralization of the acoustic vowel space, although primarily in English speakers. Little is known about the acoustic characteristics of vowels in Hebrew speakers with motor speech disorders, and whether correlations with speech intelligibility are maintained. What this paper adds to existing knowledge This study is the first to describe the acoustic characteristics of vowel space in Hebrew-speaking adolescents and young adults with developmental dysarthria. The results demonstrate a centralization of the acoustic vowel space in comparison with TD peers for all measures, as found in other languages. Correlation between acoustic measures and speech intelligibility scores were also documented. We discuss these results within the context of cross-linguistic comparisons. What are the potential or actual clinical implications of this work? The results confirm the use of objective acoustic measures in the assessment of individuals with motor speech disorders, providing such data for Hebrew-speaking adolescents and young adults. These measures can be used to determine the nature and severity of the speech deficit across languages, may guide intervention planning, as well as measure the effectiveness of intelligibility-based treatment programmes.}, } @article {pmid33514177, year = {2021}, author = {Bakst, S and Niziolek, CA}, title = {Effects of syllable stress in adaptation to altered auditory feedback in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {1}, pages = {708}, pmid = {33514177}, issn = {1520-8524}, support = {F32 DC017653/DC/NIDCD NIH HHS/United States ; K99 DC014520/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; T32 DC005359/DC/NIDCD NIH HHS/United States ; }, mesh = {*Feedback ; Humans ; Language ; Phonetics ; *Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Unstressed syllables in English most commonly contain the vowel quality [ə] (schwa), which is cross-linguistically described as having a variable target. The present study examines whether speakers are sensitive to whether their auditory feedback matches their target when producing unstressed syllables. When speakers hear themselves producing formant-altered speech, they will change their motor plans so that their altered feedback is a better match to the target. If schwa has no target, then feedback mismatches in unstressed syllables may not drive a change in production. In this experiment, participants spoke disyllabic words with initial or final stress where the auditory feedback of F1 was raised (Experiment 1) or lowered (Experiment 2) by 100 mels. Both stressed and unstressed syllables showed adaptive changes in F1. In Experiment 1, initial-stress words showed larger adaptive decreases in F1 than final-stress words, but in Experiment 2, stressed syllables overall showed greater adaptive increases in F1 than unstressed syllables in all words, regardless of which syllable contained the primary stress. These results suggest that speakers are sensitive to feedback mismatches in both stressed and unstressed syllables, but that stress and metrical foot type may mediate the corrective response.}, } @article {pmid33495033, year = {2023}, author = {Hakanpää, T and Waaramaa, T and Laukkanen, AM}, title = {Training the Vocal Expression of Emotions in Singing: Effects of Including Acoustic Research-Based Elements in the Regular Singing Training of Acting Students.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {293.e7-293.e23}, doi = {10.1016/j.jvoice.2020.12.032}, pmid = {33495033}, issn = {1873-4588}, mesh = {Humans ; *Singing ; *Voice ; Acoustics ; Students ; Emotions ; }, abstract = {OBJECTIVES: This study examines the effects of including acoustic research-based elements of the vocal expression of emotions in the singing lessons of acting students during a seven-week teaching period. This information may be useful in improving the training of interpretation in singing.

STUDY DESIGN: Experimental comparative study.

METHODS: Six acting students participated in seven weeks of extra training concerning voice quality in the expression of emotions in singing. Song samples were recorded before and after the training. A control group of six acting students were recorded twice within a seven-week period, during which they participated in ordinary training. All participants sang on the vowel [a:] and on a longer phrase expressing anger, sadness, joy, tenderness, and neutral states. The vowel and phrase samples were evaluated by 34 listeners for the perceived emotion. Additionally, the vowel samples were analyzed for formant frequencies (F1-F4), sound pressure level (SPL), spectral structure (Alpha ratio = SPL 1500-5000 Hz - SPL 50-1500 Hz), harmonic-to-noise ratio (HNR), and perturbation (jitter, shimmer).

RESULTS: The number of correctly perceived expressions improved in the test group's vowel samples, while no significant change was observed in the control group. The overall recognition was higher for the phrases than for the vowel samples. Of the acoustic parameters, F1 and SPL significantly differentiated emotions in both groups, and HNR specifically differentiated emotions in the test group. The Alpha ratio was found to statistically significantly differentiate emotion expression after training.

CONCLUSIONS: The expression of emotion in the singing voice improved after seven weeks of voice quality training. The F1, SPL, Alpha ratio, and HNR differentiated emotional expression. The variation in acoustic parameters became wider after training. Similar changes were not observed after seven weeks of ordinary voice training.}, } @article {pmid33484095, year = {2021}, author = {Mendoza Ramos, V and Paulyn, C and Van den Steen, L and Hernandez-Diaz Huici, ME and De Bodt, M and Van Nuffelen, G}, title = {Effect of boost articulation therapy (BArT) on intelligibility in adults with dysarthria.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {271-282}, pmid = {33484095}, issn = {1460-6984}, mesh = {Adult ; Behavior Therapy ; *Dysarthria/diagnosis/therapy ; Humans ; Speech Articulation Tests ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {BACKGROUND: The articulatory accuracy of patients with dysarthria is one of the most affected speech dimensions with a high impact on speech intelligibility. Behavioural treatments of articulation can either involve direct or indirect approaches. The latter have been thoroughly investigated and are generally appreciated for their almost immediate effects on articulation and intelligibility. The number of studies on (short-term) direct articulation therapy is limited.

AIMS: To investigate the effects of short-term, boost articulation therapy (BArT) on speech intelligibility in patients with chronic or progressive dysarthria and the effect of severity of dysarthria on the outcome.

METHODS & PROCEDURES: The study consists of a two-group pre-/post-test design to assess speech intelligibility at phoneme and sentence level and during spontaneous speech, automatic speech and reading a phonetically balanced text. A total of 17 subjects with mild to severe dysarthria participated in the study and were randomly assigned to either a patient-tailored, intensive articulatory drill programme or an intensive minimal pair training. Both training programmes were based on the principles of motor learning. Each training programme consisted of five sessions of 45 min completed within one week.

OUTCOMES & RESULTS: Following treatment, a statistically significant increase of mean group intelligibility was shown at phoneme and sentence level, and in automatic sequences. This was supported by an acoustic analysis that revealed a reduction in formant centralization ratio. Within specific groups of severity, large and moderate positive effect sizes with Cohen's d were demonstrated.

BArT successfully improves speech intelligibility in patients with chronic or progressive dysarthria at different levels of the impairment. What this paper adds What is already known on the subject Behavioural treatment of articulation in patients with dysarthria mainly involves indirect strategies, which have shown positive effects on speech intelligibility. However, there is limited evidence on the short-term effects of direct articulation therapy at the segmental level of speech. This study investigates the effectiveness of BArT on speech intelligibility in patients with chronic or progressive dysarthria at all severity levels. What this paper adds to existing knowledge The intensive and direct articulatory therapy programmes developed and applied in this study intend to reduce the impairment instead of compensating it. This approach results in a significant improvement of speech intelligibility at different dysarthria severity levels in a short period of time while contributing to exploit and develop all available residual motor skills in persons with dysarthria. What are the potential or actual clinical implications of this work? The improvements in intelligibility demonstrate the effectiveness of a BArT at the segmental level of speech. This makes it to be considered a suitable approach in the treatment of patients with chronic or progressive dysarthria.}, } @article {pmid33455538, year = {2022}, author = {Kulikov, V}, title = {Voice and Emphasis in Arabic Coronal Stops: Evidence for Phonological Compensation.}, journal = {Language and speech}, volume = {65}, number = {1}, pages = {73-104}, pmid = {33455538}, issn = {1756-6053}, mesh = {Cues ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {The current study investigates multiple acoustic cues-voice onset time (VOT), spectral center of gravity (SCG) of burst, pitch (F0), and frequencies of the first (F1) and second (F2) formants at vowel onset-associated with phonological contrasts of voicing and emphasis in production of Arabic coronal stops. The analysis of the acoustic data collected from eight native speakers of the Qatari dialect showed that the three stops form three distinct modes on the VOT scale: [d] is (pre)voiced, voiceless [t] is aspirated, and emphatic [ṭ] is voiceless unaspirated. The contrast is also maintained in spectral cues. Each cue influences production of coronal stops while their relevance to phonological contrasts varies. VOT was most relevant for voicing, but F2 was mostly associated with emphasis. The perception experiment revealed that listeners were able to categorize ambiguous tokens correctly and compensate for phonological contrasts. The listeners' results were used to evaluate three categorization models to predict the intended category of a coronal stop: a model with unweighted and unadjusted cues, a model with weighted cues compensating for phonetic context, and a model with weighted cues compensating for the voicing and emphasis contrasts. The findings suggest that the model with phonological compensation performed most similar to human listeners both in terms of accuracy rate and error pattern.}, } @article {pmid33441596, year = {2021}, author = {Aung, T and Goetz, S and Adams, J and McKenna, C and Hess, C and Roytman, S and Cheng, JT and Zilioli, S and Puts, D}, title = {Low fundamental and formant frequencies predict fighting ability among male mixed martial arts fighters.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {905}, pmid = {33441596}, issn = {2045-2322}, mesh = {Acoustics ; Adult ; Aggression/*physiology/psychology ; Anthropometry ; Athletes/psychology ; Biomarkers ; Cues ; Humans ; Male ; Martial Arts/physiology ; Phenotype ; Pitch Discrimination/physiology ; Sexual Behavior/physiology/psychology ; Social Perception/psychology ; Voice/*physiology ; }, abstract = {Human voice pitch is highly sexually dimorphic and eminently quantifiable, making it an ideal phenotype for studying the influence of sexual selection. In both traditional and industrial populations, lower pitch in men predicts mating success, reproductive success, and social status and shapes social perceptions, especially those related to physical formidability. Due to practical and ethical constraints however, scant evidence tests the central question of whether male voice pitch and other acoustic measures indicate actual fighting ability in humans. To address this, we examined pitch, pitch variability, and formant position of 475 mixed martial arts (MMA) fighters from an elite fighting league, with each fighter's acoustic measures assessed from multiple voice recordings extracted from audio or video interviews available online (YouTube, Google Video, podcasts), totaling 1312 voice recording samples. In four regression models each predicting a separate measure of fighting ability (win percentages, number of fights, Elo ratings, and retirement status), no acoustic measure significantly predicted fighting ability above and beyond covariates. However, after fight statistics, fight history, height, weight, and age were used to extract underlying dimensions of fighting ability via factor analysis, pitch and formant position negatively predicted "Fighting Experience" and "Size" factor scores in a multivariate regression model, explaining 3-8% of the variance. Our findings suggest that lower male pitch and formants may be valid cues of some components of fighting ability in men.}, } @article {pmid33413460, year = {2021}, author = {Volodin, IA and Volodina, EV and Frey, R}, title = {Rutting vocal display in male impala (Aepyceros melampus) and overlap with alarm context.}, journal = {Frontiers in zoology}, volume = {18}, number = {1}, pages = {2}, pmid = {33413460}, issn = {1742-9994}, support = {19-04-00133//Российский Фонд Фундаментальных Исследований (РФФИ)/ ; 19-04-00133//Российский Фонд Фундаментальных Исследований (РФФИ)/ ; }, abstract = {BACKGROUND: The rutting vocal display of male impala Aepyceros melampus is unique for its complexity among ruminants. This study investigates bouts of rutting calls produced towards potential mates and rival males by free-ranging male impala in Namibia. In particular, a comparison of male rutting and alarm snorts is conducted, inspired by earlier findings of mate guarding by using alarm snorts in male topi Damaliscus lunatus.

RESULTS: Rutting male impala produced 4-38 (13.5 ± 6.5) rutting calls per bout. We analyzed 201 bouts, containing in total 2709 rutting calls of five types: continuous roars produced within a single exhalation-inhalation cycle; interrupted roars including few exhalation-inhalation cycles; pant-roars distinctive by a pant-phase with rapidly alternating inhalations and exhalations; usual snorts lacking any roar part; and roar-snorts starting with a short roar part. Bouts mostly started and ended with usual snorts. Continuous roars were the shortest roars. The average duration of the exhalatory phase was longest in the continuous roars and shortest in the pant-roars. The average fundamental frequency (49.7-51.4 Hz) did not differ between roar types. Vocal tract length, calculated by using measurements of the first four vocal tract resonances (formants), ranged within 381-382 mm in all roar types. In the studied male impala, rutting snorts within bouts of rutting calls were longer and had higher values of the upper quartile in the call spectra than alarm snorts produced towards potential danger.

CONCLUSIONS: Additional inhalations during the emission of the interrupted and pant-roars prolong their duration compared to the continuous roars but do not affect the fundamental frequency or the degree of larynx retraction while roaring. Alarm snorts are separated from one another by large intervals, whereas the intervals between rutting snorts within bouts are short. Sometimes, rutting snorts alternate with roars, whereas alarm snorts do not. Therefore, it is not the acoustic structure of individual snorts but the temporal sequence and the occasional association with another call type that defines snorts as either rutting or alarm snorts. The rutting snorts of male impala may function to attract the attention of receptive females and delay their departure from a male's harem or territory.}, } @article {pmid33399816, year = {2021}, author = {Bodaghi, D and Jiang, W and Xue, Q and Zheng, X}, title = {Effect of Supraglottal Acoustics on Fluid-Structure Interaction During Human Voice Production.}, journal = {Journal of biomechanical engineering}, volume = {143}, number = {4}, pages = {}, pmid = {33399816}, issn = {1528-8951}, support = {R01 DC009616/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Acoustics ; *Glottis/physiology ; *Voice ; Hydrodynamics ; Phonation ; Vibration ; Vocal Cords/physiology ; Models, Biological ; Pressure ; }, abstract = {A hydrodynamic/acoustic splitting method was used to examine the effect of supraglottal acoustics on fluid-structure interactions during human voice production in a two-dimensional computational model. The accuracy of the method in simulating compressible flows in typical human airway conditions was verified by comparing it to full compressible flow simulations. The method was coupled with a three-mass model of vocal fold lateral motion to simulate fluid-structure interactions during human voice production. By separating the acoustic perturbation components of the airflow, the method allows isolation of the role of supraglottal acoustics in fluid-structure interactions. The results showed that an acoustic resonance between a higher harmonic of the sound source and the first formant of the supraglottal tract occurred during normal human phonation when the fundamental frequency was much lower than the formants. The resonance resulted in acoustic pressure perturbation at the glottis which was of the same order as the incompressible flow pressure and found to affect vocal fold vibrations and glottal flow rate waveform. Specifically, the acoustic perturbation delayed the opening of the glottis, reduced the vertical phase difference of vocal fold vibrations, decreased flow rate and maximum flow deceleration rate (MFDR) at the glottal exit; yet, they had little effect on glottal opening. The results imply that the sound generation in the glottis and acoustic resonance in the supraglottal tract are coupled processes during human voice production and computer modeling of vocal fold vibrations needs to include supraglottal acoustics for accurate predictions.}, } @article {pmid33397591, year = {2023}, author = {Feng, M and Howard, DM}, title = {The Dynamic Effect of the Valleculae on Singing Voice - An Exploratory Study Using 3D Printed Vocal Tracts.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {178-186}, doi = {10.1016/j.jvoice.2020.12.012}, pmid = {33397591}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Speech Acoustics ; *Voice/physiology ; Acoustics ; Printing, Three-Dimensional ; }, abstract = {BACKGROUND AND OBJECTIVES: The valleculae can be seen as a pair of side branches of the human vocal tract like the piriform fossae. While the acoustic properties of the piriform fossae have been explored in detail, there is little evidence of full exploration of the acoustic properties of the valleculae. A recent investigation (Vampola, Horáček, & Švec, 2015), using a finite element model of a single vowel /a/, suggests that the valleculae created two antiresonances and two resonances in the high frequency region (above 4kHz) along with those produced by the piriform sinuses. In the current study, we investigate, in multiple vowels, the acoustic influences of the valleculae in singing voice, using 3-D printed vocal tracts.

METHOD: MRI data were collected from an operatic tenor singing English vowels /a/, /u/, /i/. The images of each vowel were segmented and edited to create a pair of tracts, where one is the original and one had the valleculae digitally removed.The printed tracts were then placed atop a vocal tract organ loudspeaker, excited by white noise. Recordings were made with a microphone placed in front of the mouths of the tracts, to measure their frequency responses.

RESULTS: Dimensional changes were observed in valleculae of different vowels, with the long-term average spectra of the recordings illustrating clear differences between the frequency responses of the va-nova (valleculae - no valleculae) pairs, which varies with vowels.

CONCLUSION: The experiment demonstrates the dynamic[1] nature of the shapes of the valleculae in the human vocal tract and its acoustic consequences. It provides evidence that the valleculae have similar acoustic properties to the piriform fossae but with larger variations, and in some cases can influence acoustically the frequency region below 4kHz. The results suggest that large volume valleculae have the potential to impede to some extent the acoustic effect of the singers formant cluster and small valleculae may do the reverse. Since the volume of the valleculae is observed to be largely dependent on tongue movement and also with changes to the uttered vowel, it can be assumed that the high frequency energy, including that within the singer's formant region, could be vowel dependent. Strategies to control valleculae volumes are likely to be highly relevant to voice pedagogy practice as well as singing performance.}, } @article {pmid36154080, year = {2021}, author = {Ying Liu, Y and Polka, L and Masapollo, M and Ménard, L}, title = {Disentangling the roles of formant proximity and stimulus prototypicality in adult vowel perception.}, journal = {JASA express letters}, volume = {1}, number = {1}, pages = {015201}, doi = {10.1121/10.0003041}, pmid = {36154080}, issn = {2691-1191}, abstract = {The present investigation examined the extent to which asymmetries in vowel perception derive from a sensitivity to focalization (formant proximity), stimulus prototypicality, or both. English-speaking adults identified, rated, and discriminated a vowel series that spanned a less-focal/prototypic English /u/ and a more-focal/prototypic French /u/ exemplar. Discrimination pairs included one-step, two-step, and three-step intervals along the series. Asymmetries predicted by both focalization and prototype effects emerged when discrimination step-size was varied. The findings indicate that both generic/universal and language-specific biases shape vowel perception in adults; the latter are challenging to isolate without well-controlled stimuli and appropriately scaled discrimination tasks.}, } @article {pmid33379914, year = {2020}, author = {Lovcevic, I and Kalashnikova, M and Burnham, D}, title = {Acoustic features of infant-directed speech to infants with hearing loss.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3399}, doi = {10.1121/10.0002641}, pmid = {33379914}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Infant ; Speech ; *Speech Perception ; }, abstract = {This study investigated the effects of hearing loss and hearing experience on the acoustic features of infant-directed speech (IDS) to infants with hearing loss (HL) compared to controls with normal hearing (NH) matched by either chronological or hearing age (experiment 1) and across development in infants with hearing loss as well as the relation between IDS features and infants' developing lexical abilities (experiment 2). Both experiments included detailed acoustic analyses of mothers' productions of the three corner vowels /a, i, u/ and utterance-level pitch in IDS and in adult-directed speech. Experiment 1 demonstrated that IDS to infants with HL was acoustically more variable than IDS to hearing-age matched infants with NH. Experiment 2 yielded no changes in IDS features over development; however, the results did show a positive relationship between formant distances in mothers' speech and infants' concurrent receptive vocabulary size, as well as between vowel hyperarticulation and infants' expressive vocabulary. These findings suggest that despite infants' HL and thus diminished access to speech input, infants with HL are exposed to IDS with generally similar acoustic qualities as are infants with NH. However, some differences persist, indicating that infants with HL might receive less intelligible speech.}, } @article {pmid33379900, year = {2020}, author = {Nault, DR and Munhall, KG}, title = {Individual variability in auditory feedback processing: Responses to real-time formant perturbations and their relation to perceptual acuity.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3709}, doi = {10.1121/10.0002923}, pmid = {33379900}, issn = {1520-8524}, abstract = {In this study, both between-subject and within-subject variability in speech perception and speech production were examined in the same set of speakers. Perceptual acuity was determined using an ABX auditory discrimination task, whereby speakers made judgments between pairs of syllables on a /ɛ/ to /æ/ acoustic continuum. Auditory feedback perturbations of the first two formants were implemented in a production task to obtain measures of compensation, normal speech production variability, and vowel spacing. Speakers repeated the word "head" 120 times under varying feedback conditions, with the final Hold phase involving the strongest perturbations of +240 Hz in F1 and -300 Hz in F2. Multiple regression analyses were conducted to determine whether individual differences in compensatory behavior in the Hold phase could be predicted by perceptual acuity, speech production variability, and vowel spacing. Perceptual acuity significantly predicted formant changes in F1, but not in F2. These results are discussed in consideration of the importance of using larger sample sizes in the field and developing new methods to explore feedback processing at the individual participant level. The potential positive role of variability in speech motor control is also considered.}, } @article {pmid33379892, year = {2020}, author = {Kothare, H and Raharjo, I and Ramanarayanan, V and Ranasinghe, K and Parrell, B and Johnson, K and Houde, JF and Nagarajan, SS}, title = {Sensorimotor adaptation of speech depends on the direction of auditory feedback alteration.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3682}, pmid = {33379892}, issn = {1520-8524}, support = {K08 AG058749/AG/NIA NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {A hallmark feature of speech motor control is its ability to learn to anticipate and compensate for persistent feedback alterations, a process referred to as sensorimotor adaptation. Because this process involves adjusting articulation to counter the perceived effects of altering acoustic feedback, there are a number of factors that affect it, including the complex relationship between acoustics and articulation and non-uniformities of speech perception. As a consequence, sensorimotor adaptation is hypothesised to vary as a function of the direction of the applied auditory feedback alteration in vowel formant space. This hypothesis was tested in two experiments where auditory feedback was altered in real time, shifting the frequency values of the first and second formants (F1 and F2) of participants' speech. Shifts were designed on a subject-by-subject basis and sensorimotor adaptation was quantified with respect to the direction of applied shift, normalised for individual speakers. Adaptation was indeed found to depend on the direction of the applied shift in vowel formant space, independent of shift magnitude. These findings have implications for models of sensorimotor adaptation of speech.}, } @article {pmid33379880, year = {2020}, author = {Houle, N and Levi, SV}, title = {Acoustic differences between voiced and whispered speech in gender diverse speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {4002}, doi = {10.1121/10.0002952}, pmid = {33379880}, issn = {1520-8524}, mesh = {Acoustics ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; *Voice ; }, abstract = {Whispered speech is a naturally produced mode of communication that lacks a fundamental frequency. Several other acoustic differences exist between whispered and voiced speech, such as speaking rate (measured as segment duration) and formant frequencies. Previous research has shown that listeners are less accurate at identifying linguistic information (e.g., identifying a speech sound) and speaker information (e.g., reporting speaker gender) from whispered speech. To further explore differences between voiced and whispered speech, acoustic differences were examined across three datasets (hVd, sVd, and ʃVd) and three speaker groups (ciswomen, transwomen, cismen). Consistent with previous studies, vowel duration was generally longer in whispered speech and formant frequencies were shifted higher, although the magnitude of these differences depended on vowel and gender. Despite the increase in duration, the acoustic vowel space area (measured either with a vowel quadrilateral or with a convex hull) was smaller in the whispered speech, suggesting that larger vowel space areas are not an automatic consequence of a lengthened articulation. Overall, these findings are consistent with previous literature showing acoustic differences between voiced and whispered speech beyond the articulatory change of eliminating fundamental frequency.}, } @article {pmid33369591, year = {2021}, author = {Ananthakrishnan, S and Grinstead, L and Yurjevich, D}, title = {Human Frequency Following Responses to Filtered Speech.}, journal = {Ear and hearing}, volume = {42}, number = {1}, pages = {87-105}, doi = {10.1097/AUD.0000000000000902}, pmid = {33369591}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adult ; *Hearing Aids ; Humans ; Noise ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: There is increasing interest in using the frequency following response (FFR) to describe the effects of varying different aspects of hearing aid signal processing on brainstem neural representation of speech. To this end, recent studies have examined the effects of filtering on brainstem neural representation of the speech fundamental frequency (f0) in listeners with normal hearing sensitivity by measuring FFRs to low- and high-pass filtered signals. However, the stimuli used in these studies do not reflect the entire range of typical cutoff frequencies used in frequency-specific gain adjustments during hearing aid fitting. Further, there has been limited discussion on the effect of filtering on brainstem neural representation of formant-related harmonics. Here, the effects of filtering on brainstem neural representation of speech fundamental frequency (f0) and harmonics related to first formant frequency (F1) were assessed by recording envelope and spectral FFRs to a vowel low-, high-, and band-pass filtered at cutoff frequencies ranging from 0.125 to 8 kHz.

DESIGN: FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth and low-pass (experiment 1), high-pass (experiment 2), and band-pass (experiment 3) filtered conditions. In experiment 1, FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 11 low-pass filtered conditions (low-pass cutoff frequencies: 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 6, and 8 kHz) in 19 adult listeners with normal hearing sensitivity. In experiment 2, FFRs were measured to the same synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 10 high-pass filtered conditions (high-pass cutoff frequencies: 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, and 6 kHz) in 7 adult listeners with normal hearing sensitivity. In experiment 3, in addition to the full bandwidth condition, FFRs were measured to vowel /u/ low-pass filtered at 2 kHz, band-pass filtered between 2-4 kHz and 4-6 kHz in 10 adult listeners with normal hearing sensitivity. A Fast Fourier Transform analysis was conducted to measure the strength of f0 and the F1-related harmonic relative to the noise floor in the brainstem neural responses obtained to the full bandwidth and filtered stimulus conditions.

RESULTS: Brainstem neural representation of f0 was reduced when the low-pass filter cutoff frequency was between 0.25 and 0.5 kHz; no differences in f0 strength were noted between conditions when the low-pass filter cutoff condition was at or greater than 0.75 kHz. While envelope FFR f0 strength was reduced when the stimulus was high-pass filtered at 6 kHz, there was no effect of high-pass filtering on brainstem neural representation of f0 when the high-pass filter cutoff frequency ranged from 0.125 to 4 kHz. There was a weakly significant global effect of band-pass filtering on brainstem neural phase-locking to f0. A trends analysis indicated that mean f0 magnitude in the brainstem neural response was greater when the stimulus was band-pass filtered between 2 and 4 kHz as compared to when the stimulus was band-pass filtered between 4 and 6 kHz, low-pass filtered at 2 kHz or presented in the full bandwidth condition. Last, neural phase-locking to f0 was reduced or absent in envelope FFRs measured to filtered stimuli that lacked spectral energy above 0.125 kHz or below 6 kHz. Similarly, little to no energy was seen at F1 in spectral FFRs obtained to low-, high-, or band-pass filtered stimuli that did not contain energy in the F1 region. For stimulus conditions that contained energy at F1, the strength of the peak at F1 in the spectral FFR varied little with low-, high-, or band-pass filtering.

CONCLUSIONS: Energy at f0 in envelope FFRs may arise due to neural phase-locking to low-, mid-, or high-frequency stimulus components, provided the stimulus envelope is modulated by at least two interacting harmonics. Stronger neural responses at f0 are measured when filtering results in stimulus bandwidths that preserve stimulus energy at F1 and F2. In addition, results suggest that unresolved harmonics may favorably influence f0 strength in the neural response. Lastly, brainstem neural representation of the F1-related harmonic measured in spectral FFRs obtained to filtered stimuli is related to the presence or absence of stimulus energy at F1. These findings add to the existing literature exploring the viability of the FFR as an objective technique to evaluate hearing aid fitting where stimulus bandwidth is altered by design due to frequency-specific gain applied by amplification algorithms.}, } @article {pmid33356887, year = {2021}, author = {Parrell, B and Niziolek, CA}, title = {Increased speech contrast induced by sensorimotor adaptation to a nonuniform auditory perturbation.}, journal = {Journal of neurophysiology}, volume = {125}, number = {2}, pages = {638-647}, pmid = {33356887}, issn = {1522-1598}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {*Adaptation, Physiological ; *Feedback, Sensory ; Female ; Humans ; Male ; Phonetics ; Sensorimotor Cortex/*physiology ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {When auditory feedback is perturbed in a consistent way, speakers learn to adjust their speech to compensate, a process known as sensorimotor adaptation. Although this paradigm has been highly informative for our understanding of the role of sensory feedback in speech motor control, its ability to induce behaviorally relevant changes in speech that affect communication effectiveness remains unclear. Because reduced vowel contrast contributes to intelligibility deficits in many neurogenic speech disorders, we examine human speakers' ability to adapt to a nonuniform perturbation field that was designed to affect vowel distinctiveness, applying a shift that depended on the vowel being produced. Twenty-five participants were exposed to this "vowel centralization" feedback perturbation in which the first two formant frequencies were shifted toward the center of each participant's vowel space, making vowels less distinct from one another. Speakers adapted to this nonuniform shift, learning to produce corner vowels with increased vowel space area and vowel contrast to partially overcome the perceived centralization. The increase in vowel contrast occurred without a concomitant increase in duration and persisted after the feedback shift was removed, including after a 10-min silent period. These findings establish the validity of a sensorimotor adaptation paradigm to increase vowel contrast, showing that complex, nonuniform alterations to sensory feedback can successfully drive changes relevant to intelligible communication.NEW & NOTEWORTHY To date, the speech motor learning evoked in sensorimotor adaptation studies has had little ecological consequences for communication. By inducing complex, nonuniform acoustic errors, we show that adaptation can be leveraged to cause an increase in speech sound contrast, a change that has the capacity to improve intelligibility. This study is relevant for models of sensorimotor integration across motor domains, showing that complex alterations to sensory feedback can successfully drive changes relevant to ecological behavior.}, } @article {pmid33302780, year = {2021}, author = {Pisanski, K and Sorokowski, P}, title = {Human Stress Detection: Cortisol Levels in Stressed Speakers Predict Voice-Based Judgments of Stress.}, journal = {Perception}, volume = {50}, number = {1}, pages = {80-87}, doi = {10.1177/0301006620978378}, pmid = {33302780}, issn = {1468-4233}, mesh = {Cues ; Humans ; Hydrocortisone ; Judgment ; *Speech Perception ; *Voice ; }, abstract = {Despite recent evidence of a positive relationship between cortisol levels and voice pitch in stressed speakers, the extent to which human listeners can reliably judge stress from the voice remains unknown. Here, we tested whether voice-based judgments of stress co-vary with the free cortisol levels and vocal parameters of speakers recorded in a real-life stressful situation (oral examination) and baseline (2 weeks prior). Hormone and acoustic analyses indicated elevated salivary cortisol levels and corresponding changes in voice pitch, vocal tract resonances (formants), and speed of speech during stress. In turn, listeners' stress ratings correlated significantly with speakers' cortisol levels. Higher pitched voices were consistently perceived as more stressed; however, the influence of formant frequencies, vocal perturbation and noise parameters on stress ratings varied across contexts, suggesting that listeners utilize different strategies when assessing calm versus stressed speech. These results indicate that nonverbal vocal cues can convey honest information about a speaker's underlying physiological level of stress that listeners can, to some extent, detect and utilize, while underscoring the necessity to control for individual differences in the biological stress response.}, } @article {pmid33296889, year = {2020}, author = {Yu, M and Wen, Y and Xu, L and Han, F and Gao, X}, title = {Polysomnographic characteristics and acoustic analysis of catathrenia (nocturnal groaning).}, journal = {Physiological measurement}, volume = {41}, number = {12}, pages = {125012}, doi = {10.1088/1361-6579/abd235}, pmid = {33296889}, issn = {1361-6579}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Male ; *Parasomnias/diagnosis ; *Polysomnography ; Sleep Stages ; Sleep, REM ; Snoring/*diagnosis ; Young Adult ; }, abstract = {OBJECTIVE: Catathrenia is a sleep disorder characterized by nocturnal groaning sounds emitted during prolonged expiration. As a rare condition, its polysomnographic findings were inconsistent. We aimed to present polysomnographic characteristics of catathrenia patients and perform acoustic analysis of groaning sounds.

APPROACH: Twenty-three patients (eight males and 15 females) diagnosed with catathrenia by video-polysomnography were included. They underwent clinical evaluation and physical examination, and answered a questionnaire. Acoustic analyses (oscillograms and spectrograms) of catathrenia and snoring signals were performed by Praat 6.1.09. Sounds were classified according to Yanagihara criteria.

MAIN RESULTS: The average age of catathrenia patients was 29.6 ± 10.0 years, with a body mass index of 22.3 ± 5.1 kg m[-2]. A total of 3728 groaning episodes were documented. Catathrenia events of 16 patients (70%) were rapid eye movement (REM)-predominant. The average duration of groaning was 11.4 ± 4.6 s, ranging from 1.3 to 74.9 s. All signals of groaning were rhythmic or semi-rhythmic, classified as type I and type II, respectively, with formants and harmonics. Snoring events were observed in nine patients. Snoring mainly occurred in the non-REM stage, with a duration of less than 1.5 s. Signals of snoring were chaotic, classified as type III, without harmonics.

SIGNIFICANCE: Catathrenia occurred in all sleep stages but mainly in REM. Durations of groaning varied greatly across patients. Acoustic characteristics of catathrenia were typical. Groaning had rhythmic or semi-rhythmic waveform, formants and harmonics, indicating vocal origin, while snoring had chaotic waveform.}, } @article {pmid33293174, year = {2023}, author = {Albuquerque, L and Oliveira, C and Teixeira, A and Sa-Couto, P and Figueiredo, D}, title = {A Comprehensive Analysis of Age and Gender Effects in European Portuguese Oral Vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {1}, pages = {143.e13-143.e29}, doi = {10.1016/j.jvoice.2020.10.021}, pmid = {33293174}, issn = {1873-4588}, mesh = {Adult ; Humans ; Male ; Female ; Middle Aged ; Aged ; Aged, 80 and over ; Portugal ; *Phonetics ; *Speech ; Speech Acoustics ; Language ; }, abstract = {The knowledge about the age effects in speech acoustics is still disperse and incomplete. This study extends the analyses of the effects of age and gender on acoustics of European Portuguese (EP) oral vowels, in order to complement initial studies with limited sets of acoustic parameters, and to further investigate unclear or inconsistent results. A database of EP vowels produced by a group of 113 adults, aged between 35 and 97, was used. Duration, fundamental frequency (f0), formant frequencies (F1 to F3), and a selection of vowel space metrics (F1 and F2 range ratios, vowel articulation index [VAI] and formant centralization ratio [FCR]) were analyzed. To avoid the arguable division into age groups, the analyses considered age as a continuous variable. The most relevant age-related results included: vowel duration increase in both genders; a general tendency to formant frequencies decrease for females; changes that were consistent with vowel centralization for males, confirmed by the vowel space acoustic indexes; and no evidence of F3 decrease with age, in both genders. This study has contributed to knowledge on aging speech, providing new information for an additional language. The results corroborated that acoustic characteristics of speech change with age and present different patterns between genders.}, } @article {pmid33286105, year = {2020}, author = {Van Soom, M and de Boer, B}, title = {Detrending the Waveforms of Steady-State Vowels.}, journal = {Entropy (Basel, Switzerland)}, volume = {22}, number = {3}, pages = {}, pmid = {33286105}, issn = {1099-4300}, abstract = {Steady-state vowels are vowels that are uttered with a momentarily fixed vocal tract configuration and with steady vibration of the vocal folds. In this steady-state, the vowel waveform appears as a quasi-periodic string of elementary units called pitch periods. Humans perceive this quasi-periodic regularity as a definite pitch. Likewise, so-called pitch-synchronous methods exploit this regularity by using the duration of the pitch periods as a natural time scale for their analysis. In this work, we present a simple pitch-synchronous method using a Bayesian approach for estimating formants that slightly generalizes the basic approach of modeling the pitch periods as a superposition of decaying sinusoids, one for each vowel formant, by explicitly taking into account the additional low-frequency content in the waveform which arises not from formants but rather from the glottal pulse. We model this low-frequency content in the time domain as a polynomial trend function that is added to the decaying sinusoids. The problem then reduces to a rather familiar one in macroeconomics: estimate the cycles (our decaying sinusoids) independently from the trend (our polynomial trend function); in other words, detrend the waveform of steady-state waveforms. We show how to do this efficiently.}, } @article {pmid33277544, year = {2020}, author = {Schild, C and Aung, T and Kordsmeyer, TL and Cardenas, RA and Puts, DA and Penke, L}, title = {Linking human male vocal parameters to perceptions, body morphology, strength and hormonal profiles in contexts of sexual selection.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {21296}, pmid = {33277544}, issn = {2045-2322}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Hydrocortisone ; Male ; Muscle Strength ; Pitch Perception ; *Sexual Selection ; Social Dominance ; *Speech Acoustics ; *Testosterone ; *Voice ; Young Adult ; }, abstract = {Sexual selection appears to have shaped the acoustic signals of diverse species, including humans. Deep, resonant vocalizations in particular may function in attracting mates and/or intimidating same-sex competitors. Evidence for these adaptive functions in human males derives predominantly from perception studies in which vocal acoustic parameters were manipulated using specialist software. This approach affords tight experimental control but provides little ecological validity, especially when the target acoustic parameters vary naturally with other parameters. Furthermore, such experimental studies provide no information about what acoustic variables indicate about the speaker-that is, why attention to vocal cues may be favored in intrasexual and intersexual contexts. Using voice recordings with high ecological validity from 160 male speakers and biomarkers of condition, including baseline cortisol and testosterone levels, body morphology and strength, we tested a series of pre-registered hypotheses relating to both perceptions and underlying condition of the speaker. We found negative curvilinear and negative linear relationships between male fundamental frequency (fo) and female perceptions of attractiveness and male perceptions of dominance. In addition, cortisol and testosterone negatively interacted in predicting fo, and strength and measures of body size negatively predicted formant frequencies (Pf). Meta-analyses of the present results and those from two previous samples confirmed that fonegatively predicted testosterone only among men with lower cortisol levels. This research offers empirical evidence of possible evolutionary functions for attention to men's vocal characteristics in contexts of sexual selection.}, } @article {pmid33268219, year = {2022}, author = {Leung, Y and Oates, J and Papp, V and Chan, SP}, title = {Formant Frequencies of Adult Speakers of Australian English and Effects of Sex, Age, Geographical Location, and Vowel Quality.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {6}, pages = {875.e1-875.e13}, doi = {10.1016/j.jvoice.2020.09.026}, pmid = {33268219}, issn = {1873-4588}, mesh = {Humans ; Adult ; Male ; Female ; Adolescent ; Young Adult ; Middle Aged ; *Speech Acoustics ; *Phonetics ; Australia ; Reading ; Linear Models ; }, abstract = {AIMS: The primary aim of this study was to provide normative formant frequency (F) values for male and female speakers of Australian English. The secondary aim was to examine the effects of speaker sex, age, vowel quality, and geographical location on F.

METHOD: The first three monophthong formant frequencies (F1, F2, and F3) for 244 female and 135 male speakers aged 18-60 years from a recent large-scale corpus of Australian English were analysed on a passage reading task.

RESULTS: Mixed effects linear regression models suggested that speaker sex, speaker age, and vowel quality significantly predicted F1, F2, and F3 (P = 0.000). Effect sizes suggested that speaker sex and vowel quality contributed most to the variations in F1, F2, and F3 whereas speaker age and geographical location contributed a smaller amount.

CONCLUSION: Both clinicians and researchers are provided with normative F data for 18-60 year-old speakers of Australian English. Such data have increased internal and external validity relative to previous literature. F normative data for speakers of Australian English should be considered with reference to speaker sex and vowel but it may not be practically necessary to adjust for speaker age and geographical location.}, } @article {pmid33261411, year = {2020}, author = {Tabain, M and Kochetov, A and Beare, R}, title = {An ultrasound and formant study of manner contrasts at four coronal places of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {5}, pages = {3195}, doi = {10.1121/10.0002486}, pmid = {33261411}, issn = {1520-8524}, mesh = {Acoustics ; Australia ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study examines consonant manner of articulation at four coronal places of articulation, using ultrasound and formant analyses of the Australian language Arrernte. Stop, nasal, and lateral articulations are examined at the dental, alveolar, retroflex, and alveo-palatal places of articulation: /t̪ n̪ l̪ / vs /t n l/ vs /ʈɳɭ/ vs /c ɲ ʎ/. Ultrasound data clearly show a more retracted tongue root for the lateral, and a more advanced tongue root for the nasal, as compared to the stop. However, the magnitude of the differences is much greater for the stop∼lateral contrast than for the stop∼nasal contrast. Acoustic results show clear effects on F1 in the adjacent vowels, in particular the preceding vowel, with F1 lower adjacent to nasals and higher adjacent to laterals, as compared to stops. Correlations between the articulatory and acoustic data are particularly strong for this formant. However, the retroflex place of articulation shows effects according to manner for higher formants as well, suggesting that a better understanding of retroflex acoustics for different manners of articulation is required. The study also suggests that articulatory symmetry and gestural economy are affected by the size of the phonemic inventory.}, } @article {pmid33261400, year = {2020}, author = {Vampola, T and Horáček, J and Radolf, V and Švec, JG and Laukkanen, AM}, title = {Influence of nasal cavities on voice quality: Computer simulations and experiments.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {5}, pages = {3218}, doi = {10.1121/10.0002487}, pmid = {33261400}, issn = {1520-8524}, mesh = {Computer Simulation ; Female ; Humans ; Nasal Cavity/diagnostic imaging ; Phonetics ; Speech Acoustics ; *Voice ; *Voice Quality ; }, abstract = {Nasal cavities are known to introduce antiresonances (dips) in the sound spectrum reducing the acoustic power of the voice. In this study, a three-dimensional (3D) finite element (FE) model of the vocal tract (VT) of one female subject was created for vowels [a:] and [i:] without and with a detailed model of nasal cavities based on CT (Computer Tomography) images. The 3D FE models were then used for analyzing the resonances, antiresonances and the acoustic pressure response spectra of the VT. The computed results were compared with the measurements of a VT model for the vowel [a:], obtained from the FE model by 3D printing. The nasality affects mainly the lowest formant frequency and decreases its peak level. The results confirm the main effect of nasalization, i.e., that sound pressure level decreases in the frequency region of the formants F1-F2 and emphasizes the frequency region of the formants F3-F5 around the singer's formant cluster. Additionally, many internal local resonances in the nasal and paranasal cavities were found in the 3D FE model. Their effect on the acoustic output was found to be minimal, but accelerometer measurements on the walls of the 3D-printed model suggested they could contribute to structure vibrations.}, } @article {pmid33202192, year = {2021}, author = {Pépiot, E and Arnold, A}, title = {Cross-Gender Differences in English/French Bilingual Speakers: A Multiparametric Study.}, journal = {Perceptual and motor skills}, volume = {128}, number = {1}, pages = {153-177}, doi = {10.1177/0031512520973514}, pmid = {33202192}, issn = {1558-688X}, mesh = {Female ; Humans ; *Language ; Male ; Phonetics ; Sex Characteristics ; Speech ; *Voice ; }, abstract = {The present study concerns speech productions of female and male English/French bilingual speakers in both reading and semi-spontaneous speech tasks. We investigated various acoustic parameters: average fundamental sound frequency (F0), F0 range, F0 variance (SD), vowel formants (F1, F2, and F3), voice onset time (VOT) and H1-H2 (intensity difference between the first and the second harmonic frequencies, used to measure phonation type) in both languages. Our results revealed a significant effect of gender and language on all parameters. Overall, average F0 was higher in French while F0 modulation was stronger in English. Regardless of language, female speakers exhibited higher F0 than male speakers. Moreover, the higher average F0 in French was larger in female speakers. On the other hand, the smaller F0 modulation in French was stronger in male speakers. The analysis of vowel formants showed that overall, female speakers exhibited higher values than males. However, we found a significant cross-gender difference on F2 of the back vowel [u:] in English, but not on the vowel [u] in French. VOT of voiceless stops was longer in Female speakers in both languages, with a greater difference in English. VOT contrast between voiceless stops and their voiced counterparts was also significantly longer in female speakers in both languages. The scope of this cross-gender difference was greater in English. H1-H2 was higher in female speakers in both languages, indicating a breathier phonation type. Furthermore, female speakers tended to exhibit smaller H1-H2 in French, while the opposite was true in males. This resulted in a smaller cross-gender difference in French for this parameter. All these data support the idea of language- and gender-specific vocal norms, to which bilingual speakers seem to adapt. This constitutes a further argument to give social factors, such as gender dynamics, more consideration in phonetic studies.}, } @article {pmid33166974, year = {2020}, author = {Hînganu, D and Hînganu, MV}, title = {Hidden Anatomy of Opera Singers.}, journal = {Advances in oto-rhino-laryngology}, volume = {85}, number = {}, pages = {158-169}, doi = {10.1159/000490014}, pmid = {33166974}, issn = {1662-2847}, mesh = {Glottis/*anatomy & histology/physiology ; Humans ; Oropharynx/*anatomy & histology/physiology ; Singing/*physiology ; Voice Quality/*physiology ; }, abstract = {The history of research on the voice of opera soloists shows that there are certain functional features of the cranial nerves and cortical nerve centers. In this chapter, we review the most important findings in the field of canto voice neuroanatomy, which we corroborate with the results of our team research and experience. Our study focuses on the nerve structures involved in phonation at each level of the vocal formants: infraglottic, glottic, and oropharyngeal. We consider this research to have direct applicability in the fields of neurolaryngology, neuroscience, phoniatry, but also in the academic teaching. At the same time, the present study is a starting point for future research works on the anatomical and functional particularities of the structures involved during the act of phonation in canto soloists.}, } @article {pmid33143999, year = {2023}, author = {Ishikawa, K and Webster, J}, title = {The Formant Bandwidth as a Measure of Vowel Intelligibility in Dysphonic Speech.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {173-177}, doi = {10.1016/j.jvoice.2020.10.012}, pmid = {33143999}, issn = {1873-4588}, mesh = {Adult ; Female ; Humans ; *Dysphonia ; Speech Intelligibility ; Speech Acoustics ; *Voice ; Acoustics ; Phonetics ; }, abstract = {OBJECTIVE: The current paper examined the impact of dysphonia on the bandwidth of the first two formants of vowels, and the relationship between the formant bandwidth and vowel intelligibility.

METHODS: Speaker participants of the study were 10 adult females with healthy voice and 10 adult females with dysphonic voice. Eleven vowels in American English were recorded in /h/-vowel-/d/ format. The vowels were presented to 10 native speakers of American English with normal hearing, who were asked to select a vowel they heard from a list of /h/-vowel-/d/ words. The vowels were acoustically analyzed to measure the bandwidth of the first and second formants (B1 and B2). Separate Wilcoxon rank sum tests were conducted for each vowel for normal and dysphonic speech because the differences in B1 and B2 were found to not be normally distributed. Spearman correlation tests were conducted to evaluate the association between the difference in formant bandwidths and vowel intelligibility between the healthy and dysphonic speakers.

RESULTS: B1 was significantly greater in dysphonic vowels for seven of the eleven vowels, and lesser for only one of the vowels. There was no statistically significant difference in B2 between the normal and dysphonic vowels, except for the vowel /i/. The difference in B1 between normal and dysphonic vowels strongly predicted the intelligibility difference.

CONCLUSION: Dysphonia significantly affects B1, and the difference in B1 may serve as an acoustic marker for the intelligibility reduction in dysphonic vowels. This acoustic-perceptual relationship should be confirmed by a larger-scale study in the future.}, } @article {pmid33143998, year = {2023}, author = {Burckardt, ES and Hillman, RE and Murton, O and Mehta, D and Van Stan, J and Burns, JA}, title = {The Impact of Tonsillectomy on the Adult Singing Voice: Acoustic and Aerodynamic Measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {1}, pages = {101-104}, doi = {10.1016/j.jvoice.2020.09.029}, pmid = {33143998}, issn = {1873-4588}, mesh = {Adult ; Humans ; *Singing ; *Tonsillectomy/adverse effects ; Prospective Studies ; Quality of Life ; Voice Quality ; Acoustics ; }, abstract = {OBJECTIVE: Singers undergoing tonsillectomy are understandably concerned about possible sequelae to their voice. The surgical risks of laryngeal damage from intubation and upper airway scarring are valid reasons for singers to carefully consider their options for treatment of tonsil-related symptoms. No prior studies have statistically assessed objective voice outcomes in a group of adult singers undergoing tonsillectomy. This study determined the impact of tonsillectomy on the adult singing voice by determining if there were statistically significant changes in preoperative versus postoperative acoustic, aerodynamic, and Voice-Related Quality of Life (VRQOL) measures.

STUDY DESIGN: Prospective cohort study.

SETTING: Tertiary Referral Academic Hospital SUBJECTS: Thirty singers undergoing tonsillectomy from 2012 to 2019.

METHODS: Acoustic recordings were obtained with Computerized Speech Lab (CSL) (Pentax CSL 4500) and analyzed with the Multidimensional Voice Program (MDVP) (Pentax MDVP) and Pratt Acoustic Analysis Software. Estimates of aerodynamic vocal efficiency were obtained and analyzed using the Phonatory Aerodynamic System (Pentax PAS 6600). Preoperative VRQOL scores were recorded, and singers were instructed to refrain from singing for 3 weeks following tonsillectomy. Repeat acoustic and aerodynamic measures as well as VRQOL scores were obtained at the first postoperative visit.

RESULTS: Average postoperative acoustic (jitter, shimmer, HNR) and aerodynamic (sound pressure level divided by subglottal pressure) parameters related to laryngeal phonatory function did not differ significantly from preoperative measures. The only statistically significant change in postoperative measures of resonance was a decrease in the 3rd formant (F3) for the /a/ vowel. Average postoperative VRQOL scores (79.8, SD18.7) improved significantly from preoperative VRQOL scores (89, SD12.2) (P = 0.007).

CONCLUSIONS: Tonsillectomy does not appear to alter laryngeal voice production in adult singers as measured by standard acoustic and aerodynamic parameters. The observed decrease in F3 for the /a/ vowel is hypothetically related to increasing the pharyngeal cross-sectional area by removing tonsillar tissue, but this would not be expected to appreciably impact the perceptual characteristics of the vowel. Singers' self-assessment (VRQOL) improved after tonsillectomy.}, } @article {pmid33138537, year = {2020}, author = {Roberts, B and Summers, RJ}, title = {Informational masking of speech depends on masker spectro-temporal variation but not on its coherence.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {4}, pages = {2416}, doi = {10.1121/10.0002359}, pmid = {33138537}, issn = {1520-8524}, mesh = {Humans ; *Perceptual Masking ; Recognition, Psychology ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {The impact of an extraneous formant on intelligibility is affected by the extent (depth) of variation in its formant-frequency contour. Two experiments explored whether this impact also depends on masker spectro-temporal coherence, using a method ensuring that interference occurred only through informational masking. Targets were monaural three-formant analogues (F1+F2+F3) of natural sentences presented alone or accompanied by a contralateral competitor for F2 (F2C) that listeners must reject to optimize recognition. The standard F2C was created using the inverted F2 frequency contour and constant amplitude. Variants were derived by dividing F2C into abutting segments (100-200 ms, 10-ms rise/fall). Segments were presented either in the correct order (coherent) or in random order (incoherent), introducing abrupt discontinuities into the F2C frequency contour. F2C depth was also manipulated (0%, 50%, or 100%) prior to segmentation, and the frequency contour of each segment either remained time-varying or was set to constant at the geometric mean frequency of that segment. The extent to which F2C lowered keyword scores depended on segment type (frequency-varying vs constant) and depth, but not segment order. This outcome indicates that the impact on intelligibility depends critically on the overall amount of frequency variation in the competitor, but not its spectro-temporal coherence.}, } @article {pmid33138491, year = {2020}, author = {Nenadić, F and Coulter, P and Nearey, TM and Kiefte, M}, title = {Perception of vowels with missing formant peaks.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {4}, pages = {1911}, doi = {10.1121/10.0002110}, pmid = {33138491}, issn = {1520-8524}, mesh = {Cues ; Humans ; Language ; *Phonetics ; *Speech Perception ; }, abstract = {Although the first two or three formant frequencies are considered essential cues for vowel identification, certain limitations of this approach have been noted. Alternative explanations have suggested listeners rely on other aspects of the gross spectral shape. A study conducted by Ito, Tsuchida, and Yano [(2001). J. Acoust. Soc. Am. 110, 1141-1149] offered strong support for the latter, as attenuation of individual formant peaks left vowel identification largely unaffected. In the present study, these experiments are replicated in two dialects of English. Although the results were similar to those of Ito, Tsuchida, and Yano [(2001). J. Acoust. Soc. Am. 110, 1141-1149], quantitative analyses showed that when a formant is suppressed, participant response entropy increases due to increased listener uncertainty. In a subsequent experiment, using synthesized vowels with changing formant frequencies, suppressing individual formant peaks led to reliable changes in identification of certain vowels but not in others. These findings indicate that listeners can identify vowels with missing formant peaks. However, such formant-peak suppression may lead to decreased certainty in identification of steady-state vowels or even changes in vowel identification in certain dynamically specified vowels.}, } @article {pmid33136646, year = {2020}, author = {Easwar, V and Birstler, J and Harrison, A and Scollie, S and Purcell, D}, title = {The Accuracy of Envelope Following Responses in Predicting Speech Audibility.}, journal = {Ear and hearing}, volume = {41}, number = {6}, pages = {1732-1746}, pmid = {33136646}, issn = {1538-4667}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; /CAPMC/CIHR/Canada ; }, mesh = {Hearing Tests ; Humans ; Male ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {OBJECTIVES: The present study aimed to (1) evaluate the accuracy of envelope following responses (EFRs) in predicting speech audibility as a function of the statistical indicator used for objective response detection, stimulus phoneme, frequency, and level, and (2) quantify the minimum sensation level (SL; stimulus level above behavioral threshold) needed for detecting EFRs.

DESIGN: In 21 participants with normal hearing, EFRs were elicited by 8 band-limited phonemes in the male-spoken token /susa∫i/ (2.05 sec) presented between 20 and 65 dB SPL in 15 dB increments. Vowels in /susa∫i/ were modified to elicit two EFRs simultaneously by selectively lowering the fundamental frequency (f0) in the first formant (F1) region. The modified vowels elicited one EFR from the low-frequency F1 and another from the mid-frequency second and higher formants (F2+). Fricatives were amplitude-modulated at the average f0. EFRs were extracted from single-channel EEG recorded between the vertex (Cz) and the nape of the neck when /susa∫i/ was presented monaurally for 450 sweeps. The performance of the three statistical indicators, F-test, Hotelling's T, and phase coherence, was compared against behaviorally determined audibility (estimated SL, SL ≥0 dB = audible) using area under the receiver operating characteristics (AUROC) curve, sensitivity (the proportion of audible speech with a detectable EFR [true positive rate]), and specificity (the proportion of inaudible speech with an undetectable EFR [true negative rate]). The influence of stimulus phoneme, frequency, and level on the accuracy of EFRs in predicting speech audibility was assessed by comparing sensitivity, specificity, positive predictive value (PPV; the proportion of detected EFRs elicited by audible stimuli) and negative predictive value (NPV; the proportion of undetected EFRs elicited by inaudible stimuli). The minimum SL needed for detection was evaluated using a linear mixed-effects model with the predictor variables stimulus and EFR detection p value.

RESULTS: of the 3 statistical indicators were similar; however, at the type I error rate of 5%, the sensitivities of Hotelling's T (68.4%) and phase coherence (68.8%) were significantly higher than the F-test (59.5%). In contrast, the specificity of the F-test (97.3%) was significantly higher than the Hotelling's T (88.4%). When analyzed using Hotelling's T as a function of stimulus, fricatives offered higher sensitivity (88.6 to 90.6%) and NPV (57.9 to 76.0%) compared with most vowel stimuli (51.9 to 71.4% and 11.6 to 51.3%, respectively). When analyzed as a function of frequency band (F1, F2+, and fricatives aggregated as low-, mid- and high-frequencies, respectively), high-frequency stimuli offered the highest sensitivity (96.9%) and NPV (88.9%). When analyzed as a function of test level, sensitivity improved with increases in stimulus level (99.4% at 65 dB SPL). The minimum SL for EFR detection ranged between 13.4 and 21.7 dB for F1 stimuli, 7.8 to 12.2 dB for F2+ stimuli, and 2.3 to 3.9 dB for fricative stimuli.

CONCLUSIONS: EFR-based inference of speech audibility requires consideration of the statistical indicator used, phoneme, stimulus frequency, and stimulus level.}, } @article {pmid33123625, year = {2019}, author = {Rakerd, B and Hunter, EJ and Lapine, P}, title = {Resonance Effects and the Vocalization of Speech.}, journal = {Perspectives of the ASHA special interest groups}, volume = {4}, number = {6}, pages = {1637-1643}, pmid = {33123625}, issn = {2381-4764}, support = {R01 DC012315/DC/NIDCD NIH HHS/United States ; }, abstract = {Studies of the respiratory and laryngeal actions required for phonation are central to our understanding of both voice and voice disorders. The purpose of the present article is to highlight complementary insights about voice that have come from the study of vocal tract resonance effects.}, } @article {pmid33121295, year = {2022}, author = {Jeanneteau, M and Hanna, N and Almeida, A and Smith, J and Wolfe, J}, title = {Using visual feedback to tune the second vocal tract resonance for singing in the high soprano range.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {1}, pages = {25-34}, doi = {10.1080/14015439.2020.1834612}, pmid = {33121295}, issn = {1651-2022}, mesh = {Feedback, Sensory ; Female ; Humans ; *Singing ; Vibration ; *Voice ; Voice Quality ; }, abstract = {PURPOSE: Over a range roughly C5-C6, sopranos usually tune their first vocal tract resonance (R1) to the fundamental frequency (fo) of the note sung: R1:fo tuning. Those who sing well above C6 usually adjust their second vocal tract resonance (R2) and use R2:fo tuning. This study investigated these questions: Can singers quickly learn R2:fo tuning when given suitable feedback? Can they subsequently use this tuning without feedback? And finally, if so, does this assist their singing in the high range?

METHODS: New computer software for the technique of resonance estimation by broadband excitation at the lips was used to provide real-time visual feedback on fo and vocal tract resonances. Eight sopranos participated. In a one-hour session, they practised adjusting R2 whilst miming (i.e. without phonating), and then during singing.

RESULTS: Six sopranos learned to tune R2 over a range of several semi-tones, when feedback was present. This achievement did not immediately extend their singing range. When the feedback was removed, two sopranos spontaneously used R2:fo tuning at the top of their range above C6.

CONCLUSIONS: With only one hour of training, singers can learn to adjust their vocal tract shape for R2:fo tuning when provided with visual feedback. One additional participant who spent considerable time with the software, acquired greater skill at R2:fo tuning and was able to extend her singing range. A simple version of the hardware used can be assembled using basic equipment and the software is available online.}, } @article {pmid33106062, year = {2022}, author = {Ayres, A and Winckler, PB and Jacinto-Scudeiro, LA and Rech, RS and Padovani, MMP and Jotz, GP and Olchik, MR}, title = {Speech characteristics in individuals with myasthenia gravis: a case control study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {1}, pages = {35-42}, doi = {10.1080/14015439.2020.1834614}, pmid = {33106062}, issn = {1651-2022}, mesh = {Case-Control Studies ; *Dysarthria/diagnosis/etiology ; Female ; Humans ; Male ; Middle Aged ; *Myasthenia Gravis/complications/diagnosis ; Speech ; Speech Acoustics ; Voice Quality ; }, abstract = {INTRODUCTION: Myasthenia Gravis (MG) is an autoimmune disease. The characteristic symptoms of the disease are muscle weakness and fatigue. These symptoms affect de oral muscles causing dysarthria, affecting about 60% of patients with disease progression.

PURPOSE: Describe the speech pattern of patients with MG and comparing with healthy controls (HC).

MATERIAL AND METHODS: Case-control study. Participants were divided in MG group (MGG) with 38 patients MG diagnosed and HC with 18 individuals matched for age and sex. MGG was evaluated with clinical and motor scales and answered self-perceived questionnaires. Speech assessment of both groups included: recording of speech tasks, acoustic and auditory-perceptual analysis.

RESULTS: In the MGG, 68.24% of the patients were female, with average age of 50.21 years old (±16.47), 14.18 years (±9.52) of disease duration and a motor scale of 11.19 points (±8.79). The auditory-perceptual analysis verified that 47.36% (n = 18) participants in MGG presented mild dysarthria, 10.52% (n = 4) moderate dysarthria, with a high percentage of alterations in phonation (95.2%) and breathing (52.63%). The acoustic analysis verified a change in phonation, with significantly higher shimmer values in the MGG compared to the HC and articulation with a significant difference between the groups for the first formant of the /iu/ (p = <.001). No correlation was found between the diagnosis of speech disorder and the dysarthria self-perception questionnaire.

CONCLUSION: We found dysarthria mild in MG patients with changes in the motor bases phonation and breathing, with no correlation with severity and disease duration.}, } @article {pmid33091464, year = {2020}, author = {Kim, KS and Daliri, A and Flanagan, JR and Max, L}, title = {Dissociated Development of Speech and Limb Sensorimotor Learning in Stuttering: Speech Auditory-motor Learning is Impaired in Both Children and Adults Who Stutter.}, journal = {Neuroscience}, volume = {451}, number = {}, pages = {1-21}, pmid = {33091464}, issn = {1873-7544}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adaptation, Physiological ; Adolescent ; Adult ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Learning ; *Speech ; *Stuttering ; }, abstract = {Stuttering is a neurodevelopmental disorder of speech fluency. Various experimental paradigms have demonstrated that affected individuals show limitations in sensorimotor control and learning. However, controversy exists regarding two core aspects of this perspective. First, it has been claimed that sensorimotor learning limitations are detectable only in adults who stutter (after years of coping with the disorder) but not during childhood close to the onset of stuttering. Second, it remains unclear whether stuttering individuals' sensorimotor learning limitations affect only speech movements or also unrelated effector systems involved in nonspeech movements. We report data from separate experiments investigating speech auditory-motor learning (N = 60) and limb visuomotor learning (N = 84) in both children and adults who stutter versus matched nonstuttering individuals. Both children and adults who stutter showed statistically significant limitations in speech auditory-motor adaptation with formant-shifted feedback. This limitation was more profound in children than in adults and in younger children versus older children. Between-group differences in the adaptation of reach movements performed with rotated visual feedback were subtle but statistically significant for adults. In children, even the nonstuttering groups showed limited visuomotor adaptation just like their stuttering peers. We conclude that sensorimotor learning is impaired in individuals who stutter, and that the ability for speech auditory-motor learning-which was already adult-like in 3-6 year-old typically developing children-is severely compromised in young children near the onset of stuttering. Thus, motor learning limitations may play an important role in the fundamental mechanisms contributing to the onset of this speech disorder.}, } @article {pmid33079610, year = {2020}, author = {Lester-Smith, RA and Daliri, A and Enos, N and Abur, D and Lupiani, AA and Letcher, S and Stepp, CE}, title = {The Relation of Articulatory and Vocal Auditory-Motor Control in Typical Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {11}, pages = {3628-3642}, pmid = {33079610}, issn = {1558-9102}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; R21 DC017001/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Feedback, Sensory ; Female ; Humans ; *Voice ; }, abstract = {Purpose The purpose of this study was to explore the relationship between feedback and feedforward control of articulation and voice by measuring reflexive and adaptive responses to first formant (F 1) and fundamental frequency (f o) perturbations. In addition, perception of F 1 and f o perturbation was estimated using passive (listening) and active (speaking) just noticeable difference paradigms to assess the relation of auditory acuity to reflexive and adaptive responses. Method Twenty healthy women produced single words and sustained vowels while the F 1 or f o of their auditory feedback was suddenly and unpredictably perturbed to assess reflexive responses or gradually and predictably perturbed to assess adaptive responses. Results Typical speakers' reflexive responses to sudden perturbation of F 1 were related to their adaptive responses to gradual perturbation of F 1. Specifically, speakers with larger reflexive responses to sudden perturbation of F 1 had larger adaptive responses to gradual perturbation of F 1. Furthermore, their reflexive responses to sudden perturbation of F 1 were associated with their passive auditory acuity to F 1 such that speakers with better auditory acuity to F 1 produced larger reflexive responses to sudden perturbations of F 1. Typical speakers' adaptive responses to gradual perturbation of F 1 were not associated with their auditory acuity to F 1. Speakers' reflexive and adaptive responses to perturbation of f o were not related, nor were their responses related to either measure of auditory acuity to f o. Conclusion These findings indicate that there may be disparate feedback and feedforward control mechanisms for articulatory and vocal error correction based on auditory feedback.}, } @article {pmid33069508, year = {2022}, author = {Pawelec, ŁP and Graja, K and Lipowicz, A}, title = {Vocal Indicators of Size, Shape and Body Composition in Polish Men.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {6}, pages = {878.e9-878.e22}, doi = {10.1016/j.jvoice.2020.09.011}, pmid = {33069508}, issn = {1873-4588}, mesh = {Humans ; Adult ; Male ; Poland ; *Voice ; Voice Quality ; *Larynx ; Body Composition ; Speech Acoustics ; }, abstract = {OBJECTIVES: From a human evolution perspective, identifying a link between physique and vocal quality could demonstrate dual signaling in terms of the health and biological condition of an individual. In this regard, this study investigates the relationship between men's body size, shape, and composition, and their vocal characteristics.

MATERIALS AND METHODS: Eleven anthropometric measurements, using seven indices, were carried out with 80 adult Polish male participants, while the speech analysis adopted a voice recording procedure that involved phonetically recording vowels /ɑː/, /ɛː/, /iː/, /ɔː/, /uː/ to define the voice acoustic components used in Praat software.

RESULTS: The relationship between voice parameters and body size/shape/composition was found. The analysis indicated that the formants and their derivatives were useful parameters for prediction of height, weight, neck, shoulder, waist, and hip circumferences. Fundamental frequency (F0) was negatively correlated with neck circumference at Adam's apple level and body height. Moreover neck circumference and F0 association was observed for the first time in this paper. The association between waist circumference and formant component showed a net effect. In addition, the formant parameters showed significant correlations with body shape, indicating a lower vocal timbre in men with a larger relative waist circumference.

DISCUSSION: Men with lower vocal pitch had wider necks, probably a result of larynx size. Furthermore, a greater waist circumference, presumably resulting from abdominal fat distribution in men, correlated with a lower vocal timbre. While these results are inconclusive, they highlight new directions for further research.}, } @article {pmid33029845, year = {2020}, author = {Auracher, J and Menninghaus, W and Scharinger, M}, title = {Sound Predicts Meaning: Cross-Modal Associations Between Formant Frequency and Emotional Tone in Stanzas.}, journal = {Cognitive science}, volume = {44}, number = {10}, pages = {e12906}, doi = {10.1111/cogs.12906}, pmid = {33029845}, issn = {1551-6709}, mesh = {Acoustics ; Adult ; *Comprehension ; *Emotions ; Female ; Humans ; *Language ; Male ; Phonetics ; *Sound ; }, abstract = {Research on the relation between sound and meaning in language has reported substantial evidence for implicit associations between articulatory-acoustic characteristics of phonemes and emotions. In the present study, we specifically tested the relation between the acoustic properties of a text and its emotional tone as perceived by readers. To this end, we asked participants to assess the emotional tone of single stanzas extracted from a large variety of poems. The selected stanzas had either an extremely high, a neutral, or an extremely low average formant dispersion. To assess the average formant dispersion per stanza, all words were phonetically transcribed and the distance between the first and second formant per vowel was calculated. Building on a long tradition of research on associations between sound frequency on the one hand and non-acoustic concepts such as size, strength, or happiness on the other hand, we hypothesized that stanzas with an extremely high average formant dispersion would be rated lower on items referring to Potency (dominance) and higher on items referring to Activity (arousal) and Evaluation (emotional valence). The results confirmed our hypotheses for the dimensions of Potency and Evaluation, but not for the dimension of Activity. We conclude that, at least in poetic language, extreme values of acoustic features of vowels are a significant predictor for the emotional tone of a text.}, } @article {pmid33012680, year = {2020}, author = {Song, XY and Wang, SJ and Xu, ZX and Hao, YM and Feng, L and Ding, XD and Gao, H and Wang, YQ}, title = {Preliminary study on phonetic characteristics of patients with pulmonary nodules.}, journal = {Journal of integrative medicine}, volume = {18}, number = {6}, pages = {499-504}, doi = {10.1016/j.joim.2020.09.004}, pmid = {33012680}, issn = {2095-4964}, mesh = {Case-Control Studies ; China ; Early Detection of Cancer ; Humans ; *Lung Neoplasms/complications ; *Phonetics ; *Voice Quality ; }, abstract = {OBJECTIVE: Pulmonary nodules (PNs) are one of the imaging manifestations of early lung cancer screening, which should receive more attention. Traditional Chinese medicine believes that voice changes occur in patients with pulmonary diseases. The purpose of this study is to explore the differences in phonetic characteristics between patients with PNs and able-bodied persons.

METHODS: This study explores the phonetic characteristics of patients with PNs in order to provide a simpler and cheaper method for PN screening. It is a case-control study to explore the differences in phonetic characteristics between individuals with and without PNs. This study performed non-parametric statistics on acoustic parameters of vocalizations, collected from January 2017 to March 2018 in Shanghai, China, from these two groups; it explores the differences in third and fourth acoustic parameters between patients with PNs and a normal control group. At the same time, computed tomography (CT) scans, course of disease, combined disease and other risk factors of the patients were collected in the form of questionnaire. According to the grouping of risk factors, the phonetic characteristics of the patients with PNs were analyzed.

RESULTS: This study was comprised of 200 patients with PNs, as confirmed by CT, and 86 healthy people that served as a control group. Among patients with PNs, 43% had ground glass opacity, 32% had nodules with a diameter ≥ 8 mm, 19% had a history of smoking and 31% had hyperlipidemia. Compared with the normal group, there were statistically significant differences in pitch, intensity and shimmer in patients with PNs. Among patients with PNs, patients with diameters ≥ 8 mm had a significantly higher third formant. There was a significant difference in intensity, fourth formant and harmonics-to-noise ratio (HNR) between smoking and non-smoking patients. Compared with non-hyperlipidemia patients, the pitch, jitter and shimmer of patients with PNs and hyperlipidemia were higher and the HNR was lower; these differences were statistically significant.

CONCLUSION: This measurable changes in vocalizations can be in patients with PNs. Patients with PNs had lower and weaker voices. The size of PNs had an effect on the phonetic formant. Smoking may contribute to damage to the voice and formant changes. Voice damage is more pronounced in individuals who have PNs accompanied by hyperlipidemia.}, } @article {pmid33008725, year = {2022}, author = {Melton, J and Bradford, Z and Lee, J}, title = {Acoustic Characteristics of Vocal Sounds Used by Professional Actors Performing Classical Material Without Microphones in Outdoor Theatre.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {5}, pages = {733.e23-733.e29}, doi = {10.1016/j.jvoice.2020.08.036}, pmid = {33008725}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Humans ; Male ; Occupations ; *Speech Acoustics ; *Voice ; Voice Quality ; }, abstract = {OBJECTIVE: Theatre actors use voice in virtually any physical position, moving or still, and perform in a wide range of venues. The present study investigated acoustic qualities required to perform classical material without electronic amplification in outdoor spaces.

DESIGN: Eight professional actors, four female, four male, from NY Classical Theatre performed one-minute monologues, first stationary, then moving, for audio recording in Central Park. Four subjects recorded two monologues each, from productions in which they played both male and female characters. Data were analyzed for fundamental frequency (F0), sound pressure level (SPL), and long-term average spectrum (LTAS).

RESULTS: Overall, F0 ranged between 75.38 and 530.33 Hz. Average F0 was 326 Hz stationary and 335.78 Hz moving for females, 248.54 Hz stationary, 252.82 Hz moving for males. SPL ranged from 28.54 to 110.51 dB for females, and 56.69 to 124.44 dB for males. Average SPL was 82 dB for females, 96.98 dB for males. On LTAS, females had a peak between 3 and 4 kHz ranging from 1.5 to 4.5 dB and another between 4 and 5 kHz ranging from 2 to 4.5 dB, while males had a peak between 3 and 4 kHz ranging from 1 to 8.5 dB.

CONCLUSION: Actors appear to use a similar F0 range across gender and performing conditions. Average F0 increased from stationary to moving. Males had greater SPL values than females, and the amplitude of peaks in the region of the Actor's Formant of LTAS curves was higher in male than female voices.}, } @article {pmid33003843, year = {2020}, author = {Caverlé, MWJ and Vogel, AP}, title = {Stability, reliability, and sensitivity of acoustic measures of vowel space: A comparison of vowel space area, formant centralization ratio, and vowel articulation index.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {3}, pages = {1436}, doi = {10.1121/10.0001931}, pmid = {33003843}, issn = {1520-8524}, mesh = {*Acoustics ; Phonetics ; Reproducibility of Results ; *Speech Acoustics ; }, abstract = {Vowel space (VS) measurements can provide objective information on formant distribution and act as a proxy for vowel production. There are a number of proposed ways to quantify vowel production clinically, including vowel space area, formant centralization ratio, and vowel articulation index (VAI). The stability, reliability, and sensitivity of three VS measurements were investigated in two experiments. Stability was explored across three inter-recording intervals and challenged in two sensitivity conditions. Data suggest that VAI is the most stable measure across 30 s, 2 h, and 4 h inter-recording intervals. VAI appears the most sensitive metric of the three measures in conditions of fatigue and noise. These analyses highlight the need for stability and sensitivity analysis when developing and validating acoustic metrics, and underscore the potential of the VAI for vowel analysis.}, } @article {pmid32995486, year = {2020}, author = {Kaya, Z and Soltanipour, M and Treves, A}, title = {Non-hexagonal neural dynamics in vowel space.}, journal = {AIMS neuroscience}, volume = {7}, number = {3}, pages = {275-298}, pmid = {32995486}, issn = {2373-7972}, abstract = {Are the grid cells discovered in rodents relevant to human cognition? Following up on two seminal studies by others, we aimed to check whether an approximate 6-fold, grid-like symmetry shows up in the cortical activity of humans who "navigate" between vowels, given that vowel space can be approximated with a continuous trapezoidal 2D manifold, spanned by the first and second formant frequencies. We created 30 vowel trajectories in the assumedly flat central portion of the trapezoid. Each of these trajectories had a duration of 240 milliseconds, with a steady start and end point on the perimeter of a "wheel". We hypothesized that if the neural representation of this "box" is similar to that of rodent grid units, there should be an at least partial hexagonal (6-fold) symmetry in the EEG response of participants who navigate it. We have not found any dominant n-fold symmetry, however, but instead, using PCAs, we find indications that the vowel representation may reflect phonetic features, as positioned on the vowel manifold. The suggestion, therefore, is that vowels are encoded in relation to their salient sensory-perceptual variables, and are not assigned to arbitrary grid-like abstract maps. Finally, we explored the relationship between the first PCA eigenvector and putative vowel attractors for native Italian speakers, who served as the subjects in our study.}, } @article {pmid32994430, year = {2020}, author = {Moon, IJ and Kang, S and Boichenko, N and Hong, SH and Lee, KM}, title = {Meter enhances the subcortical processing of speech sounds at a strong beat.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {15973}, pmid = {32994430}, issn = {2045-2322}, mesh = {Acoustic Stimulation/*methods ; Adult ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Female ; Humans ; Male ; Music ; Phonetics ; Sound ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The temporal structure of sound such as in music and speech increases the efficiency of auditory processing by providing listeners with a predictable context. Musical meter is a good example of a sound structure that is temporally organized in a hierarchical manner, with recent studies showing that meter optimizes neural processing, particularly for sounds located at a higher metrical position or strong beat. Whereas enhanced cortical auditory processing at times of high metric strength has been studied, there is to date no direct evidence showing metrical modulation of subcortical processing. In this work, we examined the effect of meter on the subcortical encoding of sounds by measuring human auditory frequency-following responses to speech presented at four different metrical positions. Results show that neural encoding of the fundamental frequency of the vowel was enhanced at the strong beat, and also that the neural consistency of the vowel was the highest at the strong beat. When comparing musicians to non-musicians, musicians were found, at the strong beat, to selectively enhance the behaviorally relevant component of the speech sound, namely the formant frequency of the transient part. Our findings indicate that the meter of sound influences subcortical processing, and this metrical modulation differs depending on musical expertise.}, } @article {pmid32991418, year = {2020}, author = {Park, EJ and Kim, JH and Choi, YH and Son, JE and Lee, SA and Yoo, SD}, title = {Association between phonation and the vowel quadrilateral in patients with stroke: A retrospective observational study.}, journal = {Medicine}, volume = {99}, number = {39}, pages = {e22236}, pmid = {32991418}, issn = {1536-5964}, mesh = {Aged ; Dysphonia/etiology/physiopathology/*therapy ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; Retrospective Studies ; Stroke/complications ; Stroke Rehabilitation/*methods ; }, abstract = {Articulation disorder is associated with impaired control of respiration and speech organ movement. There are many cases of dysarthria and dysphonia in stroke patients. Dysphonia adversely affects communication and social activities, and it can interfere with everyday life. The purpose of this study is to assess the association between phonation abilities and the vowel quadrilateral in stroke patients.The subjects were stroke patients with pronunciation and phonation disorders. The resonance frequency was measured for the 4 corner vowels to measure the vowel space area (VSA) and formant centralization ratio (FCR). Phonation ability was evaluated by the Dysphonia Severity Index (DSI) and maximal phonation time (MPT) through acoustic evaluation for each vowel. Pearsons correlation analysis was performed to confirm the association, and multiple linear regression analysis was performed between variables.The correlation coefficients of VSA and MPT/u/ were 0.420, VSA and MPT/i/ were 0.536, VSA and DSI/u/ were 0.392, VSA and DSI /i/ were 0.364, and FCR and DSI /i/ were -0.448. Multiple linear regression analysis showed that VSA was a factor significantly influencing MPT/u/ (β = 0.420, P = .021, R = 0.147), MPT/i/ (β = 0.536, P = .002, R = 0.262), DSI/u/ (β = 0.564, P = .045, R = 0.256), and DSI/i/ (β = 0.600, P = .03, R = 0.302).The vowel quadrilateral can be a useful tool for evaluating the phonation function of stroke patients.}, } @article {pmid32985269, year = {2021}, author = {Ge, S and Wan, Q and Yin, M and Wang, Y and Huang, Z}, title = {Quantitative acoustic metrics of vowel production in mandarin-speakers with post-stroke spastic dysarthria.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {8}, pages = {779-792}, doi = {10.1080/02699206.2020.1827295}, pmid = {32985269}, issn = {1464-5076}, mesh = {Acoustics ; *Benchmarking ; *Dysarthria/etiology ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Impairment of vowel production in dysarthria has been highly valued. This study aimed to explore the vowel production of Mandarin-speakers with post-stroke spastic dysarthria in connected speech and to explore the influence of gender and tone on the vowel production. Multiple vowel acoustic metrics, including F1 range, F2 range, vowel space area (VSA), vowel articulation index (VAI) and formant centralization ratio (FCR), were analyzed from vowel tokens embedded in connected speech produced. The participants included 25 clients with spastic dysarthria secondary to stroke (15 males, 10 females) and 25 speakers with no history of neurological disease (15 males, 10 females). Variance analyses were conducted and the results showed that the main effects of population, gender, and tone on F2 range, VSA, VAI, and FCR were all significant. Vowel production became centralized in the clients with post-stroke spastic dysarthria. Vowel production was found to be more centralized in males compared to females. Vowels in neutral tone (T0) were the most centralized among the other tones. The quantitative acoustic metrics of F2 range, VSA, VAI, and FCR were effective in predicting vowel production in Mandarin-speaking clients with post-stroke spastic dysarthria, and hence may be used as powerful tools to assess the speech performance for this population.}, } @article {pmid32976078, year = {2020}, author = {Daliri, A and Chao, SC and Fitzgerald, LC}, title = {Compensatory Responses to Formant Perturbations Proportionally Decrease as Perturbations Increase.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {10}, pages = {3392-3407}, pmid = {32976078}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Feedback, Sensory ; Female ; Humans ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Purpose We continuously monitor our speech output to detect potential errors in our productions. When we encounter errors, we rapidly change our speech output to compensate for the errors. However, it remains unclear whether we adjust the magnitude of our compensatory responses based on the characteristics of errors. Method Participants (N = 30 adults) produced monosyllabic words containing /ɛ/ (/hɛp/, /hɛd/, /hɛk/) while receiving perturbed or unperturbed auditory feedback. In the perturbed trials, we applied two different types of formant perturbations: (a) the F1 shift, in which the first formant of /ɛ/ was increased, and (b) the F1-F2 shift, in which the first formant was increased and the second formant was decreased to make a participant's /ɛ/ sound like his or her /æ/. In each perturbation condition, we applied three participant-specific perturbation magnitudes (0.5, 1.0, and 1.5 ɛ-æ distance). Results Compensatory responses to perturbations with the magnitude of 1.5 ɛ-æ were proportionally smaller than responses to perturbation magnitudes of 0.5 ɛ-æ. Responses to the F1-F2 shift were larger than responses to the F1 shift regardless of the perturbation magnitude. Additionally, compensatory responses for /hɛd/ were smaller than responses for /hɛp/ and /hɛk/. Conclusions Overall, these results suggest that the brain uses its error evaluation to determine the extent of compensatory responses. The brain may also consider categorical errors and phonemic environments (e.g., articulatory configurations of the following phoneme) to determine the magnitude of its compensatory responses to auditory errors.}, } @article {pmid32951953, year = {2022}, author = {Nilsson, T and Laukkanen, AM and Syrjä, T}, title = {Effects of Sixteen Month Voice Training of Student Actors Applying the Linklater Voice Method.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {5}, pages = {733.e9-733.e21}, doi = {10.1016/j.jvoice.2020.08.014}, pmid = {32951953}, issn = {1873-4588}, mesh = {Humans ; Phonation ; Speech Acoustics ; Students ; *Voice ; Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVE: This study investigates the perceptual and acoustic changes in student actors' voices after 16 months of Linklater Voice training, which is a holistic method to train actors' voices.

METHODS: Eleven (n = 11) actor students' text and Voice Range Profile (VRP) recordings were analyzed pretraining and 16 months posttraining. From text readings at comfortable performance loudness, both perceptual and acoustic analyses were made. Acoustic measures included sound pressure level (SPL), fundamental frequency (fo), and sound level differences between different frequency ranges derived from long-term-average spectrum. Sustained vowels [i:], [o], and [e] abstracted from the text sample were analyzed for formant frequencies F1-F4 and the frequency difference between F4 and F3. The VRP was registered to investigate SPL of the softest and loudest phonations throughout the voice range.

RESULTS: The perceived pitch range during text reading increased significantly. The acoustic result showed a strong trend toward decreasing in minimum fo, and increasing in maximum fo and fo range. The VRP showed a significant increase in the fo range and dynamics (SPL range). Perceived voice production showed a trend toward phonation balance (neither pressed-nor breathy) and darker voice color posttraining.

CONCLUSION: The perceptual and acoustic analysis of text reading and acoustic measures of VRP suggest that LV training has a positive impact on voice.}, } @article {pmid32943283, year = {2022}, author = {Di Natale, V and Cantarella, G and Manfredi, C and Ciabatta, A and Bacherini, C and DeJonckere, PH}, title = {Semioccluded Vocal Tract Exercises Improve Self-Perceived Voice Quality in Healthy Actors.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {4}, pages = {584.e7-584.e14}, doi = {10.1016/j.jvoice.2020.07.024}, pmid = {32943283}, issn = {1873-4588}, mesh = {Female ; Humans ; Male ; *Singing ; Speech Acoustics ; *Voice Disorders/diagnosis/therapy ; Voice Quality ; Voice Training ; }, abstract = {PURPOSE: Semi-occluded vocal tract exercises (SOVTE) have shown to lead to more effective and efficient vocal production for individuals with voice disorders and for singers. The aim of the present study is to investigate the effects of a 10-minute SOVTE warm-up protocol on the actors' voice.

METHODS: Twenty-seven professional theater actors (16 females) without voice complaints were audio-recorded while reading aloud, with their acting voice, a short dramatic passage at four time points. Recordings were made: the day before the show, just before and soon after the warm-up protocol which was performed prior to the show and soon after the show. The voice quality was acoustically and auditory-perceptually evaluated and quantified at each time point by blinded raters. Self-assessment parameters anonymously collected pre and post exercising were also analyzed.

RESULTS: No statistically significant differences on perceptual ratings and acoustic parameters were found between pre/post exercise sessions and males/females. A statistically significant improvement was detected in the self-assessment parameters concerning comfort of production, sonorousness, vocal clarity and power.

CONCLUSIONS: Vocal warm-up with the described SOVTE protocol was effective in determining a self-perceived improvement in comfort of production, voice quality and power, although objective evidence was missing. This straightforward protocol could thus be beneficial if routinely utilized by professional actors to facilitate the voice performance.}, } @article {pmid32933336, year = {2021}, author = {Sugathan, N and Maruthy, S}, title = {Predictive factors for persistence and recovery of stuttering in children: A systematic review.}, journal = {International journal of speech-language pathology}, volume = {23}, number = {4}, pages = {359-371}, doi = {10.1080/17549507.2020.1812718}, pmid = {32933336}, issn = {1754-9515}, mesh = {Child ; Humans ; Linguistics ; Speech ; Speech Production Measurement ; *Stuttering ; Telephone ; }, abstract = {PURPOSE: The purpose of this study was to systematically review the available literature on various factors that can predict the persistence and recovery of stuttering in children.

METHOD: An electronic search yielded a total of 35 studies, which considered 44 variables that can be potential factors for predicting persistence and recovery.

RESULT: Among 44 factors studied, only four factors- phonological abilities, articulatory rate, change in the pattern of disfluencies, and trend in stuttering severity over one-year post-onset were identified to be replicated predictors of recovery of the stuttering. Several factors, such as differences in the second formant transition between fluent and disfluent speech, articulatory rate measured in phones/sec, etc., were observed to predict the future course of stuttering. However, these factors lack replicated evidence as predictors.

CONCLUSION: There is clear support only for limited factors as reliable predictors. Also, it is observed to be too early to conclude on several replicated factors due to differences in the age group of participants, participant sample size, and the differences in tools used in research that lead to mixed findings as a predictive factor. Hence there is a need for systematic and replicated testing of the factors identified before initiating their use for clinical purposes.}, } @article {pmid32921855, year = {2020}, author = {Palaparthi, A and Titze, IR}, title = {Analysis of Glottal Inverse Filtering in the Presence of Source-Filter Interaction.}, journal = {Speech communication}, volume = {123}, number = {}, pages = {98-108}, pmid = {32921855}, issn = {0167-6393}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; R01 DC017998/DC/NIDCD NIH HHS/United States ; }, abstract = {The validity of glottal inverse filtering (GIF) to obtain a glottal flow waveform from radiated pressure signal in the presence and absence of source-filter interaction was studied systematically. A driven vocal fold surface model of vocal fold vibration was used to generate source signals. A one-dimensional wave reflection algorithm was used to solve for acoustic pressures in the vocal tract. Several test signals were generated with and without source-filter interaction at various fundamental frequencies and vowels. Linear Predictive Coding (LPC), Quasi Closed Phase (QCP), and Quadratic Programming (QPR) based algorithms, along with supraglottal impulse response, were used to inverse filter the radiated pressure signals to obtain the glottal flow pulses. The accuracy of each algorithm was tested for its recovery of maximum flow declination rate (MFDR), peak glottal flow, open phase ripple factor, closed phase ripple factor, and mean squared error. The algorithms were also tested for their absolute relative errors of the Normalized Amplitude Quotient, the Quasi-Open Quotient, and the Harmonic Richness Factor. The results indicated that the mean squared error decreased with increase in source-filter interaction level suggesting that the inverse filtering algorithms perform better in the presence of source-filter interaction. All glottal inverse filtering algorithms predicted the open phase ripple factor better than the closed phase ripple factor of a glottal flow waveform, irrespective of the source-filter interaction level. Major prediction errors occurred in the estimation of the closed phase ripple factor, MFDR, peak glottal flow, normalized amplitude quotient, and Quasi-Open Quotient. Feedback-related nonlinearity (source-filter interaction) affected the recovered signal primarily when f o was well below the first formant frequency of a vowel. The prediction error increased when f o was close to the first formant frequency due to the difficulty of estimating the precise value of resonance frequencies, which was exacerbated by nonlinear kinetic losses in the vocal tract.}, } @article {pmid32917459, year = {2022}, author = {Lopes, LW and França, FP and Evangelista, DDS and Alves, JDN and Vieira, VJD and de Lima-Silva, MFB and Pernambuco, LA}, title = {Does the Combination of Glottal and Supraglottic Acoustic Measures Improve Discrimination Between Women With and Without Voice Disorders?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {4}, pages = {583.e17-583.e29}, doi = {10.1016/j.jvoice.2020.08.006}, pmid = {32917459}, issn = {1873-4588}, mesh = {Acoustics ; Cross-Sectional Studies ; Edema ; Female ; Humans ; *Laryngeal Edema ; Laryngeal Muscles ; Retrospective Studies ; Speech Acoustics ; *Voice Disorders/diagnosis ; Voice Quality ; }, abstract = {AIM: To analyze the accuracy of traditional acoustic measurements (F0, perturbation, and noise) and formant measurements in discriminating between women with and without voice disorders, and with different laryngeal disorders.

STUDY DESIGN: A descriptive, cross-sectional, and retrospective.

METHOD: Two hundred and sixty women participated. All participants recorded the spoken vowel /Ɛ/ and underwent laryngeal visual examination. Acoustic measures of the mean and standard deviation of the fundamental frequency (F0), jitter, shimmer, glottal-to-noise excitation ratio, and the values of the first three formants (F1, F2, and F3) were obtained.

RESULTS: Individual acoustic measurements did not demonstrate adequate (<70%) performance when discriminating between women with and without voice disorders. The combination of the standard deviation of the F0, shimmer, glottal-to-noise excitation ratio, F1, F2, and F3 showed acceptable (>70%) performance in classifying women with and without voice disorders. Individual measures of jitter as well as F1 and F3 demonstrated acceptable (>70%) performance when distinguishing women with different laryngeal diagnoses, including without voice disorders (healthy larynges), Reinke's edema, unilateral vocal fold paralysis, and sulcus vocalis. The combination of acoustic measurements showed excellent (>80%) performance when discriminating women without voice disorder from those with Reinke's edema (mean of F0, F1, and F3) and with sulcus vocalis (mean of F0, F1, and F2).

CONCLUSIONS: Individual formant and traditional acoustic measurements do not demonstrate adequate performance when discriminating between women with and without voice disorders. However, the combination of traditional and formant measurements improves the discrimination between the presence and absence of voice disorders and differentiates several laryngeal diagnoses.}, } @article {pmid32913919, year = {2020}, author = {Kishimoto, T and Takamiya, A and Liang, KC and Funaki, K and Fujita, T and Kitazawa, M and Yoshimura, M and Tazawa, Y and Horigome, T and Eguchi, Y and Kikuchi, T and Tomita, M and Bun, S and Murakami, J and Sumali, B and Warnita, T and Kishi, A and Yotsui, M and Toyoshiba, H and Mitsukura, Y and Shinoda, K and Sakakibara, Y and Mimura, M and , }, title = {The project for objective measures using computational psychiatry technology (PROMPT): Rationale, design, and methodology.}, journal = {Contemporary clinical trials communications}, volume = {19}, number = {}, pages = {100649}, pmid = {32913919}, issn = {2451-8654}, abstract = {INTRODUCTION: Depressive and neurocognitive disorders are debilitating conditions that account for the leading causes of years lived with disability worldwide. However, there are no biomarkers that are objective or easy-to-obtain in daily clinical practice, which leads to difficulties in assessing treatment response and developing new drugs. New technology allows quantification of features that clinicians perceive as reflective of disorder severity, such as facial expressions, phonic/speech information, body motion, daily activity, and sleep.

METHODS: Major depressive disorder, bipolar disorder, and major and minor neurocognitive disorders as well as healthy controls are recruited for the study. A psychiatrist/psychologist conducts conversational 10-min interviews with participants ≤10 times within up to five years of follow-up. Interviews are recorded using RGB and infrared cameras, and an array microphone. As an option, participants are asked to wear wrist-band type devices during the observational period. Various software is used to process the raw video, voice, infrared, and wearable device data. A machine learning approach is used to predict the presence of symptoms, severity, and the improvement/deterioration of symptoms.

DISCUSSION: The overall goal of this proposed study, the Project for Objective Measures Using Computational Psychiatry Technology (PROMPT), is to develop objective, noninvasive, and easy-to-use biomarkers for assessing the severity of depressive and neurocognitive disorders in the hopes of guiding decision-making in clinical settings as well as reducing the risk of clinical trial failure. Challenges may include the large variability of samples, which makes it difficult to extract the features that commonly reflect disorder severity.

TRIAL REGISTRATION: UMIN000021396, University Hospital Medical Information Network (UMIN).}, } @article {pmid32881631, year = {2020}, author = {Skuk, VG and Kirchen, L and Oberhoffner, T and Guntinas-Lichius, O and Dobel, C and Schweinberger, SR}, title = {Parameter-Specific Morphing Reveals Contributions of Timbre and Fundamental Frequency Cues to the Perception of Voice Gender and Age in Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {9}, pages = {3155-3175}, doi = {10.1044/2020_JSLHR-20-00026}, pmid = {32881631}, issn = {1558-9102}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Cues ; Female ; Humans ; Male ; Perception ; *Speech Perception ; *Voice ; }, abstract = {Purpose Using naturalistic synthesized speech, we determined the relative importance of acoustic cues in voice gender and age perception in cochlear implant (CI) users. Method We investigated 28 CI users' abilities to utilize fundamental frequency (F0) and timbre in perceiving voice gender (Experiment 1) and vocal age (Experiment 2). Parameter-specific voice morphing was used to selectively control acoustic cues (F0; time; timbre, i.e., formant frequencies, spectral-level information, and aperiodicity, as defined in TANDEM-STRAIGHT) in voice stimuli. Individual differences in CI users' performance were quantified via deviations from the mean performance of 19 normal-hearing (NH) listeners. Results CI users' gender perception seemed exclusively based on F0, whereas NH listeners efficiently used timbre. For age perception, timbre was more informative than F0 for both groups, with minor contributions of temporal cues. While a few CI users performed comparable to NH listeners overall, others were at chance. Separate analyses confirmed that even high-performing CI users classified gender almost exclusively based on F0. While high performers could discriminate age in male and female voices, low performers were close to chance overall but used F0 as a misleading cue to age (classifying female voices as young and male voices as old). Satisfaction with CI generally correlated with performance in age perception. Conclusions We confirmed that CI users' gender classification is mainly based on F0. However, high performers could make reasonable usage of timbre cues in age perception. Overall, parameter-specific morphing can serve to objectively assess individual profiles of CI users' abilities to perceive nonverbal social-communicative vocal signals.}, } @article {pmid32873043, year = {2020}, author = {Hansen, JHL and Bokshi, M and Khorram, S}, title = {Speech variability: A cross-language study on acoustic variations of speaking versus untrained singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {2}, pages = {829}, pmid = {32873043}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Humans ; Language ; *Singing ; *Speech ; Speech Acoustics ; }, abstract = {Speech production variability introduces significant challenges for existing speech technologies such as speaker identification (SID), speaker diarization, speech recognition, and language identification (ID). There has been limited research analyzing changes in acoustic characteristics for speech produced by untrained singing versus speaking. To better understand changes in speech production of the untrained singing voice, this study presents the first cross-language comparison between normal speaking and untrained karaoke singing of the same text content. Previous studies comparing professional singing versus speaking have shown deviations in both prosodic and spectral features. Some investigations also considered assigning the intrinsic activity of the singing. Motivated by these studies, a series of experiments to investigate both prosodic and spectral variations of untrained karaoke singers for three languages, American English, Hindi, and Farsi, are considered. A comprehensive comparison on common prosodic features, including phoneme duration, mean fundamental frequency (F0), and formant center frequencies of vowels was performed. Collective changes in the corresponding overall acoustic spaces based on the Kullback-Leibler distance using Gaussian probability distribution models trained on spectral features were analyzed. Finally, these models were used in a Gausian mixture model with universal background model SID evaluation to quantify speaker changes between speaking and singing when the audio text content is the same. The experiments showed that many acoustic characteristics of untrained singing are considerably different from speaking when the text content is the same. It is suggested that these results would help advance automatic speech production normalization/compensation to improve performance of speech processing applications (e.g., speaker ID, speech recognition, and language ID).}, } @article {pmid32873011, year = {2020}, author = {Winn, MB and Moore, AN}, title = {Perceptual weighting of acoustic cues for accommodating gender-related talker differences heard by listeners with normal hearing and with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {2}, pages = {496}, pmid = {32873011}, issn = {1520-8524}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; *Cochlear Implants ; Cues ; Hearing ; Humans ; Male ; *Speech Perception ; }, abstract = {Listeners must accommodate acoustic differences between vocal tracts and speaking styles of conversation partners-a process called normalization or accommodation. This study explores what acoustic cues are used to make this perceptual adjustment by listeners with normal hearing or with cochlear implants, when the acoustic variability is related to the talker's gender. A continuum between /ʃ/ and /s/ was paired with naturally spoken vocalic contexts that were parametrically manipulated to vary by numerous cues for talker gender including fundamental frequency (F0), vocal tract length (formant spacing), and direct spectral contrast with the fricative. The goal was to examine relative contributions of these cues toward the tendency to have a lower-frequency acoustic boundary for fricatives spoken by men (found in numerous previous studies). Normal hearing listeners relied primarily on formant spacing and much less on F0. The CI listeners were individually variable, with the F0 cue emerging as the strongest cue on average.}, } @article {pmid32777195, year = {2020}, author = {Chung, H}, title = {Acquisition and Acoustic Patterns of Southern American English /l/ in Young Children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2609-2624}, doi = {10.1044/2020_JSLHR-19-00040}, pmid = {32777195}, issn = {1558-9102}, mesh = {Acoustics ; Child, Preschool ; Humans ; *Language ; Phonetics ; Speech ; *Speech Acoustics ; United States ; }, abstract = {Purpose The aim of the current study was to examine /l/ developmental patterns in young learners of Southern American English, especially in relation to the effect of word position and phonetic contexts. Method Eighteen children with typically developing speech, aged between 2 and 5 years, produced monosyllabic single words containing singleton /l/ in different word positions (pre- vs. postvocalic /l/) across different vowel contexts (high front vs. low back) and cluster /l/ in different consonant contexts (/pl, bl/ vs. /kl, gl/). Each production was analyzed for its accuracy and acoustic patterns as measured by the first two formant frequencies and their difference (F1, F2, and F2-F1). Results There was great individual variability in /l/ acquisition patterns, with some 2- and 3-year-olds reaching 100% accuracy for prevocalic /l/, while others were below 70%. Overall, accuracy of prevocalic /l/ was higher than that of postvocalic /l/. Acoustic patterns of pre- and postvocalic /l/ showed greater differences in younger children and less apparent differences in 5-year-olds. There were no statistically significant differences between the acoustic patterns of /l/ coded as perceptually acceptable and those coded as misarticulated. There was also no apparent effect of vowel and consonant contexts on /l/ patterns. Conclusion The accuracy patterns of this study suggest an earlier development of /l/, especially prevocalic /l/, than has been reported in previous studies. The differences in acoustic patterns between pre- and postvocalic /l/, which become less apparent with age, may suggest that children alter the way they articulate /l/ with age. No significant acoustic differences between acceptable and misarticulated /l/, especially postvocalic /l/, suggest a gradient nature of /l/ that is dialect specific. This suggests the need for careful consideration of a child's dialect/language background when studying /l/.}, } @article {pmid32777194, year = {2020}, author = {Lee, J and Kim, H and Jung, Y}, title = {Patterns of Misidentified Vowels in Individuals With Dysarthria Secondary to Amyotrophic Lateral Sclerosis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2649-2666}, doi = {10.1044/2020_JSLHR-19-00237}, pmid = {32777194}, issn = {1558-9102}, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis/complications ; *Dysarthria/diagnosis/etiology ; Humans ; Phonetics ; Speech Acoustics ; Tongue ; }, abstract = {Purpose The current study examines the pattern of misidentified vowels produced by individuals with dysarthria secondary to amyotrophic lateral sclerosis (ALS). Method Twenty-three individuals with ALS and 22 typical individuals produced 10 monophthongs in an /h/-vowel-/d/ context. One hundred thirty-five listeners completed a forced-choice vowel identification test. Misidentified vowels were examined in terms of the target vowel categories (front-back; low-mid-high) and the direction of misidentification (the directional pattern when the target vowel was misidentified, e.g., misidentification "to a lower vowel"). In addition, acoustic predictors of vowel misidentifications were tested based on log first formant (F1), log second formant, log F1 vowel inherent spectral change, log second formant vowel inherent spectral change, and vowel duration. Results First, high and mid vowels were more frequently misidentified than low vowels for all speaker groups. Second, front and back vowels were misidentified at a similar rate for both the Mild and Severe groups, whereas back vowels were more frequently misidentified than front vowels in typical individuals. Regarding the direction of vowel misidentification, vowel errors were mostly made within the same backness (front-back) category for all groups. In addition, more errors were found toward a lower vowel category than toward a higher vowel category in the Severe group, but not in the Mild group. Overall, log F1 difference was identified as a consistent acoustic predictor of the main vowel misidentification pattern. Conclusion Frequent misidentifications in the vowel height dimension and the acoustic predictor, F1, suggest that limited tongue height control is the major articulatory dysfunction in individuals with ALS. Clinical implications regarding this finding are discussed.}, } @article {pmid32754872, year = {2021}, author = {Koo, SK and Kwon, SB and Koh, TK and Ji, CL and Park, GH and Lee, HB}, title = {Acoustic analyses of snoring sounds using a smartphone in patients undergoing septoplasty and turbinoplasty.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {278}, number = {1}, pages = {257-263}, pmid = {32754872}, issn = {1434-4726}, mesh = {Acoustics ; Humans ; Male ; *Nasal Obstruction/diagnosis/surgery ; Nasal Septum/surgery ; Prospective Studies ; *Rhinoplasty ; Smartphone ; Snoring/diagnosis/surgery ; Treatment Outcome ; }, abstract = {PURPOSE: Several studies have been performed using recently developed smartphone-based acoustic analysis techniques. We investigated the effects of septoplasty and turbinoplasty in patients with nasal septal deviation and turbinate hypertrophy accompanied by snoring by recording the sounds of snoring using a smartphone and performing acoustic analysis.

METHODS: A total of 15 male patients who underwent septoplasty with turbinoplasty for snoring and nasal obstruction were included in this prospective study. Preoperatively and 2 months after surgery, their bed partners or caregivers were instructed to record the snoring sounds. The intensity (dB), formant frequencies (F1, F2, F3, and F4), spectrogram pattern, and visual analog scale (VAS) score were analyzed for each subject.

RESULTS: Overall snoring sounds improved after surgery in 12/15 (80%) patients, and there was significant improvement in the intensity of snoring sounds after surgery (from 64.17 ± 12.18 dB to 55.62 ± 9.11 dB, p = 0.018). There was a significant difference in the F1 formant frequency before and after surgery (p = 0.031), but there were no significant differences in F2, F3, or F4. The change in F1 indicated that patients changed from mouth breathing to normal breathing. The degree of subjective snoring sounds improved significantly after surgery (VAS: from 5.40 ± 1.55 to 3.80 ± 1.26, p = 0.003).

CONCLUSION: Our results confirm that snoring is reduced when nasal congestion is improved, and they demonstrate that smartphone-based acoustic analysis of snoring sounds can be useful for diagnosis.}, } @article {pmid32738502, year = {2020}, author = {Scott, TL and Haenchen, L and Daliri, A and Chartove, J and Guenther, FH and Perrachione, TK}, title = {Noninvasive neurostimulation of left ventral motor cortex enhances sensorimotor adaptation in speech production.}, journal = {Brain and language}, volume = {209}, number = {}, pages = {104840}, pmid = {32738502}, issn = {1090-2155}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; R03 DC014045/DC/NIDCD NIH HHS/United States ; T90 DA032484/DA/NIDA NIH HHS/United States ; }, mesh = {Adult ; Feedback, Sensory/*physiology ; Female ; Humans ; Learning/*physiology ; Male ; Motor Cortex/*physiology ; Patient-Specific Modeling ; Psychomotor Performance/*physiology ; Speech/*physiology ; Speech Acoustics ; *Transcranial Direct Current Stimulation ; Young Adult ; }, abstract = {Sensorimotor adaptation-enduring changes to motor commands due to sensory feedback-allows speakers to match their articulations to intended speech acoustics. How the brain integrates auditory feedback to modify speech motor commands and what limits the degree of these modifications remain unknown. Here, we investigated the role of speech motor cortex in modifying stored speech motor plans. In a within-subjects design, participants underwent separate sessions of sham and anodal transcranial direct current stimulation (tDCS) over speech motor cortex while speaking and receiving altered auditory feedback of the first formant. Anodal tDCS increased the rate of sensorimotor adaptation for feedback perturbation. Computational modeling of our results using the Directions Into Velocities of Articulators (DIVA) framework of speech production suggested that tDCS primarily affected behavior by increasing the feedforward learning rate. This study demonstrates how focal noninvasive neurostimulation can enhance the integration of auditory feedback into speech motor plans.}, } @article {pmid32720557, year = {2021}, author = {Chung, H and Munson, B and Edwards, J}, title = {Cross-Linguistic Perceptual Categorization of the Three Corner Vowels: Effects of Listener Language and Talker Age.}, journal = {Language and speech}, volume = {64}, number = {3}, pages = {558-575}, doi = {10.1177/0023830920943240}, pmid = {32720557}, issn = {1756-6053}, mesh = {Adult ; Child, Preschool ; Humans ; *Language ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {The present study examined the center and size of naïve adult listeners' vowel perceptual space (VPS) in relation to listener language (LL) and talker age (TA). Adult listeners of three different first languages, American English, Greek, and Korean, categorized and rated the goodness of different vowels produced by 2-year-olds and 5-year-olds and adult speakers of those languages, and speakers of Cantonese and Japanese. The center (i.e., mean first and second formant frequencies (F1 and F2)) and size (i.e., area in the F1/F2 space) of VPSs that were categorized either into /a/, /i/, or /u/ were calculated for each LL and TA group. All center and size calculations were weighted by the goodness rating of each stimulus. The F1 and F2 values of the vowel category (VC) centers differed significantly by LL and TA. These effects were qualitatively different for the three vowel categories: English listeners had different /a/ and /u/ centers than Greek and Korean listeners. The size of VPSs did not differ significantly by LL, but did differ by TA and VCs: Greek and Korean listeners had larger vowel spaces when perceiving vowels produced by 2-year-olds than by 5-year-olds or adults, and English listeners had larger vowel spaces for /a/ than /i/ or /u/. Findings indicate that vowel perceptual categories of listeners varied by the nature of their native vowel system, and were sensitive to TA.}, } @article {pmid32697631, year = {2020}, author = {Mefferd, AS and Dietrich, MS}, title = {Tongue- and Jaw-Specific Articulatory Changes and Their Acoustic Consequences in Talkers With Dysarthria due to Amyotrophic Lateral Sclerosis: Effects of Loud, Clear, and Slow Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2625-2636}, pmid = {32697631}, issn = {1558-9102}, support = {R03 DC015075/DC/NIDCD NIH HHS/United States ; UL1 TR002243/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis/complications ; *Dysarthria/etiology ; Humans ; Speech ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; Tongue ; }, abstract = {Purpose This study aimed to determine how tongue and jaw displacement changes impact acoustic vowel contrast in talkers with amyotrophic lateral sclerosis (ALS) and controls. Method Ten talkers with ALS and 14 controls participated in this study. Loud, clear, and slow speech cues were used to elicit tongue and jaw kinematic as well as acoustic changes. Speech kinematics was recorded using three-dimensional articulography. Independent tongue and jaw displacements were extracted during the diphthong /ai/ in kite. Acoustic distance between diphthong onset and offset in Formant 1-Formant 2 vowel space indexed acoustic vowel contrast. Results In both groups, all three speech modifications elicited increases in jaw displacement (typical < slow < loud < clear). By contrast, only slow speech elicited significantly increased independent tongue displacement in the ALS group (typical = loud = clear < slow), whereas all three speech modifications elicited significantly increased independent tongue displacement in controls (typical < loud < clear = slow). Furthermore, acoustic vowel contrast significantly increased in response to clear and slow speech in the ALS group, whereas all three speech modifications elicited significant increases in acoustic vowel contrast in controls (typical < loud < slow < clear). Finally, only jaw displacements accounted for acoustic vowel contrast gains in the ALS group. In controls, however, independent tongue displacements accounted for increases in vowel acoustic contrast during loud and slow speech, whereas jaw and independent tongue displacements accounted equally for acoustic vowel contrast change during clear speech. Conclusion Kinematic findings suggest that slow speech may be better suited to target independent tongue displacements in talkers with ALS than clear and loud speech. However, given that gains in acoustic vowel contrast were comparable for slow and clear speech cues in these talkers, future research is needed to determine potential differential impacts of slow and clear speech on perceptual measures, such as intelligibility. Finally, findings suggest that acoustic vowel contrast gains are predominantly jaw driven in talkers with ALS. Therefore, the acoustic and perceptual consequences of direct instructions of enhanced jaw movements should be compared to cued speech modification, such as clear and slow speech in these talkers.}, } @article {pmid32694252, year = {2020}, author = {Laturnus, R}, title = {Comparative Acoustic Analyses of L2 English: The Search for Systematic Variation.}, journal = {Phonetica}, volume = {77}, number = {6}, pages = {441-479}, doi = {10.1159/000508387}, pmid = {32694252}, issn = {1423-0321}, mesh = {Comprehension ; Humans ; *Language ; Male ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {BACKGROUND/AIMS: Previous research has shown that exposure to multiple foreign accents facilitates adaptation to an untrained novel accent. One explanation is that L2 speech varies systematically such that there are commonalities in the productions of nonnative speakers, regardless of their language background.

METHODS: A systematic acoustic comparison was conducted between 3 native English speakers and 6 nonnative accents. Voice onset time, unstressed vowel duration, and formant values of stressed and unstressed vowels were analyzed, comparing each nonnative accent to the native English talkers. A subsequent perception experiment tests what effect training on regionally accented voices has on the participant's comprehension of nonnative accented speech to investigate the importance of within-speaker variation on attunement and generalization.

RESULTS: Data for each measure show substantial variability across speakers, reflecting phonetic transfer from individual L1s, as well as substantial inconsistency and variability in pronunciation, rather than commonalities in their productions. Training on native English varieties did not improve participants' accuracy in understanding nonnative speech.

CONCLUSION: These findings are more consistent with a hypothesis of accent attune-ment wherein listeners track general patterns of nonnative speech rather than relying on overlapping acoustic signals between speakers.}, } @article {pmid32693610, year = {2020}, author = {Rishiq, D and Harkrider, A and Springer, C and Hedrick, M}, title = {Effects of Aging on the Subcortical Encoding of Stop Consonants.}, journal = {American journal of audiology}, volume = {29}, number = {3}, pages = {391-403}, doi = {10.1044/2020_AJA-19-00044}, pmid = {32693610}, issn = {1558-9137}, mesh = {Adolescent ; Adult ; Aged ; Aging/*physiology ; Auditory Perception/physiology ; Brain Stem/physiology/physiopathology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Purpose The main purpose of this study was to evaluate aging effects on the predominantly subcortical (brainstem) encoding of the second-formant frequency transition, an essential acoustic cue for perceiving place of articulation. Method Synthetic consonant-vowel syllables varying in second-formant onset frequency (i.e., /ba/, /da/, and /ga/ stimuli) were used to elicit speech-evoked auditory brainstem responses (speech-ABRs) in 16 young adults (M age = 21 years) and 11 older adults (M age = 59 years). Repeated-measures mixed-model analyses of variance were performed on the latencies and amplitudes of the speech-ABR peaks. Fixed factors were phoneme (repeated measures on three levels: /b/ vs. /d/ vs. /g/) and age (two levels: young vs. older). Results Speech-ABR differences were observed between the two groups (young vs. older adults). Specifically, older listeners showed generalized amplitude reductions for onset and major peaks. Significant Phoneme × Group interactions were not observed. Conclusions Results showed aging effects in speech-ABR amplitudes that may reflect diminished subcortical encoding of consonants in older listeners. These aging effects were not phoneme dependent as observed using the statistical methods of this study.}, } @article {pmid32657177, year = {2021}, author = {Al-Tamimi, F and Howell, P}, title = {Voice onset time and formant onset frequencies in Arabic stuttered speech.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {6}, pages = {493-508}, doi = {10.1080/02699206.2020.1786726}, pmid = {32657177}, issn = {1464-5076}, mesh = {Adolescent ; Humans ; Phonetics ; Speech ; Speech Production Measurement ; *Stuttering ; *Voice ; }, abstract = {Neuromuscular models of stuttering consider that making transitions between phones results in inappropriate temporal arrangements of articulators in people who stutter (PWS). Using this framework, the current study examined the acoustic productions of two fine-grained phonetic features: voice onset time (VOT) and second formant (F2). The hypotheses were that PWS should differ from fluent persons (FP) in VOT duration and F2 onset frequency as a result of the transition deficit for environments with complex phonetic features such as Arabic emphatics. Ten adolescent PWS and 10 adolescent FPs participated in the study. They read and memorized four monosyllabic plain-emphatic words silently. Data were analyzed by Repeated Measures ANOVAs. The positive and negative VOT durations of/t/vs./tˁ/and/d/vs./dˁ/and F2 onset frequency were measured acoustically. Results showed that stuttering was significantly affected by emphatic consonants. PWS had atypical VOT durations and F2 values. Findings are consistent with the atypicality of VOT and F2 reported for English-speaking PWS. This atypicality is realized differently in Arabic depending on the articulatory complexity and cognitive load of the sound.}, } @article {pmid32649536, year = {2020}, author = {Levy-Lambert, D and Grigos, MI and LeBlanc, É and DeMitchell-Rodriguez, EM and Noel, DY and Alfonso, AR and Ramly, EP and Rifkin, WJ and Diaz-Siso, JR and Ceradini, DJ and Kantar, RS and Rodriguez, ED}, title = {Communication Efficiency in a Face Transplant Recipient: Determinants and Therapeutic Implications.}, journal = {The Journal of craniofacial surgery}, volume = {31}, number = {6}, pages = {e528-e530}, doi = {10.1097/SCS.0000000000006727}, pmid = {32649536}, issn = {1536-3732}, mesh = {Adult ; *Facial Transplantation ; Humans ; Male ; Speech Intelligibility ; Speech Production Measurement ; Transplant Recipients ; }, abstract = {We longitudinally assessed speech intelligibility (percent words correct/pwc), communication efficiency (intelligible words per minute/iwpm), temporal control markers (speech and pause coefficients of variation), and formant frequencies associated with lip motion in a 41-year-old face transplant recipient. Pwc and iwpm at 13 months post-transplantation were both higher than preoperative values. Multivariate regression demonstrated that temporal markers and all formant frequencies associated with lip motion were significant predictors (P < 0.05) of communication efficiency, highlighting the interplay of these variables in generating intelligible and effective speech. These findings can guide us in developing personalized rehabilitative approaches in face transplant recipients for optimal speech outcomes.}, } @article {pmid32640180, year = {2020}, author = {Kim, KS and Wang, H and Max, L}, title = {It's About Time: Minimizing Hardware and Software Latencies in Speech Research With Real-Time Auditory Feedback.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2522-2534}, pmid = {32640180}, issn = {1558-9102}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Computers ; Feedback ; *Feedback, Sensory ; Humans ; Reproducibility of Results ; Software ; *Speech ; }, abstract = {Purpose Various aspects of speech production related to auditory-motor integration and learning have been examined through auditory feedback perturbation paradigms in which participants' acoustic speech output is experimentally altered and played back via earphones/headphones "in real time." Scientific rigor requires high precision in determining and reporting the involved hardware and software latencies. Many reports in the literature, however, are not consistent with the minimum achievable latency for a given experimental setup. Here, we focus specifically on this methodological issue associated with implementing real-time auditory feedback perturbations, and we offer concrete suggestions for increased reproducibility in this particular line of work. Method Hardware and software latencies as well as total feedback loop latency were measured for formant perturbation studies with the Audapter software. Measurements were conducted for various audio interfaces, desktop and laptop computers, and audio drivers. An approach for lowering Audapter's software latency through nondefault parameter specification was also tested. Results Oft-overlooked hardware-specific latencies were not negligible for some of the tested audio interfaces (adding up to 15 ms). Total feedback loop latencies (including both hardware and software latency) were also generally larger than claimed in the literature. Nondefault parameter values can improve Audapter's own processing latency without negative impact on formant tracking. Conclusions Audio interface selection and software parameter optimization substantially affect total feedback loop latency. Thus, the actual total latency (hardware plus software) needs to be correctly measured and described in all published reports. Future speech research with "real-time" auditory feedback perturbations should increase scientific rigor by minimizing this latency.}, } @article {pmid32632010, year = {2020}, author = {Plass, J and Brang, D and Suzuki, S and Grabowecky, M}, title = {Vision perceptually restores auditory spectral dynamics in speech.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {117}, number = {29}, pages = {16920-16927}, pmid = {32632010}, issn = {1091-6490}, support = {T32 NS047987/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Cues ; Female ; Humans ; Lip/physiology ; Male ; Phonetics ; *Speech Acoustics ; *Speech Perception ; *Visual Perception ; }, abstract = {Visual speech facilitates auditory speech perception, but the visual cues responsible for these benefits and the information they provide remain unclear. Low-level models emphasize basic temporal cues provided by mouth movements, but these impoverished signals may not fully account for the richness of auditory information provided by visual speech. High-level models posit interactions among abstract categorical (i.e., phonemes/visemes) or amodal (e.g., articulatory) speech representations, but require lossy remapping of speech signals onto abstracted representations. Because visible articulators shape the spectral content of speech, we hypothesized that the perceptual system might exploit natural correlations between midlevel visual (oral deformations) and auditory speech features (frequency modulations) to extract detailed spectrotemporal information from visual speech without employing high-level abstractions. Consistent with this hypothesis, we found that the time-frequency dynamics of oral resonances (formants) could be predicted with unexpectedly high precision from the changing shape of the mouth during speech. When isolated from other speech cues, speech-based shape deformations improved perceptual sensitivity for corresponding frequency modulations, suggesting that listeners could exploit this cross-modal correspondence to facilitate perception. To test whether this type of correspondence could improve speech comprehension, we selectively degraded the spectral or temporal dimensions of auditory sentence spectrograms to assess how well visual speech facilitated comprehension under each degradation condition. Visual speech produced drastically larger enhancements during spectral degradation, suggesting a condition-specific facilitation effect driven by cross-modal recovery of auditory speech spectra. The perceptual system may therefore use audiovisual correlations rooted in oral acoustics to extract detailed spectrotemporal information from visual speech.}, } @article {pmid32631070, year = {2020}, author = {Kent, RD and Rountrey, C}, title = {What Acoustic Studies Tell Us About Vowels in Developing and Disordered Speech.}, journal = {American journal of speech-language pathology}, volume = {29}, number = {3}, pages = {1749-1778}, pmid = {32631070}, issn = {1558-9110}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Child ; Child, Preschool ; Humans ; Language ; Phonetics ; Speech ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Purpose Literature was reviewed on the development of vowels in children's speech and on vowel disorders in children and adults, with an emphasis on studies using acoustic methods. Method Searches were conducted with PubMed/MEDLINE, Google Scholar, CINAHL, HighWire Press, and legacy sources in retrieved articles. The primary search items included, but were not limited to, vowels, vowel development, vowel disorders, vowel formants, vowel therapy, vowel inherent spectral change, speech rhythm, and prosody. Results/Discussion The main conclusions reached in this review are that vowels are (a) important to speech intelligibility; (b) intrinsically dynamic; (c) refined in both perceptual and productive aspects beyond the age typically given for their phonetic mastery; (d) produced to compensate for articulatory and auditory perturbations; (e) influenced by language and dialect even in early childhood; (f) affected by a variety of speech, language, and hearing disorders in children and adults; (g) inadequately assessed by standardized articulation tests; and (h) characterized by at least three factors-articulatory configuration, extrinsic and intrinsic regulation of duration, and role in speech rhythm and prosody. Also discussed are stages in typical vowel ontogeny, acoustic characterization of rhotic vowels, a sensory-motor perspective on vowel production, and implications for clinical assessment of vowels.}, } @article {pmid32624371, year = {2022}, author = {Vurma, A}, title = {Amplitude Effects of Vocal Tract Resonance Adjustments When Singing Louder.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {2}, pages = {292.e11-292.e22}, doi = {10.1016/j.jvoice.2020.05.020}, pmid = {32624371}, issn = {1873-4588}, mesh = {Humans ; Male ; *Singing ; Sound ; Vibration ; *Voice ; Voice Quality ; }, abstract = {In the literature on vocal pedagogy we may find suggestions to increase the mouth opening when singing louder. It is known that sopranos tend to sing loud high notes with a wider mouth opening which raises the frequency of the first resonance of the vocal tract (fR1) to tune it close to the fundamental. Our experiment with classically trained male singers revealed that they also tended to raise the fR1 with the dynamics at pitches where the formant tuning does not seem relevant. The analysis by synthesis showed that such behaviour may contribute to the strengthening of the singer's formant by several dB-s and to a rise in the centre of spectral gravity. The contribution of the fR1 raising to the overall sound level was less consistent. Changing the extent of the mouth opening with the dynamics may create several simultaneous semantic cues that signal how prominent the produced sound is and how great the physical effort by the singer is. The diminishing of the mouth opening when singing piano may also have an importance as it helps singers to produce a quieter sound by increasing the distance between the fR1 and higher resonances, which lowers the transfer function of the vocal tract at the relevant spectral regions.}, } @article {pmid32616360, year = {2020}, author = {Chaturvedi, R and Kraus, M and Keefe, RSE}, title = {A new measure of authentic auditory emotion recognition: Application to patients with schizophrenia.}, journal = {Schizophrenia research}, volume = {222}, number = {}, pages = {450-454}, doi = {10.1016/j.schres.2019.11.043}, pmid = {32616360}, issn = {1573-2509}, support = {R21 MH101685/MH/NIMH NIH HHS/United States ; }, mesh = {Auditory Perception ; Emotions ; Face ; Facial Expression ; Humans ; Recognition, Psychology ; *Schizophrenia ; }, abstract = {BACKGROUND: Many social processes such as emotion recognition are severely impaired in patients with schizophrenia. While basic auditory processing seems to play a key role in identifying emotions, research in this field is limited due to the lack of proper assessment batteries. Many of the widely accepted tests utilize actors to portray certain emotions-these batteries are less ecologically and face valid.

METHODS: This study utilized a newly developed auditory emotion recognition test that contained natural stimuli from spontaneous displays of emotions to assess 28 patients with schizophrenia and 16 healthy controls.

RESULTS: The results indicate that the newly developed test, referred to as the INTONATION Test, is more sensitive to the emotion recognition deficits in patients with schizophrenia than previously used measures. The correlations of the INTONATION Test measures with basic auditory processes were similar to established tests of auditory emotion. Particular emotion sub scores from the INTONTATION test, such as happiness, demonstrated the strongest correlations with specific auditory processing skills, such as formant discrimination and sinusoidal amplitude modulation detection (SAM60).

CONCLUSIONS: The results from this study indicate that auditory emotion recognition impairments are more pronounced in patients with schizophrenia when perceiving authentic displays of emotion. Understanding these deficits could help specify the nature of auditory emotion recognition deficits in patients with schizophrenia and those at risk.}, } @article {pmid32611190, year = {2020}, author = {Toutios, A and Xu, M and Byrd, D and Goldstein, L and Narayanan, S}, title = {How an aglossic speaker produces an alveolar-like percept without a functional tongue tip.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {6}, pages = {EL460}, pmid = {32611190}, issn = {1520-8524}, support = {R01 DC007124/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Phonetics ; Speech ; *Tongue/diagnostic imaging ; *Voice ; }, abstract = {It has been previously observed [McMicken, Salles, Berg, Vento-Wilson, Rogers, Toutios, and Narayanan. (2017). J. Commun. Disorders, Deaf Stud. Hear. Aids 5(2), 1-6] using real-time magnetic resonance imaging that a speaker with severe congenital tongue hypoplasia (aglossia) had developed a compensatory articulatory strategy where she, in the absence of a functional tongue tip, produced a plosive consonant perceptually similar to /d/ using a bilabial constriction. The present paper provides an updated account of this strategy. It is suggested that the previously observed compensatory bilabial closing that occurs during this speaker's /d/ production is consistent with vocal tract shaping resulting from hyoid raising created with mylohyoid action, which may also be involved in typical /d/ production. Simulating this strategy in a dynamic articulatory synthesis experiment leads to the generation of /d/-like formant transitions.}, } @article {pmid32611162, year = {2020}, author = {Harper, S and Goldstein, L and Narayanan, S}, title = {Variability in individual constriction contributions to third formant values in American English /ɹ/.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {6}, pages = {3905}, pmid = {32611162}, issn = {1520-8524}, support = {R01 DC007124/DC/NIDCD NIH HHS/United States ; T32 DC009975/DC/NIDCD NIH HHS/United States ; }, mesh = {Constriction ; Language ; Pharynx ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; United States ; }, abstract = {Although substantial variability is observed in the articulatory implementation of the constriction gestures involved in /ɹ/ production, studies of articulatory-acoustic relations in /ɹ/ have largely ignored the potential for subtle variation in the implementation of these gestures to affect salient acoustic dimensions. This study examines how variation in the articulation of American English /ɹ/ influences the relative sensitivity of the third formant to variation in palatal, pharyngeal, and labial constriction degree. Simultaneously recorded articulatory and acoustic data from six speakers in the USC-TIMIT corpus was analyzed to determine how variation in the implementation of each constriction across tokens of /ɹ/ relates to variation in third formant values. Results show that third formant values are differentially affected by constriction degree for the different constrictions used to produce /ɹ/. Additionally, interspeaker variation is observed in the relative effect of different constriction gestures on third formant values, most notably in a division between speakers exhibiting relatively equal effects of palatal and pharyngeal constriction degree on F3 and speakers exhibiting a stronger palatal effect. This division among speakers mirrors interspeaker differences in mean constriction length and location, suggesting that individual differences in /ɹ/ production lead to variation in articulatory-acoustic relations.}, } @article {pmid32581975, year = {2020}, author = {Xu, M and Tachibana, RO and Okanoya, K and Hagiwara, H and Hashimoto, RI and Homae, F}, title = {Unconscious and Distinctive Control of Vocal Pitch and Timbre During Altered Auditory Feedback.}, journal = {Frontiers in psychology}, volume = {11}, number = {}, pages = {1224}, pmid = {32581975}, issn = {1664-1078}, abstract = {Vocal control plays a critical role in smooth social communication. Speakers constantly monitor auditory feedback (AF) and make adjustments when their voices deviate from their intentions. Previous studies have shown that when certain acoustic features of the AF are artificially altered, speakers compensate for this alteration in the opposite direction. However, little is known about how the vocal control system implements compensations for alterations of different acoustic features, and associates them with subjective consciousness. The present study investigated whether compensations for the fundamental frequency (F0), which corresponds to perceived pitch, and formants, which contribute to perceived timbre, can be performed unconsciously and independently. Forty native Japanese speakers received two types of altered AF during vowel production that involved shifts of either only the formant frequencies (formant modification; Fm) or both the pitch and formant frequencies (pitch + formant modification; PFm). For each type, three levels of shift (slight, medium, and severe) in both directions (increase or decrease) were used. After the experiment, participants were tested for whether they had perceived a change in the F0 and/or formants. The results showed that (i) only formants were compensated for in the Fm condition, while both the F0 and formants were compensated for in the PFm condition; (ii) the F0 compensation exhibited greater precision than the formant compensation in PFm; and (iii) compensation occurred even when participants misperceived or could not explicitly perceive the alteration in AF. These findings indicate that non-experts can compensate for both formant and F0 modifications in the AF during vocal production, even when the modifications are not explicitly or correctly perceived, which provides further evidence for a dissociation between conscious perception and action in vocal control. We propose that such unconscious control of voice production may enhance rapid adaptation to changing speech environments and facilitate mutual communication.}, } @article {pmid32554244, year = {2020}, author = {White-Schwoch, T and Magohe, AK and Fellows, AM and Rieke, CC and Vilarello, B and Nicol, T and Massawe, ER and Moshi, N and Kraus, N and Buckey, JC}, title = {Auditory neurophysiology reveals central nervous system dysfunction in HIV-infected individuals.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {131}, number = {8}, pages = {1827-1832}, pmid = {32554244}, issn = {1872-8952}, support = {D43 TW009573/TW/FIC NIH HHS/United States ; R01 DC009972/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; HIV Infections/*physiopathology ; Hearing/physiology ; Humans ; Male ; Middle Aged ; Speech ; Speech Perception/*physiology ; Tanzania ; Young Adult ; }, abstract = {OBJECTIVE: To test the hypothesis that human immunodeficiency virus (HIV) affects auditory-neurophysiological functions.

METHODS: A convenience sample of 68 HIV+ and 59 HIV- normal-hearing adults was selected from a study set in Dar es Salaam, Tanzania. The speech-evoked frequency-following response (FFR), an objective measure of auditory function, was collected. Outcome measures were FFRs to the fundamental frequency (F0) and to harmonics corresponding to the first formant (F1), two behaviorally relevant cues for understanding speech.

RESULTS: The HIV+ group had weaker responses to the F1 than the HIV- group; this effect generalized across multiple stimuli (d = 0.59). Responses to the F0 were similar between groups.

CONCLUSIONS: Auditory-neurophysiological responses differ between HIV+ and HIV- adults despite normal hearing thresholds.

SIGNIFICANCE: The FFR may reflect HIV-associated central nervous system dysfunction that manifests as disrupted auditory processing of speech harmonics corresponding to the first formant.}, } @article {pmid32552327, year = {2020}, author = {DiNino, M and Arenberg, JG and Duchen, ALR and Winn, MB}, title = {Effects of Age and Cochlear Implantation on Spectrally Cued Speech Categorization.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {7}, pages = {2425-2440}, pmid = {32552327}, issn = {1558-9102}, support = {R01 DC012142/DC/NIDCD NIH HHS/United States ; R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; *Cochlear Implantation ; *Cochlear Implants ; Cues ; Humans ; Speech ; *Speech Perception ; }, abstract = {Purpose Weighting of acoustic cues for perceiving place-of-articulation speech contrasts was measured to determine the separate and interactive effects of age and use of cochlear implants (CIs). It has been found that adults with normal hearing (NH) show reliance on fine-grained spectral information (e.g., formants), whereas adults with CIs show reliance on broad spectral shape (e.g., spectral tilt). In question was whether children with NH and CIs would demonstrate the same patterns as adults, or show differences based on ongoing maturation of hearing and phonetic skills. Method Children and adults with NH and with CIs categorized a /b/-/d/ speech contrast based on two orthogonal spectral cues. Among CI users, phonetic cue weights were compared to vowel identification scores and Spectral-Temporally Modulated Ripple Test thresholds. Results NH children and adults both relied relatively more on the fine-grained formant cue and less on the broad spectral tilt cue compared to participants with CIs. However, early-implanted children with CIs better utilized the formant cue compared to adult CI users. Formant cue weights correlated with CI participants' vowel recognition and in children, also related to Spectral-Temporally Modulated Ripple Test thresholds. Adults and child CI users with very poor phonetic perception showed additive use of the two cues, whereas those with better and/or more mature cue usage showed a prioritized trading relationship, akin to NH listeners. Conclusions Age group and hearing modality can influence phonetic cue-weighting patterns. Results suggest that simple nonlexical categorization tests correlate with more general speech recognition skills of children and adults with CIs.}, } @article {pmid32539544, year = {2021}, author = {Chiu, Y and Neel, A and Loux, T}, title = {Acoustic characteristics in relation to intelligibility reduction in noise for speakers with Parkinson's disease.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {3}, pages = {222-236}, doi = {10.1080/02699206.2020.1777585}, pmid = {32539544}, issn = {1464-5076}, mesh = {Acoustics ; Humans ; *Parkinson Disease/complications ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Decreased speech intelligibility in noisy environments is frequently observed in speakers with Parkinson's disease (PD). This study investigated which acoustic characteristics across the speech subsystems contributed to poor intelligibility in noise for speakers with PD. Speech samples were obtained from 13 speakers with PD and five healthy controls reading 56 sentences. Intelligibility analysis was conducted in quiet and noisy listening conditions. Seventy-two young listeners transcribed the recorded sentences in quiet and another 72 listeners transcribed in noise. The acoustic characteristics of the speakers with PD who experienced large intelligibility reduction from quiet to noise were compared to those with smaller intelligibility reduction in noise and healthy controls. The acoustic measures in the study included second formant transitions, cepstral and spectral measures of voice (cepstral peak prominence and low/high spectral ratio), pitch variation, and articulation rate to represent speech components across speech subsystems of articulation, phonation, and prosody. The results show that speakers with PD who had larger intelligibility reduction in noise exhibited decreased second formant transition, limited cepstral and spectral variations, and faster articulation rate. These findings suggest that the adverse effect of noise on speech intelligibility in PD is related to speech changes in the articulatory and phonatory systems.}, } @article {pmid32538265, year = {2021}, author = {Rankinen, W and de Jong, K}, title = {The Entanglement of Dialectal Variation and Speaker Normalization.}, journal = {Language and speech}, volume = {64}, number = {1}, pages = {181-202}, doi = {10.1177/0023830920929379}, pmid = {32538265}, issn = {1756-6053}, mesh = {Algorithms ; Humans ; *Language ; *Phonetics ; Psycholinguistics ; Reading ; *Social Behavior ; Speech/*physiology ; Speech Acoustics ; *Verbal Behavior ; }, abstract = {This paper explores the relationship between speaker normalization and dialectal identity in sociolinguistic data, examining a database of vowel formants collected from 88 monolingual American English speakers in Michigan's Upper Peninsula. Audio recordings of Finnish- and Italian-heritage American English speakers reading a passage and a word list were normalized using two normalization procedures. These algorithms are based on different concepts of normalization: Lobanov, which models normalization as based on experience with individual talkers, and Labov ANAE, which models normalization as based on experience with scale-factors inherent in acoustic resonators of all kinds. The two procedures yielded different results; while the Labov ANAE method reveals a cluster shifting of low and back vowels that correlated with heritage, the Lobanov procedure seems to eliminate this sociolinguistic variation. The difference between the two procedures lies in how they treat relations between formant changes, suggesting that dimensions of variation in the vowel space may be treated differently by different normalization procedures, raising the question of how anatomical variation and dialectal variation interact in the real world. The structure of the sociolinguistic effects found with the Labov ANAE normalized data, but not in the Lobanov normalized data, suggest that the Lobanov normalization does over-normalize formant measures and remove sociolinguistically relevant information.}, } @article {pmid32525399, year = {2020}, author = {Xiao, CC and Luetzenberg, FS and Jiang, N and Liang, J}, title = {Does Nasal Surgery Affect Voice Outcomes? A Systematic Review with Meta-Analyses.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {129}, number = {12}, pages = {1174-1185}, doi = {10.1177/0003489420933290}, pmid = {32525399}, issn = {1943-572X}, mesh = {Chronic Disease ; Humans ; Nasal Polyps/*surgery ; Nasal Septum/surgery ; *Nasal Surgical Procedures ; Otorhinolaryngologic Surgical Procedures ; Paranasal Sinuses/surgery ; Postoperative Complications/epidemiology/physiopathology ; Rhinitis/*surgery ; Rhinoplasty ; Sinusitis/*surgery ; Treatment Outcome ; Turbinates/surgery ; *Voice Quality ; }, abstract = {OBJECTIVES: Changes in airflow dynamics after nasal surgery may have implications on voice quality. Multiple studies have evaluated the impact of nasal surgery on voice using heterogeneous outcome measures. We aim to systematically review the impact of nasal surgery on voice quality.

METHODS: Our study design was a systematic review with meta-analyses. A literature search of PubMed, Ovid, Cochrane from 1997 to 2017 was performed. Inclusion criteria included English language studies containing original data on nasal surgery and voice. Two investigators independently reviewed all manuscripts and performed a comprehensive quality assessment. Meta-analysis was completed on quantitative voice measurements.

RESULTS: Of 463 identified, 19 studies with 692 patients fulfilled eligibility. Nasal surgeries performed included endoscopic sinus surgery (11/20), septoplasty (11/20), rhinoplasty (2/20), and turbinate reduction (2/20). Voice outcomes measured included nasalance (8/20), fundamental frequency (11/20), jitter (10/20), shimmer (10/20), harmonic to noise ratio (HRN) (8/20), formants (5/20), and voice handicap index (VHI) (4/20). Voice examinations were assessed preoperatively and 1 to 30 months postoperatively. Meta-analysis revealed statistically significant changes in nasalance, (P < .01) 1 month postoperatively; there was no significant difference in nasalance at 6 months postoperatively. All other variables analyzed revealed no statistically significant differences. Five of nine studies showed majority of patients did not notice subjective change in voice after surgery, but with high heterogeneity of measurements.

CONCLUSIONS: There may be a short-term increase in nasalance that resolves at longer follow-up, but there seem to be no other objective changes in voice. There may be subjective changes after surgery, but require further study to evaluate.}, } @article {pmid32516559, year = {2020}, author = {Ménard, L and Prémont, A and Trudeau-Fisette, P and Turgeon, C and Tiede, M}, title = {Phonetic Implementation of Prosodic Emphasis in Preschool-Aged Children and Adults: Probing the Development of Sensorimotor Speech Goals.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {6}, pages = {1658-1674}, doi = {10.1044/2020_JSLHR-20-00017}, pmid = {32516559}, issn = {1558-9102}, mesh = {Adult ; Child, Preschool ; Goals ; Humans ; *Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Objective We aimed to investigate the production of contrastive emphasis in French-speaking 4-year-olds and adults. Based on previous work, we predicted that, due to their immature motor control abilities, preschool-aged children would produce smaller articulatory differences between emphasized and neutral syllables than adults. Method Ten 4-year-old children and 10 adult French speakers were recorded while repeating /bib/, /bub/, and /bab/ sequences in neutral and contrastive emphasis conditions. Synchronous recordings of tongue movements, lip and jaw positions, and speech signals were made. Lip positions and tongue shapes were analyzed; formant frequencies, amplitude, fundamental frequency, and duration were extracted from the acoustic signals; and between-vowel contrasts were calculated. Results Emphasized vowels were higher in pitch, intensity, and duration than their neutral counterparts in all participants. However, the effect of contrastive emphasis on lip position was smaller in children. Prosody did not affect tongue position in children, whereas it did in adults. As a result, children's productions were perceived less accurately than those of adults. Conclusion These findings suggest that 4-year-old children have not yet learned to produce hypoarticulated forms of phonemic goals to allow them to successfully contrast syllables and enhance prosodic saliency.}, } @article {pmid35402959, year = {2020}, author = {Quatieri, TF and Talkar, T and Palmer, JS}, title = {A Framework for Biomarkers of COVID-19 Based on Coordination of Speech-Production Subsystems.}, journal = {IEEE open journal of engineering in medicine and biology}, volume = {1}, number = {}, pages = {203-206}, pmid = {35402959}, issn = {2644-1276}, abstract = {Goal: We propose a speech modeling and signal-processing framework to detect and track COVID-19 through asymptomatic and symptomatic stages. Methods: The approach is based on complexity of neuromotor coordination across speech subsystems involved in respiration, phonation and articulation, motivated by the distinct nature of COVID-19 involving lower (i.e., bronchial, diaphragm, lower tracheal) versus upper (i.e., laryngeal, pharyngeal, oral and nasal) respiratory tract inflammation, as well as by the growing evidence of the virus' neurological manifestations. Preliminary results: An exploratory study with audio interviews of five subjects provides Cohen's d effect sizes between pre-COVID-19 (pre-exposure) and post-COVID-19 (after positive diagnosis but presumed asymptomatic) using: coordination of respiration (as measured through acoustic waveform amplitude) and laryngeal motion (fundamental frequency and cepstral peak prominence), and coordination of laryngeal and articulatory (formant center frequencies) motion. Conclusions: While there is a strong subject-dependence, the group-level morphology of effect sizes indicates a reduced complexity of subsystem coordination. Validation is needed with larger more controlled datasets and to address confounding influences such as different recording conditions, unbalanced data quantities, and changes in underlying vocal status from pre-to-post time recordings.}, } @article {pmid32379521, year = {2020}, author = {Groll, MD and McKenna, VS and Hablani, S and Stepp, CE}, title = {Formant-Estimated Vocal Tract Length and Extrinsic Laryngeal Muscle Activation During Modulation of Vocal Effort in Healthy Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {5}, pages = {1395-1403}, pmid = {32379521}, issn = {1558-9102}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; T32 DC000030/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Electromyography ; Humans ; *Laryngeal Muscles ; Neck Muscles ; Speech Acoustics ; *Voice ; }, abstract = {Purpose The goal of this study was to explore the relationships among vocal effort, extrinsic laryngeal muscle activity, and vocal tract length (VTL) within healthy speakers. We hypothesized that increased vocal effort would result in increased suprahyoid muscle activation and decreased VTL, as previously observed in individuals with vocal hyperfunction. Method Twenty-eight healthy speakers of American English produced vowel-consonant-vowel utterances under varying levels of vocal effort. VTL was estimated from the vowel formants. Three surface electromyography sensors measured the activation of the suprahyoid and infrahyoid muscle groups. A general linear model was used to investigate the effects of vocal effort level and surface electromyography on VTL. Two additional general linear models were used to investigate the effects of vocal effort on suprahyoid and infrahyoid muscle activities. Results Neither vocal effort nor extrinsic muscle activity showed significant effects on VTL; however, the degree of extrinsic muscle activity of both suprahyoid and infrahyoid muscle groups increased with increases in vocal effort. Conclusion Increasing vocal effort resulted in increased activation of both suprahyoid and infrahyoid musculature in healthy adults, with no change to VTL.}, } @article {pmid32371713, year = {2020}, author = {Zhou, H and Lu, J and Zhang, C and Li, X and Li, Y}, title = {Abnormal Acoustic Features Following Pharyngeal Flap Surgery in Patients Aged Six Years and Older.}, journal = {The Journal of craniofacial surgery}, volume = {31}, number = {5}, pages = {1395-1399}, doi = {10.1097/SCS.0000000000006483}, pmid = {32371713}, issn = {1536-3732}, mesh = {Acoustics ; Adolescent ; Adult ; Child ; Female ; Humans ; Male ; Otorhinolaryngologic Surgical Procedures/*adverse effects ; Pharynx/*surgery ; Phonetics ; Retrospective Studies ; Speech ; Speech Disorders/*etiology ; *Surgical Flaps ; Treatment Outcome ; Velopharyngeal Insufficiency/surgery ; Young Adult ; }, abstract = {In our study, older velopharyngeal insufficiency (posterior velopharyngeal insufficiency) patients were defined as those older than 6 years of age. This study aimed to evaluate the abnormal acoustic features of older velopharyngeal insufficiency patients before and after posterior pharyngeal flap surgery. A retrospective medical record review was conducted for patients aged 6 years and older, who underwent posterior pharyngeal flap surgery between November 2011 and March 2015. The audio records of patients were evaluated before and after surgery. Spectral analysis was conducted by the Computer Speech Lab (CSL)-4150B acoustic system with the following input data: The vowel /i/, unaspirated plosive /b/, aspirated plosives /p/, aspirated fricatives /s/ and /x/, unaspirated affricates /j/ and /z/, and aspirated affricates /c/ and /q/. The patients were followed up for 3 months. Speech outcome was evaluated by comparing the postoperatively phonetic data with preoperative data. Subjective and objective analyses showed significant differences in the sonogram, formant, and speech articulation before and after the posterior pharyngeal flap surgery. However, the sampled patients could not be considered to have a high speech articulation (<85%) as the normal value was above or equal to 96%. Our results showed that pharyngeal flap surgery could correct the speech function of older patients with posterior velopharyngeal insufficiency to some extent. Owing to the original errors in pronunciation patterns, pathological speech articulation still existed, and speech treatment is required in the future.}, } @article {pmid32359329, year = {2020}, author = {Kochetov, A and Petersen, JH and Arsenault, P}, title = {Acoustics of Kalasha laterals.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {3012}, doi = {10.1121/10.0001013}, pmid = {32359329}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {Kalasha, a Northwestern Indo-Aryan language spoken in a remote mountainous region of Pakistan, is relatively unusual among languages of the region as it has lateral approximants contrasting in secondary articulation-velarization and palatalization (/ɫ/ vs /lʲ/). Given the paucity of previous phonetic work on the language and some discrepancies between descriptive accounts, the nature of the Kalasha lateral contrast remains poorly understood. This paper presents an analysis of fieldwork recordings with laterals produced by 14 Kalasha speakers in a variety of lexical items and phonetic contexts. Acoustic analysis of formants measured during the lateral closure revealed that the contrast was most clearly distinguished by F2 (as well as by F2-F1 difference), which was considerably higher for /lʲ/ than for /ɫ/. This confirms that the two laterals are primarily distinguished by secondary articulation and not by retroflexion, which is otherwise robustly represented in the language inventory. The laterals showed no positional differences but did show considerable fronting (higher F2) next to front vowels. Some inter-speaker variation was observed in the realization of /ɫ/, which was produced with little or no velarization by older speakers. This is indicative of a change in progress, resulting in an overall enhancement of an otherwise auditorily vulnerable contrast.}, } @article {pmid32359316, year = {2020}, author = {Almurashi, W and Al-Tamimi, J and Khattab, G}, title = {Static and dynamic cues in vowel production in Hijazi Arabic.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2917}, doi = {10.1121/10.0001004}, pmid = {32359316}, issn = {1520-8524}, mesh = {Cues ; Humans ; Language ; Male ; Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Static cues such as formant measurements obtained at the vowel midpoint are usually taken as the main correlate for vowel identification. However, dynamic cues such as vowel-inherent spectral change have been shown to yield better classification of vowels using discriminant analysis. The aim of this study is to evaluate the role of static versus dynamic cues in Hijazi Arabic (HA) vowel classification, in addition to vowel duration and F3, which are not usually looked at. Data from 12 male HA speakers producing eight HA vowels in /hVd/ syllables were obtained, and classification accuracy was evaluated using discriminant analysis. Dynamic cues, particularly the three-point model, had higher classification rates (average 95.5%) than the remaining models (static model: 93.5%; other dynamic models: between 65.75% and 94.25%). Vowel duration had a significant role in classification accuracy (average +8%). These results are in line with dynamic approaches to vowel classification and highlight the relative importance of cues such as vowel duration across languages, particularly where it is prominent in the phonology.}, } @article {pmid32359308, year = {2020}, author = {Egurtzegi, A and Carignan, C}, title = {An acoustic description of Mixean Basque.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2791}, doi = {10.1121/10.0000996}, pmid = {32359308}, issn = {1520-8524}, mesh = {Acoustics ; *Language ; Phonetics ; Spain ; *Speech Acoustics ; }, abstract = {This paper presents an acoustic analysis of Mixean Low Navarrese, an endangered variety of Basque. The manuscript includes an overview of previous acoustic studies performed on different Basque varieties in order to synthesize the sparse acoustic descriptions of the language that are available. This synthesis serves as a basis for the acoustic analysis performed in the current study, in which the various acoustic analyses given in previous studies are replicated in a single, cohesive general acoustic description of Mixean Basque. The analyses include formant and duration measurements for the six-vowel system, voice onset time measurements for the three-way stop system, spectral center of gravity for the sibilants, and number of lingual contacts in the alveolar rhotic tap and trill. Important findings include: a centralized realization ([ʉ]) of the high-front rounded vowel usually described as /y/; a data-driven confirmation of the three-way laryngeal opposition in the stop system; evidence in support of an alveolo-palatal to apical sibilant merger; and the discovery of a possible incipient merger of rhotics. These results show how using experimental acoustic methods to study under-represented linguistic varieties can result in revelations of sound patterns otherwise undescribed in more commonly studied varieties of the same language.}, } @article {pmid32359305, year = {2020}, author = {Mellesmoen, G and Babel, M}, title = {Acoustically distinct and perceptually ambiguous: ʔayʔaǰuθəm (Salish) fricatives.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2959}, doi = {10.1121/10.0001007}, pmid = {32359305}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {ʔayʔaǰuθəm (Comox-Sliammon) is a Central Salish language spoken in British Columbia with a large fricative inventory. Previous impressionistic descriptions of ʔayʔaǰuθəm have noted perceptual ambiguity of select anterior fricatives. This paper provides an auditory-acoustic description of the four anterior fricatives /θ s ʃ ɬ/ in the Mainland dialect of ʔayʔaǰuθəm. Peak ERBN trajectories, noise duration, and formant transitions are analysed in the fricative productions of five speakers. These analyses provide quantitative and qualitative descriptions of these fricative contrasts, indicating more robust acoustic differentiation for fricatives in onset versus coda position. In a perception task, English listeners categorized fricatives in CV and VC sequences from the natural productions. The results of the perception experiment are consistent with reported perceptual ambiguity between /s/ and /θ/, with listeners frequently misidentifying /θ/ as /s/. The production and perception data suggest that listener L1 categories play a role in the categorization and discrimination of ʔayʔaǰuθəm fricatives. These findings provide an empirical description of fricatives in an understudied language and have implications for L2 teaching and learning in language revitalization contexts.}, } @article {pmid32359280, year = {2020}, author = {Rosen, N and Stewart, J and Sammons, ON}, title = {How "mixed" is mixed language phonology? An acoustic analysis of the Michif vowel system.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2989}, doi = {10.1121/10.0001009}, pmid = {32359280}, issn = {1520-8524}, mesh = {Acoustics ; Canada ; Humans ; *Language ; *Phonetics ; Speech Acoustics ; }, abstract = {Michif, a severely endangered language still spoken today by an estimated 100-200 Métis people in Western Canada, is generally classified as a mixed language, meaning it cannot be traced back to a single language family [Bakker (1997). A Language of Our Own (Oxford University Press, Oxford); Thomason (2001). Language Contact: An Introduction (Edinburgh University Press and Georgetown University Press, Edinburgh and Washington, DC); Meakins (2013). Contact Languages: A Comprehensive Guide (Mouton De Gruyter, Berlin), pp. 159-228.]. It has been claimed to maintain the phonological grammar of both of its source languages, French and Plains Cree [Rhodes (1977). Actes du Huitieme congrès des Algonqunistes (Carleton University, Ottawa), pp. 6-25; Bakker (1997). A Language of Our Own (Oxford University Press, Oxford); Bakker and Papen (1997). Contact Languages: A Wider Perspective (John Benjamins, Amsterdam), pp. 295-363]. The goal of this paper is twofold: to offer an instrumental analysis of Michif vowels and to investigate this claim of a stratified grammar, based on this careful phonetic analysis. Using source language as a variable in the analysis, the authors argue the Michif vowel system does not appear to rely on historical information, and that historically similar French and Cree vowels pattern together within the Michif system with regards to formant frequencies and duration. The authors show that there are nine Michif oral vowels in this system, which has merged phonetically similar French- and Cree-source vowels.}, } @article {pmid32359278, year = {2020}, author = {van Brenk, F and Terband, H}, title = {Compensatory and adaptive responses to real-time formant shifts in adults and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2261}, doi = {10.1121/10.0001018}, pmid = {32359278}, issn = {1520-8524}, mesh = {Adaptation, Physiological ; Adolescent ; Adult ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Speech ; *Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Auditory feedback plays an important role in speech motor learning, yet, little is known about the strength of motor learning and feedback control in speech development. This study investigated compensatory and adaptive responses to auditory feedback perturbation in children (aged 4-9 years old) and young adults (aged 18-29 years old). Auditory feedback was perturbed by near-real-time shifting F1 and F2 of the vowel /ɪː/ during the production of consonant-vowel-consonant words. Children were able to compensate and adapt in a similar or larger degree compared to young adults. Higher token-to-token variability was found in children compared to adults but not disproportionately higher during the perturbation phases compared to the unperturbed baseline. The added challenge to auditory-motor integration did not influence production variability in children, and compensation and adaptation effects were found to be strong and sustainable. Significant group differences were absent in the proportions of speakers displaying a compensatory or adaptive response, an amplifying response, or no consistent response. Within these categories, children produced significantly stronger compensatory, adaptive, or amplifying responses, which could be explained by less-ingrained existing representations. The results are interpreted as both auditory-motor integration and learning capacities are stronger in young children compared to adults.}, } @article {pmid32359273, year = {2020}, author = {Chiu, C and Sun, JT}, title = {On pharyngealized vowels in Northern Horpa: An acoustic and ultrasound study.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2928}, doi = {10.1121/10.0001005}, pmid = {32359273}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; Ultrasonography ; }, abstract = {In the Northern Horpa (NH) language of Sichuan, vowels are divided between plain and pharyngealized sets, with the latter pronounced with auxiliary articulatory gestures involving more constriction in the vocal tract. The current study examines how the NH vocalic contrast is manifested in line with the process of pharyngealization both acoustically and articulatorily, based on freshly gathered data from two varieties of the language (i.e., Rtsangkhog and Yunasche). Along with formant analyses, ultrasound imaging was employed to capture the tongue postures and positions during vowel production. The results show that in contrast with plain vowels, pharyngealized vowels generally feature lower F2 values and higher F1 and F3 values. Mixed results for F2 and F3 suggest that the quality contrasts are vowel-dependent. Ultrasound images, on the other hand, reveal that the vocalic distinction is affected by different types of tongue movements, including retraction, backing, and double bunching, depending on the inherent tongue positions for each vowel. The two NH varieties investigated are found to display differential formant changes and different types of tongue displacements. The formant profiles along with ultrasound images support the view that the production of the NH phonologically marked vowels is characteristic of pharyngealization.}, } @article {pmid32359268, year = {2020}, author = {Horo, L and Sarmah, P and Anderson, GDS}, title = {Acoustic phonetic study of the Sora vowel system.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {3000}, doi = {10.1121/10.0001011}, pmid = {32359268}, issn = {1520-8524}, mesh = {Acoustics ; India ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This paper is an acoustic phonetic study of vowels in Sora, a Munda language of the Austroasiatic language family. Descriptions here illustrate that the Sora vowel system has six vowels and provide evidence that Sora disyllables have prominence on the second syllable. While the acoustic categorization of vowels is based on formant frequencies, the presence of prominence on the second syllable is shown through temporal features of vowels, including duration, intensity, and fundamental frequency. Additionally, this paper demonstrates that acoustic categorization of vowels in Sora is better in the prominent syllable than in the non-prominent syllable, providing evidence that syllable prominence and vowel quality are correlated in Sora. These acoustic properties of Sora vowels are discussed in relation to the existing debates on vowels and patterns of syllable prominence in Munda languages of India. In this regard, it is noteworthy that Munda languages, in general, lack instrumental studies, and therefore this paper presents significant findings that are undocumented in other Munda languages. These acoustic studies are supported by exploratory statistical modeling and statistical classification methods.}, } @article {pmid32359261, year = {2020}, author = {Sarvasy, H and Elvin, J and Li, W and Escudero, P}, title = {An acoustic phonetic description of Nungon vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2891}, doi = {10.1121/10.0001003}, pmid = {32359261}, issn = {1520-8524}, mesh = {Acoustics ; Papua New Guinea ; *Phonetics ; Speech ; *Speech Acoustics ; }, abstract = {This study is a comprehensive acoustic description and analysis of the six vowels /i e a u o ɔ/ in the Towet dialect of the Papuan language Nungon ⟨yuw⟩ of northeastern Papua New Guinea. Vowel tokens were extracted from a corpus of audio speech recordings created for general language documentation and grammatical description. To assess the phonetic correlates of a claimed phonological vowel length distinction, vowel duration was measured. Multi-point acoustic analyses enabled investigation of mean vowel F1, F2, and F3; vowel trajectories, and coarticulation effects. The three Nungon back vowels were of particular interest, as they contribute to an asymmetrical, back vowel-heavy array, and /o/ had previously been described as having an especially low F2. The authors found that duration of phonologically long and short vowels differed significantly. Mean vowel formant measurements confirmed that the six phonological vowels form six distinct acoustic groupings; trajectories show slightly more formant movement in some vowels than was previously known. Adjacent nasal consonants exerted significant effects on vowel formant measurements. The authors show that an uncontrolled, general documentation corpus for an under-described language can be mined for acoustic analysis, but coarticulation effects should be taken into account.}, } @article {pmid32359247, year = {2020}, author = {Nance, C and Kirkham, S}, title = {The acoustics of three-way lateral and nasal palatalisation contrasts in Scottish Gaelic.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2858}, doi = {10.1121/10.0000998}, pmid = {32359247}, issn = {1520-8524}, mesh = {Acoustics ; *Language ; Phonetics ; Scotland ; *Speech Acoustics ; }, abstract = {This paper presents an acoustic description of laterals and nasals in an endangered minority language, Scottish Gaelic (known as "Gaelic"). Gaelic sonorants are reported to take part in a typologically unusual three-way palatalisation contrast. Here, the acoustic evidence for this contrast is considered, comparing lateral and nasal consonants in both word-initial and word-final position. Previous acoustic work has considered lateral consonants, but nasals are much less well-described. An acoustic analysis of twelve Gaelic-dominant speakers resident in a traditionally Gaelic-speaking community is reported. Sonorant quality is quantified via measurements of F2-F1 and F3-F2 and observation of the whole spectrum. Additionally, we quantify extensive devoicing in word-final laterals that has not been previously reported. Mixed-effects regression modelling suggests robust three-way acoustic differences in lateral consonants in all relevant vowel contexts. Nasal consonants, however, display lesser evidence of the three-way contrast in formant values and across the spectrum. Potential reasons for lesser evidence of contrast in the nasal system are discussed, including the nature of nasal acoustics, evidence from historical changes, and comparison to other Goidelic dialects. In doing so, contributions are made to accounts of the acoustics of the Celtic languages, and to typologies of contrastive palatalisation in the world's languages.}, } @article {pmid32359243, year = {2020}, author = {Tabain, M and Butcher, A and Breen, G and Beare, R}, title = {A formant study of the alveolar versus retroflex contrast in three Central Australian languages: Stop, nasal, and lateral manners of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2745}, doi = {10.1121/10.0001012}, pmid = {32359243}, issn = {1520-8524}, mesh = {Acoustics ; Australia ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study presents formant transition data from 21 speakers for the apical alveolar∼retroflex contrast in three neighbouring Central Australian languages: Arrernte, Pitjantjatjara, and Warlpiri. The contrast is examined for three manners of articulation: stop, nasal, and lateral /t ∼ ʈ/ /n ∼ ɳ/, and /l ∼ ɭ/, and three vowel contexts /a i u/. As expected, results show that a lower F3 and F4 in the preceding vowel signal a retroflex consonant; and that the alveolar∼retroflex contrast is most clearly realized in the context of an /a/ vowel, and least clearly realized in the context of an /i/ vowel. Results also show that the contrast is most clearly realized for the stop manner of articulation. These results provide an acoustic basis for the greater typological rarity of retroflex nasals and laterals as compared to stops. It is suggested that possible nasalization of the preceding vowel accounts for the poorer nasal consonant results, and that articulatory constraints on lateral consonant production account for the poorer lateral consonant results. Importantly, differences are noticed between speakers, and it is suggested that literacy plays a major role in maintenance of this marginal phonemic contrast.}, } @article {pmid32339775, year = {2020}, author = {Liepins, R and Kaider, A and Honeder, C and Auinger, AB and Dahm, V and Riss, D and Arnoldner, C}, title = {Formant frequency discrimination with a fine structure sound coding strategy for cochlear implants.}, journal = {Hearing research}, volume = {392}, number = {}, pages = {107970}, doi = {10.1016/j.heares.2020.107970}, pmid = {32339775}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Cochlea/*physiopathology ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Comprehension ; Cross-Over Studies ; Electric Stimulation ; Female ; *Hearing ; Hearing Loss/diagnosis/physiopathology/*therapy ; Humans ; Longitudinal Studies ; Male ; Middle Aged ; Noise/adverse effects ; Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; *Pitch Discrimination ; Speech Intelligibility ; *Speech Perception ; Young Adult ; }, abstract = {Recent sound coding strategies for cochlear implants (CI) have focused on the transmission of temporal fine structure to the CI recipient. To date, knowledge about the effects of fine structure coding in electrical hearing is poorly charactarized. The aim of this study was to examine whether the presence of temporal fine structure coding affects how the CI recipient perceives sound. This was done by comparing two sound coding strategies with different temporal fine structure coverage in a longitudinal cross-over setting. The more recent FS4 coding strategy provides fine structure coding on typically four apical stimulation channels compared to FSP with usually one or two fine structure channels. 34 adult CI patients with a minimum CI experience of one year were included. All subjects were fitted according to clinical routine and used both coding strategies for three months in a randomized sequence. Formant frequency discrimination thresholds (FFDT) were measured to assess the ability to resolve timbre information. Further outcome measures included a monosyllables test in quiet and the speech reception threshold of an adaptive matrix sentence test in noise (Oldenburger sentence test). In addition, the subjective sound quality was assessed using visual analogue scales and a sound quality questionnaire after each three months period. The extended fine structure range of FS4 yields FFDT similar to FSP for formants occurring in the frequency range only covered by FS4. There is a significant interaction (p = 0.048) between the extent of fine structure coverage in FSP and the improvement in FFDT in favour of FS4 for these stimuli. FS4 Speech perception in noise and quiet was similar with both coding strategies. Sound quality was rated heterogeneously showing that both strategies represent valuable options for CI fitting to allow for best possible individual optimization.}, } @article {pmid32339072, year = {2020}, author = {Dorman, MF and Natale, SC and Baxter, L and Zeitler, DM and Carlson, ML and Lorens, A and Skarzynski, H and Peters, JPM and Torres, JH and Noble, JH}, title = {Approximations to the Voice of a Cochlear Implant: Explorations With Single-Sided Deaf Listeners.}, journal = {Trends in hearing}, volume = {24}, number = {}, pages = {2331216520920079}, pmid = {32339072}, issn = {2331-2165}, support = {R01 DC014037/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implantation ; *Cochlear Implants ; *Deafness/diagnosis ; Humans ; *Speech Perception ; }, abstract = {Fourteen single-sided deaf listeners fit with an MED-EL cochlear implant (CI) judged the similarity of clean signals presented to their CI and modified signals presented to their normal-hearing ear. The signals to the normal-hearing ear were created by (a) filtering, (b) spectral smearing, (c) changing overall fundamental frequency (F0), (d) F0 contour flattening, (e) changing formant frequencies, (f) altering resonances and ring times to create a metallic sound quality, (g) using a noise vocoder, or (h) using a sine vocoder. The operations could be used singly or in any combination. On a scale of 1 to 10 where 10 was a complete match to the sound of the CI, the mean match score was 8.8. Over half of the matches were 9.0 or higher. The most common alterations to a clean signal were band-pass or low-pass filtering, spectral peak smearing, and F0 contour flattening. On average, 3.4 operations were used to create a match. Upshifts in formant frequencies were implemented most often for electrode insertion angles less than approximately 500°. A relatively small set of operations can produce signals that approximate the sound of the MED-EL CI. There are large individual differences in the combination of operations needed. The sound files in Supplemental Material approximate the sound of the MED-EL CI for patients fit with 28-mm electrode arrays.}, } @article {pmid32330738, year = {2020}, author = {Eipert, L and Klump, GM}, title = {Uncertainty-based informational masking in a vowel discrimination task for young and old Mongolian gerbils.}, journal = {Hearing research}, volume = {392}, number = {}, pages = {107959}, doi = {10.1016/j.heares.2020.107959}, pmid = {32330738}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Age Factors ; *Aging ; Animals ; *Auditory Perception ; Auditory Threshold ; *Behavior, Animal ; *Discrimination, Psychological ; Female ; Gerbillinae ; Humans ; Male ; Species Specificity ; *Speech Acoustics ; Speech Perception ; *Uncertainty ; *Voice Quality ; }, abstract = {Informational masking emerges with processing of complex sounds in the central auditory system and can be affected by uncertainty emerging from trial-to-trial variation of stimulus features. Uncertainty can be non-informative but confusing and thus mask otherwise salient stimulus changes resulting in increased discrimination thresholds. With increasing age, the ability for processing of such complex sound scenes degrades. Here, 6 young and 4 old gerbils were tested behaviorally in a vowel discrimination task. Animals were trained to discriminate between sequentially presented target and reference vowels of the vowel pair/I/-/i/. Reference and target vowels were generated shifting the three formants of the reference vowel in steps towards the formants of the target vowels. Non-informative but distracting uncertainty was introduced by random changes in location, level, fundamental frequency or all three features combined. Young gerbils tested with uncertainty for the target or target and reference vowels showed similar informational masking effects for both conditions. Young and old gerbils were tested with uncertainty for the target vowels only. Old gerbils showed no threshold increase discriminating vowels without uncertainty in comparison with young gerbils. Introducing uncertainty, vowel discrimination thresholds increased for young and old gerbils and vowel discrimination thresholds increased most when presenting all three uncertainty features combined. Old gerbils were more susceptible to non-informative uncertainty and their thresholds increased more than thresholds of young gerbils. Gerbils' vowel discrimination thresholds are compared to human performance in the same task (Eipert et al., 2019).}, } @article {pmid32318928, year = {2020}, author = {Toyoda, A and Maruhashi, T and Malaivijitnond, S and Koda, H}, title = {Dominance status and copulatory vocalizations among male stump-tailed macaques in Thailand.}, journal = {Primates; journal of primatology}, volume = {61}, number = {5}, pages = {685-694}, doi = {10.1007/s10329-020-00820-7}, pmid = {32318928}, issn = {1610-7365}, support = {18H03503//Japan Society for the Promotion of Science/ ; 16H04848//Japan Society for the Promotion of Science/ ; 16J0098//Japan Society for the Promotion of Science/ ; # 19KK0191//Japan Society for the Promotion of Science/ ; JPMJCR17A4//Japan Science and Technology Agency Core Research for Evolutional Science and Technology/ ; 17H06380//Ministry of Education, Culture, Sports, Science and Technology, Grant-in-Aid for Scientific Research on Innovative Areas/ ; }, mesh = {Animals ; *Copulation ; Macaca arctoides/*psychology ; Male ; *Social Dominance ; Thailand ; *Vocalization, Animal ; }, abstract = {Male copulation calls sometimes play important roles in sexual strategies, attracting conspecific females or advertising their social status to conspecific males. These calls generally occur in sexually competitive societies such as harem groups and multi-male and multi-female societies. However, the call functions remain unclear because of limited availability of data sets that include a large number of male and female animals in naturalistic environments, particularly in primates. Here, we examined the possible function of male-specific copulation calls in wild stump-tailed macaques (Macaca arctoides) by analyzing the contexts and acoustic features of vocalizations. We observed 395 wild stump-tailed macaques inhabiting the Khao Krapuk Khao Taomor Non-Hunting Area in Thailand and recorded all occurrences of observed copulations. We counted 446 male-specific calls in 383 copulations recorded, and measured their acoustic characteristics. Data were categorized into three groups depending on their social status: dominant (alpha and coalition) males and non-dominant males. When comparing male status, alpha males most frequently produced copulation calls at ejaculation, coalition males produced less frequent calls than alpha males, and other non-dominant males rarely vocalized, maintaining silence even when mounting females. Acoustic analysis indicated no significant influence of status (alpha or coalition) on call number, bout duration, or further formant dispersion parameters. Our results suggest that male copulation calls of this species are social status-dependent signals. Furthermore, dominant males might actively transmit their social status and copulations to other male rivals to impede their challenging attacks, while other non-dominant males maintain silence to prevent the interference of dominants.}, } @article {pmid32305174, year = {2021}, author = {Saldías, M and Laukkanen, AM and Guzmán, M and Miranda, G and Stoney, J and Alku, P and Sundberg, J}, title = {The Vocal Tract in Loud Twang-Like Singing While Producing High and Low Pitches.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {5}, pages = {807.e1-807.e23}, doi = {10.1016/j.jvoice.2020.02.005}, pmid = {32305174}, issn = {1873-4588}, mesh = {Acoustics ; Humans ; Male ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {UNLABELLED: Twang-like vocal qualities have been related to a megaphone-like shape of the vocal tract (epilaryngeal tube and pharyngeal narrowing, and a wider mouth opening), low-frequency spectral changes, and tighter and/or increased vocal fold adduction. Previous studies have focused mainly on loud and high-pitched singing, comfortable low-pitched spoken vowels, or are based on modeling and simulation. There is no data available related to twang-like voices in loud, low-pitched singing.

PURPOSE: This study investigates the possible contribution of the lower and upper vocal tract configurations during loud twang-like singing on high and low pitches in a real subject.

METHODS: One male contemporary commercial music singer produced a sustained vowel [a:] in his habitual speaking pitch (B2) and loudness. The same vowel was also produced in a loud twang-like singing voice on high (G4) and low pitches (B2). Computerized tomography, acoustic analysis, inverse filtering, and audio-perceptual assessments were performed.

RESULTS: Both loud twang-like voices showed a megaphone-like shape of the vocal tract, being more notable on the low pitch. Also, low-frequency spectral changes, a peak of sound energy around 3 kHz and increased vocal fold adduction were found. Results agreed with audio-perceptual evaluation.

CONCLUSIONS: Loud twang-like phonation seems to be mainly related to low-frequency spectral changes (under 2 kHz) and a more compact formant structure. Twang-like qualities seem to require different degrees of twang-related vocal tract adjustments while phonating in different pitches. A wider mouth opening, pharyngeal constriction, and epilaryngeal tube narrowing may be helpful strategies for maximum power transfer and improved vocal economy in loud contemporary commercial music singing and potentially in loud speech. Further studies should focus on vocal efficiency and vocal economy measurements using modeling and simulation, based on real-singers' data.}, } @article {pmid32302643, year = {2020}, author = {Yaralı, M}, title = {Varying effect of noise on sound onset and acoustic change evoked auditory cortical N1 responses evoked by a vowel-vowel stimulus.}, journal = {International journal of psychophysiology : official journal of the International Organization of Psychophysiology}, volume = {152}, number = {}, pages = {36-43}, doi = {10.1016/j.ijpsycho.2020.04.010}, pmid = {32302643}, issn = {1872-7697}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Noise ; Speech Perception/*physiology ; Young Adult ; }, abstract = {INTRODUCTION: According to previous studies noise causes prolonged latencies and decreased amplitudes in acoustic change evoked cortical responses. Particularly for a consonant-vowel stimulus, speech shaped noise leads to more pronounced changes on onset evoked response than acoustic change evoked response. Reasoning that this may be related to the spectral characteristics of the stimuli and the noise, in the current study a vowel-vowel stimulus (/ui/) was presented in white noise during cortical response recordings. The hypothesis is that the effect of noise will be higher on acoustic change N1 compared to onset N1 due to the masking effects on formant transitions.

METHODS: Onset and acoustic change evoked auditory cortical N1-P2 responses were obtained from 21 young adults with normal hearing while presenting 1000 ms /ui/ stimuli in quiet and in white noise at +10 dB and 0 dB signal-to-noise ratio (SNR).

RESULTS: In the quiet and +10 dB SNR conditions, the N1-P2 responses to both onset and change were present. In the +10 dB SNR condition acoustic change N1-P2 peak-to-peak amplitudes were reduced and N1 latencies were prolonged compared to the quiet condition. Whereas there was not a significant change in onset N1 latencies and N1-P2 peak-to-peak amplitudes in the +10 dB SNR condition. In the 0 dB SNR condition change responses were not observed but onset N1-P2 peak-to-peak amplitudes were significantly lower, and onset N1 latencies were significantly higher compared to the quiet and the 10 dB SNR conditions. Onset and change responses were also compared with each other in each condition. N1 latencies and N1-P2 peak to peak amplitudes of onset and acoustic change were not significantly different in the quiet condition. Whereas at 10 dB SNR, acoustic change N1 latencies were higher and N1-P2 amplitudes were lower than onset latencies and amplitudes. To summarize, presentation of white noise at 10 dB SNR resulted in the reduction of acoustic change evoked N1-P2 peak-to-peak amplitudes and the prolongation of N1 latencies compared to quiet. Same effect on onsets were only observed at 0 dB SNR, where acoustic change N1 was not observed. In the quiet condition, latencies and amplitudes of onsets and changes were not different. Whereas at 10 dB SNR, acoustic change N1 latencies were higher, amplitudes were lower than onset N1.

DISCUSSION/CONCLUSIONS: The effect of noise was found to be higher on acoustic change evoked N1 response compared to onset N1. This may be related to the spectral characteristics of the utilized noise and the stimuli, possible differences in acoustic features of sound onsets and acoustic changes, or to the possible differences in the mechanisms for detecting acoustic changes and sound onsets. In order to investigate the possible reasons for more pronounced effect of noise on acoustic changes, future work with different vowel-vowel transitions in different noise types is suggested.}, } @article {pmid32245663, year = {2021}, author = {Tykalova, T and Skrabal, D and Boril, T and Cmejla, R and Volin, J and Rusz, J}, title = {Effect of Ageing on Acoustic Characteristics of Voice Pitch and Formants in Czech Vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {6}, pages = {931.e21-931.e33}, doi = {10.1016/j.jvoice.2020.02.022}, pmid = {32245663}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Aged ; Aged, 80 and over ; Aging ; Czech Republic ; Female ; Humans ; Language ; Male ; Middle Aged ; Phonetics ; *Speech Acoustics ; Young Adult ; }, abstract = {BACKGROUND: The relevance of formant-based measures has been noted across a spectrum of medical, technical, and linguistic applications. Therefore, the primary aim of the study was to evaluate the effect of ageing on vowel articulation, as the previous research revealed contradictory findings. The secondary aim was to provide normative acoustic data for all Czech monophthongs.

METHODS: The database consisted of 100 healthy speakers (50 men and 50 women) aged between 20 and 90. Acoustic characteristics, including vowel duration, vowel space area (VSA), fundamental frequency (fo), and the first to fourth formant frequencies (F1-F4) of 10 Czech vowels were extracted from a reading passage. In addition, the articulation rate was calculated from the entire duration of the reading passage.

RESULTS: Age-related changes in pitch were sex-dependent, while age-related alterations in F2/a/, F2/u/, VSA, and vowel duration seemed to be sex-independent. In particular, we observed a clear lowering of fo with age for women, but no change for men. With regard to formants, we found lowering of F2/a/ and F2/u/ with increased age, but no statistically significant changes in F1, F3, or F4 frequencies with advanced age. Although the alterations in F1 and F2 frequencies were rather small, they appeared to be in a direction against vowel centralization, resulting in a significantly greater VSA in the older population. The greater VSA was found to be related partly to longer vowel duration.

CONCLUSIONS: Alterations in vowel formant frequencies across several decades of adult life appear to be small or in a direction against vowel centralization, thus indicating the good preservation of articulatory precision in older speakers.}, } @article {pmid32237805, year = {2020}, author = {Milenkovic, PH and Wagner, M and Kent, RD and Story, BH and Vorperian, HK}, title = {Effects of sampling rate and type of anti-aliasing filter on linear-predictive estimates of formant frequencies in men, women, and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {3}, pages = {EL221}, pmid = {32237805}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {*Acoustics ; Child ; Female ; Humans ; Male ; *Speech ; Speech Acoustics ; }, abstract = {The purpose of this study was to assess the effect of downsampling the acoustic signal on the accuracy of linear-predictive (LPC) formant estimation. Based on speech produced by men, women, and children, the first four formant frequencies were estimated at sampling rates of 48, 16, and 10 kHz using different anti-alias filtering. With proper selection of number of LPC coefficients, anti-alias filter and between-frame averaging, results suggest that accuracy is not improved by rates substantially below 48 kHz. Any downsampling should not go below 16 kHz with a filter cut-off centered at 8 kHz.}, } @article {pmid32201644, year = {2020}, author = {Chen, ZQ and Lin, YF and Tang, Y and Ding, GH and Wu, YQ and Lin, ZH}, title = {Acoustic divergence in advertisement calls among three sympatric Microhyla species from East China.}, journal = {PeerJ}, volume = {8}, number = {}, pages = {e8708}, pmid = {32201644}, issn = {2167-8359}, abstract = {BACKGROUND: Species-specific advertisement calls are the main mechanism of transmitting information between individuals in anuran amphibians and are therefore indispensable for anuran survival and reproduction. Survey methods that monitor these calls can be used for rapid species recognition, behavioral experiments, and conservation monitoring. In this study, we described in detail 10 call parameters from three sympatric species in the genus Microhyla and analyzed the differences in call parameter among these species to provide a basis for systematic monitoring, acoustic analysis and taxonomic study of this genus.

METHODS: The quantitative analyses of temporal and spectral call parameters were used in our study for the advertisement calls of three sympatric Microhyla species (M. beilunensis, M. fissipes and M. heymonsi) in Zhejiang Province, East China.

RESULTS: Our results showed the following: (1) Significant differences existed among the three sympatric Microhyla species in call duration (CD), call interval (CI), number of pulses (NP), pulse rate, call intensity (CIT), dominant frequency (DF) and frequency of the first to fourth formants (F1, F2, F3 and F4). (2) Some spectral parameters (DF, F1 and F3) were negatively correlated with the body size of the vocalizing individuals in each species. (3) The coefficients of variation within individuals (CVw) for CIT, DF and F1-F4 were smaller than 5%, whereas the CVW for CI was larger than 10% in each species. (4) The principal component analysis and discriminant function analysis showed that call parameters could distinguish the three Microhyla species. (5) The phylogenetic generalized least squares analysis showed that phylogenetic relationships affected CD and NP against snout-vent length (SVL), DF and NP against CD, and NP against DF, but not of DF against SVL; based on the phylogenetic analysis, CD and NP were not related to SVL, but DF was negatively related to SVL.}, } @article {pmid32196513, year = {2020}, author = {Deloche, F}, title = {Fine-grained statistical structure of speech.}, journal = {PloS one}, volume = {15}, number = {3}, pages = {e0230233}, pmid = {32196513}, issn = {1932-6203}, mesh = {Acoustic Stimulation/methods ; Cochlea/physiology ; Cochlear Implants ; Humans ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Perception/physiology ; }, abstract = {In spite of its acoustic diversity, the speech signal presents statistical regularities that can be exploited by biological or artificial systems for efficient coding. Independent Component Analysis (ICA) revealed that on small time scales (∼ 10 ms), the overall structure of speech is well captured by a time-frequency representation whose frequency selectivity follows the same power law in the high frequency range 1-8 kHz as cochlear frequency selectivity in mammals. Variations in the power-law exponent, i.e. different time-frequency trade-offs, have been shown to provide additional adaptation to phonetic categories. Here, we adopt a parametric approach to investigate the variations of the exponent at a finer level of speech. The estimation procedure is based on a measure that reflects the sparsity of decompositions in a set of Gabor dictionaries whose atoms are Gaussian-modulated sinusoids. We examine the variations of the exponent associated with the best decomposition, first at the level of phonemes, then at an intra-phonemic level. We show that this analysis offers a rich interpretation of the fine-grained statistical structure of speech, and that the exponent values can be related to key acoustic properties. Two main results are: i) for plosives, the exponent is lowered by the release bursts, concealing higher values during the opening phases; ii) for vowels, the exponent is bound to formant bandwidths and decreases with the degree of acoustic radiation at the lips. This work further suggests that an efficient coding strategy is to reduce frequency selectivity with sound intensity level, congruent with the nonlinear behavior of cochlear filtering.}, } @article {pmid32196397, year = {2020}, author = {Hardy, TLD and Boliek, CA and Aalto, D and Lewicke, J and Wells, K and Rieger, JM}, title = {Contributions of Voice and Nonverbal Communication to Perceived Masculinity-Femininity for Cisgender and Transgender Communicators.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {4}, pages = {931-947}, doi = {10.1044/2019_JSLHR-19-00387}, pmid = {32196397}, issn = {1558-9102}, mesh = {Female ; Femininity ; Humans ; Male ; Masculinity ; Nonverbal Communication ; Speech Acoustics ; *Transgender Persons ; *Voice ; }, abstract = {Purpose The purpose of this study was twofold: (a) to identify a set of communication-based predictors (including both acoustic and gestural variables) of masculinity-femininity ratings and (b) to explore differences in ratings between audio and audiovisual presentation modes for transgender and cisgender communicators. Method The voices and gestures of a group of cisgender men and women (n = 10 of each) and transgender women (n = 20) communicators were recorded while they recounted the story of a cartoon using acoustic and motion capture recording systems. A total of 17 acoustic and gestural variables were measured from these recordings. A group of observers (n = 20) rated each communicator's masculinity-femininity based on 30- to 45-s samples of the cartoon description presented in three modes: audio, visual, and audio visual. Visual and audiovisual stimuli contained point light displays standardized for size. Ratings were made using a direct magnitude estimation scale without modulus. Communication-based predictors of masculinity-femininity ratings were identified using multiple regression, and analysis of variance was used to determine the effect of presentation mode on perceptual ratings. Results Fundamental frequency, average vowel formant, and sound pressure level were identified as significant predictors of masculinity-femininity ratings for these communicators. Communicators were rated significantly more feminine in the audio than the audiovisual mode and unreliably in the visual-only mode. Conclusions Both study purposes were met. Results support continued emphasis on fundamental frequency and vocal tract resonance in voice and communication modification training with transgender individuals and provide evidence for the potential benefit of modifying sound pressure level, especially when a masculine presentation is desired.}, } @article {pmid32160481, year = {2020}, author = {Carl, M and Kent, RD and Levy, ES and Whalen, DH}, title = {Vowel Acoustics and Speech Intelligibility in Young Adults With Down Syndrome.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {3}, pages = {674-687}, doi = {10.1044/2019_JSLHR-19-00204}, pmid = {32160481}, issn = {1558-9102}, mesh = {Acoustics ; *Down Syndrome/complications ; Humans ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Purpose Speech production deficits and reduced intelligibility are frequently noted in individuals with Down syndrome (DS) and are attributed to a combination of several factors. This study reports acoustic data on vowel production in young adults with DS and relates these findings to perceptual analysis of speech intelligibility. Method Participants were eight young adults with DS as well as eight age- and gender-matched typically developing (TD) controls. Several different acoustic measures of vowel centralization and variability were applied to tokens of corner vowels (/ɑ/, /æ/, /i/, /u/) produced in common English words. Intelligibility was assessed for single-word productions of speakers with DS, by means of transcriptions from 14 adult listeners. Results Group differentiation was found for some, but not all, of the acoustic measures. Low vowels were more acoustically centralized and variable in speakers with DS than TD controls. Acoustic findings were associated with overall intelligibility scores. Vowel formant dispersion was the most sensitive measure in distinguishing DS and TD formant data. Conclusion Corner vowels are differentially affected in speakers with DS. The acoustic characterization of vowel production and its association with speech intelligibility scores within the DS group support the conclusion of motor control deficits in the overall speech impairment. Implications are discussed for effective treatment planning.}, } @article {pmid32160080, year = {2020}, author = {Coy, A and Watson, S}, title = {Acoustic Similarity of Inner and Outer Circle Varieties of Child-Produced English Vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {3}, pages = {722-737}, doi = {10.1044/2019_JSLHR-19-00179}, pmid = {32160080}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Child ; Humans ; Language ; *Phonetics ; Speech ; *Speech Acoustics ; United States ; }, abstract = {Purpose This article compares acoustic data of normally developing children from two dominant and one nondominant variety of English in order to determine phonetic proximity. Method The study focuses on one variety of American English (AE), one British English (BE) variety, and one Jamaican English (JE) variety owing to the historical and sociopolitical influences of both dominant varieties on JE. The work examines the four corner vowels (/a/, /ɑ/, /u:/, and /i:/) of the specified varieties. Speech from children aged 8-11 years was processed to extract duration, intensity, and fundamental frequency as well as the first three formants (F1, F2, and F3) of each vowel. Results Analysis of the acoustic variables showed, for the first time, that child-produced JE is phonetically closer to the variety of BE studied, than it is to the American variety. The acoustic properties of the child-produced JE vowels were found to be similar to those of adult-produced vowels, suggesting that, as has been shown for adult speech, there appears to be a limited impact of AE on JE. Conclusions This is the first acoustic study of children's speech to show that, despite the proximity to BE, the Jamaican variety is clearly a distinct variety of English. As the first study comparing AE, BE, and JE, the article provides experimental evidence of the acoustic differences in the varieties and points to the implications for automatic speech recognition and educational applications for children who speak JE.}, } @article {pmid32149701, year = {2020}, author = {Zhang, T and Shao, Y and Wu, Y and Pang, Z and Liu, G}, title = {Multiple Vowels Repair Based on Pitch Extraction and Line Spectrum Pair Feature for Voice Disorder.}, journal = {IEEE journal of biomedical and health informatics}, volume = {24}, number = {7}, pages = {1940-1951}, doi = {10.1109/JBHI.2020.2978103}, pmid = {32149701}, issn = {2168-2208}, mesh = {Aged ; Humans ; Neural Networks, Computer ; Sound Spectrography/*methods ; Voice/*physiology ; Voice Disorders/*diagnosis ; *Wavelet Analysis ; }, abstract = {Individuals, such as voice-related professionals, elderly people and smokers, are increasingly suffering from voice disorder, which implies the importance of pathological voice repair. Previous work on pathological voice repair only concerned about sustained vowel /a/, but multiple vowels repair is still challenging due to the unstable extraction of pitch and the unsatisfactory reconstruction of formant. In this paper, a multiple vowels repair based on pitch extraction and Line Spectrum Pair feature for voice disorder is proposed, which broadened the research subjects of voice repair from only single vowel /a/ to multiple vowels /a/, /i/ and /u/ and achieved the repair of these vowels successfully. Considering deep neural network as a classifier, a voice recognition is performed to classify the normal and pathological voices. Wavelet Transform and Hilbert-Huang Transform are applied for pitch extraction. Based on Line Spectrum Pair (LSP) feature, the formant is reconstructed. The final repaired voice is obtained by synthesizing the pitch and the formant. The proposed method is validated on Saarbrücken Voice Database (SVD) database. The achieved improvements of three metrics, Segmental Signal-to-Noise Ratio, LSP distance measure and Mel cepstral distance measure, are respectively 45.87%, 50.37% and 15.56%. Besides, an intuitive analysis based on spectrogram has been done and a prominent repair effect has been achieved.}, } @article {pmid32138570, year = {2021}, author = {Figueroa Saavedra, C and Otzen Hernández, T and Alarcón Godoy, C and Ríos Pérez, A and Frugone Salinas, D and Lagos Hernández, R}, title = {Association between suicidal ideation and acoustic parameters of university students' voice and speech: a pilot study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {46}, number = {2}, pages = {55-62}, doi = {10.1080/14015439.2020.1733075}, pmid = {32138570}, issn = {1651-2022}, mesh = {Acoustics ; Adolescent ; Adult ; Cross-Sectional Studies ; Female ; Humans ; Male ; Pilot Projects ; *Speech ; Students ; *Suicidal Ideation ; Universities ; Voice Quality ; Young Adult ; }, abstract = {PURPOSE: At a worldwide level, suicide is a public health problem that, despite displaying downward trends in several areas of the world, in many countries these rates have increased. One of the elements that contributes to its prevention is an early and dynamic evaluation. Due to this, the objective is to determine the association between acoustic parameters of voice and speech (F0, F1, F2, F3, dB, and Jitter) and suicidal ideation arousal amongst some university students from the city of Temuco, Chile.

METHODS: Attending to this issue, a cross-sectional design study was conducted through a non-probabilistic sampling of sixty 18- and 19-year-old adolescents from the city of Temuco, that went through an acoustic evaluation of their voice and speech after taking a test to determine suicidal ideation. Afterwards, data were analyzed through IBM SPSS version 23.0 software (IBM SPSS Statistics, Armonk, NY), by means of exploratory, descriptive, and inferential statistics taking the variable's levels of measurements and the types of distributions into account.

RESULTS: The results point out that 30% of the adolescents, from both genders, displayed suicidal ideation. Taking into account the acoustic results of their voice, it is possible to recognize that the fundamental frequency (F0), the formants (F1, F2), and Jitter, are the ones that majorly link to the presence of suicidal ideation, both in women and men (p < .05). The characteristics that describe F3 were only linked to the presence of suicidal ideation in men (p < .05).

CONCLUSIONS: It is concluded that the acoustic parameters of voice and speech differ in adolescents with suicidal behavior, opening the possibility of representing a useful tool in the diagnosis of suicide.}, } @article {pmid32113329, year = {2020}, author = {Allison, KM and Salehi, S and Green, JR}, title = {Effect of prosodic manipulation on articulatory kinematics and second formant trajectories in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {769}, pmid = {32113329}, issn = {1520-8524}, support = {F32 DC016484/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; }, mesh = {Biomechanical Phenomena ; Child ; Dysarthria ; Humans ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {This study investigated effects of rate reduction and emphatic stress cues on second formant (F2) trajectories and articulatory movements during diphthong production in 11 typically developing school-aged children. F2 extent increased in slow and emphatic stress conditions, and tongue and jaw displacement increased in the emphatic stress condition compared to habitual speech. Tongue displacement significantly predicted F2 extent across speaking conditions. Results suggest that slow rate and emphatic stress cues induce articulatory and acoustic changes in children that may enhance clarity of the acoustic signal. Potential clinical implications for improving speech in children with dysarthria are discussed.}, } @article {pmid32113320, year = {2020}, author = {Summers, RJ and Roberts, B}, title = {Informational masking of speech by acoustically similar intelligible and unintelligible interferers.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {1113}, doi = {10.1121/10.0000688}, pmid = {32113320}, issn = {1520-8524}, abstract = {Masking experienced when target speech is accompanied by a single interfering voice is often primarily informational masking (IM). IM is generally greater when the interferer is intelligible than when it is not (e.g., speech from an unfamiliar language), but the relative contributions of acoustic-phonetic and linguistic interference are often difficult to assess owing to acoustic differences between interferers (e.g., different talkers). Three-formant analogues (F1+F2+F3) of natural sentences were used as targets and interferers. Targets were presented monaurally either alone or accompanied contralaterally by interferers from another sentence (F0 = 4 semitones higher); a target-to-masker ratio (TMR) between ears of 0, 6, or 12 dB was used. Interferers were either intelligible or rendered unintelligible by delaying F2 and advancing F3 by 150 ms relative to F1, a manipulation designed to minimize spectro-temporal differences between corresponding interferers. Target-sentence intelligibility (keywords correct) was 67% when presented alone, but fell considerably when an unintelligible interferer was present (49%) and significantly further when the interferer was intelligible (41%). Changes in TMR produced neither a significant main effect nor an interaction with interferer type. Interference with acoustic-phonetic processing of the target can explain much of the impact on intelligibility, but linguistic factors-particularly interferer intrusions-also make an important contribution to IM.}, } @article {pmid32113256, year = {2020}, author = {Winn, MB}, title = {Manipulation of voice onset time in speech stimuli: A tutorial and flexible Praat script.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {852}, doi = {10.1121/10.0000692}, pmid = {32113256}, issn = {1520-8524}, abstract = {Voice onset time (VOT) is an acoustic property of stop consonants that is commonly manipulated in studies of phonetic perception. This paper contains a thorough description of the "progressive cutback and replacement" method of VOT manipulation, and comparison with other VOT manipulation techniques. Other acoustic properties that covary with VOT-such as fundamental frequency and formant transitions-are also discussed, along with considerations for testing VOT perception and its relationship to various other measures of auditory temporal or spectral processing. An implementation of the progressive cutback and replacement method in the Praat scripting language is presented, which is suitable for modifying natural speech for perceptual experiments involving VOT and/or related covarying F0 and intensity cues. Justifications are provided for the stimulus design choices and constraints implemented in the script.}, } @article {pmid32111954, year = {2020}, author = {Riggs, WJ and Hiss, MM and Skidmore, J and Varadarajan, VV and Mattingly, JK and Moberly, AC and Adunka, OF}, title = {Utilizing Electrocochleography as a Microphone for Fully Implantable Cochlear Implants.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {3714}, pmid = {32111954}, issn = {2045-2322}, mesh = {Adolescent ; Adult ; Aged ; Aged, 80 and over ; Audiometry, Evoked Response ; Auditory Threshold ; Cochlear Implantation ; Cochlear Implants ; Hearing ; Hearing Loss/physiopathology/*therapy ; Humans ; Middle Aged ; Sound ; Young Adult ; }, abstract = {Current cochlear implants (CIs) are semi-implantable devices with an externally worn sound processor that hosts the microphone and sound processor. A fully implantable device, however, would ultimately be desirable as it would be of great benefit to recipients. While some prototypes have been designed and used in a few select cases, one main stumbling block is the sound input. Specifically, subdermal implantable microphone technology has been poised with physiologic issues such as sound distortion and signal attenuation under the skin. Here we propose an alternative method that utilizes a physiologic response composed of an electrical field generated by the sensory cells of the inner ear to serve as a sound source microphone for fully implantable hearing technology such as CIs. Electrophysiological results obtained from 14 participants (adult and pediatric) document the feasibility of capturing speech properties within the electrocochleography (ECochG) response. Degradation of formant properties of the stimuli /da/ and /ba/ are evaluated across various degrees of hearing loss. Preliminary results suggest proof-of-concept of using the ECochG response as a microphone is feasible to capture vital properties of speech. However, further signal processing refinement is needed in addition to utilization of an intracochlear recording location to likely improve signal fidelity.}, } @article {pmid32104050, year = {2020}, author = {Kim, HT}, title = {Vocal Feminization for Transgender Women: Current Strategies and Patient Perspectives.}, journal = {International journal of general medicine}, volume = {13}, number = {}, pages = {43-52}, pmid = {32104050}, issn = {1178-7074}, abstract = {Voice feminization for transgender women is a highly complicated comprehensive transition process. Voice feminization has been thought to be equal to pitch elevation. Thus, many surgical procedures have only focused on pitch raising for voice feminization. However, voice feminization should not only consider voice pitch but also consider gender differences in physical, neurophysiological, and acoustical characteristics of voice. That is why voice therapy has been the preferred choice for the feminization of the voice. Considering gender difference of phonatory system, the method for voice feminization consists of changing the following four critical elements: fundamental frequency, resonance frequency related to vocal tract volume and length, formant tuning, and phonatory pattern. Voice feminizing process can be generally divided into non-surgical feminization and surgical feminization. As a non-surgical procedure, feminization voice therapy consists of increasing fundamental frequency, improving oral and pharyngeal resonance, and behavioral therapy. Surgical feminization usually can be achieved by external approach or endoscopic approach. Based on three factors (length, tension and mass) of vocal fold for pitch modulation, surgical procedure can be classified as one-factor, two-factors and three-factors modification of vocal folds. Recent systematic reviews and meta-analysis studies have reported positive outcomes for both the voice therapy and voice feminization surgery. The benefits of voice therapy, as it is highly satisfactory, mostly increase vocal pitch, and are noninvasive. However, the surgical voice feminization of three-factors modification of vocal folds is also highly competent and provides a maximum absolute increase in vocal pitch. Voice feminization is a long transition journey for physical, neurophysiological, and psychosomatic changes that convert a male phonatory system to a female phonatory system. Therefore, strategies for voice feminization should be individualized according to the individual's physical condition, the desired change in voice pitch, economic conditions, and social roles.}, } @article {pmid32077196, year = {2020}, author = {Levy, ES and Moya-Galé, G and Chang, YM and Campanelli, L and MacLeod, AAN and Escorial, S and Maillart, C}, title = {Effects of speech cues in French-speaking children with dysarthria.}, journal = {International journal of language & communication disorders}, volume = {55}, number = {3}, pages = {401-416}, doi = {10.1111/1460-6984.12526}, pmid = {32077196}, issn = {1460-6984}, mesh = {Adolescent ; Cerebral Palsy/complications/*psychology ; Child ; *Cues ; Dysarthria/etiology/*psychology ; Female ; Humans ; Male ; *Speech ; Speech Acoustics ; Speech Intelligibility ; }, abstract = {BACKGROUND: Articulatory excursion and vocal intensity are reduced in many children with dysarthria due to cerebral palsy (CP), contributing to the children's intelligibility deficits and negatively affecting their social participation. However, the effects of speech-treatment strategies for improving intelligibility in this population are understudied, especially for children who speak languages other than English. In a cueing study on English-speaking children with dysarthria, acoustic variables and intelligibility improved when the children were provided with cues aimed to increase articulatory excursion and vocal intensity. While French is among the top 20 most spoken languages in the world, dysarthria and its management in French-speaking children are virtually unexplored areas of research. Information gleaned from such research is critical for providing an evidence base on which to provide treatment.

AIMS: To examine acoustic and perceptual changes in the speech of French-speaking children with dysarthria, who are provided with speech cues targeting greater articulatory excursion (French translation of 'speak with your big mouth') and vocal intensity (French translation of 'speak with your strong voice'). This study investigated whether, in response to the cues, the children would make acoustic changes and listeners would perceive the children's speech as more intelligible.

METHODS & PROCEDURES: Eleven children with dysarthria due to CP (six girls, five boys; ages 4;11-17;0 years; eight with spastic CP, three with dyskinetic CP) repeated pre-recorded speech stimuli across three speaking conditions (habitual, 'big mouth' and 'strong voice'). Stimuli were sentences and contrastive words in phrases. Acoustic analyses were conducted. A total of 66 Belgian-French listeners transcribed the children's utterances orthographically and rated their ease of understanding on a visual analogue scale at sentence and word levels.

OUTCOMES & RESULTS: Acoustic analyses revealed significantly longer duration in response to the big mouth cue at sentence level and in response to both the big mouth and strong voice cues at word level. Significantly higher vocal sound-pressure levels were found following both cues at sentence and word levels. Both cues elicited significantly higher first-formant vowel frequencies and listeners' greater ease-of-understanding ratings at word level. Increases in the percentage of words transcribed correctly and in sentence ease-of-understanding ratings, however, did not reach statistical significance. Considerable variability between children was observed.

Speech cues targeting greater articulatory excursion and vocal intensity yield significant acoustic changes in French-speaking children with dysarthria. However, the changes may only aid listeners' ease of understanding at word level. The significant findings and great inter-speaker variability are generally consistent with studies on English-speaking children with dysarthria, although changes appear more constrained in these French-speaking children. What this paper adds What is already known on the subject According to the only study comparing effects of speech-cueing strategies on English-speaking children with dysarthria, intelligibility increases when the children are provided with cues aimed to increase articulatory excursion and vocal intensity. Little is known about speech characteristics in French-speaking children with dysarthria and no published research has explored effects of cueing strategies in this population. What this paper adds to existing knowledge This paper is the first study to examine the effects of speech cues on the acoustics and intelligibility of French-speaking children with CP. It provides evidence that the children can make use of cues to modify their speech, although the changes may only aid listeners' ease of understanding at word level. What are the potential or actual clinical implications of this work? For clinicians, the findings suggest that speech cues emphasizing increasing articulatory excursion and vocal intensity show promise for improving the ease of understanding of words produced by francophone children with dysarthria, although improvements may be modest. The variability in the responses also suggests that this population may benefit from a combination of such cues to produce words that are easier to understand.}, } @article {pmid32076631, year = {2019}, author = {Boë, LJ and Sawallis, TR and Fagot, J and Badin, P and Barbier, G and Captier, G and Ménard, L and Heim, JL and Schwartz, JL}, title = {Which way to the dawn of speech?: Reanalyzing half a century of debates and data in light of speech science.}, journal = {Science advances}, volume = {5}, number = {12}, pages = {eaaw3916}, pmid = {32076631}, issn = {2375-2548}, mesh = {Animals ; *Biological Evolution ; Communication ; Humans ; *Models, Theoretical ; Research ; *Speech ; Vocalization, Animal ; }, abstract = {Recent articles on primate articulatory abilities are revolutionary regarding speech emergence, a crucial aspect of language evolution, by revealing a human-like system of proto-vowels in nonhuman primates and implicitly throughout our hominid ancestry. This article presents both a schematic history and the state of the art in primate vocalization research and its importance for speech emergence. Recent speech research advances allow more incisive comparison of phylogeny and ontogeny and also an illuminating reinterpretation of vintage primate vocalization data. This review produces three major findings. First, even among primates, laryngeal descent is not uniquely human. Second, laryngeal descent is not required to produce contrasting formant patterns in vocalizations. Third, living nonhuman primates produce vocalizations with contrasting formant patterns. Thus, evidence now overwhelmingly refutes the long-standing laryngeal descent theory, which pushes back "the dawn of speech" beyond ~200 ka ago to over ~20 Ma ago, a difference of two orders of magnitude.}, } @article {pmid32048990, year = {2020}, author = {Bergevin, C and Narayan, C and Williams, J and Mhatre, N and Steeves, JK and Bernstein, JG and Story, B}, title = {Overtone focusing in biphonic tuvan throat singing.}, journal = {eLife}, volume = {9}, number = {}, pages = {}, pmid = {32048990}, issn = {2050-084X}, support = {RGPIN-430761-2013//Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Audiovisual Aids ; Humans ; Magnetic Resonance Imaging ; Pharynx/diagnostic imaging/*physiology ; Russia ; *Singing ; }, abstract = {Khoomei is a unique singing style originating from the republic of Tuva in central Asia. Singers produce two pitches simultaneously: a booming low-frequency rumble alongside a hovering high-pitched whistle-like tone. The biomechanics of this biphonation are not well-understood. Here, we use sound analysis, dynamic magnetic resonance imaging, and vocal tract modeling to demonstrate how biphonation is achieved by modulating vocal tract morphology. Tuvan singers show remarkable control in shaping their vocal tract to narrowly focus the harmonics (or overtones) emanating from their vocal cords. The biphonic sound is a combination of the fundamental pitch and a focused filter state, which is at the higher pitch (1-2 kHz) and formed by merging two formants, thereby greatly enhancing sound-production in a very narrow frequency range. Most importantly, we demonstrate that this biphonation is a phenomenon arising from linear filtering rather than from a nonlinear source.}, } @article {pmid32041121, year = {2020}, author = {Gabrieli, G and Bornstein, MH and Manian, N and Esposito, G}, title = {Assessing Mothers' Postpartum Depression From Their Infants' Cry Vocalizations.}, journal = {Behavioral sciences (Basel, Switzerland)}, volume = {10}, number = {2}, pages = {}, pmid = {32041121}, issn = {2076-328X}, support = {Intramural Research Program/NH/NIH HHS/United States ; International Research Fellowship//Institute for Fiscal studies/ ; 695300-HKADeC-ERC-2015-AdG//Horizon 2020/ ; }, abstract = {Postpartum Depression (PPD), a condition that affects up to 15% of mothers in high-income countries, reduces attention to the needs of the child and is among the first causes of infanticide. PPD is usually identified using self-report measures and therefore it is possible that mothers are unwilling to report PPD because of a social desirability bias. Previous studies have highlighted the presence of significant differences in the acoustical properties of the vocalizations of infants of depressed and healthy mothers, suggesting that the mothers' behavior can induce changes in infants' vocalizations. In this study, cry episodes of infants (N = 56, 157.4 days ± 8.5, 62% firstborn) of depressed (N = 29) and non-depressed (N = 27) mothers (mean age = 31.1 years ± 3.9) are analyzed to investigate the possibility that a cloud-based machine learning model can identify PPD in mothers from the acoustical properties of their infants' vocalizations. Acoustic features (fundamental frequency, first four formants, and intensity) are first extracted from recordings of crying infants, then cloud-based artificial intelligence models are employed to identify maternal depression versus non-depression from estimated features. The trained model shows that commonly adopted acoustical features can be successfully used to identify postpartum depressed mothers with high accuracy (89.5%).}, } @article {pmid32038381, year = {2019}, author = {Kearney, E and Nieto-Castañón, A and Weerathunge, HR and Falsini, R and Daliri, A and Abur, D and Ballard, KJ and Chang, SE and Chao, SC and Heller Murray, ES and Scott, TL and Guenther, FH}, title = {A Simple 3-Parameter Model for Examining Adaptation in Speech and Voice Production.}, journal = {Frontiers in psychology}, volume = {10}, number = {}, pages = {2995}, pmid = {32038381}, issn = {1664-1078}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; R03 DC014045/DC/NIDCD NIH HHS/United States ; R01 DC011277/DC/NIDCD NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; F31 DC016197/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T90 DA032484/DA/NIDA NIH HHS/United States ; }, abstract = {Sensorimotor adaptation experiments are commonly used to examine motor learning behavior and to uncover information about the underlying control mechanisms of many motor behaviors, including speech production. In the speech and voice domains, aspects of the acoustic signal are shifted/perturbed over time via auditory feedback manipulations. In response, speakers alter their production in the opposite direction of the shift so that their perceived production is closer to what they intended. This process relies on a combination of feedback and feedforward control mechanisms that are difficult to disentangle. The current study describes and tests a simple 3-parameter mathematical model that quantifies the relative contribution of feedback and feedforward control mechanisms to sensorimotor adaptation. The model is a simplified version of the DIVA model, an adaptive neural network model of speech motor control. The three fitting parameters of SimpleDIVA are associated with the three key subsystems involved in speech motor control, namely auditory feedback control, somatosensory feedback control, and feedforward control. The model is tested through computer simulations that identify optimal model fits to six existing sensorimotor adaptation datasets. We show its utility in (1) interpreting the results of adaptation experiments involving the first and second formant frequencies as well as fundamental frequency; (2) assessing the effects of masking noise in adaptation paradigms; (3) fitting more than one perturbation dimension simultaneously; (4) examining sensorimotor adaptation at different timepoints in the production signal; and (5) quantitatively predicting responses in one experiment using parameters derived from another experiment. The model simulations produce excellent fits to real data across different types of perturbations and experimental paradigms (mean correlation between data and model fits across all six studies = 0.95 ± 0.02). The model parameters provide a mechanistic explanation for the behavioral responses to the adaptation paradigm that are not readily available from the behavioral responses alone. Overall, SimpleDIVA offers new insights into speech and voice motor control and has the potential to inform future directions of speech rehabilitation research in disordered populations. Simulation software, including an easy-to-use graphical user interface, is publicly available to facilitate the use of the model in future studies.}, } @article {pmid32037936, year = {2021}, author = {Binos, P and Thodi, C and Vogazianos, P and Psillas, G and Constantinidis, J}, title = {An acoustic and auditory analysis of vocants in infants with cochlear implants.}, journal = {Logopedics, phoniatrics, vocology}, volume = {46}, number = {1}, pages = {28-34}, doi = {10.1080/14015439.2020.1724325}, pmid = {32037936}, issn = {1651-2022}, mesh = {Acoustics ; Child ; *Cochlear Implantation ; *Cochlear Implants ; *Deafness/surgery ; Humans ; Infant ; Infant, Newborn ; Longitudinal Studies ; Speech Intelligibility ; *Speech Perception ; Voice Quality ; }, abstract = {INTRODUCTION: The duration of the nuclei is a crucial factor for the shift of prelexical to mature speech, since control of duration is closely related with improved speech intelligibility.

OBJECTIVES: This work records the suprasegmental feature of duration in infants with normal hearing (NH) compared to those with cochlear implants (CI) based on vocant productions (quasivowels and full vowels).

MATERINALS AND METHODS: In this longitudinal study, 102 vocant productions were analyzed from cases of congenitally hearing-impaired infants (implantation ages 1:4 and 1:11 years; post-implant ages 0:6 months and 1:3 years) who were matched with three NH infants of similar hearing experience (ages 0:8-0:11 months). Current methodology analyzes vocants using a combination of acoustical and auditory analyses. Vegetative data or reflexive sounds were excluded. Participants had had unknown deafness etiology and no other disabilities. Duration was measured using wideband spectrographic analysis, from voice onset to the loss of audible signal and the decrease of higher formant's energy.

RESULTS: The results showed that the mean vocant duration of young CI users was longer, compared to hearing matched peers during the first six months after cochlear implantation.

CONCLUSIONS: This recorded weakness for CI users' speech production is a challenge for future work in speech processing strategies. This is the first study measuring production of vocants during the pre-linguistic stage in CI recipients.}, } @article {pmid32036357, year = {2021}, author = {Viegas, F and Viegas, D and Serra Guimarães, G and Ritto, F and Simões-Zenari, M and Nemr, K}, title = {Acoustic Analysis of Voice and Speech in Men with Skeletal Class III Malocclusion: A Pilot Study.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {73}, number = {2}, pages = {117-125}, doi = {10.1159/000505186}, pmid = {32036357}, issn = {1421-9972}, mesh = {Acoustics ; Humans ; Male ; *Malocclusion ; Phonetics ; Pilot Projects ; *Speech ; Speech Acoustics ; }, abstract = {OBJECTIVES: To assess the fundamental (f0) and first third formant (F1, F2, F3) frequencies of the 7 oral vowels of Brazilian Portuguese in men with skeletal class III malocclusion and to compare these measures with a control group of individuals with Angle's class I.

METHODS: Sixty men aged 18-40 years, 20 with Angle's class III skeletal malocclusion and 40 with Angle's class I malocclusion were selected by speech therapists and dentists. The speech signals were obtained from sustained vowels, and the values of f0 and frequencies of F1, F2 and F3 were estimated. The differences were verified through Student's t test, and the effect size calculation was performed.

RESULTS: In the class III group, more acute f0 values were observed in all vowels, higher values of F1 in the vowels [a] and [ε] and in F2 in the vowels [a], [e] and [i] and lower F1 and F3 values of the vowel [u].

CONCLUSION: More acute f0 values were found in all vowels investigated in the class III group, which showed a higher laryngeal position in the production of these sounds. The frequencies of the first 3 formants showed punctual differences, with higher values of F1 in the vowels [a] and [ε] and of F2 in [a], [e] and [i], and lower values of F1 and F3 in the vowel [u] in the experimental group. Thus, it is concluded that the fundamental frequency of the voice was the main parameter that differentiated the studied group from the control.}, } @article {pmid32007016, year = {2020}, author = {Kelley, MC and Tucker, BV}, title = {A comparison of four vowel overlap measures.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {137}, doi = {10.1121/10.0000494}, pmid = {32007016}, issn = {1520-8524}, abstract = {Multiple measures of vowel overlap have been proposed that use F1, F2, and duration to calculate the degree of overlap between vowel categories. The present study assesses four of these measures: the spectral overlap assessment metric [SOAM; Wassink (2006). J. Acoust. Soc. Am. 119(4), 2334-2350], the a posteriori probability (APP)-based metric [Morrison (2008). J. Acoust. Soc. Am. 123(1), 37-40], the vowel overlap analysis with convex hulls method [VOACH; Haynes and Taylor, (2014). J. Acoust. Soc. Am. 136(2), 883-891], and the Pillai score as first used for vowel overlap by Hay, Warren, and Drager [(2006). J. Phonetics 34(4), 458-484]. Summaries of the measures are presented, and theoretical critiques of them are performed, concluding that the APP-based metric and Pillai score are theoretically preferable to SOAM and VOACH. The measures are empirically assessed using accuracy and precision criteria with Monte Carlo simulations. The Pillai score demonstrates the best overall performance in these tests. The potential applications of vowel overlap measures to research scenarios are discussed, including comparisons of vowel productions between different social groups, as well as acoustic investigations into vowel formant trajectories.}, } @article {pmid32007015, year = {2020}, author = {Renwick, MEL and Stanley, JA}, title = {Modeling dynamic trajectories of front vowels in the American South.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {579}, doi = {10.1121/10.0000549}, pmid = {32007015}, issn = {1520-8524}, abstract = {Regional variation in American English speech is often described in terms of shifts, indicating which vowel sounds are converging or diverging. In the U.S. South, the Southern vowel shift (SVS) and African American vowel shift (AAVS) affect not only vowels' relative positions but also their formant dynamics. Static characterizations of shifting, with a single pair of first and second formant values taken near vowels' midpoint, fail to capture this vowel-inherent spectral change, which can indicate dialect-specific diphthongization or monophthongization. Vowel-inherent spectral change is directly modeled to investigate how trajectories of front vowels /i eɪ ɪ ɛ/ differ across social groups in the 64-speaker Digital Archive of Southern Speech. Generalized additive mixed models are used to test the effects of two social factors, sex and ethnicity, on trajectory shape. All vowels studied show significant differences between men, women, African American and European American speakers. Results show strong overlap between the trajectories of /eɪ, ɛ/ particularly among European American women, consistent with the SVS, and greater vowel-inherent raising of /ɪ/ among African American speakers, indicating how that lax vowel is affected by the AAVS. Model predictions of duration additionally indicate that across groups, trajectories become more peripheral as vowel duration increases.}, } @article {pmid32006995, year = {2020}, author = {Chung, H}, title = {Vowel acoustic characteristics of Southern American English variation in Louisiana.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {541}, doi = {10.1121/10.0000505}, pmid = {32006995}, issn = {1520-8524}, abstract = {This study examined acoustic characteristics of vowels produced by speakers from Louisiana, one of the states in the Southern English dialect region. First, how Louisiana vowels differ from or are similar to the reported patterns of Southern dialect were examined. Then, within-dialect differences across regions in Louisiana were examined. Thirty-four female adult monolingual speakers of American English from Louisiana, ranging in age from 18 to 23, produced English monosyllabic words containing 11 vowels /i, ɪ, e, ɛ, æ, ʌ, u, ʊ, o, ɔ, ɑ/. The first two formant frequencies at the midpoint of the vowel nucleus, direction, and amount of formant changes across three different time points (20, 50, and 80%), and vowel duration were compared to previously reported data on Southern vowels. Overall, Louisiana vowels showed patterns consistent with previously reported characteristics of Southern vowels that reflect ongoing changes in the Southern dialect (no evidence of acoustic reversal of tense-lax pairs, more specifically no peripheralization of front vowels). Some dialect-specific patterns were also observed (a relatively lesser degree of formant changes and slightly shorter vowel duration). These patterns were consistent across different regions within Louisiana.}, } @article {pmid31956258, year = {2019}, author = {Maebayashi, H and Takiguchi, T and Takada, S}, title = {Study on the Language Formation Process of Very-Low-Birth-Weight Infants in Infancy Using a Formant Analysis.}, journal = {The Kobe journal of medical sciences}, volume = {65}, number = {2}, pages = {E59-E70}, pmid = {31956258}, issn = {1883-0498}, mesh = {Case-Control Studies ; Child, Preschool ; Female ; Humans ; Infant ; Infant, Newborn ; Infant, Very Low Birth Weight/*growth & development ; *Language Development ; Male ; *Speech Acoustics ; }, abstract = {Expressive language development depends on anatomical factors, such as motor control of the tongue and oral cavity needed for vocalization, as well as cognitive aspects for comprehension and speech. The purpose of this study was to examine the differences in expressive language development between normal-birth-weight (NBW) infants and very-low-birth-weight (VLBW) infants in infancy using a formant analysis. We also examined the presence of differences between infants with a normal development and those with a high risk of autism spectrum disorder who were expected to exist among VLBW infants. The participants were 10 NBW infants and 10 VLBW infants 12-15 months of age whose speech had been recorded at intervals of approximately once every 3 months. The recorded speech signal was analyzed using a formant analysis, and changes due to age were observed. One NBW and 3 VLBW infants failed to pass the screening tests (CBCL and M-CHAT) at 24 months of age. The formant frequencies (F1 and F2) of the three groups of infants (NBW, VLBW and CBCL·M-CHAT non-passing infants) were scatter-plotted by age. For the NBW and VLBW infants, the area of the plot increased with age, but there was no significant expansion of the plot area for the CBCL·M-CHAT non-passing infants. The results showed no significant differences in expressive language development between NBW infants at 24 months old and VLBW infants at the corrected age. However, different language developmental patterns were observed in CBCL·M-CHAT non-passing infants, regardless of birth weight, suggesting the importance of screening by acoustic analyses.}, } @article {pmid31944876, year = {2020}, author = {Hosbach-Cannon, CJ and Lowell, SY and Colton, RH and Kelley, RT and Bao, X}, title = {Assessment of Tongue Position and Laryngeal Height in Two Professional Voice Populations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {1}, pages = {109-124}, doi = {10.1044/2019_JSLHR-19-00164}, pmid = {31944876}, issn = {1558-9102}, mesh = {Adult ; Audiometry ; Female ; Humans ; Laryngeal Muscles/*diagnostic imaging/physiology ; Larynx/*diagnostic imaging/physiology ; Male ; Phonation/physiology ; Singing/*physiology ; Sound Spectrography ; Stroboscopy ; Tongue/*diagnostic imaging/physiology ; Ultrasonography/*methods ; Young Adult ; }, abstract = {Purpose To advance our current knowledge of singer physiology by using ultrasonography in combination with acoustic measures to compare physiological differences between musical theater (MT) and opera (OP) singers under controlled phonation conditions. Primary objectives addressed in this study were (a) to determine if differences in hyolaryngeal and vocal fold contact dynamics occur between two professional voice populations (MT and OP) during singing tasks and (b) to determine if differences occur between MT and OP singers in oral configuration and associated acoustic resonance during singing tasks. Method Twenty-one singers (10 MT and 11 OP) were included. All participants were currently enrolled in a music program. Experimental procedures consisted of sustained phonation on the vowels /i/ and /ɑ/ during both a low-pitch task and a high-pitch task. Measures of hyolaryngeal elevation, tongue height, and tongue advancement were assessed using ultrasonography. Vocal fold contact dynamics were measured using electroglottography. Simultaneous acoustic recordings were obtained during all ultrasonography procedures for analysis of the first two formant frequencies. Results Significant oral configuration differences, reflected by measures of tongue height and tongue advancement, were seen between groups. Measures of acoustic resonance also showed significant differences between groups during specific tasks. Both singer groups significantly raised their hyoid position when singing high-pitched vowels, but hyoid elevation was not statistically different between groups. Likewise, vocal fold contact dynamics did not significantly differentiate the two singer groups. Conclusions These findings suggest that, under controlled phonation conditions, MT singers alter their oral configuration and achieve differing resultant formants as compared with OP singers. Because singers are at a high risk of developing a voice disorder, understanding how these two groups of singers adjust their vocal tract configuration during their specific singing genre may help to identify risky vocal behavior and provide a basis for prevention of voice disorders.}, } @article {pmid31940258, year = {2020}, author = {Souza, P and Gallun, F and Wright, R}, title = {Contributions to Speech-Cue Weighting in Older Adults With Impaired Hearing.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {1}, pages = {334-344}, pmid = {31940258}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; R01 DC015051/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Aged ; Aged, 80 and over ; Audiometry, Pure-Tone ; *Auditory Threshold ; *Cues ; Female ; Hearing Aids ; Hearing Loss, Bilateral/*psychology ; Hearing Loss, Sensorineural/*psychology ; Hearing Tests ; Humans ; Male ; Middle Aged ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Purpose In a previous paper (Souza, Wright, Blackburn, Tatman, & Gallun, 2015), we explored the extent to which individuals with sensorineural hearing loss used different cues for speech identification when multiple cues were available. Specifically, some listeners placed the greatest weight on spectral cues (spectral shape and/or formant transition), whereas others relied on the temporal envelope. In the current study, we aimed to determine whether listeners who relied on temporal envelope did so because they were unable to discriminate the formant information at a level sufficient to use it for identification and the extent to which a brief discrimination test could predict cue weighting patterns. Method Participants were 30 older adults with bilateral sensorineural hearing loss. The first task was to label synthetic speech tokens based on the combined percept of temporal envelope rise time and formant transitions. An individual profile was derived from linear discriminant analysis of the identification responses. The second task was to discriminate differences in either temporal envelope rise time or formant transitions. The third task was to discriminate spectrotemporal modulation in a nonspeech stimulus. Results All listeners were able to discriminate temporal envelope rise time at levels sufficient for the identification task. There was wide variability in the ability to discriminate formant transitions, and that ability predicted approximately one third of the variance in the identification task. There was no relationship between performance in the identification task and either amount of hearing loss or ability to discriminate nonspeech spectrotemporal modulation. Conclusions The data suggest that listeners who rely to a greater extent on temporal cues lack the ability to discriminate fine-grained spectral information. The fact that the amount of hearing loss was not associated with the cue profile underscores the need to characterize individual abilities in a more nuanced way than can be captured by the pure-tone audiogram.}, } @article {pmid31898261, year = {2020}, author = {Kamiloğlu, RG and Fischer, AH and Sauter, DA}, title = {Good vibrations: A review of vocal expressions of positive emotions.}, journal = {Psychonomic bulletin & review}, volume = {27}, number = {2}, pages = {237-265}, pmid = {31898261}, issn = {1531-5320}, support = {714977/ERC_/European Research Council/International ; }, mesh = {Emotions/*physiology ; Humans ; Nonverbal Communication/*physiology ; Speech/*physiology ; Voice/*physiology ; }, abstract = {Researchers examining nonverbal communication of emotions are becoming increasingly interested in differentiations between different positive emotional states like interest, relief, and pride. But despite the importance of the voice in communicating emotion in general and positive emotion in particular, there is to date no systematic review of what characterizes vocal expressions of different positive emotions. Furthermore, integration and synthesis of current findings are lacking. In this review, we comprehensively review studies (N = 108) investigating acoustic features relating to specific positive emotions in speech prosody and nonverbal vocalizations. We find that happy voices are generally loud with considerable variability in loudness, have high and variable pitch, and are high in the first two formant frequencies. When specific positive emotions are directly compared with each other, pitch mean, loudness mean, and speech rate differ across positive emotions, with patterns mapping onto clusters of emotions, so-called emotion families. For instance, pitch is higher for epistemological emotions (amusement, interest, relief), moderate for savouring emotions (contentment and pleasure), and lower for a prosocial emotion (admiration). Some, but not all, of the differences in acoustic patterns also map on to differences in arousal levels. We end by pointing to limitations in extant work and making concrete proposals for future research on positive emotions in the voice.}, } @article {pmid31893680, year = {2019}, author = {Dubey, AK and Prasanna, SRM and Dandapat, S}, title = {Detection and assessment of hypernasality in repaired cleft palate speech using vocal tract and residual features.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {6}, pages = {4211}, doi = {10.1121/1.5134433}, pmid = {31893680}, issn = {1520-8524}, mesh = {Child ; Cleft Palate/*surgery ; Female ; Humans ; Male ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Velopharyngeal Insufficiency/physiopathology/*surgery ; Voice/*physiology ; }, abstract = {The presence of hypernasality in repaired cleft palate (CP) speech is a consequence of velopharyngeal insufficiency. The coupling of the nasal tract with the oral tract adds nasal formant and antiformant pairs in the hypernasal speech spectrum. This addition deviates the spectral and linear prediction (LP) residual characteristics of hypernasal speech compared to normal speech. In this work, the vocal tract constriction feature, peak to side-lobe ratio feature, and spectral moment features augmented by low-order cepstral coefficients are used to capture the spectral and residual deviations for hypernasality detection. The first feature captures the lower-frequencies prominence in speech due to the presence of nasal formants, the second feature captures the undesirable signal components in the residual signal due to the nasal antiformants, and the third feature captures the information about formants and antiformants in the spectrum along with the spectral envelope. The combination of three features gives normal versus hypernasal speech detection accuracies of 87.76%, 91.13%, and 93.70% for /a/, /i/, and /u/ vowels, respectively, and hypernasality severity detection accuracies of 80.13% and 81.25% for /i/ and /u/ vowels, respectively. The speech data are collected from 30 control normal and 30 repaired CP children between the ages of 7 and 12.}, } @article {pmid31889645, year = {2021}, author = {Shiraishi, M and Mishima, K and Umeda, H}, title = {Development of an Acoustic Simulation Method during Phonation of the Japanese Vowel /a/ by the Boundary Element Method.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {4}, pages = {530-544}, doi = {10.1016/j.jvoice.2019.11.022}, pmid = {31889645}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Japan ; Male ; *Phonation ; Speech Acoustics ; Vocal Cords/diagnostic imaging ; }, abstract = {OBJECTIVES: The purpose of the present study was to establish the method for an acoustic simulation of a vocal tract created from CT data during phonation of the Japanese vowel /a/ and to verify the validity of the simulation.

MATERIAL AND METHODS: The subjects were 15 healthy adults (8 males, 7 females). The vocal tract model was created from CT data acquired during sustained phonation of the Japanese vowel /a/. After conversion to a mesh model for analysis, a wave acoustic analysis was performed with a boundary element method. The wall and the bottom of the vocal tract model were regarded as a rigid wall and a nonrigid wall, respectively. The acoustic medium was set to 37°C, and a point sound source was set in the place corresponding to the vocal cord as a sound source. The first and second formant frequencies (F1 and F2) were calculated. For 1 of the 15 subjects, the range from the upper end of the frontal sinus to the tracheal bifurcation was scanned, and 2 models were created: model 1 included the range from the frontal sinus to the tracheal bifurcation; and model 2 included the range from the frontal sinus to the glottis and added a virtually extended trachea by 12 cm cylindrically. F1 and F2 calculated from models 1 and 2 were compared. To evaluate the validity of the present simulation, F1 and F2 calculated from the simulation were compared with those of the actual voice and the sound generated using a solid model and a whistle-type artificial larynx. To judge the validity, the vowel formant frequency discrimination threshold reported in the past was used as a criterion. Namely, the relative discrimination thresholds (%), dividing ▵F by F, where F was the formant frequency calculated from the simulation, and ▵F was the difference between F and the formant frequency of the actual voice and the sound generated using the solid model and artificial larynx, were obtained.

RESULTS: F1 and F2 calculated from models 1 and 2 were similar. Therefore, to reduce the exposure dose, the remaining 14 subjects were scanned from the upper end of the frontal sinus to the glottis, and model 2 with the trachea extended by 12 cm virtually was used for the simulation. The averages of the relative discrimination thresholds against F1 and F2 calculated from the actual voice were 5.9% and 4.6%, respectively. The averages of the relative discrimination thresholds against F1 and F2 calculated from the sound generated by using the solid model and the artificial larynx were 4.1% and 3.7%, respectively.

CONCLUSIONS: The Japanese vowel /a/ could be simulated with high validity for the vocal tract models created from the CT data during phonation of /a/ using the boundary element method.}, } @article {pmid31889288, year = {2020}, author = {Huang, MY and Duan, RY and Zhao, Q}, title = {The influence of long-term cadmium exposure on the male advertisement call of Xenopus laevis.}, journal = {Environmental science and pollution research international}, volume = {27}, number = {8}, pages = {7996-8002}, pmid = {31889288}, issn = {1614-7499}, support = {31970494//National Natural Science Foundation of China/ ; 2019JJ40138//the Natural Science Foundation of Hunan/ ; }, mesh = {Advertising ; Animals ; *Cadmium/chemistry ; *Endocrine Disruptors ; Female ; Male ; Xenopus laevis ; }, abstract = {Cadmium (Cd) is a non-essential environmental endocrine-disrupting compound found in water and a potential threat to aquatic habitats. Cd has been shown to have various short-term effects on aquatic animals; however, evidence for long-term effects of Cd on vocal communications in amphibians is lacking. To better understand the long-term effects of low-dose Cd on acoustic communication in amphibians, male Xenopus laevis individuals were treated with low Cd concentrations (0.1, 1, and 10 μg/L) via aqueous exposure for 24 months. At the end of the exposure, the acoustic spectrum characteristics of male advertisement calls and male movement behaviors in response to female calls were recorded. The gene and protein expressions of the androgen receptor (AR) were determined using Western blot and RT-PCR. The results showed that long-term Cd treatment affected the spectrogram and formant of the advertisement call. Compared with the control group, 10 μg/L Cd significantly decreased the first and second formant frequency, and the fundamental and main frequency, and increased the third formant frequency. One and 10-μg/L Cd treatments significantly reduced the proportion of individuals responding to female calls and prolonged the time of first movement of the male. Long-term Cd treatment induced a downregulation in the AR protein. Treatments of 0.1, 1, and 10 μg/L Cd significantly decreased the expression of AR mRNA in the brain. These findings indicate that long-term exposure of Cd has negative effects on advertisement calls in male X. laevis.}, } @article {pmid31868693, year = {2019}, author = {Park, EJ and Yoo, SD and Kim, HS and Lee, JH and Yun, DH and Kim, DH and Chon, JM and Lee, SA and Soh, YS and Kim, Y and Han, YR and Yoo, MC and Choi, KM and Seo, YK and Lee, DH and Choi, YH and Jeong, KH and Son, JE}, title = {Correlations between swallowing function and acoustic vowel space in stroke patients with dysarthria.}, journal = {NeuroRehabilitation}, volume = {45}, number = {4}, pages = {463-469}, doi = {10.3233/NRE-192904}, pmid = {31868693}, issn = {1878-6448}, mesh = {*Deglutition ; Deglutition Disorders/epidemiology/*physiopathology ; Dysarthria/epidemiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Stroke/*complications/physiopathology ; }, abstract = {BACKGROUND: Dysphagia and dysarthria tend to coexist in stroke patients. Dysphagia can reduce patients' quality of life, cause aspiration pneumonia and increased mortality.

OBJECTIVE: To evaluate correlations among swallowing function parameters and acoustic vowel space values in patients with stroke.

METHODS: Data from stroke patients with dysarthria and dysphagia were collected. The formant parameter representing the resonance frequency of the vocal tract as a two-dimensional coordinate point was measured for the /a/, /ae/, /i/, and /u/vowels, and the quadrilateral vowel space area (VSA) and formant centralization ratio (FCR) were measured. Swallowing function was evaluated by a videofluoroscopic swallowing study (VFSS) using the videofluoroscopic dysphagia scale (VDS) and penetration aspiration scale (PAS). Pearson's correlation and linear regression analyses were used to assess the correlation of VSA and FCR to VDS and PAS scores.

RESULTS: Thirty-one stroke patients with dysphagia and dysarthria were analyzed. VSA showed a negative correlation to VDS and PAS scores, while FCR showed a positive correlation to VDS score, but not to PAS score. VSA and FCR were significant factors for assessing dysphagia severity.

CONCLUSIONS: VSA and FCR values were correlated with swallowing function and may be helpful in predicting dysphagia severity associated with stroke.}, } @article {pmid31862999, year = {2019}, author = {McCarthy, KM and Skoruppa, K and Iverson, P}, title = {Development of neural perceptual vowel spaces during the first year of life.}, journal = {Scientific reports}, volume = {9}, number = {1}, pages = {19592}, pmid = {31862999}, issn = {2045-2322}, mesh = {Electroencephalography ; Female ; Humans ; Infant ; Language ; Learning ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Verbal Learning ; }, abstract = {This study measured infants' neural responses for spectral changes between all pairs of a set of English vowels. In contrast to previous methods that only allow for the assessment of a few phonetic contrasts, we present a new method that allows us to assess changes in spectral sensitivity across the entire vowel space and create two-dimensional perceptual maps of the infants' vowel development. Infants aged four to eleven months were played long series of concatenated vowels, and the neural response to each vowel change was assessed using the Acoustic Change Complex (ACC) from EEG recordings. The results demonstrated that the youngest infants' responses more closely reflected the acoustic differences between the vowel pairs and reflected higher weight to first-formant variation. Older infants had less acoustically driven responses that seemed a result of selective increases in sensitivity for phonetically similar vowels. The results suggest that phonetic development may involve a perceptual warping for confusable vowels rather than uniform learning, as well as an overall increasing sensitivity to higher-frequency acoustic information.}, } @article {pmid31848063, year = {2021}, author = {Houle, N and Levi, SV}, title = {Effect of Phonation on Perception of Femininity/Masculinity in Transgender and Cisgender Speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {3}, pages = {497.e23-497.e37}, doi = {10.1016/j.jvoice.2019.10.011}, pmid = {31848063}, issn = {1873-4588}, mesh = {Female ; Humans ; Male ; Auditory Perception ; Femininity ; Masculinity ; Phonation ; Speech Acoustics ; *Speech Perception ; *Transgender Persons ; Young Adult ; Adult ; Middle Aged ; }, abstract = {Many transwomen seek voice and communication therapy to support their transition from their gender assigned at birth to their gender identity. This has led to an increased need to examine the perception of gender and femininity/masculinity to develop evidence-based intervention practices. In this study, we explore the auditory perception of femininity/masculinity in normally phonated and whispered speech. Transwomen, ciswomen, and cismen were recorded producing /hVd/ words. Naïve listeners rated femininity/masculinity of a speaker's voice using a visual analog scale, rather than completing a binary gender identification task. The results revealed that listeners rated speakers more ambiguously in whispered speech than normally phonated speech. An analysis of speaker and token characteristics revealed that in the normally phonated condition listeners consistently use f0 to rate femininity/masculinity. In addition, some evidence was found for possible contributions of formant frequencies, particularly F2, and duration. Taken together, this provides additional evidence for the salience of f0 and F2 for voice and communication intervention among transwomen.}, } @article {pmid31824364, year = {2019}, author = {Xu, Y and Prom-On, S}, title = {Economy of Effort or Maximum Rate of Information? Exploring Basic Principles of Articulatory Dynamics.}, journal = {Frontiers in psychology}, volume = {10}, number = {}, pages = {2469}, pmid = {31824364}, issn = {1664-1078}, support = {R01 DC003902/DC/NIDCD NIH HHS/United States ; }, abstract = {Economy of effort, a popular notion in contemporary speech research, predicts that dynamic extremes such as the maximum speed of articulatory movement are avoided as much as possible and that approaching the dynamic extremes is necessary only when there is a need to enhance linguistic contrast, as in the case of stress or clear speech. Empirical data, however, do not always support these predictions. In the present study, we considered an alternative principle: maximum rate of information, which assumes that speech dynamics are ultimately driven by the pressure to transmit information as quickly and accurately as possible. For empirical data, we asked speakers of American English to produce repetitive syllable sequences such as wawawawawa as fast as possible by imitating recordings of the same sequences that had been artificially accelerated and to produce meaningful sentences containing the same syllables at normal and fast speaking rates. Analysis of formant trajectories shows that dynamic extremes in meaningful speech sometimes even exceeded those in the nonsense syllable sequences but that this happened more often in unstressed syllables than in stressed syllables. We then used a target approximation model based on a mass-spring system of varying orders to simulate the formant kinematics. The results show that the kind of formant kinematics found in the present study and in previous studies can only be generated by a dynamical system operating with maximal muscular force under strong time pressure and that the dynamics of this operation may hold the solution to the long-standing enigma of greater stiffness in unstressed than in stressed syllables. We conclude, therefore, that maximum rate of information can coherently explain both current and previous empirical data and could therefore be a fundamental principle of motor control in speech production.}, } @article {pmid31795850, year = {2019}, author = {Root-Gutteridge, H and Ratcliffe, VF and Korzeniowska, AT and Reby, D}, title = {Dogs perceive and spontaneously normalize formant-related speaker and vowel differences in human speech sounds.}, journal = {Biology letters}, volume = {15}, number = {12}, pages = {20190555}, pmid = {31795850}, issn = {1744-957X}, support = {BB/P00170X/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Animals ; Cues ; Dogs ; Humans ; Phonetics ; Speech ; *Speech Perception ; *Voice ; }, abstract = {Domesticated animals have been shown to recognize basic phonemic information from human speech sounds and to recognize familiar speakers from their voices. However, whether animals can spontaneously identify words across unfamiliar speakers (speaker normalization) or spontaneously discriminate between unfamiliar speakers across words remains to be investigated. Here, we assessed these abilities in domestic dogs using the habituation-dishabituation paradigm. We found that while dogs habituated to the presentation of a series of different short words from the same unfamiliar speaker, they significantly dishabituated to the presentation of a novel word from a new speaker of the same gender. This suggests that dogs spontaneously categorized the initial speaker across different words. Conversely, dogs who habituated to the same short word produced by different speakers of the same gender significantly dishabituated to a novel word, suggesting that they had spontaneously categorized the word across different speakers. Our results indicate that the ability to spontaneously recognize both the same phonemes across different speakers, and cues to identity across speech utterances from unfamiliar speakers, is present in domestic dogs and thus not a uniquely human trait.}, } @article {pmid31795713, year = {2019}, author = {Vorperian, HK and Kent, RD and Lee, Y and Bolt, DM}, title = {Corner vowels in males and females ages 4 to 20 years: Fundamental and F1-F4 formant frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3255}, pmid = {31795713}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Aging/*physiology ; Child ; Child, Preschool ; Female ; Glottis/growth & development ; Humans ; Male ; *Phonation ; *Phonetics ; Sex Factors ; Speech Acoustics ; Voice ; Young Adult ; }, abstract = {The purpose of this study was to determine the developmental trajectory of the four corner vowels' fundamental frequency (fo) and the first four formant frequencies (F1-F4), and to assess when speaker-sex differences emerge. Five words per vowel, two of which were produced twice, were analyzed for fo and estimates of the first four formants frequencies from 190 (97 female, 93 male) typically developing speakers ages 4-20 years old. Findings revealed developmental trajectories with decreasing values of fo and formant frequencies. Sex differences in fo emerged at age 7. The decrease of fo was larger in males than females with a marked drop during puberty. Sex differences in formant frequencies appeared at the earliest age under study and varied with vowel and formant. Generally, the higher formants (F3-F4) were sensitive to sex differences. Inter- and intra-speaker variability declined with age but had somewhat different patterns, likely reflective of maturing motor control that interacts with the changing anatomy. This study reports a source of developmental normative data on fo and the first four formants in both sexes. The different developmental patterns in the first four formants and vowel-formant interactions in sex differences likely point to anatomic factors, although speech-learning phenomena cannot be discounted.}, } @article {pmid31795696, year = {2019}, author = {Gianakas, SP and Winn, MB}, title = {Lexical bias in word recognition by cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3373}, pmid = {31795696}, issn = {1520-8524}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Bias ; *Cochlear Implants ; Cues ; Female ; Hearing Loss/*physiopathology/rehabilitation ; Humans ; Male ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {When hearing an ambiguous speech sound, listeners show a tendency to perceive it as a phoneme that would complete a real word, rather than completing a nonsense/fake word. For example, a sound that could be heard as either /b/ or /ɡ/ is perceived as /b/ when followed by _ack but perceived as /ɡ/ when followed by "_ap." Because the target sound is acoustically identical across both environments, this effect demonstrates the influence of top-down lexical processing in speech perception. Degradations in the auditory signal were hypothesized to render speech stimuli more ambiguous, and therefore promote increased lexical bias. Stimuli included three speech continua that varied by spectral cues of varying speeds, including stop formant transitions (fast), fricative spectra (medium), and vowel formants (slow). Stimuli were presented to listeners with cochlear implants (CIs), and also to listeners with normal hearing with clear spectral quality, or with varying amounts of spectral degradation using a noise vocoder. Results indicated an increased lexical bias effect with degraded speech and for CI listeners, for whom the effect size was related to segment duration. This method can probe an individual's reliance on top-down processing even at the level of simple lexical/phonetic perception.}, } @article {pmid31795676, year = {2019}, author = {Perrachione, TK and Furbeck, KT and Thurston, EJ}, title = {Acoustic and linguistic factors affecting perceptual dissimilarity judgments of voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3384}, pmid = {31795676}, issn = {1520-8524}, support = {R03 DC014045/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Psycholinguistics ; *Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {The human voice is a complex acoustic signal that conveys talker identity via individual differences in numerous features, including vocal source acoustics, vocal tract resonances, and dynamic articulations during speech. It remains poorly understood how differences in these features contribute to perceptual dissimilarity of voices and, moreover, whether linguistic differences between listeners and talkers interact during perceptual judgments of voices. Here, native English- and Mandarin-speaking listeners rated the perceptual dissimilarity of voices speaking English or Mandarin from either forward or time-reversed speech. The language spoken by talkers, but not listeners, principally influenced perceptual judgments of voices. Perceptual dissimilarity judgments of voices were always highly correlated between listener groups and forward/time-reversed speech. Representational similarity analyses that explored how acoustic features (fundamental frequency mean and variation, jitter, harmonics-to-noise ratio, speech rate, and formant dispersion) contributed to listeners' perceptual dissimilarity judgments, including how talker- and listener-language affected these relationships, found the largest effects relating to voice pitch. Overall, these data suggest that, while linguistic factors may influence perceptual judgments of voices, the magnitude of such effects tends to be very small. Perceptual judgments of voices by listeners of different native language backgrounds tend to be more alike than different.}, } @article {pmid31789576, year = {2020}, author = {Lo, JJH}, title = {Between Äh(m) and Euh(m): The Distribution and Realization of Filled Pauses in the Speech of German-French Simultaneous Bilinguals.}, journal = {Language and speech}, volume = {63}, number = {4}, pages = {746-768}, doi = {10.1177/0023830919890068}, pmid = {31789576}, issn = {1756-6053}, mesh = {Adult ; Female ; France ; Germany ; Humans ; *Language ; Male ; *Multilingualism ; *Phonetics ; *Speech ; }, abstract = {Filled pauses are well known for their speaker specificity, yet cross-linguistic research has also shown language-specific trends in their distribution and phonetic quality. To examine the extent to which speakers acquire filled pauses as language- or speaker-specific phenomena, this study investigates the use of filled pauses in the context of adult simultaneous bilinguals. Making use of both distributional and acoustic data, this study analyzed UH, consisting of only a vowel component, and UM, with a vowel followed by [m], in the speech of 15 female speakers who were simultaneously bilingual in French and German. Speakers were found to use UM more frequently in German than in French, but only German-dominant speakers had a preference for UM in German. Formant and durational analyses showed that while speakers maintained distinct vowel qualities in their filled pauses in different languages, filled pauses in their weaker language exhibited a shift towards those in their dominant language. These results suggest that, despite high levels of variability between speakers, there is a significant role for language in the acquisition of filled pauses in simultaneous bilingual speakers, which is further shaped by the linguistic environment they grow up in.}, } @article {pmid31777085, year = {2020}, author = {Frey, R and Volodin, IA and Volodina, EV and Efremova, KO and Menges, V and Portas, R and Melzheimer, J and Fritsch, G and Gerlach, C and von Dörnberg, K}, title = {Savannah roars: The vocal anatomy and the impressive rutting calls of male impala (Aepyceros melampus) - highlighting the acoustic correlates of a mobile larynx.}, journal = {Journal of anatomy}, volume = {236}, number = {3}, pages = {398-424}, pmid = {31777085}, issn = {1469-7580}, mesh = {Acoustics ; Animals ; Antelopes/*anatomy & histology/physiology ; Laryngeal Muscles/*anatomy & histology/physiology ; Larynx/*anatomy & histology/physiology ; Male ; Vocal Cords/anatomy & histology/physiology ; Vocalization, Animal/*physiology ; }, abstract = {A retractable larynx and adaptations of the vocal folds in the males of several polygynous ruminants serve for the production of rutting calls that acoustically announce larger than actual body size to both rival males and potential female mates. Here, such features of the vocal tract and of the sound source are documented in another species. We investigated the vocal anatomy and laryngeal mobility including its acoustical effects during the rutting vocal display of free-ranging male impala (Aepyceros melampus melampus) in Namibia. Male impala produced bouts of rutting calls (consisting of oral roars and interspersed explosive nasal snorts) in a low-stretch posture while guarding a rutting territory or harem. For the duration of the roars, male impala retracted the larynx from its high resting position to a low mid-neck position involving an extensible pharynx and a resilient connection between the hyoid apparatus and the larynx. Maximal larynx retraction was 108 mm based on estimates in video single frames. This was in good concordance with 91-mm vocal tract elongation calculated on the basis of differences in formant dispersion between roar portions produced with the larynx still ascended and those produced with maximally retracted larynx. Judged by their morphological traits, the larynx-retracting muscles of male impala are homologous to those of other larynx-retracting ruminants. In contrast, the large and massive vocal keels are evolutionary novelties arising by fusion and linear arrangement of the arytenoid cartilage and the canonical vocal fold. These bulky and histologically complex vocal keels produced a low fundamental frequency of 50 Hz. Impala is another ruminant species in which the males are capable of larynx retraction. In addition, male impala vocal folds are spectacularly specialized compared with domestic bovids, allowing the production of impressive, low-frequency roaring vocalizations as a significant part of their rutting behaviour. Our study expands knowledge on the evolutionary variation of vocal fold morphology in mammals, suggesting that the structure of the mammalian sound source is not always human-like and should be considered in acoustic analysis and modelling.}, } @article {pmid31758279, year = {2020}, author = {Hu, G and Determan, SC and Dong, Y and Beeve, AT and Collins, JE and Gai, Y}, title = {Spectral and Temporal Envelope Cues for Human and Automatic Speech Recognition in Noise.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {21}, number = {1}, pages = {73-87}, pmid = {31758279}, issn = {1438-7573}, support = {T32 AR060719/AR/NIAMS NIH HHS/United States ; }, mesh = {Adult ; Artificial Intelligence ; Female ; Humans ; Male ; *Noise ; *Speech Acoustics ; *Speech Perception ; *Speech Recognition Software ; Young Adult ; }, abstract = {Acoustic features of speech include various spectral and temporal cues. It is known that temporal envelope plays a critical role for speech recognition by human listeners, while automated speech recognition (ASR) heavily relies on spectral analysis. This study compared sentence-recognition scores of humans and an ASR software, Dragon, when spectral and temporal-envelope cues were manipulated in background noise. Temporal fine structure of meaningful sentences was reduced by noise or tone vocoders. Three types of background noise were introduced: a white noise, a time-reversed multi-talker noise, and a fake-formant noise. Spectral information was manipulated by changing the number of frequency channels. With a 20-dB signal-to-noise ratio (SNR) and four vocoding channels, white noise had a stronger disruptive effect than the fake-formant noise. The same observation with 22 channels was made when SNR was lowered to 0 dB. In contrast, ASR was unable to function with four vocoding channels even with a 20-dB SNR. Its performance was least affected by white noise and most affected by the fake-formant noise. Increasing the number of channels, which improved the spectral resolution, generated non-monotonic behaviors for the ASR with white noise but not with colored noise. The ASR also showed highly improved performance with tone vocoders. It is possible that fake-formant noise affected the software's performance by disrupting spectral cues, whereas white noise affected performance by compromising speech segmentation. Overall, these results suggest that human listeners and ASR utilize different listening strategies in noise.}, } @article {pmid31751443, year = {2019}, author = {França, FP and Almeida, AA and Lopes, LW}, title = {Acoustic-articulatory configuration of women with vocal nodules and with healthy voice.}, journal = {CoDAS}, volume = {31}, number = {6}, pages = {e20180241}, doi = {10.1590/2317-1782/20192018241}, pmid = {31751443}, issn = {2317-1782}, mesh = {Adult ; Aged ; Aged, 80 and over ; Case-Control Studies ; Cross-Sectional Studies ; Female ; Humans ; Laryngeal Diseases/physiopathology ; Middle Aged ; Phonetics ; *Speech Acoustics ; Speech Production Measurement/methods ; Vocal Cords/*physiopathology ; *Voice Quality ; }, abstract = {PURPOSE: To analyze the acoustic-articulatory configuration of vowels in women with vocal nodules and with healthy voice.

METHODS: Twelve women with vocal nodules (EG) and twelve vocally health women (CG) participated of this study. All women recorded vehicle phrases with the vowels /a/, /i/, and /u/ in stress position, preceded and followed by the occlusive consonant /p/: "Digo papa baixinho", "Digo pipa baixinho", and "Digo pupa baixinho". Subsequently, the first three formants (F1, F2, and F3) were extracted from these vowel targets.

RESULTS: Between the two groups studied, F1 measures differed for vowels /a/ and /u/, and F2 measures differed for the vowel /a/. Women with vocal nodules showed lower values for these measures compared to vocally healthy women. Patients with vocal nodules showed a smaller interval in F1 and F2 values between vowels /a/, /i/, and /u/ compared to vocally healthy women.

CONCLUSION: Women with vocal nodules show lower F1 and F2 values and lower range of motion of the articulators during vowel production compared to vocally healthy women.}, } @article {pmid31747532, year = {2019}, author = {Hu, W and Tao, S and Li, M and Liu, C}, title = {Distinctiveness and Assimilation in Vowel Perception in a Second Language.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {12}, pages = {4534-4543}, doi = {10.1044/2019_JSLHR-H-19-0074}, pmid = {31747532}, issn = {1558-9102}, mesh = {*Acculturation ; Acoustic Stimulation ; Adolescent ; Adult ; Asian People/psychology ; Audiometry, Speech ; Female ; Humans ; *Language ; Male ; *Multilingualism ; *Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Purpose The purpose of this study was to investigate how the distinctive establishment of 2nd language (L2) vowel categories (e.g., how distinctively an L2 vowel is established from nearby L2 vowels and from the native language counterpart in the 1st formant [F1] × 2nd formant [F2] vowel space) affected L2 vowel perception. Method Identification of 12 natural English monophthongs, and categorization and rating of synthetic English vowels /i/ and /ɪ/ in the F1 × F2 space were measured for Chinese-native (CN) and English-native (EN) listeners. CN listeners were also examined with categorization and rating of Chinese vowels in the F1 × F2 space. Results As expected, EN listeners significantly outperformed CN listeners in English vowel identification. Whereas EN listeners showed distinctive establishment of 2 English vowels, CN listeners had multiple patterns of L2 vowel establishment: both, 1, or neither established. Moreover, CN listeners' English vowel perception was significantly related to the perceptual distance between the English vowel and its Chinese counterpart, and the perceptual distance between the adjacent English vowels. Conclusions L2 vowel perception relied on listeners' capacity to distinctively establish L2 vowel categories that were distant from the nearby L2 vowels.}, } @article {pmid31741978, year = {2019}, author = {Sandeep, S and Shilpa, C and Shetty, TS and Basavaraj, S and Menon, NN}, title = {Voice Analysis in Post Tonsillectomy Patients.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {71}, number = {Suppl 1}, pages = {312-317}, pmid = {31741978}, issn = {2231-3796}, abstract = {The main aim of this study was to analyse the change in voice in terms of acoustic parameters and its perceptual impact in patients who have undergone tonsillectomy. A prospective study was conducted in our institution-JSS Hospital and JSS institute of speech and hearing, Mysore for a duration of 1 year (December 2015-December 2016). 50 post tonsillectomy cases were selected randomly and subjected to acoustic analysis. It was inferred that situation of vocal analysis and assessment for the vowels 'a', 'i' and 'u' under the categories hoarse, harsh and breathy remain more or less the same during preoperative stages, first preoperative follow up and the second post operative follow up. It was concluded that tonsillectomy did not appear to change the acoustic features of vowels remarkably. It was assumed that the subject may adjust the shape of the vocal tract to produce consistent speech sound after surgery using auditory feedback.}, } @article {pmid31738857, year = {2019}, author = {Mollaei, F and Shiller, DM and Baum, SR and Gracco, VL}, title = {The Relationship Between Speech Perceptual Discrimination and Speech Production in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {12}, pages = {4256-4268}, doi = {10.1044/2019_JSLHR-S-18-0425}, pmid = {31738857}, issn = {1558-9102}, mesh = {Aged ; Basal Ganglia/physiopathology ; Case-Control Studies ; Feedback, Sensory ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*physiopathology ; Pitch Discrimination/*physiology ; Speech/*physiology ; Speech Acoustics ; Speech Discrimination Tests ; }, abstract = {Purpose We recently demonstrated that individuals with Parkinson's disease (PD) respond differentially to specific altered auditory feedback parameters during speech production. Participants with PD respond more robustly to pitch and less robustly to formant manipulations compared to control participants. In this study, we investigated whether differences in perceptual processing may in part underlie these compensatory differences in speech production. Methods Pitch and formant feedback manipulations were presented under 2 conditions: production and listening. In the production condition, 15 participants with PD and 15 age- and gender-matched healthy control participants judged whether their own speech output was manipulated in real time. During the listening task, participants judged whether paired tokens of their previously recorded speech samples were the same or different. Results Under listening, 1st formant manipulation discrimination was significantly reduced for the PD group compared to the control group. There was a trend toward better discrimination of pitch in the PD group, but the group difference was not significant. Under the production condition, the ability of participants with PD to identify pitch manipulations was greater than that of the controls. Conclusion The findings suggest perceptual processing differences associated with acoustic parameters of fundamental frequency and 1st formant perturbations in PD. These findings extend our previous results, indicating that different patterns of compensation to pitch and 1st formant shifts may reflect a combination of sensory and motor mechanisms that are differentially influenced by basal ganglia dysfunction.}, } @article {pmid31734323, year = {2020}, author = {Escudero, P and Kalashnikova, M}, title = {Infants use phonetic detail in speech perception and word learning when detail is easy to perceive.}, journal = {Journal of experimental child psychology}, volume = {190}, number = {}, pages = {104714}, doi = {10.1016/j.jecp.2019.104714}, pmid = {31734323}, issn = {1096-0457}, mesh = {Analysis of Variance ; *Discrimination, Psychological ; Female ; Humans ; Infant ; Language ; *Language Development ; Male ; *Phonetics ; *Speech Perception ; *Verbal Learning ; }, abstract = {Infants successfully discriminate speech sound contrasts that belong to their native language's phonemic inventory in auditory-only paradigms, but they encounter difficulties in distinguishing the same contrasts in the context of word learning. These difficulties are usually attributed to the fact that infants' attention to the phonetic detail in novel words is attenuated when they must allocate additional cognitive resources demanded by word-learning tasks. The current study investigated 15-month-old infants' ability to distinguish novel words that differ by a single vowel in an auditory discrimination paradigm (Experiment 1) and a word-learning paradigm (Experiment 2). These experiments aimed to tease apart whether infants' performance is dependent solely on the specific acoustic properties of the target vowels or on the context of the task. Experiment 1 showed that infants were able to discriminate only a contrast marked by a large difference along a static dimension (the vowels' second formant), whereas they were not able to discriminate a contrast with a small phonetic distance between its vowels, due to the dynamic nature of the vowels. In Experiment 2, infants did not succeed at learning words containing the same contrast they were able to discriminate in Experiment 1. The current findings demonstrate that both the specific acoustic properties of vowels in infants' native language and the task presented continue to play a significant role in early speech perception well into the second year of life.}, } @article {pmid31715197, year = {2020}, author = {Rosenthal, MA}, title = {A systematic review of the voice-tagging hypothesis of speech-in-noise perception.}, journal = {Neuropsychologia}, volume = {136}, number = {}, pages = {107256}, doi = {10.1016/j.neuropsychologia.2019.107256}, pmid = {31715197}, issn = {1873-3514}, mesh = {Discrimination, Psychological/*physiology ; Humans ; *Music ; Pitch Perception/*physiology ; Speech Perception/*physiology ; *Voice ; }, abstract = {The voice-tagging hypothesis claims that individuals who better represent pitch information in a speaker's voice, as measured with the frequency following response (FFR), will be better at speech-in-noise perception. The hypothesis has been provided to explain how music training might improve speech-in-noise perception. This paper reviews studies that are relevant to the voice-tagging hypothesis, including studies on musicians and nonmusicians. Most studies on musicians show greater f0 amplitude compared to controls. Most studies on nonmusicians do not show group differences in f0 amplitude. Across all studies reviewed, f0 amplitude does not consistently predict accuracy in speech-in-noise perception. The evidence suggests that music training does not improve speech-in-noise perception via enhanced subcortical representation of the f0.}, } @article {pmid31708368, year = {2021}, author = {Hakanpää, T and Waaramaa, T and Laukkanen, AM}, title = {Comparing Contemporary Commercial and Classical Styles: Emotion Expression in Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {4}, pages = {570-580}, doi = {10.1016/j.jvoice.2019.10.002}, pmid = {31708368}, issn = {1873-4588}, mesh = {Emotions ; Female ; Humans ; *Singing ; Speech Acoustics ; *Voice ; Voice Quality ; }, abstract = {OBJECTIVE: This study examines the acoustic correlates of the vocal expression of emotions in contemporary commercial music (CCM) and classical styles of singing. This information may be useful in improving the training of interpretation in singing.

STUDY DESIGN: This is an experimental comparative study.

METHODS: Eleven female singers with a minimum of 3 years of professional-level singing training in CCM, classical, or both styles participated. They sang the vowel [ɑ:] at three pitches (A3 220Hz, E4 330Hz, and A4 440Hz) expressing anger, sadness, joy, tenderness, and a neutral voice. Vowel samples were analyzed for fundamental frequency (fo) formant frequencies (F1-F5), sound pressure level (SPL), spectral structure (alpha ratio = SPL 1500-5000 Hz-SPL 50-1500 Hz), harmonics-to-noise ratio (HNR), perturbation (jitter, shimmer), onset and offset duration, sustain time, rate and extent of fo variation in vibrato, and rate and extent of amplitude vibrato.

RESULTS: The parameters that were statistically significantly (RM-ANOVA, P ≤ 0.05) related to emotion expression in both genres were SPL, alpha ratio, F1, and HNR. Additionally, for CCM, significance was found in sustain time, jitter, shimmer, F2, and F4. When fo and SPL were set as covariates in the variance analysis, jitter, HNR, and F4 did not show pure dependence on expression. The alpha ratio, F1, F2, shimmer apq5, amplitude vibrato rate, and sustain time of vocalizations had emotion-related variation also independent of fo and SPL in the CCM style, while these parameters were related to fo and SPL in the classical style.

CONCLUSIONS: The results differed somewhat for the CCM and classical styles. The alpha ratio showed less variation in the classical style, most likely reflecting the demand for a more stable voice source quality. The alpha ratio, F1, F2, shimmer, amplitude vibrato rate, and the sustain time of the vocalizations were related to fo and SPL control in the classical style. The only common independent sound parameter indicating emotional expression for both styles was SPL. The CCM style offers more freedom for expression-related changes in voice quality.}, } @article {pmid31693443, year = {2019}, author = {Weirich, M and Simpson, A}, title = {Effects of Gender, Parental Role, and Time on Infant- and Adult-Directed Read and Spontaneous Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {11}, pages = {4001-4014}, doi = {10.1044/2019_JSLHR-S-19-0047}, pmid = {31693443}, issn = {1558-9102}, mesh = {Adult ; Age Factors ; *Child Language ; Germany ; Humans ; Infant ; Male ; Parents/*psychology ; Phonetics ; *Reading ; *Role ; Sex Factors ; *Speech ; }, abstract = {Purpose The study sets out to investigate inter- and intraspeaker variation in German infant-directed speech (IDS) and considers the potential impact that the factors gender, parental involvement, and speech material (read vs. spontaneous speech) may have. In addition, we analyze data from 3 time points prior to and after the birth of the child to examine potential changes in the features of IDS and, particularly also, of adult-directed speech (ADS). Here, the gender identity of a speaker is considered as an additional factor. Method IDS and ADS data from 34 participants (15 mothers, 19 fathers) is gathered by means of a reading and a picture description task. For IDS, 2 recordings were made when the baby was approximately 6 and 9 months old, respectively. For ADS, an additional recording was made before the baby was born. Phonetic analyses comprise mean fundamental frequency (f0), variation in f0, the 1st 2 formants measured in /i: ɛ a u:/, and the vowel space size. Moreover, social and behavioral data were gathered regarding parental involvement and gender identity. Results German IDS is characterized by an increase in mean f0, a larger variation in f0, vowel- and formant-specific differences, and a larger acoustic vowel space. No effect of gender or parental involvement was found. Also, the phonetic features of IDS were found in both spontaneous and read speech. Regarding ADS, changes in vowel space size in some of the fathers and in mean f0 in mothers were found. Conclusion Phonetic features of German IDS are robust with respect to the factors gender, parental involvement, speech material (read vs. spontaneous speech), and time. Some phonetic features of ADS changed within the child's first year depending on gender and parental involvement/gender identity. Thus, further research on IDS needs to address also potential changes in ADS.}, } @article {pmid31688299, year = {2019}, author = {de Carvalho, CC and da Silva, DM and de Carvalho, AD and Nóbrega, FJF and de Orange, FA}, title = {Evaluation of the association between voice formants and difficult facemask ventilation.}, journal = {European journal of anaesthesiology}, volume = {36}, number = {12}, pages = {972-973}, doi = {10.1097/EJA.0000000000001108}, pmid = {31688299}, issn = {1365-2346}, mesh = {Adult ; Aged ; Airway Management/adverse effects/*instrumentation ; Anesthesia, General/adverse effects/instrumentation ; Elective Surgical Procedures/adverse effects ; Female ; Humans ; Intraoperative Complications/*epidemiology/etiology/physiopathology ; Larynx/anatomy & histology/physiology ; Male ; Masks/*adverse effects ; Middle Aged ; Phonation/*physiology ; Preoperative Period ; Prospective Studies ; Risk Assessment/methods ; Voice/*physiology ; }, } @article {pmid31685325, year = {2021}, author = {Bernardi, JMB and de Barros, LN and Assunção, LS and de Oliveira, RS and Gambirásio, YF and Medved, DMS and Fernandes, ACN and da Silva, EM}, title = {Effect of the Finnish Tube on the Voice of a Deaf Musician: A Case Report.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {3}, pages = {498.e23-498.e29}, doi = {10.1016/j.jvoice.2019.09.019}, pmid = {31685325}, issn = {1873-4588}, mesh = {Brazil ; Finland ; Humans ; Male ; Middle Aged ; *Phonation ; Speech Acoustics ; Voice Quality ; *Voice Training ; }, abstract = {PURPOSE: To verify the auditory-perceptual and acoustic effects of the semioccluded vocal tract exercise with Finnish tube on the vocal quality of a deaf musician.

METHODS: A seven-day protocol with Finnish tube was performed with guidance for its home replication twice a day. A 46-years-old man with profound bilateral sensorineural hearing loss, musician and composer participated. Before and after the application of the protocol had undergone tonal audiometry, nasofibrolaryngoscopy, acoustic analysis with Praat and auditory-perceptual evaluation of the voice with Voice Profile Analysis Scheme for Brazilian Portuguese.

RESULTS: The postintervention auditory-perceptual analysis identified reduction of the deviation in lip spreading, extensive labial range, raised tongue body, pharyngeal expansion, nasal resonance, larynx height, larynx and vocal tract tension and irregularity, pitch, speech rate, and a better respiratory support. The maximum phonation time reduced, probably because of elimination of the abrupt vocal attack and tension, articulatory deviations, improvement in voicing and the absence of the use of expiratory reserve air. The fundamental frequency became lower, and the first, second, third, and fourth formants became higher. The jitter increased, and the shimmer reduced.

CONCLUSION: The use of the Finnish tube might have facilitated the voicing sensations in the deaf musician, by enhancing the tactile-kinesthetic perception of the vocal tract and brought a greater source-filter interaction.}, } @article {pmid31682569, year = {2020}, author = {Preisig, BC and Sjerps, MJ and Hervais-Adelman, A and Kösem, A and Hagoort, P and Riecke, L}, title = {Bilateral Gamma/Delta Transcranial Alternating Current Stimulation Affects Interhemispheric Speech Sound Integration.}, journal = {Journal of cognitive neuroscience}, volume = {32}, number = {7}, pages = {1242-1250}, doi = {10.1162/jocn_a_01498}, pmid = {31682569}, issn = {1530-8898}, mesh = {*Auditory Cortex ; Auditory Perception ; Humans ; Phonetics ; Speech ; *Transcranial Direct Current Stimulation ; }, abstract = {Perceiving speech requires the integration of different speech cues, that is, formants. When the speech signal is split so that different cues are presented to the right and left ear (dichotic listening), comprehension requires the integration of binaural information. Based on prior electrophysiological evidence, we hypothesized that the integration of dichotically presented speech cues is enabled by interhemispheric phase synchronization between primary and secondary auditory cortex in the gamma frequency band. We tested this hypothesis by applying transcranial alternating current stimulation (TACS) bilaterally above the superior temporal lobe to induce or disrupt interhemispheric gamma-phase coupling. In contrast to initial predictions, we found that gamma TACS applied in-phase above the two hemispheres (interhemispheric lag 0°) perturbs interhemispheric integration of speech cues, possibly because the applied stimulation perturbs an inherent phase lag between the left and right auditory cortex. We also observed this disruptive effect when applying antiphasic delta TACS (interhemispheric lag 180°). We conclude that interhemispheric phase coupling plays a functional role in interhemispheric speech integration. The direction of this effect may depend on the stimulation frequency.}, } @article {pmid31663086, year = {2019}, author = {Howson, PJ and Redford, MA}, title = {Liquid coarticulation in child and adult speech.}, journal = {Proceedings of the ... International Congress of Phonetic Sciences. International Congress of Phonetic Sciences}, volume = {2019}, number = {}, pages = {3100-3104}, pmid = {31663086}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, abstract = {Although liquids are mastered late, English-speaking children are said to have fully acquired these segments by age 8. The aim of this study was to test whether liquid coarticulation was also adult-like by this age. 8-year-old productions of /əLa/ and /əLu/ sequences were compared to 5-year-old and adult productions of these sequences. SSANOVA analyses of formant frequency trajectories indicated that, while adults contrasted rhotics and laterals from the onset of the vocalic sequence, F2 trajectories for rhotics and lateral were overlapped at the onset of the /əLa/ sequence in 8-year-old productions and across the entire /əLu/ sequence. The F2 trajectories for rhotics and laterals were even more overlapped in 5-year olds' productions. Overall, the study suggests that whereas younger children have difficulty coordinating the tongue body/root gesture with the tongue tip gesture, older children still struggle with the intergestural timing associated with liquid production.}, } @article {pmid31663083, year = {2019}, author = {Howson, PJ and Redford, MA}, title = {LISTENER PREFERENCE IS FOR REDUCED DETERMINERS THAT ANTICIPATE THE FOLLOWING NOUN.}, journal = {Proceedings of the ... International Congress of Phonetic Sciences. International Congress of Phonetic Sciences}, volume = {2019}, number = {}, pages = {378-382}, pmid = {31663083}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, abstract = {This study examines the effects of determiner reduction and coarticulation on the perceived naturalness of resynthesized shock-the-geek (V-the-N) sequences. The determiner, equally spaced between monosyllabic V and N, was manipulated in 3 experiments along a 7-step continuum: (1) duration varied from 0.25x the original duration to 4x this duration; (2) amplitude varied from 55 dB to 85 dB; (3) schwa formants varied from completely overlapped with the vowel in V to completely overlapped with the vowel in N. Listeners rated V-the-N sequences with reduced duration and intensity and more anticipatory coarticulation more favourably than sequences with increased duration and intensity and more preservatory coarticulation. These results are consistent with a listener preference for the production of supralexical chunks that adhere to morphosyntactic rather than metrical structure.}, } @article {pmid31660423, year = {2019}, author = {Kim, D and Kim, S}, title = {Coarticulatory vowel nasalization in American English: Data of individual differences in acoustic realization of vowel nasalization as a function of prosodic prominence and boundary.}, journal = {Data in brief}, volume = {27}, number = {}, pages = {104593}, doi = {10.1016/j.dib.2019.104593}, pmid = {31660423}, issn = {2352-3409}, abstract = {This article provides acoustic measurements data for vowel nasalization which are based on speech recorded from fifteen (8 female and 7 male) native speakers of American English in a laboratory setting. Each individual speaker's production patterns for the vowel nasalization in tautosyllabic CVN and NVC words are documented in terms of three acoustic parameters: the duration of nasal consonant (N-Duration), the duration of vowel (V-Duration) and the difference between the amplitude of the first formant (A1) and the first nasal peak (P0) obtained from the vowel (A1-P0) as an indication of the degree of vowel nasalization. The A1-P0 is measured at three different time points within the vowel -i.e., the near point (25%), midpoint (50%), and distant point (75%), either from the onset (CVN) or the offset (NVC) of the nasal consonant. These measures are taken from the target words in various prosodic prominence and boundary contexts: phonologically focused (PhonFOC) vs. lexically focused (LexFOC) vs. unfocused (NoFOC) conditions; phrase-edge (i.e., phrase-final for CVN and phrase-initial for NVC) vs. phrase-medial conditions. The data also contain a CSV file with each speaker's mean values of the N-Duration, V-Duration, and A1-P0 (z-scored) for each prosodic context along with the information about the speakers' gender. For further discussion of the data, please refer to the full-length article entitled "Prosodically-conditioned fine-tuning of coarticulatory vowel nasalization in English"(Cho et al., 2017).}, } @article {pmid31659578, year = {2020}, author = {Goswami, U and Nirmala, SR and Vikram, CM and Kalita, S and Prasanna, SRM}, title = {Analysis of Articulation Errors in Dysarthric Speech.}, journal = {Journal of psycholinguistic research}, volume = {49}, number = {1}, pages = {163-174}, pmid = {31659578}, issn = {1573-6555}, mesh = {Dysarthria/*diagnosis/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Psycholinguistics ; *Speech Acoustics ; }, abstract = {Imprecise articulation is the major issue reported in various types of dysarthria. Detection of articulation errors can help in diagnosis. The cues derived from both the burst and the formant transitions contribute to the discrimination of place of articulation of stops. It is believed that any acoustic deviations in stops due to articulation error can be analyzed by deriving features around the burst and the voicing onsets. The derived features can be used to discriminate the normal and dysarthric speech. In this work, a method is proposed to differentiate the voiceless stops produced by the normal speakers from the dysarthric by deriving the spectral moments, two-dimensional discrete cosine transform of linear prediction spectrum and Mel frequency cepstral coefficients features. These features and cosine distance based classifier is used for the classification of normal and dysarthic speech.}, } @article {pmid31658784, year = {2019}, author = {Machado, TJ and Vieira Filho, J and de Oliveira, MA}, title = {Forensic Speaker Verification Using Ordinary Least Squares.}, journal = {Sensors (Basel, Switzerland)}, volume = {19}, number = {20}, pages = {}, pmid = {31658784}, issn = {1424-8220}, abstract = {In Brazil, the recognition of speakers for forensic purposes still relies on a subjectivity-based decision-making process through a results analysis of untrustworthy techniques. Owing to the lack of a voice database, speaker verification is currently applied to samples specifically collected for confrontation. However, speaker comparative analysis via contested discourse requires the collection of an excessive amount of voice samples for a series of individuals. Further, the recognition system must inform who is the most compatible with the contested voice from pre-selected individuals. Accordingly, this paper proposes using a combination of linear predictive coding (LPC) and ordinary least squares (OLS) as a speaker verification tool for forensic analysis. The proposed recognition technique establishes confidence and similarity upon which to base forensic reports, indicating verification of the speaker of the contested discourse. Therefore, in this paper, an accurate, quick, alternative method to help verify the speaker is contributed. After running seven different tests, this study preliminarily achieved a hit rate of 100% considering a limited dataset (Brazilian Portuguese). Furthermore, the developed method extracts a larger number of formants, which are indispensable for statistical comparisons via OLS. The proposed framework is robust at certain levels of noise, for sentences with the suppression of word changes, and with different quality or even meaningful audio time differences.}, } @article {pmid31644889, year = {2020}, author = {Cartei, V and Banerjee, R and Garnham, A and Oakhill, J and Roberts, L and Anns, S and Bond, R and Reby, D}, title = {Physiological and perceptual correlates of masculinity in children's voices.}, journal = {Hormones and behavior}, volume = {117}, number = {}, pages = {104616}, doi = {10.1016/j.yhbeh.2019.104616}, pmid = {31644889}, issn = {1095-6867}, mesh = {Adolescent ; Adult ; Age Factors ; Auditory Perception/physiology ; Child ; Child Development/*physiology ; Child, Preschool ; Female ; Humans ; Male ; *Masculinity ; Sex Factors ; Sexual Maturation/physiology ; *Social Perception ; *Speech Acoustics ; Testosterone/blood ; Voice/*physiology ; Young Adult ; }, abstract = {Low frequency components (i.e. a low pitch (F0) and low formant spacing (ΔF)) signal high salivary testosterone and height in adult male voices and are associated with high masculinity attributions by unfamiliar listeners (in both men and women). However, the relation between the physiological, acoustic and perceptual dimensions of speakers' masculinity prior to puberty remains unknown. In this study, 110 pre-pubertal children (58 girls), aged 3 to 10, were recorded as they described a cartoon picture. 315 adults (182 women) rated children's perceived masculinity from the voice only after listening to the speakers' audio recordings. On the basis of their voices alone, boys who had higher salivary testosterone levels were rated as more masculine and the relation between testosterone and perceived masculinity was partially mediated by F0. The voices of taller boys were also rated as more masculine, but the relation between height and perceived masculinity was not mediated by the considered acoustic parameters, indicating that acoustic cues other than F0 and ΔF may signal stature. Both boys and girls who had lower F0, were also rated as more masculine, while ΔF did not affect ratings. These findings highlight the interdependence of physiological, acoustic and perceptual dimensions, and suggest that inter-individual variation in male voices, particularly F0, may advertise hormonal masculinity from a very early age.}, } @article {pmid31621355, year = {2020}, author = {Scheerer, NE and Jacobson, DS and Jones, JA}, title = {Sensorimotor control of vocal production in early childhood.}, journal = {Journal of experimental psychology. General}, volume = {149}, number = {6}, pages = {1071-1077}, doi = {10.1037/xge0000706}, pmid = {31621355}, issn = {1939-2222}, support = {//Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Acoustic Stimulation ; Child, Preschool ; Feedback, Sensory/physiology ; Female ; Humans ; Male ; Speech/*physiology ; Speech Perception/physiology ; Voice/*physiology ; }, abstract = {Children maintain fluent speech despite dramatic changes to their articulators during development. Auditory feedback aids in the acquisition and maintenance of the sensorimotor mechanisms that underlie vocal motor control. MacDonald, Johnson, Forsythe, Plante, and Munhall (2012) reported that toddlers' speech motor control systems may "suppress" the influence of auditory feedback, since exposure to altered auditory feedback regarding their formant frequencies did not lead to modifications of their speech. This finding is not parsimonious with most theories of motor control. Here, we exposed toddlers to perturbations to the pitch of their auditory feedback as they vocalized. Toddlers compensated for the manipulations, producing significantly different responses to upward and downward perturbations. These data represent the first empirical demonstration that toddlers use auditory feedback for vocal motor control. Furthermore, our findings suggest toddlers are more sensitive to changes to the postural properties of their auditory feedback, such as fundamental frequency, relative to the phonemic properties, such as formant frequencies. (PsycInfo Database Record (c) 2020 APA, all rights reserved).}, } @article {pmid31593943, year = {2020}, author = {Conklin, JT and Dmitrieva, O}, title = {Vowel-to-Vowel Coarticulation in Spanish Nonwords.}, journal = {Phonetica}, volume = {77}, number = {4}, pages = {294-319}, doi = {10.1159/000502890}, pmid = {31593943}, issn = {1423-0321}, mesh = {Adult ; Female ; Humans ; Language ; Linguistics ; Male ; Middle Aged ; *Phonetics ; Spain ; *Speech Acoustics ; Young Adult ; }, abstract = {The present study examined vowel-to-vowel (VV) coarticulation in backness affecting mid vowels /e/ and /o/ in 36 Spanish nonwords produced by 20 native speakers of Spanish, aged 19-50 years (mean = 30.7; SD = 8.2). Examination of second formant frequency showed substantial carryover coarticulation throughout the data set, while anticipatory coarticulation was minimal and of shorter duration. Furthermore, the effect of stress on vowel-to-vowel coarticulation was investigated and found to vary by direction. In the anticipatory direction, small coarticulatory changes were relatively stable regardless of stress, particularly for target /e/, while in the carryover direction, a hierarchy of stress emerged wherein the greatest coarticulation occurred between stressed triggers and unstressed targets, less coarticulation was observed between unstressed triggers and unstressed targets, and the least coarticulation occurred between unstressed triggers with stressed targets. The results of the study augment and refine previously available knowledge about vowel-to-vowel coarticulation in Spanish and expand cross-linguistic understanding of the effect of stress on the magnitude and direction of vowel-to-vowel coarticulation.}, } @article {pmid31590565, year = {2019}, author = {Lee, Y and Keating, P and Kreiman, J}, title = {Acoustic voice variation within and between speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {3}, pages = {1568}, pmid = {31590565}, issn = {1520-8524}, support = {R01 DC001797/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Biological Variation, Individual ; *Biological Variation, Population ; Female ; Humans ; Male ; Phonetics ; Psychoacoustics ; *Speech Acoustics ; Voice/*physiology ; }, abstract = {Little is known about the nature or extent of everyday variability in voice quality. This paper describes a series of principal component analyses to explore within- and between-talker acoustic variation and the extent to which they conform to expectations derived from current models of voice perception. Based on studies of faces and cognitive models of speaker recognition, the authors hypothesized that a few measures would be important across speakers, but that much of within-speaker variability would be idiosyncratic. Analyses used multiple sentence productions from 50 female and 50 male speakers of English, recorded over three days. Twenty-six acoustic variables from a psychoacoustic model of voice quality were measured every 5 ms on vowels and approximants. Across speakers the balance between higher harmonic amplitudes and inharmonic energy in the voice accounted for the most variance (females = 20%, males = 22%). Formant frequencies and their variability accounted for an additional 12% of variance across speakers. Remaining variance appeared largely idiosyncratic, suggesting that the speaker-specific voice space is different for different people. Results further showed that voice spaces for individuals and for the population of talkers have very similar acoustic structures. Implications for prototype models of voice perception and recognition are discussed.}, } @article {pmid31586643, year = {2019}, author = {Gammon, DE and Corsiglia, AM}, title = {Mockingbirds imitate frogs and toads across North America.}, journal = {Behavioural processes}, volume = {169}, number = {}, pages = {103982}, doi = {10.1016/j.beproc.2019.103982}, pmid = {31586643}, issn = {1872-8308}, mesh = {Acoustics ; Animals ; Anura ; Behavior, Animal/*physiology ; Bufonidae ; Imitative Behavior/*physiology ; North America ; Passeriformes/*physiology ; }, abstract = {Vocal mimicry is taxonomically widespread among birds, but little is known about mimicry of non-avian models. Prior studies show preferential imitation of avian models whose sounds are acoustically similar to the non-imitative songs of the vocal mimic. Based on these studies and anecdotes about frog imitations by northern mockingbirds (Mimus polyglottos), we hypothesized which anuran models would be most likely to get imitated by mockingbirds across their geographic range. We tested our hypothesis using >40 h of archived mockingbird recordings. Our results showed that mockingbirds imitated at least 12 anuran species, and calls were disproportionately mimicked when they contained dominant frequencies within the vocal range of the mockingbird (750-7000 Hz). Mockingbirds also frequently modified model anuran sounds by leaving out formants and/or truncating call duration. Our results represent the most comprehensive survey for any mimicking species of the imitation of anurans.}, } @article {pmid31571334, year = {2020}, author = {Balaguer, M and Pommée, T and Farinas, J and Pinquier, J and Woisard, V and Speyer, R}, title = {Effects of oral and oropharyngeal cancer on speech intelligibility using acoustic analysis: Systematic review.}, journal = {Head & neck}, volume = {42}, number = {1}, pages = {111-130}, doi = {10.1002/hed.25949}, pmid = {31571334}, issn = {1097-0347}, mesh = {Acoustics ; Humans ; *Oropharyngeal Neoplasms ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {BACKGROUND: The development of automatic tools based on acoustic analysis allows to overcome the limitations of perceptual assessment for patients with head and neck cancer. The aim of this study is to provide a systematic review of literature describing the effects of oral and oropharyngeal cancer on speech intelligibility using acoustic analysis.

METHODS: Two databases (PubMed and Embase) were surveyed. The selection process, according to the preferred reporting items for systematic reviews and meta-analyses (PRISMA) statement, led to a final set of 22 articles.

RESULTS: Nasalance is studied mainly in oropharyngeal patients. The vowels are mostly studied using formant analysis and vowel space area, the consonants by means of spectral moments with specific parameters according to their phonetic characteristic. Machine learning methods allow classifying "intelligible" or "unintelligible" speech for T3 or T4 tumors.

CONCLUSIONS: The development of comprehensive models combining different acoustic measures would allow a better consideration of the functional impact of the speech disorder.}, } @article {pmid31564128, year = {2019}, author = {Suire, A and Raymond, M and Barkat-Defradas, M}, title = {Male Vocal Quality and Its Relation to Females' Preferences.}, journal = {Evolutionary psychology : an international journal of evolutionary approaches to psychology and behavior}, volume = {17}, number = {3}, pages = {1474704919874675}, pmid = {31564128}, issn = {1474-7049}, mesh = {Adult ; Choice Behavior/*physiology ; Female ; Humans ; Male ; Sexual Behavior/*physiology ; *Social Perception ; *Verbal Behavior ; *Voice ; }, abstract = {In both correlational and experimental settings, studies on women's vocal preferences have reported negative relationships between perceived attractiveness and men's vocal pitch, emphasizing the idea of an adaptive preference. However, such consensus on vocal attractiveness has been mostly conducted with native English speakers, but a few evidence suggest that it may be culture-dependent. Moreover, other overlooked acoustic components of vocal quality, such as intonation, perceived breathiness and roughness, may influence vocal attractiveness. In this context, the present study aims to contribute to the literature by investigating vocal attractiveness in an underrepresented language (i.e., French) as well as shedding light on its relationship with understudied acoustic components of vocal quality. More specifically, we investigated the relationships between attractiveness ratings as assessed by female raters and male voice pitch, its variation, the formants' dispersion and position, and the harmonics-to-noise and jitter ratios. Results show that women were significantly more attracted to lower vocal pitch and higher intonation patterns. However, they did not show any directional preferences for all the other acoustic features. We discuss our results in light of the adaptive functions of vocal preferences in a mate choice context.}, } @article {pmid31543207, year = {2019}, author = {Zeng, Q and Jiao, Y and Huang, X and Wang, R and Bao, H and Lamb, JR and Le, J and Zhuang, P and Jiang, J}, title = {Effects of Angle of Epiglottis on Aerodynamic and Acoustic Parameters in Excised Canine Larynges.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {5}, pages = {627-633}, doi = {10.1016/j.jvoice.2018.02.007}, pmid = {31543207}, issn = {1873-4588}, mesh = {Acoustics ; Animals ; Dogs ; Epiglottis/*anatomy & histology/*physiology/surgery ; Laryngectomy ; *Phonation ; Pressure ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {OBJECTIVES: The aim of this study is to explore the effects of the angle of epiglottis (Aepi) on phonation and resonance in excised canine larynges.

METHODS: The anatomic Aepi was measured for 14 excised canine larynges as a control. Then, the Aepis were manually adjusted to 60° and 90° in each larynx. Aerodynamic and acoustic parameters, including mean flow rate, sound pressure level, jitter, shimmer, fundamental frequency (F0), and formants (F1'-F4'), were measured with a subglottal pressure of 1.5 kPa. Simple linear regression analysis between acoustic and aerodynamic parameters and the Aepi of the control was performed, and an analysis of variance comparing the acoustic and aerodynamic parameters of the three treatments was carried out.

RESULTS: The results of the study are as follows: (1) the larynges with larger anatomic Aepi had significantly lower jitter, shimmer, formant 1, and formant 2; (2) phonation threshold flow was significantly different for the three treatments; and (3) mean flow rate and sound pressure level were significantly different between the 60° and the 90° treatments of the 14 larynges.

CONCLUSIONS: The Aepi was proposed for the first time in this study. The Aepi plays an important role in phonation and resonance of excised canine larynges.}, } @article {pmid31533114, year = {2020}, author = {Dmitrieva, O and Dutta, I}, title = {Acoustic Correlates of the Four-Way Laryngeal Contrast in Marathi.}, journal = {Phonetica}, volume = {77}, number = {3}, pages = {209-237}, doi = {10.1159/000501673}, pmid = {31533114}, issn = {1423-0321}, mesh = {Humans ; Speech Acoustics ; Speech ; *Larynx/diagnostic imaging ; Acoustics ; *Voice ; Language ; }, abstract = {The study examines acoustic correlates of the four-way laryngeal contrast in Marathi, focusing on temporal parameters, voice quality, and onset f0. Acoustic correlates of the laryngeal contrast were investigated in the speech of 33 native speakers of Marathi, recorded in Mumbai, India, producing a word list containing six sets of words minimally contrastive in terms of laryngeal specification of word-initial velar stops. Measurements were made for the duration of prevoicing, release, and voicing during release. Fundamental frequency was measured at the onset of voicing following the stop and at 10 additional time points. As measures of voice quality, amplitude differences between the first and second harmonic (H1-H2) and between the first harmonic and the third formant (H1-A3) were calculated. The results demonstrated that laryngeal categories in Marathi are differentiated based on temporal measures, voice quality, and onset f0, although differences in each dimension were unequal in magnitude across different pairs of stop categories. We conclude that a single acoustic correlate, such as voice onset time, is insufficient to differentiate among all the laryngeal categories in languages such as Marathi, characterized by complex four-way laryngeal contrasts. Instead, a joint contribution of several acoustic correlates creates a robust multidimensional contrast.}, } @article {pmid31479380, year = {2019}, author = {Guan, J and Liu, C}, title = {Speech Perception in Noise With Formant Enhancement for Older Listeners.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {9}, pages = {3290-3301}, doi = {10.1044/2019_JSLHR-S-18-0089}, pmid = {31479380}, issn = {1558-9102}, mesh = {Age Factors ; Female ; Hearing Loss/*physiopathology ; Humans ; Male ; Middle Aged ; *Noise ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Purpose Degraded speech intelligibility in background noise is a common complaint of listeners with hearing loss. The purpose of the current study is to explore whether 2nd formant (F2) enhancement improves speech perception in noise for older listeners with hearing impairment (HI) and normal hearing (NH). Method Target words (e.g., color and digit) were selected and presented based on the paradigm of the coordinate response measure corpus. Speech recognition thresholds with original and F2-enhanced speech in 2- and 6-talker babble were examined for older listeners with NH and HI. Results The thresholds for both the NH and HI groups improved for enhanced speech signals primarily in 2-talker babble, but not in 6-talker babble. The F2 enhancement benefits did not correlate significantly with listeners' age and their average hearing thresholds in most listening conditions. However, speech intelligibility index values increased significantly with F2 enhancement in babble for listeners with HI, but not for NH listeners. Conclusions Speech sounds with F2 enhancement may improve listeners' speech perception in 2-talker babble, possibly due to a greater amount of speech information available in temporally modulated noise or a better capacity to separate speech signals from background babble.}, } @article {pmid31472573, year = {2019}, author = {Klein, E and Brunner, J and Hoole, P}, title = {The influence of coarticulatory and phonemic relations on individual compensatory formant production.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {2}, pages = {1265}, doi = {10.1121/1.5122788}, pmid = {31472573}, issn = {1520-8524}, abstract = {Previous auditory perturbation studies have shown that speakers are able to simultaneously use multiple compensatory strategies to produce a certain acoustic target. In the case of formant perturbation, these findings were obtained examining the compensatory production for low vowels /ɛ/ and /æ/. This raises some controversy as more recent research suggests that the contribution of the somatosensory feedback to the production of vowels might differ across phonemes. In particular, the compensatory magnitude to auditory perturbations is expected to be weaker for high vowels compared to low vowels since the former are characterized by larger linguopalatal contact. To investigate this hypothesis, this paper conducted a bidirectional auditory perturbation study in which F2 of the high central vowel /ɨ/ was perturbed in opposing directions depending on the preceding consonant (alveolar vs velar). The consonants were chosen such that speakers' usual coarticulatory patterns were either compatible or incompatible with the required compensatory strategy. The results demonstrate that speakers were able to compensate for applied perturbations even if speakers' compensatory movements resulted in unusual coarticulatory configurations. However, the results also suggest that individual compensatory patterns were influenced by additional perceptual factors attributable to the phonemic space surrounding the target vowel /ɨ/.}, } @article {pmid31472538, year = {2019}, author = {Migimatsu, K and Tokuda, IT}, title = {Experimental study on nonlinear source-filter interaction using synthetic vocal fold models.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {2}, pages = {983}, doi = {10.1121/1.5120618}, pmid = {31472538}, issn = {1520-8524}, mesh = {Acoustics/*instrumentation ; Biomimetic Materials/chemistry ; *Models, Biological ; Phonation ; Silicones/chemistry ; Transducers ; Vocal Cords/*physiology ; Voice ; }, abstract = {Under certain conditions, e.g., singing voice, the fundamental frequency of the vocal folds can go up and interfere with the formant frequencies. Acoustic feedback from the vocal tract filter to the vocal fold source then becomes strong and non-negligible. An experimental study was presented on such source-filter interaction using three types of synthetic vocal fold models. Asymmetry was also created between the left and right vocal folds. The experiment reproduced various nonlinear phenomena, such as frequency jump and quenching, as reported in humans. Increase in phonation threshold pressure was also observed when resonant frequency of the vocal tract and fundamental frequency of the vocal folds crossed each other. As a combined effect, the phonation threshold pressure was further increased by the left-right asymmetry. Simulation of the asymmetric two-mass model reproduced the experiments to some extent. One of the intriguing findings of this study is the variable strength of the source-filter interaction over different model types. Among the three models, two models were strongly influenced by the vocal tract, while no clear effect of the vocal tract was observed in the other model. This implies that the level of source-filter interaction may vary considerably from one subject to another in humans.}, } @article {pmid34307642, year = {2019}, author = {Mamun, N and Ghosh, R and Hansen, JHL}, title = {Quantifying Cochlear Implant Users' Ability for Speaker Identification using CI Auditory Stimuli.}, journal = {Interspeech}, volume = {2019}, number = {}, pages = {3118-3122}, pmid = {34307642}, issn = {2308-457X}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, abstract = {Speaker recognition is a biometric modality that uses underlying speech information to determine the identity of the speaker. Speaker Identification (SID) under noisy conditions is one of the challenging topics in the field of speech processing, specifically when it comes to individuals with cochlear implants (CI). This study analyzes and quantifies the ability of CI-users to perform speaker identification based on direct electric auditory stimuli. CI users employ a limited number of frequency bands (8 ∼ 22) and use electrodes to directly stimulate the Basilar Membrane/Cochlear in order to recognize the speech signal. The sparsity of electric stimulation within the CI frequency range is a prime reason for loss in human speech recognition, as well as SID performance. Therefore, it is assumed that CI-users might be unable to recognize and distinguish a speaker given dependent information such as formant frequencies, pitch etc. which are lost to un-simulated electrodes. To quantify this assumption, the input speech signal is processed using a CI Advanced Combined Encoder (ACE) signal processing strategy to construct the CI auditory electrodogram. The proposed study uses 50 speakers from each of three different databases for training the system using two different classifiers under quiet, and tested under both quiet and noisy conditions. The objective result shows that, the CI users can effectively identify a limited number of speakers. However, their performance decreases when more speakers are added in the system, as well as when noisy conditions are introduced. This information could therefore be used for improving CI-user signal processing techniques to improve human SID.}, } @article {pmid31465711, year = {2019}, author = {Max, L and Daliri, A}, title = {Limited Pre-Speech Auditory Modulation in Individuals Who Stutter: Data and Hypotheses.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {8S}, pages = {3071-3084}, pmid = {31465711}, issn = {1558-9102}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adult ; *Auditory Perception/physiology ; Evoked Potentials, Auditory/physiology ; Humans ; Models, Theoretical ; Speech/physiology ; Stuttering/etiology/*physiopathology ; }, abstract = {Purpose We review and interpret our recent series of studies investigating motor-to-auditory influences during speech movement planning in fluent speakers and speakers who stutter. In those studies, we recorded auditory evoked potentials in response to probe tones presented immediately prior to speaking or at the equivalent time in no-speaking control conditions. As a measure of pre-speech auditory modulation (PSAM), we calculated changes in auditory evoked potential amplitude in the speaking conditions relative to the no-speaking conditions. Whereas adults who do not stutter consistently showed PSAM, this phenomenon was greatly reduced or absent in adults who stutter. The same between-group difference was observed in conditions where participants expected to hear their prerecorded speech played back without actively producing it, suggesting that the speakers who stutter use inefficient forward modeling processes rather than inefficient motor command generation processes. Compared with fluent participants, adults who stutter showed both less PSAM and less auditory-motor adaptation when producing speech while exposed to formant-shifted auditory feedback. Across individual participants, however, PSAM and auditory-motor adaptation did not correlate in the typically fluent group, and they were negatively correlated in the stuttering group. Interestingly, speaking with a consistent 100-ms delay added to the auditory feedback signal-normalized PSAM in speakers who stutter, and there no longer was a between-group difference in this condition. Conclusions Combining our own data with human and animal neurophysiological evidence from other laboratories, we interpret the overall findings as suggesting that (a) speech movement planning modulates auditory processing in a manner that may optimize its tuning characteristics for monitoring feedback during speech production and, (b) in conditions with typical auditory feedback, adults who stutter do not appropriately modulate the auditory system prior to speech onset. Lack of modulation of speakers who stutter may lead to maladaptive feedback-driven movement corrections that manifest themselves as repetitive movements or postural fixations.}, } @article {pmid31439969, year = {2018}, author = {Plummer, AR and Reidy, PF}, title = {Computing low-dimensional representations of speech from socio-auditory structures for phonetic analyses.}, journal = {Journal of phonetics}, volume = {71}, number = {}, pages = {355-375}, pmid = {31439969}, issn = {0095-4470}, support = {R01 DC002932/DC/NIDCD NIH HHS/United States ; }, abstract = {Low-dimensional representations of speech data, such as formant values extracted by linear predictive coding analysis or spectral moments computed from whole spectra viewed as probability distributions, have been instrumental in both phonetic and phonological analyses over the last few decades. In this paper, we present a framework for computing low-dimensional representations of speech data based on two assumptions: that speech data represented in high-dimensional data spaces lie on shapes called manifolds that can be used to map speech data to low-dimensional coordinate spaces, and that manifolds underlying speech data are generated from a combination of language-specific lexical, phonological, and phonetic information as well as culture-specific socio-indexical information that is expressed by talkers of a given speech community. We demonstrate the basic mechanics of the framework by carrying out an analysis of children's productions of sibilant fricatives relative to those of adults in their speech community using the phoneigen package - a publicly available implementation of the framework. We focus the demonstration on enumerating the steps for constructing manifolds from data and then using them to map the data to a low-dimensional space, explicating how manifold structure affects the learned low-dimensional representations, and comparing the use of these representations against standard acoustic features in a phonetic analysis. We conclude with a discussion of the framework's underlying assumptions, its broader modeling potential, and its position relative to recent advances in the field of representation learning.}, } @article {pmid31418715, year = {2019}, author = {Jain, S and Nataraja, NP}, title = {The Relationship between Temporal Integration and Temporal Envelope Perception in Noise by Males with Mild Sensorineural Hearing Loss.}, journal = {The journal of international advanced otology}, volume = {15}, number = {2}, pages = {257-262}, pmid = {31418715}, issn = {2148-3817}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Auditory Threshold/physiology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; *Noise ; Psychoacoustics ; Signal-To-Noise Ratio ; Speech Perception/*physiology ; }, abstract = {OBJECTIVES: A surge of literature indicated that temporal integration and temporal envelope perception contribute largely to the perception of speech. A review of literature showed that the perception of speech with temporal integration and temporal envelope perception in noise might be affected due to sensorineural hearing loss but to a varying degree. Because the temporal integration and temporal envelope share similar physiological processing at the cochlear level, the present study was aimed to identify the relationship between temporal integration and temporal envelope perception in noise by individuals with mild sensorineural hearing loss.

MATERIALS AND METHODS: Thirty adult males with mild sensorineural hearing loss and thirty age- and gender-matched normal-hearing individuals volunteered for being the participants of the study. The temporal integration was measured using synthetic consonant-vowel-consonant syllables, varied for onset, offset, and onset-offset of second and third formant frequencies of the vowel following and preceding consonants in six equal steps, thus forming a six-step onset, offset, and onset-offset continuum, each. The duration of the transition was kept short (40 ms) in one set of continua and long (80 ms) in another. Temporal integration scores were calculated as the differences in the identification of the categorical boundary between short- and long-transition continua. Temporal envelope perception was measured using sentences processed in quiet, 0 dB, and -5 dB signal-to-noise ratios at 4, 8, 16, and 32 contemporary frequency channels, and the temporal envelope was extracted for each sentence using the Hilbert transformation.

RESULTS: A significant effect of hearing loss was observed on temporal integration, but not on temporal envelope perception. However, when the temporal integration abilities were controlled, the variable effect of hearing loss on temporal envelope perception was noted.

CONCLUSION: It was important to measure the temporal integration to accurately account for the envelope perception by individuals with normal hearing and those with hearing loss.}, } @article {pmid31417760, year = {2019}, author = {Cartei, V and Garnham, A and Oakhill, J and Banerjee, R and Roberts, L and Reby, D}, title = {Children can control the expression of masculinity and femininity through the voice.}, journal = {Royal Society open science}, volume = {6}, number = {7}, pages = {190656}, pmid = {31417760}, issn = {2054-5703}, abstract = {Pre-pubertal boys and girls speak with acoustically different voices despite the absence of a clear anatomical dimorphism in the vocal apparatus, suggesting that a strong component of the expression of gender through the voice is behavioural. Initial evidence for this hypothesis was found in a previous study showing that children can alter their voice to sound like a boy or like a girl. However, whether they can spontaneously modulate these voice components within their own gender in order to vary the expression of their masculinity and femininity remained to be investigated. Here, seventy-two English-speaking children aged 6-10 were asked to give voice to child characters varying in masculine and feminine stereotypicality to investigate whether primary school children spontaneously adjust their sex-related cues in the voice-fundamental frequency (F0) and formant spacing (ΔF)-along gender stereotypical lines. Boys and girls masculinized their voice, by lowering F0 and ΔF, when impersonating stereotypically masculine child characters of the same sex. Girls and older boys also feminized their voice, by raising their F0 and ΔF, when impersonating stereotypically feminine same-sex child characters. These findings reveal that children have some knowledge of the sexually dimorphic acoustic cues underlying the expression of gender, and are capable of controlling them to modulate gender-related attributes, paving the way for the use of the voice as an implicit, objective measure of the development of gender stereotypes and behaviour.}, } @article {pmid31415186, year = {2019}, author = {Dorman, MF and Natale, SC and Zeitler, DM and Baxter, L and Noble, JH}, title = {Looking for Mickey Mouse™ But Finding a Munchkin: The Perceptual Effects of Frequency Upshifts for Single-Sided Deaf, Cochlear Implant Patients.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {9}, pages = {3493-3499}, pmid = {31415186}, issn = {1558-9102}, support = {R01 DC014037/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Auditory Perception ; *Cochlear Implants ; Deafness/*physiopathology/*rehabilitation ; Female ; Humans ; Middle Aged ; *Sound ; }, abstract = {Purpose Our aim was to make audible for normal-hearing listeners the Mickey Mouse™ sound quality of cochlear implants (CIs) often found following device activation. Method The listeners were 3 single-sided deaf patients fit with a CI and who had 6 months or less of CI experience. Computed tomography imaging established the location of each electrode contact in the cochlea and allowed an estimate of the place frequency of the tissue nearest each electrode. For the most apical electrodes, this estimate ranged from 650 to 780 Hz. To determine CI sound quality, a clean signal (a sentence) was presented to the CI ear via a direct connect cable and candidate, and CI-like signals were presented to the ear with normal hearing via an insert receiver. The listeners rated the similarity of the candidate signals to the sound of the CI on a 1- to 10-point scale, with 10 being a complete match. Results To make the match to CI sound quality, all 3 patients need an upshift in formant frequencies (300-800 Hz) and a metallic sound quality. Two of the 3 patients also needed an upshift in voice pitch (10-80 Hz) and a muffling of sound quality. Similarity scores ranged from 8 to 9.7. Conclusion The formant frequency upshifts, fundamental frequency upshifts, and metallic sound quality experienced by the listeners can be linked to the relatively basal locations of the electrode contacts and short duration experience with their devices. The perceptual consequence was not the voice quality of Mickey Mouse™ but rather that of Munchkins in The Wizard of Oz for whom both formant frequencies and voice pitch were upshifted. Supplemental Material https://doi.org/10.23641/asha.9341651.}, } @article {pmid31399293, year = {2020}, author = {Knight, EJ and Austin, SF}, title = {The Effect of Head Flexion/Extension on Acoustic Measures of Singing Voice Quality.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {964.e11-964.e21}, doi = {10.1016/j.jvoice.2019.06.019}, pmid = {31399293}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Humans ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {A study was undertaken to identify the effect of head flexion/extension on singing voice quality. The amplitude of the fundamental frequency (F0) and the singing power ratio (SPR), an indirect measure of Singer's Formant activity, were measured. F0 and SPR scores at four experimental head positions were compared with the subjects' scores at their habitual positions. Three vowels and three pitch levels were tested. F0 amplitudes and low-frequency partials in general were greater with neck extension, while SPR increased with neck flexion. No effect of pitch or vowel was found. Gains in SPR appear to be the result of damping low-frequency partials rather than amplifying those in the Singer's Formant region. Raising the amplitude of F0 is an important resonance tool for female voices in the high range, and may be of benefit to other voice types in resonance, loudness, and laryngeal function.}, } @article {pmid31379540, year = {2019}, author = {Alho, K and Żarnowiec, K and Gorina-Careta, N and Escera, C}, title = {Phonological Task Enhances the Frequency-Following Response to Deviant Task-Irrelevant Speech Sounds.}, journal = {Frontiers in human neuroscience}, volume = {13}, number = {}, pages = {245}, pmid = {31379540}, issn = {1662-5161}, abstract = {In electroencephalography (EEG) measurements, processing of periodic sounds in the ascending auditory pathway generates the frequency-following response (FFR) phase-locked to the fundamental frequency (F0) and its harmonics of a sound. We measured FFRs to the steady-state (vowel) part of syllables /ba/ and /aw/ occurring in binaural rapid streams of speech sounds as frequently repeating standard syllables or as infrequent (p = 0.2) deviant syllables among standard /wa/ syllables. Our aim was to study whether concurrent active phonological processing affects early processing of irrelevant speech sounds reflected by FFRs to these sounds. To this end, during syllable delivery, our healthy adult participants performed tasks involving written letters delivered on a computer screen in a rapid stream. The stream consisted of vowel letters written in red, infrequently occurring consonant letters written in the same color, and infrequently occurring vowel letters written in blue. In the phonological task, the participants were instructed to press a response key to the consonant letters differing phonologically but not in color from the frequently occurring red vowels, whereas in the non-phonological task, they were instructed to respond to the vowel letters written in blue differing only in color from the frequently occurring red vowels. We observed that the phonological task enhanced responses to deviant /ba/ syllables but not responses to deviant /aw/ syllables. This suggests that active phonological task performance may enhance processing of such small changes in irrelevant speech sounds as the 30-ms difference in the initial formant-transition time between the otherwise identical syllables /ba/ and /wa/ used in the present study.}, } @article {pmid31370636, year = {2019}, author = {Birkholz, P and Gabriel, F and Kürbis, S and Echternach, M}, title = {How the peak glottal area affects linear predictive coding-based formant estimates of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {223}, doi = {10.1121/1.5116137}, pmid = {31370636}, issn = {1520-8524}, abstract = {The estimation of formant frequencies from acoustic speech signals is mostly based on Linear Predictive Coding (LPC) algorithms. Since LPC is based on the source-filter model of speech production, the formant frequencies obtained are often implicitly regarded as those for an infinite glottal impedance, i.e., a closed glottis. However, previous studies have indicated that LPC-based formant estimates of vowels generated with a realistically varying glottal area may substantially differ from the resonances of the vocal tract with a closed glottis. In the present study, the deviation between closed-glottis resonances and LPC-estimated formants during phonation with different peak glottal areas has been systematically examined both using physical vocal tract models excited with a self-oscillating rubber model of the vocal folds, and by computer simulations of interacting source and filter models. Ten vocal tract resonators representing different vowels have been analyzed. The results showed that F1 increased with the peak area of the time-varying glottis, while F2 and F3 were not systematically affected. The effect of the peak glottal area on F1 was strongest for close-mid to close vowels, and more moderate for mid to open vowels.}, } @article {pmid31370618, year = {2019}, author = {González Hautamäki, R and Hautamäki, V and Kinnunen, T}, title = {On the limits of automatic speaker verification: Explaining degraded recognizer scores through acoustic changes resulting from voice disguise.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {693}, doi = {10.1121/1.5119240}, pmid = {31370618}, issn = {1520-8524}, abstract = {In speaker verification research, objective performance benchmarking of listeners and automatic speaker verification (ASV) systems are of key importance in understanding the limits of speaker recognition. While the adoption of common data and metrics has been instrumental to progress in ASV, there are two major shortcomings. First, the utterances lack intentional voice changes imposed by the speaker. Second, the standard evaluation metrics focus on average performance across all speakers and trials. As a result, a knowledge gap remains in how the acoustic changes impact recognition performance at the level of individual speakers. This paper addresses the limits of speaker recognition in ASV systems under voice disguise using a linear mixed effects model to analyze the impact of change in long-term statistics of selected features (formants F1-F4, the bandwidths B1-B4, F0, and speaking rate) to ASV log-likelihood ratio (LLR) score. The correlations between the proposed predictive model and the LLR scores are 0.72 for females and 0.81 for male speakers. As a whole, the difference in long-term F0 between enrollment and test utterances was found to be the individually most detrimental factor, even if the ASV system uses only spectral, rather than prosodic, features.}, } @article {pmid31370566, year = {2019}, author = {Patel, RR and Lulich, SM and Verdi, A}, title = {Vocal tract shape and acoustic adjustments of children during phonation into narrow flow-resistant tubes.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {352}, doi = {10.1121/1.5116681}, pmid = {31370566}, issn = {1520-8524}, mesh = {Child ; Female ; Glottis/physiology ; Humans ; Male ; Mouth/physiology ; Phonation/*physiology ; Tongue/physiology ; Voice/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {The goal of the study is to quantify the salient vocal tract acoustic, subglottal acoustic, and vocal tract physiological characteristics during phonation into a narrow flow-resistant tube with 2.53 mm inner diameter and 124 mm length in typically developing vocally healthy children using simultaneous microphone, accelerometer, and 3D/4D ultrasound recordings. Acoustic measurements included fundamental frequency (fo), first formant frequency (F1), second formant frequency (F2), first subglottal resonance (FSg1), and peak-to-peak amplitude ratio (Pvt:Psg). Physiological measurements included posterior tongue height (D1), tongue dorsum height (D2), tongue tip height (D3), tongue length (D4), oral cavity width (D5), hyoid elevation (D6), pharynx width (D7). All measurements were made on eight boys and ten girls (6-9 years) during sustained /o:/ production at typical pitch and loudness, with and without flow-resistant tube. Phonation with the flow-resistant tube resulted in a significant decrease in F1, F2, and Pvt:Psg and a significant increase in D2, D3, and FSg1. A statistically significant gender effect was observed for D1, with D1 higher in boys. These findings agree well with reported findings from adults, suggesting common acoustic and articulatory mechanisms for narrow flow-resistant tube phonation. Theoretical implications of the findings are discussed.}, } @article {pmid31370496, year = {2019}, author = {Wadamori, N}, title = {Evaluation of a photoacoustic bone-conduction vibration system.}, journal = {The Review of scientific instruments}, volume = {90}, number = {7}, pages = {074905}, doi = {10.1063/1.5081078}, pmid = {31370496}, issn = {1089-7623}, abstract = {This article proposes a bone conduction vibrator that is based on a phenomenon by which audible sound can be perceived when vibrations are produced using a laser beam that is synchronized to the sound and these vibrations are then transmitted to an auricular cartilage. To study this phenomenon, we measured the vibrations using a rubber sheet with similar properties to those of soft tissue in combination with an acceleration sensor. We also calculated the force level of the sound based on the mechanical impedance and the acceleration in the proposed system. We estimated the formant frequencies of specific vibrations that were synchronized to five Japanese vowels using this phenomenon. We found that the vibrations produced in the rubber sheet caused audible sound generation when the photoacoustic bone conduction vibration system was used. It is expected that a force level that is equal to the reference equivalent threshold force level can be achieved at light intensities that lie below the safety limit for human skin exposure by selecting an irradiation wavelength at which a high degree of optical absorption occurs. It is demonstrated that clear sounds can be transmitted to the cochlea using the proposed system, while the effects of acoustic and electric noise in the environment are barred. Improvements in the vibratory force levels realized using this system will enable the development of a novel hearing aid that will provide an alternative to conventional bone conduction hearing aids.}, } @article {pmid31345679, year = {2020}, author = {Kaneko, M and Sugiyama, Y and Mukudai, S and Hirano, S}, title = {Effect of Voice Therapy Using Semioccluded Vocal Tract Exercises in Singers and Nonsingers With Dysphonia.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {963.e1-963.e9}, doi = {10.1016/j.jvoice.2019.06.014}, pmid = {31345679}, issn = {1873-4588}, mesh = {*Dysphonia/diagnosis/therapy ; Humans ; *Singing ; *Voice ; Voice Quality ; Voice Training ; }, abstract = {OBJECTIVES: Voice therapy with semioccluded vocal tract exercises (SOVTE) has a long history of use in singers and nonsingers with dysphonia. SOVTE with increased vocal tract impedance leads to increased vocal efficiency and economy. Although there is a growing body of research on the physiological impact of SOVTE, and growing clinical sentiment about its therapeutic benefits, empirical data describing its potential efficacy in singers and nonsingers are lacking. The objective of the current study is to evaluate vocal tract function and voice quality in singers and nonsingers with dysphonia after undergoing SOVTE.

METHODS: Patients who were diagnosed with functional dysphonia, vocal fold nodules and age-related atrophy were assessed (n = 8 singers, n = 8 nonsingers). Stroboscopic examination, aerodynamic assessment, acoustic analysis, formant frequency, and self-assessments were evaluated before and after performing SOVTE.

RESULTS: In the singer group, expiratory lung pressure, jitter, shimmer, and self-assessment significantly improved after SOVTE. In addition, formant frequency (first, second, third, and fourth), and the standard deviation (SD) of the first, second, and third formant frequency significantly improved. In the nonsinger group, expiratory lung pressure, jitter, shimmer, and Voice Handicap Index-10 significantly improved after SOVTE. However, no significant changes were observed in formant frequency.

CONCLUSIONS: These results suggest that SOVTE may improve voice quality in singers and nonsingers with dysphonia, and SOVTE may be more effective at adjusting the vocal tract function in singers with dysphonia compared to nonsingers.}, } @article {pmid31331237, year = {2020}, author = {Myers, S}, title = {An Acoustic Study of Sandhi Vowel Hiatus in Luganda.}, journal = {Language and speech}, volume = {63}, number = {3}, pages = {506-525}, doi = {10.1177/0023830919862842}, pmid = {31331237}, issn = {1756-6053}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Uganda ; Young Adult ; }, abstract = {In Luganda (Bantu, Uganda), a sequence of vowels in successive syllables (V.V) is not allowed. If the first vowel is high, the two vowels are joined together in a diphthong (e.g., i + a → i͜a). If the first vowel is non-high, it is deleted with compensatory lengthening of the second vowel in the sequence (e.g., e + a → aː). This paper presents an acoustic investigation of inter-word V#V sequences in Luganda. It was found that the vowel interval in V#V sequences is longer than that in V#C sequences. When the first vowel in V#V is non-high, the formant frequency of the outcome is determined by the second vowel in the sequence. When the first vowel is high, on the other hand, the sequence is realized as a diphthong, with the transition between the two formant patterns taking up most of the duration. The durational patterns within these diphthongs provide evidence against the transcription-based claim that these sequences are reorganized so that the length lies in the second vowel (/i#V/ → [jVː]). The findings bring into question a canonical case of compensatory lengthening conditioned by glide formation.}, } @article {pmid31307041, year = {2020}, author = {Longo, L and Di Stadio, A and Ralli, M and Marinucci, I and Ruoppolo, G and Dipietro, L and de Vincentiis, M and Greco, A}, title = {Voice Parameter Changes in Professional Musician-Singers Singing with and without an Instrument: The Effect of Body Posture.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {72}, number = {4}, pages = {309-315}, doi = {10.1159/000501202}, pmid = {31307041}, issn = {1421-9972}, mesh = {Acoustics ; Humans ; *Music ; *Phonation ; *Posture ; *Singing ; Voice Quality ; }, abstract = {BACKGROUND AND AIM: The impact of body posture on vocal emission is well known. Postural changes may increase muscular resistance in tracts of the phono-articulatory apparatus and lead to voice disorders. This work aimed to assess whether and to which extent body posture during singing and playing a musical instrument impacts voice performance in professional musicians.

SUBJECTS AND METHODS: Voice signals were recorded from 17 professional musicians (pianists and guitarists) while they were singing and while they were singing and playing a musical instrument simultaneously. Metrics were extracted from their voice spectrogram using the Multi-Dimensional Voice Program (MDVP) and included jitter, shift in fundamental voice frequency (sF0), shimmer, change in peak amplitude, noise to harmonic ratio, Voice Turbulence Index, Soft Phonation Index (SPI), Frequency Tremor Intensity Index, Amplitude Tremor Intensity Index, and maximum phonatory time (MPT). Statistical analysis was performed using two-tailed t tests, one-way ANOVA, and χ2 tests. Subjects' body posture was visually assessed following the recommendations of the Italian Society of Audiology and Phoniatrics. Thirty-seven voice signals were collected, 17 during singing and 20 during singing and playing a musical instrument.

RESULTS: Data showed that playing an instrument while singing led to an impairment of the "singer formant" and to a decrease in jitter, sF0, shimmer, SPI, and MPT. However, statistical analysis showed that none of the MDVP metrics changed significantly when subjects played an instrument compared to when they did not. Shoulder and back position affected voice features as measured by the MDVP metrics, while head and neck position did not. In particular, playing the guitar decreased the amplitude of the "singer formant" and increased noise, causing a typical "raucous rock voice."

CONCLUSIONS: Voice features may be affected by the use of the instrument the musicians play while they sing. Body posture selected by the musician while playing the instrument may affect expiration and phonation.}, } @article {pmid31306606, year = {2019}, author = {Whitfield, JA and Mehta, DD}, title = {Examination of Clear Speech in Parkinson Disease Using Measures of Working Vowel Space.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2082-2098}, doi = {10.1044/2019_JSLHR-S-MSC18-18-0189}, pmid = {31306606}, issn = {1558-9102}, mesh = {Aged ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*physiopathology ; *Phonetics ; Reading ; Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Production Measurement/methods ; }, abstract = {Purpose The purpose of the current study was to characterize clear speech production for speakers with and without Parkinson disease (PD) using several measures of working vowel space computed from frequently sampled formant trajectories. Method The 1st 2 formant frequencies were tracked for a reading passage that was produced using habitual and clear speaking styles by 15 speakers with PD and 15 healthy control speakers. Vowel space metrics were calculated from the distribution of frequently sampled formant frequency tracks, including vowel space hull area, articulatory-acoustic vowel space, and multiple vowel space density (VSD) measures based on different percentile contours of the formant density distribution. Results Both speaker groups exhibited significant increases in the articulatory-acoustic vowel space and VSD10, the area of the outermost (10th percentile) contour of the formant density distribution, from habitual to clear styles. These clarity-related vowel space increases were significantly smaller for speakers with PD than controls. Both groups also exhibited a significant increase in vowel space hull area; however, this metric was not sensitive to differences in the clear speech response between groups. Relative to healthy controls, speakers with PD exhibited a significantly smaller VSD90, the area of the most central (90th percentile), densely populated region of the formant space. Conclusions Using vowel space metrics calculated from formant traces of the reading passage, the current work suggests that speakers with PD do indeed reach the more peripheral regions of the vowel space during connected speech but spend a larger percentage of the time in more central regions of formant space than healthy speakers. Additionally, working vowel space metrics based on the distribution of formant data suggested that speakers with PD exhibited less of a clarity-related increase in formant space than controls, a trend that was not observed for perimeter-based measures of vowel space area.}, } @article {pmid31306601, year = {2019}, author = {Chiu, YF and Forrest, K and Loux, T}, title = {Relationship Between F2 Slope and Intelligibility in Parkinson's Disease: Lexical Effects and Listening Environment.}, journal = {American journal of speech-language pathology}, volume = {28}, number = {2S}, pages = {887-894}, doi = {10.1044/2018_AJSLP-MSC18-18-0098}, pmid = {31306601}, issn = {1558-9110}, mesh = {Aged ; *Auditory Perception ; Case-Control Studies ; Dysarthria/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/complications/*physiopathology ; *Signal-To-Noise Ratio ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {Purpose There is a complex relationship between speech production and intelligibility of speech. The current study sought to evaluate the interaction of the factors of lexical characteristics, listening environment, and the 2nd formant transition (F2 slope) on intelligibility of speakers with Parkinson's disease (PD). Method Twelve speakers with PD and 12 healthy controls read sentences that included words with the diphthongs /aɪ/, /ɔɪ/, and /aʊ/. The F2 slope of the diphthong transition was measured and averaged across the 3 diphthongs for each speaker. Young adult listeners transcribed the sentences to assess intelligibility of words with high and low word frequency and high and low neighborhood density in quiet and noisy listening conditions. The average F2 slope and intelligibility scores were entered into regression models to examine their relationship. Results F2 slope was positively related to intelligibility in speakers with PD in both listening conditions with a stronger relationship in noise than in quiet. There was no significant relationship between F2 slope and intelligibility of healthy speakers. In the quiet condition, F2 slope was only correlated with intelligibility in less-frequent words produced by the PD group. In the noise condition, F2 slope was related to intelligibility in high- and low-frequency words and high-density words in PD. Conclusions The relationship between F2 slope and intelligibility in PD was affected by lexical factors and listening conditions. F2 slope was more strongly related to intelligibility in noise than in quiet for speakers with PD. This relationship was absent in highly frequent words presented in quiet and those with fewer lexical neighbors.}, } @article {pmid31265363, year = {2019}, author = {Bauerly, KR and Jones, RM and Miller, C}, title = {Effects of Social Stress on Autonomic, Behavioral, and Acoustic Parameters in Adults Who Stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2185-2202}, doi = {10.1044/2019_JSLHR-S-18-0241}, pmid = {31265363}, issn = {1558-9102}, mesh = {Adult ; Anxiety/complications/physiopathology ; Autonomic Nervous System/*physiology ; Case-Control Studies ; Emotions ; Female ; Humans ; Male ; Middle Aged ; Psychomotor Performance/physiology ; Speech/*physiology ; Speech Acoustics ; Stress, Psychological/*complications/physiopathology ; Stuttering/physiopathology/*psychology ; Young Adult ; }, abstract = {Purpose The purpose of this study was to assess changes in autonomic, behavioral, and acoustic measures in response to social stress in adults who stutter (AWS) compared to adults who do not stutter (ANS). Method Participants completed the State-Trait Anxiety Inventory (Speilberger, Gorsuch, Luschene, Vagg, & Jacobs, 1983). In order to provoke social stress, participants were required to complete a modified version of the Trier Social Stress Test (TSST-M, Kirschbaum, Pirke, & Hellhammer, 1993), which included completing a nonword reading task and then preparing and delivering a speech to what was perceived as a group of professionals trained in public speaking. Autonomic nervous system changes were assessed by measuring skin conductance levels, heart rate, and respiratory sinus arrhythmia (RSA). Behavioral changes during speech production were measured in errors, percentage of syllable stuttered, percentage of other disfluencies, and speaking rate. Acoustic changes were measured using 2nd formant frequency fluctuations. In order to make comparisons of speech with and without social-cognitive stress, measurements were collected while participants completed a speaking task before and during TSST-M conditions. Results AWS showed significantly higher levels of self-reported state and trait anxiety compared to ANS. Autonomic nervous system changes revealed similar skin conductance level and heart rate across pre-TSST-M and TSST-M conditions; however, RSA levels were significantly higher in AWS compared to ANS across conditions. There were no differences found between groups for speaking rate, fundamental frequency, and percentage of other disfluencies when speaking with or without social stress. However, acoustic analysis revealed higher levels of 2nd formant frequency fluctuations in the AWS compared to the controls under pre-TSST-M conditions, followed by a decline to a level that resembled controls when speaking under the TSST-M condition. Discussion Results suggest that AWS, compared to ANS, engage higher levels of parasympathetic control (i.e., RSA) during speaking, regardless of stress level. Higher levels of self-reported state and trait anxiety support this view point and suggest that anxiety may have an indirect role on articulatory variability in AWS.}, } @article {pmid31255144, year = {2019}, author = {Charles, S and Lulich, SM}, title = {Articulatory-acoustic relations in the production of alveolar and palatal lateral sounds in Brazilian Portuguese.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {6}, pages = {3269}, doi = {10.1121/1.5109565}, pmid = {31255144}, issn = {1520-8524}, abstract = {Lateral approximant speech sounds are notoriously difficult to measure and describe due to their complex articulation and acoustics. This has prevented researchers from reaching a unifying description of the articulatory and acoustic characteristics of laterals. This paper examines articulatory and acoustic properties of Brazilian Portuguese alveolar and palatal lateral approximants (/l/ and /ʎ/) produced by six native speakers. The methodology for obtaining vocal tract area functions was based on three-dimensional/four-dimensional (3D/4D) ultrasound recordings and 3D digitized palatal impressions with simultaneously recorded audio signals. Area functions were used to calculate transfer function spectra, and predicted formant and anti-resonance frequencies were compared with the acoustic recordings. Mean absolute error in formant frequency prediction was 4% with a Pearson correlation of r = 0.987. Findings suggest anti-resonances from the interdental channels are less important than a prominent anti-resonance from the supralingual cavity but can become important in asymmetrical articulations. The use of 3D/4D ultrasound to study articulatory-acoustic relations is promising, but significant limitations remain and future work is needed to make better use of 3D/4D ultrasound data, e.g., by combining it with magnetic resonance imaging.}, } @article {pmid31251880, year = {2019}, author = {Heller Murray, ES and Lupiani, AA and Kolin, KR and Segina, RK and Stepp, CE}, title = {Pitch Shifting With the Commercially Available Eventide Eclipse: Intended and Unintended Changes to the Speech Signal.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2270-2279}, pmid = {31251880}, issn = {1558-9102}, support = {F31 DC016197/DC/NIDCD NIH HHS/United States ; P50 DC015446/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Analysis of Variance ; Female ; Healthy Volunteers ; Humans ; Male ; Pitch Discrimination/*physiology ; Speech/*physiology ; Speech Acoustics ; Young Adult ; }, abstract = {Purpose This study details the intended and unintended consequences of pitch shifting with the commercially available Eventide Eclipse. Method Ten vocally healthy participants (M = 22.0 years; 6 cisgender females, 4 cisgender males) produced a sustained /ɑ/, creating an input signal. This input signal was processed in near real time by the Eventide Eclipse to create an output signal that was either not shifted (0 cents), shifted +100 cents, or shifted -100 cents. Shifts occurred either throughout the entire vocalization or for a 200-ms period after vocal onset. Results Input signals were compared to output signals to examine potential changes. Average pitch-shift magnitudes were within 1 cent of the intended pitch shift. Measured pitch-shift length for intended 200-ms shifts was between 5.9% and 21.7% less than expected, based on the portion of shift selected for measurement. The delay between input and output signals was an average of 11.1 ms. Trials shifted +100 cents had a longer delay than trials shifted -100 or 0 cents. The first 2 formants (F1, F2) shifted in the direction of the pitch shift, with F1 shifting 6.5% and F2 shifting 6.0%. Conclusions The Eventide Eclipse is an accurate pitch-shifting hardware that can be used to explore voice and vocal motor control. The pitch-shifting algorithm shifts all frequencies, resulting in a subsequent change in F1 and F2 during pitch-shifted trials. Researchers using this device should be mindful of stimuli selection to avoid confusion during data interpretation.}, } @article {pmid31251676, year = {2019}, author = {Horáček, J and Radolf, V and Laukkanen, AM}, title = {Experimental and Computational Modeling of the Effects of Voice Therapy Using Tubes.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2227-2244}, doi = {10.1044/2019_JSLHR-S-17-0490}, pmid = {31251676}, issn = {1558-9102}, mesh = {Computer Simulation ; Female ; Glottis ; Humans ; Lung/physiology ; Male ; Models, Anatomic ; Phonation/*physiology ; Speech Acoustics ; Speech Therapy/*methods ; Voice Training ; }, abstract = {Purpose Phonations into a tube with the distal end either in the air or submerged in water are used for voice therapy. This study explores the effective mechanisms of these therapy methods. Method The study applied a physical model complemented by calculations from a computational model, and the results were compared to those that have been reported for humans. The effects of tube phonation on vocal tract resonances and oral pressure variation were studied. The relationships of transglottic pressure variation in time Ptrans (t) versus glottal area variation in time GA(t) were constructed. Results The physical model revealed that, for the phonation on [u:] vowel through a glass resonance tube ending in the air, the 1st formant frequency (F1) decreased by 67%, from 315 Hz to 105 Hz, thus slightly above the fundamental frequency (F0) that was set to 90-94 Hz . For phonation through the tube into water, F1 decreased by 91%-92%, reaching 26-28 Hz, and the water bubbling frequency Fb ≅ 19-24 Hz was just below F1 . The relationships of Ptrans (t) versus GA(t) clearly differentiate vowel phonation from both therapy methods, and show a physical background for voice therapy with tubes. It is shown that comparable results have been measured in humans during tube therapy. For the tube in air, F1 descends closer to F0 , whereas for the tube in water, the frequency Fb occurs close to the acoustic-mechanical resonance of the human vocal tract. Conclusion In both therapy methods, part of the airflow energy required for phonation is substituted by the acoustic energy utilizing the 1st acoustic resonance. Thus, less flow energy is needed for vocal fold vibration, which results in improved vocal efficiency. The effect can be stronger in water resistance therapy if the frequency Fb approaches the acoustic-mechanical resonance of the vocal tract, while simultaneously F0 is voluntarily changed close to F1.}, } @article {pmid31246660, year = {2020}, author = {Suresh, CH and Krishnan, A and Luo, X}, title = {Human Frequency Following Responses to Vocoded Speech: Amplitude Modulation Versus Amplitude Plus Frequency Modulation.}, journal = {Ear and hearing}, volume = {41}, number = {2}, pages = {300-311}, doi = {10.1097/AUD.0000000000000756}, pmid = {31246660}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adult ; *Cochlear Implantation ; *Cochlear Implants ; Cues ; Humans ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: The most commonly employed speech processing strategies in cochlear implants (CIs) only extract and encode amplitude modulation (AM) in a limited number of frequency channels. proposed a novel speech processing strategy that encodes both frequency modulation (FM) and AM to improve CI performance. Using behavioral tests, they reported better speech, speaker, and tone recognition with this novel strategy than with the AM-alone strategy. Here, we used the scalp-recorded human frequency following responses (FFRs) to examine the differences in the neural representation of vocoded speech sounds with AM alone and AM + FM as the spectral and temporal cues were varied. Specifically, we were interested in determining whether the addition of FM to AM improved the neural representation of envelope periodicity (FFRENV) and temporal fine structure (FFRTFS), as reflected in the temporal pattern of the phase-locked neural activity generating the FFR.

DESIGN: FFRs were recorded from 13 normal-hearing, adult listeners in response to the original unprocessed stimulus (a synthetic diphthong /au/ with a 110-Hz fundamental frequency or F0 and a 250-msec duration) and the 2-, 4-, 8- and 16-channel sine vocoded versions of /au/ with AM alone and AM + FM. Temporal waveforms, autocorrelation analyses, fast Fourier Transform, and stimulus-response spectral correlations were used to analyze both the strength and fidelity of the neural representation of envelope periodicity (F0) and TFS (formant structure).

RESULTS: The periodicity strength in the FFRENV decreased more for the AM stimuli than for the relatively resilient AM + FM stimuli as the number of channels was increased. Regardless of the number of channels, a clear spectral peak of FFRENV was consistently observed at the stimulus F0 for all the AM + FM stimuli but not for the AM stimuli. Neural representation as revealed by the spectral correlation of FFRTFS was better for the AM + FM stimuli when compared to the AM stimuli. Neural representation of the time-varying formant-related harmonics as revealed by the spectral correlation was also better for the AM + FM stimuli as compared to the AM stimuli.

CONCLUSIONS: These results are consistent with previously reported behavioral results and suggest that the AM + FM processing strategy elicited brainstem neural activity that better preserved periodicity, temporal fine structure, and time-varying spectral information than the AM processing strategy. The relatively more robust neural representation of AM + FM stimuli observed here likely contributes to the superior performance on speech, speaker, and tone recognition with the AM + FM processing strategy. Taken together, these results suggest that neural information preserved in the FFR may be used to evaluate signal processing strategies considered for CIs.}, } @article {pmid31231051, year = {2019}, author = {Stansbury, AL and Janik, VM}, title = {Formant Modification through Vocal Production Learning in Gray Seals.}, journal = {Current biology : CB}, volume = {29}, number = {13}, pages = {2244-2249.e4}, doi = {10.1016/j.cub.2019.05.071}, pmid = {31231051}, issn = {1879-0445}, mesh = {Animals ; Female ; *Learning ; Male ; Seals, Earless/*physiology ; *Vocalization, Animal ; }, abstract = {Vocal production learning is a rare communication skill and has only been found in selected avian and mammalian species [1-4]. Although humans use learned formants and voiceless sounds to encode most lexical information [5], evidence for vocal learning in other animals tends to focus on the modulation pattern of the fundamental frequency [3, 4]. Attempts to teach mammals to produce human speech sounds have largely been unsuccessful, most notably in extensive studies on great apes [5]. The limited evidence for formant copying in mammals raises the question whether advanced learned control over formant production is uniquely human. We show that gray seals (Halichoerus grypus) have the ability to match modulations in peak frequency patterns of call sequences or melodies by modifying the formants in their own calls, moving outside of their normal repertoire's distribution of frequencies and even copying human vowel sounds. Seals also demonstrated enhanced auditory memory for call sequences by accurately copying sequential changes in peak frequency and the number of calls played to them. Our results demonstrate that formants can be influenced by vocal production learning in non-human vocal learners, providing a mammalian substrate for the evolution of flexible information coding in formants as found in human language.}, } @article {pmid31202525, year = {2020}, author = {Dahl, KL and Mahler, LA}, title = {Acoustic Features of Transfeminine Voices and Perceptions of Voice Femininity.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {961.e19-961.e26}, doi = {10.1016/j.jvoice.2019.05.012}, pmid = {31202525}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Femininity ; Humans ; Male ; Speech Acoustics ; *Speech Perception ; *Transgender Persons ; *Voice ; Voice Quality ; }, abstract = {The purpose of this study was to evaluate the relationships between acoustic measures of transfeminine voices and both self- and listener ratings of voice femininity. Connected speech samples were collected from 12 transfeminine individuals (M = 36.3 years, SD = 10.6 years) and a control group of five cisgender (cis) women and five cis men (M = 35.3 years, SD = 13.3 years). The acoustic measures of fundamental frequency (fo), fo variation, formant frequencies, and vocal intensity were calculated from these samples. Transfeminine speakers rated their own voices on a five-point scale of voice femininity. Twenty inexperienced listeners heard an excerpt of each speech sample and rated the voices on the same five-point scale of voice femininity. Spearman's rank-order correlation coefficients were calculated to measure the relationships between the acoustic variables and ratings of voice femininity. Significant positive correlations were found between fo and both self-ratings (r = 0.712, P = 0.009) and listener ratings of voice femininity (r = 0.513, P < 0.001). Significant positive correlations were found between intensity and both self-ratings (r = 0.584, P = 0.046) and listener ratings of voice femininity (r = 0.584, P = 0.046). No significant correlations were found between fo variation or formant frequencies and perceptual ratings of voice femininity. A Pearson's chi-square test of independence showed that the distribution of self- and listener ratings differed significantly (χ[2] = 9.668, P = 0.046). Self- and listener ratings were also shown to be strongly correlated (r = 0.912, P < 0.001). This study provides further evidence to support the selection of training targets in voice feminization programs for transfeminine individuals and promotes the use of self-ratings of voice as an important outcome measure.}, } @article {pmid31193755, year = {2019}, author = {Sankar, MSA and Sathidevi, PS}, title = {A scalable speech coding scheme using compressive sensing and orthogonal mapping based vector quantization.}, journal = {Heliyon}, volume = {5}, number = {5}, pages = {e01820}, doi = {10.1016/j.heliyon.2019.e01820}, pmid = {31193755}, issn = {2405-8440}, abstract = {A novel scalable speech coding scheme based on Compressive Sensing (CS), which can operate at bit rates from 3.275 to 7.275 kbps is designed and implemented in this paper. The CS based speech coding offers the benefit of combined compression and encryption with inherent de-noising and bit rate scalability. The non-stationary nature of speech signal causes the recovery process from CS measurements very complex due to the variation in sparsifying bases. In this work, the complexity of the recovery process is reduced by assigning a suitable basis to each frame of the speech signal based on its statistical properties. As the quality of the reconstructed speech depends on the sensing matrix used at the transmitter, a variant of Binary Permuted Block Diagonal (BPBD) matrix is also proposed here which offers a better performance than that of the commonly used Gaussian random matrix. To improve the coding efficiency, formant filter coefficients are quantized using the conventional Vector Quantization (VQ) and an orthogonal mapping based VQ is developed for the quantization of CS measurements. The proposed coding scheme offers the listening quality for reconstructed speech similar to that of Adaptive Multi rate - Narrowband (AMR-NB) codec at 6.7 kbps and Enhanced Voice Services (EVS) at 7.2 kbps. A separate de-noising block is not required in the proposed coding scheme due to the inherent de-noising property of CS. Scalability in bit rate is achieved in the proposed method by varying the number of random measurements and the number of levels for orthogonal mapping in the VQ stage of measurements.}, } @article {pmid31183861, year = {2019}, author = {de Carvalho, CC and da Silva, DM and de Carvalho Junior, AD and Santos Neto, JM and Rio, BR and Neto, CN and de Orange, FA}, title = {Pre-operative voice evaluation as a hypothetical predictor of difficult laryngoscopy.}, journal = {Anaesthesia}, volume = {74}, number = {9}, pages = {1147-1152}, doi = {10.1111/anae.14732}, pmid = {31183861}, issn = {1365-2044}, mesh = {Adult ; Anesthesia, General ; Female ; Humans ; Intubation, Intratracheal/*methods ; Laryngoscopy/*methods ; Male ; Middle Aged ; Predictive Value of Tests ; Preoperative Care/*methods ; Prospective Studies ; Voice/*physiology ; }, abstract = {We examined the potential for voice sounds to predict a difficult airway as compared with prediction by the modified Mallampati test. A total of 453 patients scheduled for elective surgery under general anaesthesia with tracheal intubation were studied. Five phonemes were recorded and their formants analysed. Difficult laryngoscopy was defined as the Cormack-Lehane grade 3 or 4. Univariate and multivariate logistic regression were used to examine the association between some variables (mouth opening, sternomental distance, modified Mallampati and formants) and difficult laryngoscopy. Difficult laryngoscopy was reported in 29/453 (6.4%) patients. Among five regression models evaluated, the model achieving better performance to predict difficult laryngoscopy, after a variable selection criteria (stepwise, multivariate) and included a modified Mallampati classification (OR 2.920; 95%CI 1.992-4.279; p < 0.001), first formant of /i/(iF1) (OR 1.003; 95%CI 1.002-1.04; p < 0.001), and second formant of /i/(iF2) (OR 0.998; 95%CI 0.997-0.998; p < 0.001). The receiver operating curve for a regression model that included both formants and Mallampati showed an area under curve of 0.918, higher than formants alone (area under curve 0.761) and modified Mallampati alone (area under curve 0.874). Voice presented a significant association with difficult laryngoscopy during general anaesthesia showing a 76.1% probability of correctly classifying a randomly selected patient.}, } @article {pmid31176869, year = {2019}, author = {Easwar, V and Scollie, S and Purcell, D}, title = {Investigating potential interactions between envelope following responses elicited simultaneously by different vowel formants.}, journal = {Hearing research}, volume = {380}, number = {}, pages = {35-45}, doi = {10.1016/j.heares.2019.05.005}, pmid = {31176869}, issn = {1878-5891}, mesh = {*Acoustic Stimulation ; Adolescent ; Adult ; Auditory Pathways/*physiology ; *Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Periodicity ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Envelope following responses (EFRs) evoked by the periodicity of voicing in vowels are elicited at the fundamental frequency of voice (f0), irrespective of the harmonics that initiate it. One approach of improving the frequency specificity of vowel stimuli without increasing test-time is by altering the f0 selectively in one or more formants. The harmonics contributing to an EFR can then be differentiated by the unique f0 at which the EFRs are elicited. The advantages of using such an approach would be increased frequency specificity and efficiency, given that multiple EFRs can be evaluated in a certain test-time. However, multiple EFRs elicited simultaneously could interact and lead to altered amplitudes and outcomes. To this end, the present study aimed to evaluate: (i) if simultaneous recording of two EFRs, one elicited by harmonics in the first formant (F1) and one elicited by harmonics in the second and higher formants (F2+), leads to attenuation or enhancement of EFR amplitude, and (ii) if simultaneous measurement of two EFRs affects its accuracy and anticipated efficiency. In a group of 22 young adults with normal hearing, EFRs were elicited by F1 and F2+ bands of /u/, /a/ and /i/ when F1 and F2+ were presented independently (individual), when F1 and F2+ were presented simultaneously (dual), and when F1 or F2+ was presented with spectrally matched Gaussian noise of the other (noise). Repeated-measures analysis of variance indicated no significant group differences in EFR amplitudes between any of the conditions, suggesting minimal between-EFR interactions. Between-participant variability was evident, however, significant changes were evident only in a third of the participants for the stimulus /u/ F1. For the majority of stimuli, the change between individual and dual conditions was positively correlated with the change between individual and noise conditions, suggesting that interaction-based changes in EFR amplitude, when present, were likely due to the restriction of cochlear regions of excitation in the presence of a competing stimulus. The amplitude of residual noise was significantly higher in the dual or noise relative to the individual conditions, although the mean differences were very small (<3 nV). F-test-based detection of EFRs, commonly used to determine the presence of an EFR, did not vary across conditions. Further, neither the mean reduction in EFR amplitude nor the mean increase in noise amplitude in dual relative to individual conditions was large enough to alter the anticipated gain in efficiency of simultaneous EFR recordings. Together, results suggest that the approach of simultaneously recording two vowel-evoked EFRs from different formants for improved frequency-specificity does not alter test accuracy and is more time-efficient than evaluating EFRs to each formant individually.}, } @article {pmid31164264, year = {2019}, author = {Mou, Z and Teng, W and Ouyang, H and Chen, Y and Liu, Y and Jiang, C and Zhang, J and Chen, Z}, title = {Quantitative analysis of vowel production in cerebral palsy children with dysarthria.}, journal = {Journal of clinical neuroscience : official journal of the Neurosurgical Society of Australasia}, volume = {66}, number = {}, pages = {77-82}, doi = {10.1016/j.jocn.2019.05.020}, pmid = {31164264}, issn = {1532-2653}, mesh = {Adolescent ; Cerebral Palsy/*complications ; Child ; Dysarthria/etiology/*physiopathology ; Female ; Humans ; Male ; Phonetics ; *Speech Acoustics ; Speech Intelligibility ; }, abstract = {OBJECTIVE: The present study aimed to identify certain acoustic parameters for speech evaluation in cerebral palsy children with dysarthria.

METHODS: The subject included 30 native Mandarin-Speaking children with cerebral palsy, who were 5-15 years old, and 13 healthy children in a similar age range. Each subject was recorded while producing a list of 12 Mandarin words, which included three syllables ('ba', 'bi' and 'du'), in all four Mandarin tones. The formants (F1 and F2) of monophthong vowels /a, i, u/ were extracted from each vowel token. Based on F1 and F2, the vowel acoustic indexes VSA, VAI and FCR were calculated and analyzed.

RESULTS: Compared with the control group, the cerebral palsy group had significantly low F1 and F2 in vowel /a/ (P < 0.05), and F2 in vowel /i/ (P < 0.05), while F1 and F2 in vowel /u/ and F1 in vowel /i/ had no significant difference. Between the healthy group and cerebral palsy group, the differences in VSA, VAI and FCR were all statistically significant.

CONCLUSION: Children with cerebral palsy have reduced vowel space and speech articulation. The significant difference in vowel acoustic indexes (VSA, VAI and FCR) among the two groups revealed that the three indexes were sensitive to the variation of the vowels production in children with cerebral palsy, and that these may be used as an evaluation method of speech intelligibility caused by impaired vowel pronunciation in children with cerebral palsy, and the effect of rehabilitation therapy.}, } @article {pmid31153772, year = {2020}, author = {Buckley, DP and Dahl, KL and Cler, GJ and Stepp, CE}, title = {Transmasculine Voice Modification: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {903-910}, pmid = {31153772}, issn = {1873-4588}, support = {F31 DC014872/DC/NIDCD NIH HHS/United States ; F32 DC017637/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Masculinity ; Speech ; *Speech Acoustics ; *Voice ; Voice Training ; }, abstract = {This case study measured the effects of manual laryngeal therapy on the fundamental frequency (fo), formant frequencies, estimated vocal tract length, and listener perception of masculinity of a 32-year-old transmasculine individual. The participant began testosterone therapy 1.5 years prior to the study. Two therapy approaches were administered sequentially in a single session: (1) passive circumlaryngeal massage and manual laryngeal reposturing, and (2) active laryngeal reposturing with voicing. Acoustic recordings were collected before and after each treatment and 3 days after the session. Speaking fo decreased from 124 Hz to 120 Hz after passive training, and to 108 Hz after active training. Estimated vocal tract length increased from 17.0 cm to 17.3 cm after passive training, and to 19.4 cm after active training. Eight listeners evaluated the masculinity of the participant's speech; his voice was rated as most masculine at the end of the training session. All measures returned to baseline at follow-up. Overall, both acoustic and perceptual changes were observed in one transmasculine individual who participated in manual laryngeal therapy, even after significant testosterone-induced voice changes had already occurred; however, changes were not maintained in the follow-up. This study adds to scant literature on effective approaches to and proposed outcome measures for voice masculinization in transmasculine individuals.}, } @article {pmid31153348, year = {2019}, author = {Chen, WR and Whalen, DH and Shadle, CH}, title = {F0-induced formant measurement errors result in biased variabilities.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {EL360}, pmid = {31153348}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; *Bias ; Humans ; Phonetics ; Sound Spectrography/methods ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Many developmental studies attribute reduction of acoustic variability to increasing motor control. However, linear prediction-based formant measurements are known to be biased toward the nearest harmonic of F0, especially at high F0s. Thus, the amount of reported formant variability generated by changes in F0 is unknown. Here, 470 000 vowels were synthesized, mimicking statistics reported in four developmental studies, to estimate the proportion of formant variability that can be attributed to F0 bias, as well as other formant measurement errors. Results showed that the F0-induced formant measurements errors are large and systematic, and cannot be eliminated by a large sample size.}, } @article {pmid31153321, year = {2019}, author = {Briefer, EF and Vizier, E and Gygax, L and Hillmann, E}, title = {Expression of emotional valence in pig closed-mouth grunts: Involvement of both source- and filter-related parameters.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {2895}, doi = {10.1121/1.5100612}, pmid = {31153321}, issn = {1520-8524}, mesh = {Animals ; Arousal/*physiology ; Behavior, Animal/*physiology ; Emotions/*physiology ; Face/physiology ; Female ; Male ; Mouth/physiology ; Swine ; Vocalization, Animal/*physiology ; Voice ; }, abstract = {Emotion expression plays a crucial role for regulating social interactions. One efficient channel for emotion communication is the vocal-auditory channel, which enables a fast transmission of information. Filter-related parameters (formants) have been suggested as a key to the vocal differentiation of emotional valence (positive versus negative) across species, but variation in relation to emotions has rarely been investigated. Here, whether pig (Sus scrofa domesticus) closed-mouth grunts differ in source- and filter-related features when produced in situations assumed to be positive and negative is investigated. Behavioral and physiological parameters were used to validate the animals' emotional state (both in terms of valence and arousal, i.e., bodily activation). Results revealed that grunts produced in a positive situation were characterized by higher formants, a narrower range of the third formant, a shorter duration, a lower fundamental frequency, and a lower harmonicity compared to negative grunts. Particularly, formant-related parameters and duration made up most of the difference between positive and negative grunts. Therefore, these parameters have the potential to encode dynamic information and to vary as a function of the emotional valence of the emitter in pigs, and possibly in other mammals as well.}, } @article {pmid31153297, year = {2019}, author = {Houde, JF and Gill, JS and Agnew, Z and Kothare, H and Hickok, G and Parrell, B and Ivry, RB and Nagarajan, SS}, title = {Abnormally increased vocal responses to pitch feedback perturbations in patients with cerebellar degeneration.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {EL372}, pmid = {31153297}, issn = {1520-8524}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 NS105839/NS/NINDS NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Feedback ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Middle Aged ; Pitch Perception/*physiology ; Speech/*physiology ; Speech Perception/physiology ; Voice/*physiology ; }, abstract = {Cerebellar degeneration (CD) has deleterious effects on speech motor behavior. Recently, a dissociation between feedback and feedforward control of speaking was observed in CD: Whereas CD patients exhibited reduced adaptation across trials to consistent formant feedback alterations, they showed enhanced within-trial compensation for unpredictable formant feedback perturbations. In this study, it was found that CD patients exhibit abnormally increased within-trial vocal compensation responses to unpredictable pitch feedback perturbations. Taken together with recent findings, the results indicate that CD is associated with a general hypersensitivity to auditory feedback during speaking.}, } @article {pmid31147205, year = {2021}, author = {Reinheimer, DM and Andrade, BMR and Nascimento, JKF and Fonte, JBM and Araújo, IMP and Martins-Filho, PRS and Salvatori, R and Valença, EHO and Oliveira, AHA and Aguiar-Oliveira, MH and Oliveira-Neto, LA}, title = {Formant Frequencies, Cephalometric Measures, and Pharyngeal Airway Width in Adults With Congenital, Isolated, and Untreated Growth Hormone Deficiency.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {1}, pages = {61-68}, doi = {10.1016/j.jvoice.2019.04.014}, pmid = {31147205}, issn = {1873-4588}, mesh = {Adult ; Cephalometry ; *Dwarfism, Pituitary ; Growth Hormone ; Humans ; Mandible/diagnostic imaging ; Pharynx/diagnostic imaging ; }, abstract = {OBJECTIVE: Adult subjects with isolated growth hormone deficiency (IGHD) due to a mutation in the growth hormone releasing hormone receptor gene exhibit higher values formant frequencies. In normal subjects, a significant negative association between the formant frequencies and the reduction of linear craniofacial measurements, especially of maxilla and mandible, has been reported. This suggests smaller pharyngeal width, despite low prevalence of obstructive sleep apnea syndrome. Here we evaluate their pharyngeal airway width, its correlation with vowel formant frequencies, and the correlation between them and the craniofacial measures.

SUBJECTS AND METHODS: A two-step protocol was performed. In the first case-control experiment, aimed to assess the pharyngeal width, we compared nine adult IGHD and 36 normal statured controls. Both upper and lower pharyngeal widths were measured. The second step (assessment of pharyngeal width) was performed only in the IGHD group.

RESULTS: Upper and lower pharyngeal widths were similar in IGHD and controls. In IGHD subjects, the lower pharyngeal width exhibited a negative correlation with F1 [a] and a positive correlation with mandibular length. There were negative correlations between F1 and F2 with linear and positive correlations with the angular measures.

CONCLUSIONS: Pharyngeal airway width is not reduced in adults with congenital, untreated lifetime IGHD, contributing to the low prevalence of obstructive sleep apnea syndrome. The formant frequencies relate more with cephalometric measurements than with the pharyngeal airway width. These findings exemplify the consequences of lifetime IGHD on osseous and nonosseous growth.}, } @article {pmid31136317, year = {2020}, author = {Easwar, V and Scollie, S and Aiken, S and Purcell, D}, title = {Test-Retest Variability in the Characteristics of Envelope Following Responses Evoked by Speech Stimuli.}, journal = {Ear and hearing}, volume = {41}, number = {1}, pages = {150-164}, doi = {10.1097/AUD.0000000000000739}, pmid = {31136317}, issn = {1538-4667}, support = {/CAPMC/CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Adult ; Evoked Potentials, Auditory ; *Hearing Aids ; Humans ; Male ; Noise ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: The objective of the present study was to evaluate the between-session test-retest variability in the characteristics of envelope following responses (EFRs) evoked by modified natural speech stimuli in young normal hearing adults.

DESIGN: EFRs from 22 adults were recorded in two sessions, 1 to 12 days apart. EFRs were evoked by the token /susa∫ i/ (2.05 sec) presented at 65 dB SPL and recorded from the vertex referenced to the neck. The token /susa∫ i/, spoken by a male with an average fundamental frequency [f0] of 98.53 Hz, was of interest because of its potential utility as an objective hearing aid outcome measure. Each vowel was modified to elicit two EFRs simultaneously by lowering the f0 in the first formant while maintaining the original f0 in the higher formants. Fricatives were amplitude-modulated at 93.02 Hz and elicited one EFR each. EFRs evoked by vowels and fricatives were estimated using Fourier analyzer and discrete Fourier transform, respectively. Detection of EFRs was determined by an F-test. Test-retest variability in EFR amplitude and phase coherence were quantified using correlation, repeated-measures analysis of variance, and the repeatability coefficient. The repeatability coefficient, computed as twice the standard deviation (SD) of test-retest differences, represents the ±95% limits of test-retest variation around the mean difference. Test-retest variability of EFR amplitude and phase coherence were compared using the coefficient of variation, a normalized metric, which represents the ratio of the SD of repeat measurements to its mean. Consistency in EFR detection outcomes was assessed using the test of proportions.

RESULTS: EFR amplitude and phase coherence did not vary significantly between sessions, and were significantly correlated across repeat measurements. The repeatability coefficient for EFR amplitude ranged from 38.5 nV to 45.6 nV for all stimuli, except for /∫/ (71.6 nV). For any given stimulus, the test-retest differences in EFR amplitude of individual participants were not correlated with their test-retest differences in noise amplitude. However, across stimuli, higher repeatability coefficients of EFR amplitude tended to occur when the group mean noise amplitude and the repeatability coefficient of noise amplitude were higher. The test-retest variability of phase coherence was comparable to that of EFR amplitude in terms of the coefficient of variation, and the repeatability coefficient varied from 0.1 to 0.2, with the highest value of 0.2 for /∫/. Mismatches in EFR detection outcomes occurred in 11 of 176 measurements. For each stimulus, the tests of proportions revealed a significantly higher proportion of matched detection outcomes compared to mismatches.

CONCLUSIONS: Speech-evoked EFRs demonstrated reasonable repeatability across sessions. Of the eight stimuli, the shortest stimulus /∫/ demonstrated the largest variability in EFR amplitude and phase coherence. The test-retest variability in EFR amplitude could not be explained by test-retest differences in noise amplitude for any of the stimuli. This lack of explanation argues for other sources of variability, one possibility being the modulation of cortical contributions imposed on brainstem-generated EFRs.}, } @article {pmid31129300, year = {2019}, author = {Zhao, TC and Masapollo, M and Polka, L and Ménard, L and Kuhl, PK}, title = {Effects of formant proximity and stimulus prototypicality on the neural discrimination of vowels: Evidence from the auditory frequency-following response.}, journal = {Brain and language}, volume = {194}, number = {}, pages = {77-83}, pmid = {31129300}, issn = {1090-2155}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; U54 HD083091/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Discrimination, Psychological ; Female ; Humans ; Male ; Multilingualism ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Cross-language speech perception experiments indicate that for many vowel contrasts, discrimination is easier when the same pair of vowels is presented in one direction compared to the reverse direction. According to one account, these directional asymmetries reflect a universal bias favoring "focal" vowels (i.e., vowels with prominent spectral peaks formed by the convergence of adjacent formants). An alternative account is that such effects reflect an experience-dependent bias favoring prototypical exemplars of native-language vowel categories. Here, we tested the predictions of these accounts by recording the auditory frequency-following response in English-speaking listeners to two synthetic variants of the vowel /u/ that differed in the proximity of their first and second formants and prototypicality, with stimuli arranged in oddball and reversed-oddball blocks. Participants showed evidence of neural discrimination when the more-focal/less-prototypic /u/ served as the deviant stimulus, but not when the less-focal/more-prototypic /u/ served as the deviant, consistent with the focalization account.}, } @article {pmid31127764, year = {2019}, author = {König, A and Linz, N and Zeghari, R and Klinge, X and Tröger, J and Alexandersson, J and Robert, P}, title = {Detecting Apathy in Older Adults with Cognitive Disorders Using Automatic Speech Analysis.}, journal = {Journal of Alzheimer's disease : JAD}, volume = {69}, number = {4}, pages = {1183-1193}, doi = {10.3233/JAD-181033}, pmid = {31127764}, issn = {1875-8908}, mesh = {Aged ; *Apathy ; Cognition Disorders/diagnosis/*psychology ; Environmental Biomarkers ; Female ; Humans ; Machine Learning ; Male ; Speech ; Speech Production Measurement/*methods ; }, abstract = {BACKGROUND: Apathy is present in several psychiatric and neurological conditions and has been found to have a severe negative effect on disease progression. In older people, it can be a predictor of increased dementia risk. Current assessment methods lack objectivity and sensitivity, thus new diagnostic tools and broad-scale screening technologies are needed.

OBJECTIVE: This study is the first of its kind aiming to investigate whether automatic speech analysis could be used for characterization and detection of apathy.

METHODS: A group of apathetic and non-apathetic patients (n = 60) with mild to moderate neurocognitive disorder were recorded while performing two short narrative speech tasks. Paralinguistic markers relating to prosodic, formant, source, and temporal qualities of speech were automatically extracted, examined between the groups and compared to baseline assessments. Machine learning experiments were carried out to validate the diagnostic power of extracted markers.

RESULTS: Correlations between apathy sub-scales and features revealed a relation between temporal aspects of speech and the subdomains of reduction in interest and initiative, as well as between prosody features and the affective domain. Group differences were found to vary for males and females, depending on the task. Differences in temporal aspects of speech were found to be the most consistent difference between apathetic and non-apathetic patients. Machine learning models trained on speech features achieved top performances of AUC = 0.88 for males and AUC = 0.77 for females.

CONCLUSIONS: These findings reinforce the usability of speech as a reliable biomarker in the detection and assessment of apathy.}, } @article {pmid31121594, year = {2020}, author = {Cox, SR and Raphael, LJ and Doyle, PC}, title = {Production of Vowels by Electrolaryngeal Speakers Using Clear Speech.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {72}, number = {4}, pages = {250-256}, doi = {10.1159/000499928}, pmid = {31121594}, issn = {1421-9972}, mesh = {Humans ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; }, abstract = {BACKGROUND/AIMS: This study examined the effect of clear speech on vowel productions by electrolaryngeal speakers.

METHOD: Ten electrolaryngeal speakers produced eighteen words containing /i/, /ɪ/, /ɛ/, /æ/, /eɪ/, and /oʊ/ using habitual speech and clear speech. Twelve listeners transcribed 360 words, and a total of 4,320 vowel stimuli across speaking conditions, speakers, and listeners were analyzed. Analyses included listeners' identifications of vowels, vowel duration, and vowel formant relationships.

RESULTS: No significant effect of speaking condition was found on vowel identification. Specifically, 85.4% of the vowels were identified in habitual speech, and 82.7% of the vowels were identified in clear speech. However, clear speech was found to have a significant effect on vowel durations. The mean vowel duration in the 17 consonant-vowel-consonant words was 333 ms in habitual speech and 354 ms in clear speech. The mean vowel duration in the single consonant-vowel words was 551 ms in habitual speech and 629 ms in clear speech.

CONCLUSION: Finding suggests that, although clear speech facilitates longer vowel durations, electrolaryngeal speakers may not gain a clear speech benefit relative to listeners' vowel identifications.}, } @article {pmid31117110, year = {2019}, author = {Alharbi, GG and Cannito, MP and Buder, EH and Awan, SN}, title = {Spectral/Cepstral Analyses of Phonation in Parkinson's Disease before and after Voice Treatment: A Preliminary Study.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {71}, number = {5-6}, pages = {275-285}, doi = {10.1159/000495837}, pmid = {31117110}, issn = {1421-9972}, mesh = {Aged ; Aged, 80 and over ; Dysarthria/diagnosis/*therapy ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Parkinson Disease/diagnosis/*therapy ; *Phonation ; *Sound Spectrography ; Speech Acoustics ; Voice Disorders/diagnosis/*therapy ; *Voice Quality ; *Voice Training ; }, abstract = {PURPOSE: This article examines cepstral/spectral analyses of sustained /α/ vowels produced by speakers with hypokinetic dysarthria secondary to idiopathic Parkinson's disease (PD) before and after Lee Silverman Voice Treatment (LSVT®LOUD) and the relationship of these measures with overall voice intensity.

METHODOLOGY: Nine speakers with PD were examined in a pre-/post-treatment design, with multiple daily audio recordings before and after treatment. Sustained vowels were analyzed for cepstral peak prominence (CPP), CPP standard deviation (CPP SD), low/high spectral ratio (L/H SR), and Cepstral/Spectral Index of Dysphonia (CSID) using the KAYPENTAX computer software.

RESULTS: CPP and CPP SD increased significantly and CSID decreased significantly from pre- to post-treatment recordings, with strong effect sizes. Increased CPP indicates increased dominance of harmonics in the spectrum following LSVT. After restricting the frequency cutoff to the region just above the first formant and second formant and below the third formant, L/H SR was observed to decrease significantly following treatment. Correlation analyses demonstrated that CPP was more strongly associated with CSID before treatment than after.

CONCLUSION: In addition to increased vocal intensity following LSVT, speakers with PD exhibited both improved harmonic structure and voice quality as reflected by cepstral/spectral analysis, indicating that there was improved harmonic structure and reduced dysphonia following treatment.}, } @article {pmid31112961, year = {2019}, author = {Broś, K and Lipowska, K}, title = {Gran Canarian Spanish Non-Continuant Voicing: Gradiency, Sex Differences and Perception.}, journal = {Phonetica}, volume = {76}, number = {2-3}, pages = {100-125}, doi = {10.1159/000494928}, pmid = {31112961}, issn = {1423-0321}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Language ; Male ; *Phonetics ; Sex Factors ; Sound Spectrography ; *Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Voice Quality ; Young Adult ; }, abstract = {BACKGROUND/AIMS: This paper examines the process of postvocalic voicing in the Spanish of Gran Canaria from the point of view of language change. A perception-production study was designed to measure the extent of variation in speaker productions, explore the degree to which production is affected by perception and identify variables that can be considered markers of sound change in progress.

METHODS: 20 native speakers of the dialect were asked to repeat auditory input data containing voiceless non-continuants with and without voicing.

RESULTS: Input voicing has no effect on output pronunciations, but voicing is highly variable, with both phonetic and social factors involved. Most importantly, a clear lenition pattern was identified based on such indicators as consonant duration, intensity ratio, absence of burst and presence of formants, with the velar /k/ as the most affected segment. Furthermore, strong social implications were identified: voicing degrees and rates depend both on the level of education and on the gender of the speaker.

CONCLUSION: The results of the study suggest that the interplay of external and internal factors must be investigated more thoroughly to better address the question of phonetic variation and phonologisation of contrasts in the context of language change.}, } @article {pmid31107364, year = {2020}, author = {Zaltz, Y and Goldsworthy, RL and Eisenberg, LS and Kishon-Rabin, L}, title = {Children With Normal Hearing Are Efficient Users of Fundamental Frequency and Vocal Tract Length Cues for Voice Discrimination.}, journal = {Ear and hearing}, volume = {41}, number = {1}, pages = {182-193}, pmid = {31107364}, issn = {1538-4667}, support = {R01 DC018701/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Child ; *Cochlear Implants ; Cues ; Hearing ; Humans ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {BACKGROUND: The ability to discriminate between talkers assists listeners in understanding speech in a multitalker environment. This ability has been shown to be influenced by sensory processing of vocal acoustic cues, such as fundamental frequency (F0) and formant frequencies that reflect the listener's vocal tract length (VTL), and by cognitive processes, such as attention and memory. It is, therefore, suggested that children who exhibit immature sensory and/or cognitive processing will demonstrate poor voice discrimination (VD) compared with young adults. Moreover, greater difficulties in VD may be associated with spectral degradation as in children with cochlear implants.

OBJECTIVES: The aim of this study was as follows: (1) to assess the use of F0 cues, VTL cues, and the combination of both cues for VD in normal-hearing (NH) school-age children and to compare their performance with that of NH adults; (2) to assess the influence of spectral degradation by means of vocoded speech on the use of F0 and VTL cues for VD in NH children; and (3) to assess the contribution of attention, working memory, and nonverbal reasoning to performance.

DESIGN: Forty-one children, 8 to 11 years of age, were tested with nonvocoded stimuli. Twenty-one of them were also tested with eight-channel, noise-vocoded stimuli. Twenty-one young adults (18 to 35 years) were tested for comparison. A three-interval, three-alternative forced-choice paradigm with an adaptive tracking procedure was used to estimate the difference limens (DLs) for VD when F0, VTL, and F0 + VTL were manipulated separately. Auditory memory, visual attention, and nonverbal reasoning were assessed for all participants.

RESULTS: (a) Children' F0 and VTL discrimination abilities were comparable to those of adults, suggesting that most school-age children utilize both cues effectively for VD. (b) Children's VD was associated with trail making test scores that assessed visual attention abilities and speed of processing, possibly reflecting their need to recruit cognitive resources for the task. (c) Best DLs were achieved for the combined (F0 + VTL) manipulation for both children and adults, suggesting that children at this age are already capable of integrating spectral and temporal cues. (d) Both children and adults found the VTL manipulations more beneficial for VD compared with the F0 manipulations, suggesting that formant frequencies are more reliable for identifying a specific speaker than F0. (e) Poorer DLs were achieved with the vocoded stimuli, though the children maintained similar thresholds and pattern of performance among manipulations as the adults.

CONCLUSIONS: The present study is the first to assess the contribution of F0, VTL, and the combined F0 + VTL to the discrimination of speakers in school-age children. The findings support the notion that many NH school-age children have effective spectral and temporal coding mechanisms that allow sufficient VD, even in the presence of spectrally degraded information. These results may challenge the notion that immature sensory processing underlies poor listening abilities in children, further implying that other processing mechanisms contribute to their difficulties to understand speech in a multitalker environment. These outcomes may also provide insight into VD processes of children under listening conditions that are similar to cochlear implant users.}, } @article {pmid31095612, year = {2019}, author = {Auracher, J and Scharinger, M and Menninghaus, W}, title = {Contiguity-based sound iconicity: The meaning of words resonates with phonetic properties of their immediate verbal contexts.}, journal = {PloS one}, volume = {14}, number = {5}, pages = {e0216930}, pmid = {31095612}, issn = {1932-6203}, mesh = {Cognition ; Female ; Germany ; Humans ; *Language ; Linguistics/*methods ; Literature ; Male ; Phonetics ; *Semantics ; Sound ; *Verbal Learning ; *Voice ; }, abstract = {We tested the hypothesis that phonosemantic iconicity--i.e., a motivated resonance of sound and meaning--might not only be found on the level of individual words or entire texts, but also in word combinations such that the meaning of a target word is iconically expressed, or highlighted, in the phonetic properties of its immediate verbal context. To this end, we extracted single lines from German poems that all include a word designating high or low dominance, such as large or small, strong or weak, etc. Based on insights from previous studies, we expected to find more vowels with a relatively short distance between the first two formants (low formant dispersion) in the immediate context of words expressing high physical or social dominance than in the context of words expressing low dominance. Our findings support this hypothesis, suggesting that neighboring words can form iconic dyads in which the meaning of one word is sound-iconically reflected in the phonetic properties of adjacent words. The construct of a contiguity-based phono-semantic iconicity opens many venues for future research well beyond lines extracted from poems.}, } @article {pmid31084509, year = {2019}, author = {Koenig, LL and Fuchs, S}, title = {Vowel Formants in Normal and Loud Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {5}, pages = {1278-1295}, doi = {10.1044/2018_JSLHR-S-18-0043}, pmid = {31084509}, issn = {1558-9102}, mesh = {Adult ; Female ; Humans ; *Language ; Loudness Perception ; *Phonation ; *Phonetics ; *Speech ; Young Adult ; }, abstract = {Purpose This study evaluated how 1st and 2nd vowel formant frequencies (F1, F2) differ between normal and loud speech in multiple speaking tasks to assess claims that loudness leads to exaggerated vowel articulation. Method Eleven healthy German-speaking women produced normal and loud speech in 3 tasks that varied in the degree of spontaneity: reading sentences that contained isolated /i: a: u:/, responding to questions that included target words with controlled consonantal contexts but varying vowel qualities, and a recipe recall task. Loudness variation was elicited naturalistically by changing interlocutor distance. First and 2nd formant frequencies and average sound pressure level were obtained from the stressed vowels in the target words, and vowel space area was calculated from /i: a: u:/. Results Comparisons across many vowels indicated that high, tense vowels showed limited formant variation as a function of loudness. Analysis of /i: a: u:/ across speech tasks revealed vowel space reduction in the recipe retell task compared to the other 2. Loudness changes for F1 were consistent in direction but variable in extent, with few significant results for high tense vowels. Results for F2 were quite varied and frequently not significant. Speakers differed in how loudness and task affected formant values. Finally, correlations between sound pressure level and F1 were generally positive but varied in magnitude across vowels, with the high tense vowels showing very flat slopes. Discussion These data indicate that naturalistically elicited loud speech in typical speakers does not always lead to changes in vowel formant frequencies and call into question the notion that increasing loudness is necessarily an automatic method of expanding the vowel space. Supplemental Material https://doi.org/10.23641/asha.8061740.}, } @article {pmid31082309, year = {2019}, author = {Nalborczyk, L and Batailler, C and Lœvenbruck, H and Vilain, A and Bürkner, PC}, title = {An Introduction to Bayesian Multilevel Models Using brms: A Case Study of Gender Effects on Vowel Variability in Standard Indonesian.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {5}, pages = {1225-1242}, doi = {10.1044/2018_JSLHR-S-18-0006}, pmid = {31082309}, issn = {1558-9102}, mesh = {Bayes Theorem ; Female ; Humans ; Indonesia ; *Language ; Male ; Multilevel Analysis ; *Phonation ; Phonetics ; Sex Characteristics ; *Speech ; }, abstract = {Purpose Bayesian multilevel models are increasingly used to overcome the limitations of frequentist approaches in the analysis of complex structured data. This tutorial introduces Bayesian multilevel modeling for the specific analysis of speech data, using the brms package developed in R. Method In this tutorial, we provide a practical introduction to Bayesian multilevel modeling by reanalyzing a phonetic data set containing formant (F1 and F2) values for 5 vowels of standard Indonesian (ISO 639-3:ind), as spoken by 8 speakers (4 females and 4 males), with several repetitions of each vowel. Results We first give an introductory overview of the Bayesian framework and multilevel modeling. We then show how Bayesian multilevel models can be fitted using the probabilistic programming language Stan and the R package brms, which provides an intuitive formula syntax. Conclusions Through this tutorial, we demonstrate some of the advantages of the Bayesian framework for statistical modeling and provide a detailed case study, with complete source code for full reproducibility of the analyses (https://osf.io/dpzcb /). Supplemental Material https://doi.org/10.23641/asha.7973822.}, } @article {pmid31081486, year = {2019}, author = {van Rij, J and Hendriks, P and van Rijn, H and Baayen, RH and Wood, SN}, title = {Analyzing the Time Course of Pupillometric Data.}, journal = {Trends in hearing}, volume = {23}, number = {}, pages = {2331216519832483}, pmid = {31081486}, issn = {2331-2165}, mesh = {Female ; Humans ; Male ; *Natural Language Processing ; *Psychometrics/methods/standards ; *Pupil ; Regression Analysis ; }, abstract = {This article provides a tutorial for analyzing pupillometric data. Pupil dilation has become increasingly popular in psychological and psycholinguistic research as a measure to trace language processing. However, there is no general consensus about procedures to analyze the data, with most studies analyzing extracted features from the pupil dilation data instead of analyzing the pupil dilation trajectories directly. Recent studies have started to apply nonlinear regression and other methods to analyze the pupil dilation trajectories directly, utilizing all available information in the continuously measured signal. This article applies a nonlinear regression analysis, generalized additive mixed modeling, and illustrates how to analyze the full-time course of the pupil dilation signal. The regression analysis is particularly suited for analyzing pupil dilation in the fields of psychological and psycholinguistic research because generalized additive mixed models can include complex nonlinear interactions for investigating the effects of properties of stimuli (e.g., formant frequency) or participants (e.g., working memory score) on the pupil dilation signal. To account for the variation due to participants and items, nonlinear random effects can be included. However, one of the challenges for analyzing time series data is dealing with the autocorrelation in the residuals, which is rather extreme for the pupillary signal. On the basis of simulations, we explain potential causes of this extreme autocorrelation, and on the basis of the experimental data, we show how to reduce their adverse effects, allowing a much more coherent interpretation of pupillary data than possible with feature-based techniques.}, } @article {pmid31067968, year = {2019}, author = {He, L and Zhang, Y and Dellwo, V}, title = {Between-speaker variability and temporal organization of the first formant.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {3}, pages = {EL209}, doi = {10.1121/1.5093450}, pmid = {31067968}, issn = {1520-8524}, mesh = {Adult ; Biological Variation, Population ; Female ; Humans ; Male ; Phonation ; *Speech Acoustics ; Speech Recognition Software ; Voice/physiology ; }, abstract = {First formant (F1) trajectories of vocalic intervals were divided into positive and negative dynamics. Positive F1 dynamics were defined as the speeds of F1 increases to reach the maxima, and negative F1 dynamics as the speeds of F1 decreases away from the maxima. Mean, standard deviation, and sequential variability were measured for both dynamics. Results showed that measures of negative F1 dynamics explained more between-speaker variability, which was highly congruent with a previous study using intensity dynamics [He and Dellwo (2017). J. Acoust. Soc. Am. 141, EL488-EL494]. The results may be explained by speaker idiosyncratic articulation.}, } @article {pmid31067923, year = {2019}, author = {Roberts, B and Summers, RJ}, title = {Dichotic integration of acoustic-phonetic information: Competition from extraneous formants increases the effect of second-formant attenuation on intelligibility.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {3}, pages = {1230}, doi = {10.1121/1.5091443}, pmid = {31067923}, issn = {1520-8524}, abstract = {Differences in ear of presentation and level do not prevent effective integration of concurrent speech cues such as formant frequencies. For example, presenting the higher formants of a consonant-vowel syllable in the opposite ear to the first formant protects them from upward spread of masking, allowing them to remain effective speech cues even after substantial attenuation. This study used three-formant (F1+F2+F3) analogues of natural sentences and extended the approach to include competitive conditions. Target formants were presented dichotically (F1+F3; F2), either alone or accompanied by an extraneous competitor for F2 (i.e., F1±F2C+F3; F2) that listeners must reject to optimize recognition. F2C was created by inverting the F2 frequency contour and using the F2 amplitude contour without attenuation. In experiment 1, F2C was always absent and intelligibility was unaffected until F2 attenuation exceeded 30 dB; F2 still provided useful information at 48-dB attenuation. In experiment 2, attenuating F2 by 24 dB caused considerable loss of intelligibility when F2C was present, but had no effect in its absence. Factors likely to contribute to this interaction include informational masking from F2C acting to swamp the acoustic-phonetic information carried by F2, and interaural inhibition from F2C acting to reduce the effective level of F2.}, } @article {pmid31048576, year = {2020}, author = {Gross, J and Forsberg, J}, title = {Weak Lips? A Possible Merger of /i:/ and /y:/ in Gothenburg.}, journal = {Phonetica}, volume = {77}, number = {4}, pages = {268-288}, doi = {10.1159/000499107}, pmid = {31048576}, issn = {1423-0321}, mesh = {Adolescent ; Educational Status ; Female ; Humans ; Language ; Male ; *Phonetics ; *Speech Acoustics ; Sweden ; Young Adult ; }, abstract = {BACKGROUND/AIMS: This study investigates a possible merger in the early stages between /i:/ and /y:/ among young speakers in Gothenburg, Sweden.

METHODS: (1) A large-scale online perception experiment testing listeners' abilities to identify the two vowels and (2) acoustic analysis of 705 vowels from 19 speakers.

RESULTS: The perception study shows that listeners classify the horizontally centralized /y:/ as /i:/, both in isolated vowel items and in items containing the full word. This indicates that /y:/ is moving into the perceptual space of /i:/. Listeners also classify the unmerged /y:/ as /i:/ when listening to [y:] in isolation, indicating that lip rounding is a perceptually weak feature, for this centralized vowel, in this variety. The acoustic analysis shows that /i:/ tends to be produced as [ɨ:], and that there is no acoustic difference between /i:/ and /y:/ in measurements correlated with the first two formants, i.e. lip rounding is the most important distinctive feature.

CONCLUSION: Results point in the direction of an incipient vowel merger, following a merger-by-approximation model. These results indicate a lack of perceptual strength of an articulatory feature in the disappearing phoneme, namely lip rounding, and the consequent perceptual similarities between the horizontally centralized [ɨ:] and /y:/.}, } @article {pmid31046355, year = {2019}, author = {Ogata, K and Kodama, T and Hayakawa, T and Aoki, R}, title = {Inverse estimation of the vocal tract shape based on a vocal tract mapping interface.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {4}, pages = {1961}, doi = {10.1121/1.5095409}, pmid = {31046355}, issn = {1520-8524}, abstract = {This paper describes the inverse estimation of the vocal tract shape for vowels by using a vocal tract mapping interface. In prior research, an interface capable of generating a vocal tract shape by clicking on its window was developed. The vocal tract shapes for five vowels are located at the vertices of a pentagonal chart and a different shape that corresponds to an arbitrary mouse-pointer position on the interface window is calculated by interpolation. In this study, an attempt was made to apply the interface to the inverse estimation of vocal tract shapes based on formant frequencies. A target formant frequency data set was searched based on the geometry of the interface window by using a coarse to fine algorithm. It was revealed that the estimated vocal tract shapes obtained from the mapping interface were close to those from magnetic resonance imaging data in another study and to lip area data captured using video recordings. The results of experiments to evaluate the estimated vocal tract shapes showed that each subject demonstrated unique trajectories on the interface window corresponding to the estimated vocal tract shapes. These results suggest the usefulness of inverse estimation using the interface.}, } @article {pmid31046324, year = {2019}, author = {Thompson, A and Kim, Y}, title = {Relation of second formant trajectories to tongue kinematics.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {4}, pages = {EL323}, doi = {10.1121/1.5099163}, pmid = {31046324}, issn = {1520-8524}, abstract = {In this study, the relationship between the acoustic and articulatory kinematic domains of speech was examined among nine neurologically healthy female speakers using two derived relationships between tongue kinematics and F2 measurements: (1) second formant frequency (F2) extent to lingual displacement and (2) F2 slope to lingual speed. Additionally, the relationships between these paired parameters were examined within conversational, more clear, and less clear speaking modes. In general, the findings of the study support a strong correlation for both sets of paired parameters. In addition, the data showed significant changes in articulatory behaviors across speaking modes including the magnitude of tongue motion, but not in the speed-related measures.}, } @article {pmid31046311, year = {2019}, author = {Bürki, A and Welby, P and Clément, M and Spinelli, E}, title = {Orthography and second language word learning: Moving beyond "friend or foe?".}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {4}, pages = {EL265}, doi = {10.1121/1.5094923}, pmid = {31046311}, issn = {1520-8524}, abstract = {French participants learned English pseudowords either with the orthographic form displayed under the corresponding picture (Audio-Ortho) or without (Audio). In a naming task, pseudowords learned in the Audio-Ortho condition were produced faster and with fewer errors, providing a first piece of evidence that orthographic information facilitates the learning and on-line retrieval of productive vocabulary in a second language. Formant analyses, however, showed that productions from the Audio-Ortho condition were more French-like (i.e., less target-like), a result confirmed by a vowel categorization task performed by native speakers of English. It is argued that novel word learning and pronunciation accuracy should be considered together.}, } @article {pmid31026194, year = {2019}, author = {Colby, S and Shiller, DM and Clayards, M and Baum, S}, title = {Different Responses to Altered Auditory Feedback in Younger and Older Adults Reflect Differences in Lexical Bias.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {4S}, pages = {1144-1151}, doi = {10.1044/2018_JSLHR-H-ASCC7-18-0124}, pmid = {31026194}, issn = {1558-9102}, mesh = {Adult ; *Age Factors ; Aged ; Aging/physiology/*psychology ; Bias ; Feedback, Psychological/*physiology ; Female ; Humans ; Linear Models ; Male ; *Phonetics ; Speech Perception/*physiology ; Speech Reception Threshold Test ; Young Adult ; }, abstract = {Purpose Previous work has found that both young and older adults exhibit a lexical bias in categorizing speech stimuli. In young adults, this has been argued to be an automatic influence of the lexicon on perceptual category boundaries. Older adults exhibit more top-down biases than younger adults, including an increased lexical bias. We investigated the nature of the increased lexical bias using a sensorimotor adaptation task designed to evaluate whether automatic processes drive this bias in older adults. Method A group of older adults (n = 27) and younger adults (n = 35) participated in an altered auditory feedback production task. Participants produced target words and nonwords under altered feedback that affected the 1st formant of the vowel. There were 2 feedback conditions that affected the lexical status of the target, such that target words were shifted to sound more like nonwords (e.g., less-liss) and target nonwords to sound more like words (e.g., kess-kiss). Results A mixed-effects linear regression was used to investigate the magnitude of compensation to altered auditory feedback between age groups and lexical conditions. Over the course of the experiment, older adults compensated (by shifting their production of 1st formant) more to altered auditory feedback when producing words that were shifted toward nonwords (less-liss) than when producing nonwords that were shifted toward words (kess-kiss). This is in contrast to younger adults who compensated more to nonwords that were shifted toward words compared to words that were shifted toward nonwords. Conclusion We found no evidence that the increased lexical bias previously observed in older adults is driven by a greater sensitivity to top-down lexical influence on perceptual category boundaries. We suggest the increased lexical bias in older adults is driven by postperceptual processes that arise as a result of age-related cognitive and sensory changes.}, } @article {pmid31018217, year = {2020}, author = {Schertz, J and Carbonell, K and Lotto, AJ}, title = {Language Specificity in Phonetic Cue Weighting: Monolingual and Bilingual Perception of the Stop Voicing Contrast in English and Spanish.}, journal = {Phonetica}, volume = {77}, number = {3}, pages = {186-208}, doi = {10.1159/000497278}, pmid = {31018217}, issn = {1423-0321}, mesh = {Acoustic Stimulation ; Adult ; *Cues ; Humans ; *Language ; Logistic Models ; *Multilingualism ; *Phonetics ; Speech ; *Voice ; Young Adult ; }, abstract = {BACKGROUND/AIMS: This work examines the perception of the stop voicing contrast in Spanish and English along four acoustic dimensions, comparing monolingual and bilingual listeners. Our primary goals are to test the extent to which cue-weighting strategies are language-specific in monolinguals, and whether this language specificity extends to bilingual listeners.

METHODS: Participants categorized sounds varying in voice onset time (VOT, the primary cue to the contrast) and three secondary cues: fundamental frequency at vowel onset, first formant (F1) onset frequency, and stop closure duration. Listeners heard acoustically identical target stimuli, within language-specific carrier phrases, in English and Spanish modes.

RESULTS: While all listener groups used all cues, monolingual English listeners relied more on F1, and less on closure duration, than monolingual Spanish listeners, indicating language specificity in cue use. Early bilingual listeners used the three secondary cues similarly in English and Spanish, despite showing language-specific VOT boundaries.

CONCLUSION: While our findings reinforce previous work demonstrating language-specific phonetic representations in bilinguals in terms of VOT boundary, they suggest that this specificity may not extend straightforwardly to cue-weighting strategies.}, } @article {pmid31018216, year = {2020}, author = {Kulikov, V}, title = {Laryngeal Contrast in Qatari Arabic: Effect of Speaking Rate on Voice Onset Time.}, journal = {Phonetica}, volume = {77}, number = {3}, pages = {163-185}, doi = {10.1159/000497277}, pmid = {31018216}, issn = {1423-0321}, mesh = {Humans ; *Language ; Larynx/physiology ; *Phonation ; *Phonetics ; Qatar ; Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {Beckman and colleagues claimed in 2011 that Swedish has an overspecified phonological contrast between prevoiced and voiceless aspirated stops. Yet, Swedish is the only language for which this pattern has been reported. The current study describes a similar phonological pattern in the vernacular Arabic dialect of Qatar. Acoustic measurements of main (voice onset time, VOT) and secondary (fundamental frequency, first formant) cues to voicing are based on production data of 8 native speakers of Qatari Arabic, who pronounced 1,380 voiced and voiceless word-initial stops in the slow and fast rate conditions. The results suggest that the VOT pattern found in voiced Qatari Arabic stops b, d, g is consistent with prevoicing in voice languages like Dutch, Russian, or Swedish. The pattern found in voiceless stops t, k is consistent with aspiration in aspirating languages like English, German, or Swedish. Similar to Swedish, both prevoicing and aspiration in Qatari Arabic stops change in response to speaking rate. VOT significantly increased by 19 ms in prevoiced stops and by 12 ms in voiceless stops in the slow speaking rate condition. The findings suggest that phonological overspecification in laryngeal contrasts may not be an uncommon pattern among languages.}, } @article {pmid30998549, year = {2020}, author = {Hedrick, M and Thornton, KET and Yeager, K and Plyler, P and Johnstone, P and Reilly, K and Springer, C}, title = {The Use of Static and Dynamic Cues for Vowel Identification by Children Wearing Hearing Aids or Cochlear Implants.}, journal = {Ear and hearing}, volume = {41}, number = {1}, pages = {72-81}, doi = {10.1097/AUD.0000000000000735}, pmid = {30998549}, issn = {1538-4667}, mesh = {Child ; *Cochlear Implantation ; *Cochlear Implants ; Cues ; *Hearing Aids ; Humans ; Phonetics ; *Speech Perception ; }, abstract = {OBJECTIVE: To examine vowel perception based on dynamic formant transition and/or static formant pattern cues in children with hearing loss while using their hearing aids or cochlear implants. We predicted that the sensorineural hearing loss would degrade formant transitions more than static formant patterns, and that shortening the duration of cues would cause more difficulty for vowel identification for these children than for their normal-hearing peers.

DESIGN: A repeated-measures, between-group design was used. Children 4 to 9 years of age from a university hearing services clinic who were fit for hearing aids (13 children) or who wore cochlear implants (10 children) participated. Chronologically age-matched children with normal hearing served as controls (23 children). Stimuli included three naturally produced syllables (/ba/, /bi/, and /bu/), which were presented either in their entirety or segmented to isolate the formant transition or the vowel static formant center. The stimuli were presented to listeners via loudspeaker in the sound field. Aided participants wore their own devices and listened with their everyday settings. Participants chose the vowel presented by selecting from corresponding pictures on a computer screen.

RESULTS: Children with hearing loss were less able to use shortened transition or shortened vowel centers to identify vowels as compared to their normal-hearing peers. Whole syllable and initial transition yielded better identification performance than the vowel center for /ɑ/, but not for /i/ or /u/.

CONCLUSIONS: The children with hearing loss may require a longer time window than children with normal hearing to integrate vowel cues over time because of altered peripheral encoding in spectrotemporal domains. Clinical implications include cognizance of the importance of vowel perception when developing habilitative programs for children with hearing loss.}, } @article {pmid30986136, year = {2019}, author = {Lowenstein, JH and Nittrouer, S}, title = {Perception-Production Links in Children's Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {4}, pages = {853-867}, pmid = {30986136}, issn = {1558-9102}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Child, Preschool ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Noise ; Perceptual Masking/physiology ; *Phonetics ; Reaction Time ; Speech/*physiology ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Purpose Child phonologists have long been interested in how tightly speech input constrains the speech production capacities of young children, and the question acquires clinical significance when children with hearing loss are considered. Children with sensorineural hearing loss often show differences in the spectral and temporal structures of their speech production, compared to children with normal hearing. The current study was designed to investigate the extent to which this problem can be explained by signal degradation. Method Ten 5-year-olds with normal hearing were recorded imitating 120 three-syllable nonwords presented in unprocessed form and as noise-vocoded signals. Target segments consisted of fricatives, stops, and vowels. Several measures were made: 2 duration measures (voice onset time and fricative length) and 4 spectral measures involving 2 segments (1st and 3rd moments of fricatives and 1st and 2nd formant frequencies for the point vowels). Results All spectral measures were affected by signal degradation, with vowel production showing the largest effects. Although a change in voice onset time was observed with vocoded signals for /d/, voicing category was not affected. Fricative duration remained constant. Conclusions Results support the hypothesis that quality of the input signal constrains the speech production capacities of young children. Consequently, it can be concluded that the production problems of children with hearing loss-including those with cochlear implants-can be explained to some extent by the degradation in the signal they hear. However, experience with both speech perception and production likely plays a role as well.}, } @article {pmid30967768, year = {2019}, author = {Chao, SC and Ochoa, D and Daliri, A}, title = {Production Variability and Categorical Perception of Vowels Are Strongly Linked.}, journal = {Frontiers in human neuroscience}, volume = {13}, number = {}, pages = {96}, pmid = {30967768}, issn = {1662-5161}, abstract = {Theoretical models of speech production suggest that the speech motor system (SMS) uses auditory goals to determine errors in its auditory output during vowel production. This type of error calculation indicates that within-speaker production variability of a given vowel is related to the size of the vowel's auditory goal. However, emerging evidence suggests that the SMS may also take into account perceptual knowledge of vowel categories (in addition to auditory goals) to estimate errors in auditory feedback. In this study, we examined how this mechanism influences within-speaker variability in vowel production. We conducted a study (n = 40 adults), consisting of a vowel categorization task and a vowel production task. The vowel categorization task was designed-based on participant-specific vowels-to estimate the categorical perceptual boundary (CPB) between two front vowels (/ε/ and /æ/). Using the vowel production data of each participant, we calculated a variability-based boundary (VBB) located at the "center of mass" of the two vowels. The inverse of the standard deviation of a vowel distribution was used as the "mass" of the vowel. We found that: (a) categorical boundary was located farther from more variable vowels; and (b) the calculated VBB (i.e., the center of mass of the vowels) significantly and positively correlated with the estimated categorical boundary (r = 0.912 for formants calculated in hertz; r = 0.854 for formants calculated in bark). Overall, our findings support a view that vowel production and vowel perception are strongly and bidirectionally linked.}, } @article {pmid30950759, year = {2019}, author = {Lee, J and Dickey, E and Simmons, Z}, title = {Vowel-Specific Intelligibility and Acoustic Patterns in Individuals With Dysarthria Secondary to Amyotrophic Lateral Sclerosis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {1}, pages = {34-59}, doi = {10.1044/2018_JSLHR-S-17-0357}, pmid = {30950759}, issn = {1558-9102}, mesh = {Adult ; Aged ; Amyotrophic Lateral Sclerosis/*complications ; *Dysarthria/etiology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {Purpose The purpose of the study was to investigate vowel-specific intelligibility and acoustic patterns of individuals with different severities of dysarthria secondary to amyotrophic lateral sclerosis (ALS). Method Twenty-three individuals with dysarthria secondary to ALS and 22 typically aging individuals participated as speakers. Participants with ALS were divided into 2 severity groups (severe, mild). For vowel-specific intelligibility data, 135 listeners participated in the study. Vowel-specific intelligibility, intrinsic vowel duration, 1st and 2nd formants (F1 and F2), vowel inherent spectral change (VISC), and absolute VISC were examined. Results A significant interaction between severity group and the vowel-specific intelligibility pattern as well as F1, F2 VISC, and absolute F2 VISC was observed. Specifically, individuals with severe dysarthria showed a significantly less intelligible /ɪ/ than /ɛ/, unlike individuals with mild dysarthria and typically aging individuals. In addition, vowel intelligibility of /ɪ/ showed the strongest association to the severity measures in individuals with ALS. A number of vowel-specific findings are reported in the acoustic variables. Acoustic correlates of vowel-specific intelligibility were identified. Conclusion Vowel-specific intelligibility patterns are different across severity groups; particularly, low intelligibility of /ɪ/ was noted in individuals with severe dysarthria. Individuals with dysarthria maintained the acoustic contrast in duration and F1 VISC among vowels but did not maintain the other spectral contrasts. Reduction of acoustic vowel space was observed primarily due to high F1 in high vowels in individuals with severe dysarthria. Regression findings suggest that the high F1 values of high and mid vowels and F2 reduction of high- and mid-front vowels decreased vowel-specific intelligibility. In addition, vowel duration influenced the vowel intelligibility of vowels that required short intrinsic vowel duration. Lastly, F2 VISC influenced the vowel intelligibility of /ɪ/. Overall, the vowel-specific intelligibility pattern is related to both vowel-specific characteristics and group-specific articulatory control dysfunction.}, } @article {pmid30950696, year = {2019}, author = {Croake, DJ and Andreatta, RD and Stemple, JC}, title = {Descriptive Analysis of the Interactive Patterning of the Vocalization Subsystems in Healthy Participants: A Dynamic Systems Perspective.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {2}, pages = {215-228}, doi = {10.1044/2018_JSLHR-S-17-0466}, pmid = {30950696}, issn = {1558-9102}, mesh = {Adolescent ; Adult ; Biomechanical Phenomena ; Female ; Healthy Volunteers ; Humans ; Kentucky ; Lung Volume Measurements ; Male ; Middle Aged ; Phonation/*physiology ; Respiration ; Speech Acoustics ; Vocal Cords/diagnostic imaging ; Voice/*physiology ; Voice Quality/physiology ; Young Adult ; }, abstract = {Purpose Normative data for many objective voice measures are routinely used in clinical voice assessment; however, normative data reflect vocal output, but not vocalization process. The underlying physiologic processes of healthy phonation have been shown to be nonlinear and thus are likely different across individuals. Dynamic systems theory postulates that performance behaviors emerge from the nonlinear interplay of multiple physiologic components and that certain patterns are preferred and loosely governed by the interactions of physiology, task, and environment. The purpose of this study was to descriptively characterize the interactive nature of the vocalization subsystem triad in subjects with healthy voices and to determine if differing subgroups could be delineated to better understand how healthy voicing is physiologically generated. Method Respiratory kinematic, aerodynamic, and acoustic formant data were obtained from 29 individuals with healthy voices (21 female and eight male). Multivariate analyses were used to descriptively characterize the interactions among the subsystems that contributed to healthy voicing. Results Group data revealed representative measures of the 3 subsystems to be generally within the boundaries of established normative data. Despite this, 3 distinct clusters were delineated that represented 3 subgroups of individuals with differing subsystem patterning. Seven of the 9 measured variables in this study were found to be significantly different across at least 1 of the 3 subgroups indicating differing physiologic processes across individuals. Conclusion Vocal output in healthy individuals appears to be generated by distinct and preferred physiologic processes that were represented by 3 subgroups indicating that the process of vocalization is different among individuals, but not entirely idiosyncratic. Possibilities for these differences are explored using the framework of dynamic systems theory and the dynamics of emergent behaviors. A revised physiologic model of phonation that accounts for differences within and among the vocalization subsystems is described. Supplemental Material https://doi.org/10.23641/asha.7616462.}, } @article {pmid30949025, year = {2019}, author = {Navarro, J and Fernández Rosell, M and Castellanos, A and Del Moral, R and Lahoz-Beltra, R and Marijuán, PC}, title = {Plausibility of a Neural Network Classifier-Based Neuroprosthesis for Depression Detection via Laughter Records.}, journal = {Frontiers in neuroscience}, volume = {13}, number = {}, pages = {267}, pmid = {30949025}, issn = {1662-4548}, abstract = {The present work explores the diagnostic performance for depression of neural network classifiers analyzing the sound structures of laughter as registered from clinical patients and healthy controls. The main methodological novelty of this work is that simple sound variables of laughter are used as inputs, instead of electrophysiological signals or local field potentials (LFPs) or spoken language utterances, which are the usual protocols up-to-date. In the present study, involving 934 laughs from 30 patients and 20 controls, four different neural networks models were tested for sensitivity analysis, and were additionally trained for depression detection. Some elementary sound variables were extracted from the records: timing, fundamental frequency mean, first three formants, average power, and the Shannon-Wiener entropy. In the results obtained, two of the neural networks show a diagnostic discrimination capability of 93.02 and 91.15% respectively, while the third and fourth ones have an 87.96 and 82.40% percentage of success. Remarkably, entropy turns out to be a fundamental variable to distinguish between patients and controls, and this is a significant factor which becomes essential to understand the deep neurocognitive relationships between laughter and depression. In biomedical terms, our neural network classifier-based neuroprosthesis opens up the possibility of applying the same methodology to other mental-health and neuropsychiatric pathologies. Indeed, exploring the application of laughter in the early detection and prognosis of Alzheimer and Parkinson would represent an enticing possibility, both from the biomedical and the computational points of view.}, } @article {pmid30933706, year = {2019}, author = {Eipert, L and Selle, A and Klump, GM}, title = {Uncertainty in location, level and fundamental frequency results in informational masking in a vowel discrimination task for young and elderly subjects.}, journal = {Hearing research}, volume = {377}, number = {}, pages = {142-152}, doi = {10.1016/j.heares.2019.03.015}, pmid = {30933706}, issn = {1878-5891}, mesh = {Adult ; Age Factors ; Aged ; Aging/*psychology ; Auditory Threshold ; Cues ; *Discrimination, Psychological ; Female ; Humans ; Male ; Middle Aged ; Noise/*adverse effects ; *Perceptual Masking ; *Pitch Discrimination ; *Sound Localization ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {Informational masking (IM) is defined as the compromised ability to perceive and analyze signals from a single sound source in a cacophony of sounds from other sources even if the excitation patterns produced by these signals in the auditory periphery are well separated from those produced by the sounds from the other sources. IM that causes an elevation of discrimination thresholds is affected by the similarity between target and masker and by stimulus uncertainty. Here, six young and six elderly subjects were asked to discriminate between sequentially presented reference and target vowels of the vowel pairs /I/-/i/, /æ/-/ε/, and /α/-/Λ/. Psychometric functions were collected characterizing the discrimination of target vowels from reference vowels. Target vowels differed from the reference by one of seven steps shifting the three formants of a reference vowel towards the formants of the corresponding target vowel. Stimulus statistics were varied, generating uncertainty by non-informative but potentially distracting location, level, and fundamental frequency changes or all three combined. Young subjects tested with distracting changes applied to the target vowels only, the reference vowels only, or the target and reference vowels showed similar amounts of IM for all three conditions. Elderly subjects were tested with distracting changes applied to target vowels only. Applying uncertainty only to the target vowels led to worse vowel discrimination thresholds for young and elderly subjects and thresholds increased most for the three distracting changes combined. Elderly subjects showed higher vowel discrimination thresholds than young subjects, but the increase in vowel discrimination thresholds due to IM did not differ between young and elderly subjects. The temporal fine structure processing of elderly subjects was degraded in comparison to young subjects, but it was only correlated with the discrimination threshold for vowel pair /I/-/i/.}, } @article {pmid30887381, year = {2019}, author = {Stilp, CE and Assgari, AA}, title = {Natural speech statistics shift phoneme categorization.}, journal = {Attention, perception & psychophysics}, volume = {81}, number = {6}, pages = {2037-2052}, doi = {10.3758/s13414-018-01659-3}, pmid = {30887381}, issn = {1943-393X}, mesh = {Adult ; Female ; Humans ; Language ; Male ; *Phonetics ; Speech ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {All perception takes place in context. Recognition of a given speech sound is influenced by the acoustic properties of surrounding sounds. When the spectral composition of earlier (context) sounds (e.g., more energy at lower first formant [F1] frequencies) differs from that of a later (target) sound (e.g., vowel with intermediate F1), the auditory system magnifies this difference, biasing target categorization (e.g., towards higher-F1 /ɛ/). Historically, these studies used filters to force context sounds to possess desired spectral compositions. This approach is agnostic to the natural signal statistics of speech (inherent spectral compositions without any additional manipulations). The auditory system is thought to be attuned to such stimulus statistics, but this has gone untested. Here, vowel categorization was measured following unfiltered (already possessing the desired spectral composition) or filtered sentences (to match spectral characteristics of unfiltered sentences). Vowel categorization was biased in both cases, with larger biases as the spectral prominences in context sentences increased. This confirms sensitivity to natural signal statistics, extending spectral context effects in speech perception to more naturalistic listening conditions. Importantly, categorization biases were smaller and more variable following unfiltered sentences, raising important questions about how faithfully experiments using filtered contexts model everyday speech perception.}, } @article {pmid30856195, year = {2019}, author = {Rodrigues, S and Martins, F and Silva, S and Jesus, LMT}, title = {/l/ velarisation as a continuum.}, journal = {PloS one}, volume = {14}, number = {3}, pages = {e0213392}, pmid = {30856195}, issn = {1932-6203}, mesh = {Adult ; Female ; Humans ; *Language ; Linguistics ; Male ; *Phonetics ; Portugal ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; Young Adult ; }, abstract = {In this paper, we present a production study to explore the controversial question about /l/ velarisation. Measurements of first (F1), second (F2) and third (F3) formant frequencies and the slope of F2 were analysed to clarify the /l/ velarisation behaviour in European Portuguese (EP). The acoustic data were collected from ten EP speakers, producing trisyllabic words with paroxytone stress pattern, with the liquid consonant at the middle of the word in onset, complex onset and coda positions. Results suggested that /l/ is produced on a continuum in EP. The consistently low F2 indicates that /l/ is velarised in all syllable positions, but variation especially in F1 and F3 revealed that /l/ could be "more velarised" or "less velarised" dependent on syllable positions and vowel contexts. These findings suggest that it is important to consider different acoustic measures to better understand /l/ velarisation in EP.}, } @article {pmid30853310, year = {2020}, author = {Hernández-García, E and Moro-Velázquez, L and González-Herranz, R and Godino-Llorente, JI and Plaza, G}, title = {Effect of Functional Endoscopic Sinus Surgery on Voice and Speech Recognition.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {4}, pages = {650.e1-650.e6}, doi = {10.1016/j.jvoice.2019.02.012}, pmid = {30853310}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Chronic Disease ; *Endoscopy ; Female ; Humans ; Machine Learning ; Male ; Middle Aged ; Nasal Polyps/physiopathology/*surgery ; Pattern Recognition, Automated ; Prospective Studies ; Rhinitis/physiopathology/*surgery ; Sinusitis/physiopathology/*surgery ; Sound Spectrography ; *Speech Production Measurement ; *Speech Recognition Software ; Time Factors ; Treatment Outcome ; Vocal Cords/*physiopathology ; *Voice Quality ; }, abstract = {OBJECTIVE: Functional Endoscopic Sinus Surgery (FESS) is the surgery of choice for nasal polyposis and chronic rhinosinusitis. The aim of our study is to assess the influence of this surgery in the acoustic parameters of voice, and their implications in the systems of identification or verification of the speaker through the speech.

MATERIAL AND METHODS: A prospective study was performed between January 2017 and June 2017 including two groups of patients: those undergoing FESS, and a control group. Demographic data and GRBAS assessment were statistically analyzed. In addition, a recording of patients' voices was made with a subsequent acoustic analysis and automatic identification of the speaker through machine learning systems, establishing the equal error rate. Samples were taken before surgery, 2 weeks after surgery and 3 months later.

RESULTS: After FESS, a significant difference was observed in Grade, Roughness, Breathiness, Asthenia, Strain (GRBAS). Besides, acoustic analysis showed a significance decrease in fundamental frequency (F0), when compared with the control group. For the automatic identification of the speaker through computer systems, we found that the equal error rate is higher in the FESS group.

CONCLUSIONS: Results suggest that FESS produce a decrease of F0 and changes in the vocal tract that derive in an increase in the error of recognition of the speaker in FESS patients.}, } @article {pmid30839469, year = {2019}, author = {Hernández-García, E and Moro-Velazquez, L and González-Herranz, R and Lara, A and Godino-Llorente, JI and Plaza, G}, title = {Effect of Septoplasty On Voice Performance: Nasalance and Acoustic Study.}, journal = {The Journal of craniofacial surgery}, volume = {30}, number = {4}, pages = {1000-1003}, doi = {10.1097/SCS.0000000000005435}, pmid = {30839469}, issn = {1536-3732}, mesh = {Adult ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; Nasal Obstruction/*surgery ; Nasal Septum/*surgery ; Prospective Studies ; Speech Acoustics ; *Voice Quality ; }, abstract = {BACKGROUND: Septoplasty is a surgical technique for the correction of the nasal septum that may alter the vocal tract. The aim of our study is to assess whether this technique modifies nasalance and acoustic parameters, and their clinical implications in voice perception.

METHODOLOGY: A prospective study was performed between January 2017 and June 2017 including 2 groups of patients: those undergoing septoplasty, and a control group. Subjective nasality questionnaire, objective nasalance with nasometer, and GRBAS (Grade, Roughness, Breathiness, Asthenia, Strain) assessment were statistically analysed. In addition, a recording of patients' voices was made with a subsequent acoustic analysis. Samples were taken: pre-surgically, 2 weeks after surgery and after 3 months.

RESULTS: After septoplasty, a significant difference was observed in GRBAS, nasality questionnaire and nasometer nasalance, when compared with the control group. As for the acoustic analysis, no differences were observed in most parameters (F0, Jitter, Shimmer, HNR, NHR, Formants F1-F3), except for the antiF3 antiformant, which showed significant changes in all the vowels studied.

CONCLUSIONS: Septoplasty can produce changes in the vocal tract, with an increase in initial nasalance but with subsequent normalization. Besides, minor changes were found in the acoustic analysis but with no clinical relevance.}, } @article {pmid30837851, year = {2019}, author = {Rampinini, AC and Handjaras, G and Leo, A and Cecchetti, L and Betta, M and Marotta, G and Ricciardi, E and Pietrini, P}, title = {Formant Space Reconstruction From Brain Activity in Frontal and Temporal Regions Coding for Heard Vowels.}, journal = {Frontiers in human neuroscience}, volume = {13}, number = {}, pages = {32}, pmid = {30837851}, issn = {1662-5161}, abstract = {Classical studies have isolated a distributed network of temporal and frontal areas engaged in the neural representation of speech perception and production. With modern literature arguing against unique roles for these cortical regions, different theories have favored either neural code-sharing or cortical space-sharing, thus trying to explain the intertwined spatial and functional organization of motor and acoustic components across the fronto-temporal cortical network. In this context, the focus of attention has recently shifted toward specific model fitting, aimed at motor and/or acoustic space reconstruction in brain activity within the language network. Here, we tested a model based on acoustic properties (formants), and one based on motor properties (articulation parameters), where model-free decoding of evoked fMRI activity during perception, imagery, and production of vowels had been successful. Results revealed that phonological information organizes around formant structure during the perception of vowels; interestingly, such a model was reconstructed in a broad temporal region, outside of the primary auditory cortex, but also in the pars triangularis of the left inferior frontal gyrus. Conversely, articulatory features were not associated with brain activity in these regions. Overall, our results call for a degree of interdependence based on acoustic information, between the frontal and temporal ends of the language network.}, } @article {pmid30836818, year = {2019}, author = {Franken, MK and Acheson, DJ and McQueen, JM and Hagoort, P and Eisner, F}, title = {Consistency influences altered auditory feedback processing.}, journal = {Quarterly journal of experimental psychology (2006)}, volume = {72}, number = {10}, pages = {2371-2379}, doi = {10.1177/1747021819838939}, pmid = {30836818}, issn = {1747-0226}, mesh = {Adult ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Motor Activity/*physiology ; Speech/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Previous research on the effect of perturbed auditory feedback in speech production has focused on two types of responses. In the short term, speakers generate compensatory motor commands in response to unexpected perturbations. In the longer term, speakers adapt feedforward motor programmes in response to feedback perturbations, to avoid future errors. The current study investigated the relation between these two types of responses to altered auditory feedback. Specifically, it was hypothesised that consistency in previous feedback perturbations would influence whether speakers adapt their feedforward motor programmes. In an altered auditory feedback paradigm, formant perturbations were applied either across all trials (the consistent condition) or only to some trials, whereas the others remained unperturbed (the inconsistent condition). The results showed that speakers' responses were affected by feedback consistency, with stronger speech changes in the consistent condition compared with the inconsistent condition. Current models of speech-motor control can explain this consistency effect. However, the data also suggest that compensation and adaptation are distinct processes, which are not in line with all current models.}, } @article {pmid30823822, year = {2019}, author = {Klaus, A and Lametti, DR and Shiller, DM and McAllister, T}, title = {Can perceptual training alter the effect of visual biofeedback in speech-motor learning?.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {2}, pages = {805}, pmid = {30823822}, issn = {1520-8524}, support = {R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Perception/*physiology ; Feedback, Sensory/*physiology ; Female ; Humans ; Motor Skills/*physiology ; Speech/*physiology ; Young Adult ; }, abstract = {Recent work showing that a period of perceptual training can modulate the magnitude of speech-motor learning in a perturbed auditory feedback task could inform clinical interventions or second-language training strategies. The present study investigated the influence of perceptual training on a clinically and pedagogically relevant task of vocally matching a visually presented speech target using visual-acoustic biofeedback. Forty female adults aged 18-35 yr received perceptual training targeting the English /æ-ɛ/ contrast, randomly assigned to a condition that shifted the perceptual boundary toward either /æ/ or /ɛ/. Participants were then asked to produce the word head while modifying their output to match a visually presented acoustic target corresponding with a slightly higher first formant (F1, closer to /æ/). By analogy to findings from previous research, it was predicted that individuals whose boundary was shifted toward /æ/ would also show a greater magnitude of change in the visual biofeedback task. After perceptual training, the groups showed the predicted difference in perceptual boundary location, but they did not differ in their performance on the biofeedback matching task. It is proposed that the explicit versus implicit nature of the tasks used might account for the difference between this study and previous findings.}, } @article {pmid30823790, year = {2019}, author = {Dissen, Y and Goldberger, J and Keshet, J}, title = {Formant estimation and tracking: A deep learning approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {2}, pages = {642}, doi = {10.1121/1.5088048}, pmid = {30823790}, issn = {1520-8524}, abstract = {Formant frequency estimation and tracking are among the most fundamental problems in speech processing. In the estimation task, the input is a stationary speech segment such as the middle part of a vowel, and the goal is to estimate the formant frequencies, whereas in the task of tracking the input is a series of speech frames, and the goal is to track the trajectory of the formant frequencies throughout the signal. The use of supervised machine learning techniques trained on an annotated corpus of read-speech for these tasks is proposed. Two deep network architectures were evaluated for estimation: feed-forward multilayer-perceptrons and convolutional neural-networks and, correspondingly, two architectures for tracking: recurrent and convolutional recurrent networks. The inputs to the former are composed of linear predictive coding-based cepstral coefficients with a range of model orders and pitch-synchronous cepstral coefficients, where the inputs to the latter are raw spectrograms. The performance of the methods compares favorably with alternative methods for formant estimation and tracking. A network architecture is further proposed, which allows model adaptation to different formant frequency ranges that were not seen at training time. The adapted networks were evaluated on three datasets, and their performance was further improved.}, } @article {pmid30823785, year = {2019}, author = {Kirkham, S and Nance, C and Littlewood, B and Lightfoot, K and Groarke, E}, title = {Dialect variation in formant dynamics: The acoustics of lateral and vowel sequences in Manchester and Liverpool English.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {2}, pages = {784}, doi = {10.1121/1.5089886}, pmid = {30823785}, issn = {1520-8524}, abstract = {This study analyses the time-varying acoustics of laterals and their adjacent vowels in Manchester and Liverpool English. Generalized additive mixed-models (GAMMs) are used for quantifying time-varying formant data, which allows the modelling of non-linearities in acoustic time series while simultaneously modelling speaker and word level variability in the data. These models are compared to single time-point analyses of lateral and vowel targets in order to determine what analysing formant dynamics can tell about dialect variation in speech acoustics. The results show that lateral targets exhibit robust differences between some positional contexts and also between dialects, with smaller differences present in vowel targets. The time-varying analysis shows that dialect differences frequently occur globally across the lateral and adjacent vowels. These results suggest a complex relationship between lateral and vowel targets and their coarticulatory dynamics, which problematizes straightforward claims about the realization of laterals and their adjacent vowels. These findings are further discussed in terms of hypotheses about positional and sociophonetic variation. In doing so, the utility of GAMMs for analysing time-varying multi-segmental acoustic signals is demonstrated, and the significance of the results for accounts of English lateral typology is highlighted.}, } @article {pmid30744970, year = {2019}, author = {Menda, G and Nitzany, EI and Shamble, PS and Wells, A and Harrington, LC and Miles, RN and Hoy, RR}, title = {The Long and Short of Hearing in the Mosquito Aedes aegypti.}, journal = {Current biology : CB}, volume = {29}, number = {4}, pages = {709-714.e4}, doi = {10.1016/j.cub.2019.01.026}, pmid = {30744970}, issn = {1879-0445}, mesh = {Aedes/*physiology ; Animals ; *Auditory Perception ; Female ; *Flight, Animal ; Hearing/physiology ; Male ; }, abstract = {Mating behavior in Aedes aegypti mosquitoes occurs mid-air and involves the exchange of auditory signals at close range (millimeters to centimeters) [1-6]. It is widely assumed that this intimate signaling distance reflects short-range auditory sensitivity of their antennal hearing organs to faint flight tones [7, 8]. To the contrary, we show here that male mosquitoes can hear the female's flight tone at surprisingly long distances-from several meters to up to 10 m-and that unrestrained, resting Ae. aegypti males leap off their perches and take flight when they hear female flight tones. Moreover, auditory sensitivity tests of Ae. aegypti's hearing organ, made from neurophysiological recordings of the auditory nerve in response to pure-tone stimuli played from a loudspeaker, support the behavioral experiments. This demonstration of long-range hearing in mosquitoes overturns the common assumption that the thread-like antennal hearing organs of tiny insects are strictly close-range ears. The effective range of a hearing organ depends ultimately on its sensitivity [9-13]. Here, a mosquito's antennal ear is shown to be sensitive to sound levels down to 31 dB sound pressure level (SPL), translating to air particle velocity at nanometer dimensions. We note that the peak of energy of the first formant of the vowels of the human speech spectrum range from about 200-1,000 Hz and is typically spoken at 45-70 dB SPL; together, they lie in the sweet spot of mosquito hearing. VIDEO ABSTRACT.}, } @article {pmid30742609, year = {2019}, author = {Zhang, Q and Hu, X and Hong, B and Zhang, B}, title = {A hierarchical sparse coding model predicts acoustic feature encoding in both auditory midbrain and cortex.}, journal = {PLoS computational biology}, volume = {15}, number = {2}, pages = {e1006766}, pmid = {30742609}, issn = {1553-7358}, mesh = {Acoustic Stimulation/methods ; Acoustics ; Animals ; Auditory Cortex/physiology ; Auditory Pathways/*physiology ; Auditory Perception/physiology ; Computational Biology/*methods ; Computer Simulation ; Humans ; Mesencephalon/physiology ; Neurons/physiology ; Phonetics ; Speech/*physiology ; }, abstract = {The auditory pathway consists of multiple stages, from the cochlear nucleus to the auditory cortex. Neurons acting at different stages have different functions and exhibit different response properties. It is unclear whether these stages share a common encoding mechanism. We trained an unsupervised deep learning model consisting of alternating sparse coding and max pooling layers on cochleogram-filtered human speech. Evaluation of the response properties revealed that computing units in lower layers exhibited spectro-temporal receptive fields (STRFs) similar to those of inferior colliculus neurons measured in physiological experiments, including properties such as sound onset and termination, checkerboard pattern, and spectral motion. Units in upper layers tended to be tuned to phonetic features such as plosivity and nasality, resembling the results of field recording in human auditory cortex. Variation of the sparseness level of the units in each higher layer revealed a positive correlation between the sparseness level and the strength of phonetic feature encoding. The activities of the units in the top layer, but not other layers, correlated with the dynamics of the first two formants (F1, F2) of all phonemes, indicating the encoding of phoneme dynamics in these units. These results suggest that the principles of sparse coding and max pooling may be universal in the human auditory pathway.}, } @article {pmid30739113, year = {2020}, author = {Garellek, M}, title = {Acoustic Discriminability of the Complex Phonation System in !Xóõ.}, journal = {Phonetica}, volume = {77}, number = {2}, pages = {131-160}, doi = {10.1159/000494301}, pmid = {30739113}, issn = {1423-0321}, mesh = {Humans ; *Language ; Male ; *Phonation ; Phonetics ; Sound Spectrography ; Speech/physiology ; *Speech Acoustics ; Voice Quality ; }, abstract = {Phonation types, or contrastive voice qualities, are minimally produced using complex movements of the vocal folds, but may additionally involve constriction in the supraglottal and pharyngeal cavities. These complex articulations in turn produce a multidimensional acoustic output that can be modeled in various ways. In this study, I investigate whether the psychoacoustic model of voice by Kreiman et al. (2014) succeeds at distinguishing six phonation types of !Xóõ. Linear discriminant analysis is performed using parameters from the model averaged over the entire vowel as well as for the first and final halves of the vowel. The results indicate very high classification accuracy for all phonation types. Measures averaged over the vowel's entire duration are closely correlated with the discriminant functions, suggesting that they are sufficient for distinguishing even dynamic phonation types. Measures from all classes of parameters are correlated with the linear discriminant functions; in particular, the "strident" vowels, which are harsh in quality, are characterized by their noise, changes in spectral tilt, decrease in voicing amplitude and frequency, and raising of the first formant. Despite the large number of contrasts and the time-varying characteristics of many of the phonation types, the phonation contrasts in !Xóõ remain well differentiated acoustically.}, } @article {pmid30738782, year = {2020}, author = {Apaydın, E and İkincioğulları, A and Çolak, M and Atan, D and Ensari, S and Dere, HH}, title = {The Voice Performance After Septoplasty With Surgical Efficacy Demonstrated Through Acoustic Rhinometry and Rhinomanometry.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {4}, pages = {649.e15-649.e20}, doi = {10.1016/j.jvoice.2019.01.008}, pmid = {30738782}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Disability Evaluation ; Female ; Humans ; Male ; Nasal Obstruction/diagnosis/physiopathology/*surgery ; Nasal Septum/diagnostic imaging/*surgery ; Predictive Value of Tests ; Prospective Studies ; Recovery of Function ; *Rhinomanometry ; *Rhinometry, Acoustic ; *Rhinoplasty ; Sound Spectrography ; Speech Production Measurement ; Time Factors ; Treatment Outcome ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To demonstrate the surgical efficacy of septoplasty using acoustic rhinometry (AR) and anterior rhinomanometry (ARM) and to evaluate the effect of septoplasty on voice performance through subjective voice analysis methods.

MATERIALS AND METHODS: This prospective study enrolled a total of 62 patients who underwent septoplasty with the diagnosis of deviated nasal septum. Thirteen patients with no postoperative improvement versus preoperative period as shown by AR and/or ARM tests and three patients with postoperative complications and four patients who were lost to follow-up were excluded. As a result, a total of 42 patients were included in the study. Objective tests including AR, ARM, acoustic voice analysis and spectrographic analysis were performed before the surgery and at 1 month and 3 months after the surgery. Subjective measures included the Nasal Obstruction Symptom Evaluation questionnaire to evaluate surgical success and Voice Handicap Index-30 tool for assessment of voice performance postoperatively, both completed by all study patients.

RESULTS: Among acoustic voice analysis parameters, F0, jitter, Harmonics-to-Noise Ratio values as well as formant frequency (F1-F2-F3-F4) values did not show significant differences postoperatively in comparison to the preoperative period (P > 0.05). Only the shimmer value was statistically significantly reduced at 1 month (P < 0.05) and 3 months postoperatively (P < 0.05) versus baseline. Statistically significant reductions in Voice Handicap Index-30 scores were observed at postoperative 1 month (P < 0.001) and 3 months (P < 0.001) compared to the preoperative period and between postoperative 1 month and 3 months (P < 0.05).

CONCLUSION: In this study, first operative success of septoplasty was demonstrated through objective tests and then objective voice analyses were performed to better evaluate the overall effect of septoplasty on voice performance. Shimmer value was found to be improved in the early and late postoperative periods.}, } @article {pmid30717888, year = {2020}, author = {de Souza, GVS and Duarte, JMT and Viegas, F and Simões-Zenari, M and Nemr, K}, title = {An Acoustic Examination of Pitch Variation in Soprano Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {4}, pages = {648.e41-648.e49}, doi = {10.1016/j.jvoice.2018.12.007}, pmid = {30717888}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Cross-Sectional Studies ; Female ; Humans ; Middle Aged ; *Phonation ; *Singing ; Sound Spectrography ; *Voice Quality ; Young Adult ; }, abstract = {INTRODUCTION: The ability to perform acoustic inspection of data and to correlate the results with perceptual and physiological aspects facilitates vocal behavior analysis. The singing voice has specific characteristics and parameters that are involved during the phonation mechanism, which may be analyzed acoustically.

OBJECTIVE: To describe and analyze the fundamental frequency and formants in pitch variation in the /a/ vowel in sopranos.

METHODS: The sample consisted of 30 female participants between the ages of 20 to 45 years without vocal complaints. All sustained vowel sounds were recorded with the /a/ vowel sustained for 5 seconds, with three replications at low (C4-261 Hz), medium (Eb4-622 Hz), and high (Bb4-932 Hz) frequencies that were comfortable for the voice classification. In total, 90 samples were analyzed with digital extraction of the fundamental frequency (f0) and the first five formants (F1, F2, F3, F4, and F5) and manual confirmation. The middle segment was considered for analysis, whereas the onset and offset segments were not considered. Subsequently, FFT (fast Fourier transform) plots, LPC (linear predictive coding) graphs, and tube diagrams were created. The Shapiro-Wilks test was applied for adherence and the Friedman test was applied for comparison of paired samples.

RESULTS: For vocalizations at low and medium pitches, higher values were observed for the first five formant frequencies than for the f0 value. Overlaying the LPC and FFT graphs revealed a similarity between F1 and F2 at the two pitches, with clustered harmonics in the F3, F4, and F5 region in the low pitch. At the medium pitch, there was similarity between F3 and F4, an F5 peak, and tuned harmonics. However, in the high-pitch vocalizations, there was an increase in the F2, F3, F4, and F5 values in relation to f0, and there was similarity between them along with synchrony between f0 and F1, H2 and F2, H3 and F3, H4 and F4, and H5 and F5.

CONCLUSIONS: Pitch changes indicate differences in the behavior of the fundamental frequency and sound formants in sopranos. The comparison of the sustained vowels sounds in f0 at the three pitches revealed specific vocal tract changes on the LPC curve and FFT harmonics, with an extra gain range at 261 Hz, synchrony between peaks of formants and harmonics at 622 Hz, and equivalence of f0 and F1 at 932 Hz.}, } @article {pmid30714462, year = {2020}, author = {Frič, M and Pavlechová, A}, title = {Listening evaluation and classification of female singing voice categories.}, journal = {Logopedics, phoniatrics, vocology}, volume = {45}, number = {3}, pages = {97-109}, doi = {10.1080/14015439.2018.1551418}, pmid = {30714462}, issn = {1651-2022}, mesh = {Acoustics ; *Auditory Perception ; Female ; Humans ; Judgment ; Observer Variation ; Sex Factors ; *Singing ; Sound Spectrography ; *Voice Quality ; }, abstract = {Objective: Classification of voices into types depends on several factors: physiological - the size of the laryngeal and vocal tracts; acoustic-musically acceptable vocal range; position of formants; and properties of timbre. The aim of the study is to verify whether a group of experienced voice pedagogues and singers can determine the vocal type of the artist based on listening to a part of the aria better than a group of musicians can, and to determine what acoustic properties of the recordings are linked with the perceptual results of their evaluation.Methods: Freely available recordings of 11 females of different vocal categories of Rossini's aria "Una voce poco fa" from the opera Il Barbiere di Seviglia were selected for listening tests performed on examples of recitatives and coloraturas. Seven voice teachers (experienced group) and seven musicians non-teachers (laypeople group) evaluated the properties of the vocal category, timbre (dark-bright), resonance, vowel placement, suitability of vibrato, aesthetic impression, and voice flexibility.Results: The results showed a significantly higher inter-judge reliability in the experienced group. The highest reliability was achieved in timbre and vocal category evaluation, the least consistent was the evaluation of resonance. Factor analysis of the assessment variability showed dependent ratings of the vocal category, brightness and vowel placement for both groups in recitative. The experienced group similarly evaluated the brightness and the vocal category in coloratura. Assessment of the vocal category correlated with the reported categories of singers only in the experienced group. The categories mezzo-soprano and soprano were differentiated by spectral levels (based on FFT analysis of whole stimuli) in the 3.5-4.1 kHz spectral band in the recitative and in the 1.3-2.1 kHz and around 2.5 kHz bands in coloratura, and by the position of the local minimum after the fifth maximum for both kinds of stimuli.Conclusions: By means of correlation with ratings by experienced listeners, it is demonstrated that the voice category is mainly connected with the levels of specific spectral peaks, while brightness is correlated with the frequency positions of spectral maxima.}, } @article {pmid30707332, year = {2019}, author = {Kissine, M and Geelhand, P}, title = {Brief Report: Acoustic Evidence for Increased Articulatory Stability in the Speech of Adults with Autism Spectrum Disorder.}, journal = {Journal of autism and developmental disorders}, volume = {49}, number = {6}, pages = {2572-2580}, pmid = {30707332}, issn = {1573-3432}, support = {Research Incentive Grant F.4502.15//Fonds De La Recherche Scientifique - FNRS/ ; Doctoral Grant//Foundation Jean-François Peterbroeck/ ; }, mesh = {Adult ; *Autism Spectrum Disorder ; Female ; Humans ; Male ; Phonation ; *Speech Acoustics ; Speech Production Measurement ; Voice Quality ; Young Adult ; }, abstract = {Subjective impressions of speech delivery in Autism Spectrum Disorder (ASD) as monotonic or over-precise are widespread but still lack robust acoustic evidence. This study provides a detailed acoustic characterization of the specificities of speech in individuals with ASD using an extensive sample of speech data, from the production of narratives and from spontaneous conversation. Syllable-level analyses (30,843 tokens in total) were performed on audio recordings from two sub-tasks of the Autism Diagnostic Observation Schedule from 20 adults with ASD and 20 pairwise matched neuro-typical adults, providing acoustic measures of fundamental frequency, jitter, shimmer and the first three formants. The results suggest that participants with ASD display a greater articulatory stability in vowel production than neuro-typical participants, both in phonation and articulatory gestures.}, } @article {pmid30701397, year = {2019}, author = {Qian, K and Schmitt, M and Janott, C and Zhang, Z and Heiser, C and Hohenhorst, W and Herzog, M and Hemmert, W and Schuller, B}, title = {A Bag of Wavelet Features for Snore Sound Classification.}, journal = {Annals of biomedical engineering}, volume = {47}, number = {4}, pages = {1000-1011}, pmid = {30701397}, issn = {1573-9686}, support = {338164//European Union's Seventh Framework/ ; }, mesh = {Adult ; *Algorithms ; *Databases, Factual ; Female ; Humans ; Male ; *Signal Processing, Computer-Assisted ; *Snoring ; *Sound ; }, abstract = {Snore sound (SnS) classification can support a targeted surgical approach to sleep related breathing disorders. Using machine listening methods, we aim to find the location of obstruction and vibration within a subject's upper airway. Wavelet features have been demonstrated to be efficient in the recognition of SnSs in previous studies. In this work, we use a bag-of-audio-words approach to enhance the low-level wavelet features extracted from SnS data. A Naïve Bayes model was selected as the classifier based on its superiority in initial experiments. We use SnS data collected from 219 independent subjects under drug-induced sleep endoscopy performed at three medical centres. The unweighted average recall achieved by our proposed method is 69.4%, which significantly ([Formula: see text] one-tailed z-test) outperforms the official baseline (58.5%), and beats the winner (64.2%) of the INTERSPEECH COMPARE Challenge 2017 Snoring sub-challenge. In addition, the conventionally used features like formants, mel-scale frequency cepstral coefficients, subband energy ratios, spectral frequency features, and the features extracted by the OPENSMILE toolkit are compared with our proposed feature set. The experimental results demonstrate the effectiveness of the proposed method in SnS classification.}, } @article {pmid30648798, year = {2019}, author = {Galle, ME and Klein-Packard, J and Schreiber, K and McMurray, B}, title = {What Are You Waiting For? Real-Time Integration of Cues for Fricatives Suggests Encapsulated Auditory Memory.}, journal = {Cognitive science}, volume = {43}, number = {1}, pages = {}, pmid = {30648798}, issn = {1551-6709}, support = {P50 DC000242/DC/NIDCD NIH HHS/United States ; R01 DC008089/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cues ; Eye Movement Measurements ; Humans ; *Memory ; *Phonetics ; *Speech Perception ; }, abstract = {Speech unfolds over time, and the cues for even a single phoneme are rarely available simultaneously. Consequently, to recognize a single phoneme, listeners must integrate material over several hundred milliseconds. Prior work contrasts two accounts: (a) a memory buffer account in which listeners accumulate auditory information in memory and only access higher level representations (i.e., lexical representations) when sufficient information has arrived; and (b) an immediate integration scheme in which lexical representations can be partially activated on the basis of early cues and then updated when more information arises. These studies have uniformly shown evidence for immediate integration for a variety of phonetic distinctions. We attempted to extend this to fricatives, a class of speech sounds which requires not only temporal integration of asynchronous cues (the frication, followed by the formant transitions 150-350 ms later), but also integration across different frequency bands and compensation for contextual factors like coarticulation. Eye movements in the visual world paradigm showed clear evidence for a memory buffer. Results were replicated in five experiments, ruling out methodological factors and tying the release of the buffer to the onset of the vowel. These findings support a general auditory account for speech by suggesting that the acoustic nature of particular speech sounds may have large effects on how they are processed. It also has major implications for theories of auditory and speech perception by raising the possibility of an encapsulated memory buffer in early auditory processing.}, } @article {pmid30644324, year = {2019}, author = {Naderifar, E and Ghorbani, A and Moradi, N and Ansari, H}, title = {Use of formant centralization ratio for vowel impairment detection in normal hearing and different degrees of hearing impairment.}, journal = {Logopedics, phoniatrics, vocology}, volume = {44}, number = {4}, pages = {159-165}, doi = {10.1080/14015439.2018.1545867}, pmid = {30644324}, issn = {1651-2022}, mesh = {*Acoustics ; Case-Control Studies ; Child ; Female ; Hearing ; Hearing Loss, Bilateral/*diagnosis/physiopathology ; Hearing Loss, Sensorineural/*diagnosis/physiopathology ; Humans ; Male ; *Persons With Hearing Impairments ; Severity of Illness Index ; Sex Factors ; *Speech Acoustics ; *Speech Production Measurement ; *Voice Quality ; }, abstract = {Purpose: Hearing-impaired (HI) speakers show changes in vowel production and formant frequencies, as well as more cases of overlapping between vowels and more restricted formant space, than hearing speakers. This study was intended to explore whether the use of different acoustic parameters (Formant Centralization Ratio (FCR), Vowel Space Area (VSA), F2i/F2u ratio (second formant of/i,u/)) was suitable or not for characterizing impairments in the articulation of vowels in the speech of HL speakers. In fact, correlated acoustic parameters are used to determine the limits of tongue movements in vowel production in different severity degrees of hearing impairment. Methods: Speech recordings of 40 speakers with HL and 40 healthy controls were acoustically analyzed. The vowels (/a/,/i/,/u/) were extracted from the word context and, then, the first and second formants were calculated. The same vowel-formant elements were used to construct the FCR, expressed as (F2u + F2a + F1i + F1u)/(F2i + F1a), the F2i/F2u ratio, and the vowel space area (VSA), expressed as ABS((F1i*(F2a-F2u)+F1a*(F2u-F2i)+F1u*(F2i-F2a))/2). Results: The FCR differentiated HL groups from the control group and the discrimination was not gender-sensitive. All parameters were found to be strongly correlated with each other. Conclusions: The findings of this study showed that FCR was a more sensitive acoustic parameter than F2i/F2u ratio and VSA to distinguish speech of the HL groups from that of the normal group. Thus, FCR is considered to be applicable as an early objective measure of impaired vowel articulation in HL speakers.}, } @article {pmid30639152, year = {2020}, author = {Mercer, E and Lowell, SY}, title = {The Low Mandible Maneuver: Preliminary Study of Its Effects on Aerodynamic and Acoustic Measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {4}, pages = {645.e1-645.e9}, doi = {10.1016/j.jvoice.2018.12.005}, pmid = {30639152}, issn = {1873-4588}, mesh = {Adult ; Female ; Healthy Volunteers ; Humans ; Male ; Mandible/*physiology ; *Phonation ; Pressure ; *Speech Acoustics ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {OBJECTIVES: The purpose of this preliminary study was to determine the aerodynamic and acoustic effects of the low mandible maneuver (LMM) as compared to normal voice production.

METHODS: Ten participants with normal voice characteristics who were nonsingers produced sustained vowel and repeated syllable utterances during two different speaking conditions: using the LMM and using normal phonation posture. The LMM involves a wider vocal tract configuration with a lowered and relaxed jaw position. Acoustic recordings and analyses were performed to determine formants 1 and 2 (F1 and F2) and sound pressure level. Aerodynamic data were collected and analyzed to investigate the effects of the LMM on mean peak pressure, mean airflow, aerodynamic power, aerodynamic efficiency, and aerodynamic resistance.

RESULTS: Participants showed greater aerodynamic efficiency, mean peak pressure, and sound pressure level during the LMM condition as compared to normal phonation. The LMM vocal tract configuration changes were also associated with a lowering of F1 and F2 relative to normal voice production.

CONCLUSIONS: The lowering of the mandible and increased oral area that occurred during the LMM increased vocal efficiency and sound output without significant change to parameters that can be associated with increased vocal effort. These changes in filter configuration were associated with changes in vocal tract resonances. The LMM was readily learned and implemented by healthy participants in this study, and may have utility for singers in training as well as people with hyperfunctional voice disorders.}, } @article {pmid30618687, year = {2018}, author = {Ballard, KJ and Halaki, M and Sowman, P and Kha, A and Daliri, A and Robin, DA and Tourville, JA and Guenther, FH}, title = {An Investigation of Compensation and Adaptation to Auditory Perturbations in Individuals With Acquired Apraxia of Speech.}, journal = {Frontiers in human neuroscience}, volume = {12}, number = {}, pages = {510}, pmid = {30618687}, issn = {1662-5161}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R29 DC002852/DC/NIDCD NIH HHS/United States ; }, abstract = {Two auditory perturbation experiments were used to investigate the integrity of neural circuits responsible for speech sensorimotor adaptation in acquired apraxia of speech (AOS). This has implications for understanding the nature of AOS as well as normal speech motor control. Two experiments were conducted. In Experiment 1, compensatory responses to unpredictable fundamental frequency (F0) perturbations during vocalization were investigated in healthy older adults and adults with acquired AOS plus aphasia. F0 perturbation involved upward and downward 100-cent shifts versus no shift, in equal proportion, during 2 s vocalizations of the vowel /a/. In Experiment 2, adaptive responses to sustained first formant (F1) perturbations during speech were investigated in healthy older adults, adults with AOS and adults with aphasia only (APH). The F1 protocol involved production of the vowel /ε/ in four consonant-vowel words of Australian English (pear, bear, care, dare), and one control word with a different vowel (paw). An unperturbed Baseline phase was followed by a gradual Ramp to a 30% upward F1 shift stimulating a compensatory response, a Hold phase where the perturbation was repeatedly presented with alternating blocks of masking trials to probe adaptation, and an End phase with masking trials only to measure persistence of any adaptation. AOS participants showed normal compensation to unexpected F0 perturbations, indicating that auditory feedback control of low-level, non-segmental parameters is intact. Furthermore, individuals with AOS displayed an adaptive response to sustained F1 perturbations, but age-matched controls and APH participants did not. These findings suggest that older healthy adults may have less plastic motor programs that resist modification based on sensory feedback, whereas individuals with AOS have less well-established and more malleable motor programs due to damage from stroke.}, } @article {pmid30601240, year = {2019}, author = {Caldwell, MT and Jiradejvong, P and Limb, CJ}, title = {Effects of Phantom Electrode Stimulation on Vocal Production in Cochlear Implant Users.}, journal = {Ear and hearing}, volume = {40}, number = {5}, pages = {1127-1139}, doi = {10.1097/AUD.0000000000000690}, pmid = {30601240}, issn = {1538-4667}, mesh = {Acoustic Stimulation/*methods ; Aged ; *Cochlear Implants ; Deafness/physiopathology/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; *Speech ; Speech Acoustics ; }, abstract = {OBJECTIVES: Cochlear implant (CI) users suffer from a range of speech impairments, such as stuttering and vocal control of pitch and intensity. Though little research has focused on the role of auditory feedback in the speech of CI users, these speech impairments could be due in part to limited access to low-frequency cues inherent in CI-mediated listening. Phantom electrode stimulation (PES) represents a novel application of current steering that extends access to low frequencies for CI recipients. It is important to note that PES transmits frequencies below 300 Hz, whereas Baseline does not. The objective of this study was to explore the effects of PES on multiple frequency-related characteristics of voice production.

DESIGN: Eight postlingually deafened, adult Advanced Bionics CI users underwent a series of vocal production tests including Tone Repetition, Vowel Sound Production, Passage Reading, and Picture Description. Participants completed all of these tests twice: once with PES and once using their program used for everyday listening (Baseline). An additional test, Automatic Modulation, was included to measure acute effects of PES and was completed only once. This test involved switching between PES and Baseline at specific time intervals in real time as participants read a series of short sentences. Finally, a subjective Vocal Effort measurement was also included.

RESULTS: In Tone Repetition, the fundamental frequencies (F0) of tones produced using PES and the size of musical intervals produced using PES were significantly more accurate (closer to the target) compared with Baseline in specific gender, target tone range, and target tone type testing conditions. In the Vowel Sound Production task, vowel formant profiles produced using PES were closer to that of the general population compared with those produced using Baseline. The Passage Reading and Picture Description task results suggest that PES reduces measures of pitch variability (F0 standard deviation and range) in natural speech production. No significant results were found in comparisons of PES and Baseline in the Automatic Modulation task nor in the Vocal Effort task.

CONCLUSIONS: The findings of this study suggest that usage of PES increases accuracy of pitch matching in repeated sung tones and frequency intervals, possibly due to more accurate F0 representation. The results also suggest that PES partially normalizes the vowel formant profiles of select vowel sounds. PES seems to decrease pitch variability of natural speech and appears to have limited acute effects on natural speech production, though this finding may be due in part to paradigm limitations. On average, subjective ratings of vocal effort were unaffected by the usage of PES versus Baseline.}, } @article {pmid30599685, year = {2018}, author = {Saba, JN and Ali, H and Hansen, JHL}, title = {Formant priority channel selection for an "n-of-m" sound processing strategy for cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {6}, pages = {3371}, pmid = {30599685}, issn = {1520-8524}, support = {R01 DC010494/DC/NIDCD NIH HHS/United States ; R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Cochlear Implants/*standards ; Female ; Humans ; Male ; Middle Aged ; Natural Language Processing ; Signal-To-Noise Ratio ; *Speech Acoustics ; Speech Intelligibility ; }, abstract = {The Advanced Combination Encoder (ACE) signal processing strategy is used in the majority of cochlear implant (CI) sound processors manufactured by Cochlear Corporation. This "n-of-m" strategy selects "n" out of "m" available frequency channels with the highest spectral energy in each stimulation cycle. It is hypothesized that at low signal-to-noise ratio (SNR) conditions, noise-dominant frequency channels are susceptible for selection, neglecting channels containing target speech cues. In order to improve speech segregation in noise, explicit encoding of formant frequency locations within the standard channel selection framework of ACE is suggested. Two strategies using the direct formant estimation algorithms are developed within this study, FACE (formant-ACE) and VFACE (voiced-activated-formant-ACE). Speech intelligibility from eight CI users is compared across 11 acoustic conditions, including mixtures of noise and reverberation at multiple SNRs. Significant intelligibility gains were observed with VFACE over ACE in 5 dB babble noise; however, results with FACE/VFACE in all other conditions were comparable to standard ACE. An increased selection of channels associated with the second formant frequency is observed for FACE and VFACE. Both proposed methods may serve as potential supplementary channel selection techniques for the ACE sound processing strategy for cochlear implants.}, } @article {pmid30599639, year = {2018}, author = {Kochetov, A and Tabain, M and Sreedevi, N and Beare, R}, title = {Manner and place differences in Kannada coronal consonants: Articulatory and acoustic results.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {6}, pages = {3221}, doi = {10.1121/1.5081686}, pmid = {30599639}, issn = {1520-8524}, abstract = {This study investigated articulatory differences in the realization of Kannada coronal consonants of the same place but different manner of articulation. This was done by examining tongue positions and acoustic formant transitions for dentals and retroflexes of three manners of articulation: stops, nasals, and laterals. Ultrasound imaging data collected from ten speakers of the language revealed that the tongue body/root was more forward for the nasal manner of articulation compared to stop and lateral consonants of the same place of articulation. The dental nasal and lateral were also produced with a higher front part of the tongue compared to the dental stop. As a result, the place contrast was greater in magnitude for the stops (being the prototypical dental vs retroflex) than for the nasals and laterals (being apparently alveolar vs retroflex). Acoustic formant transition differences were found to reflect some of the articulatory differences, while also providing evidence for the more dynamic articulation of nasal and lateral retroflexes. Overall, the results of the study shed light on factors underlying manner requirements (aerodynamic or physiological) and how the factors interact with principles of gestural economy/symmetry, providing an empirical baseline for further cross-language investigations and articulation-to-acoustics modeling.}, } @article {pmid30595758, year = {2018}, author = {Mekyska, J and Galaz, Z and Kiska, T and Zvoncak, V and Mucha, J and Smekal, Z and Eliasova, I and Kostalova, M and Mrackova, M and Fiedorova, D and Faundez-Zanuy, M and Solé-Casals, J and Gomez-Vilda, P and Rektorova, I}, title = {Quantitative Analysis of Relationship Between Hypokinetic Dysarthria and the Freezing of Gait in Parkinson's Disease.}, journal = {Cognitive computation}, volume = {10}, number = {6}, pages = {1006-1018}, pmid = {30595758}, issn = {1866-9956}, abstract = {Hypokinetic dysarthria (HD) and freezing of gait (FOG) are both axial symptoms that occur in patients with Parkinson's disease (PD). It is assumed they have some common pathophysiological mechanisms and therefore that speech disorders in PD can predict FOG deficits within the horizon of some years. The aim of this study is to employ a complex quantitative analysis of the phonation, articulation and prosody in PD patients in order to identify the relationship between HD and FOG, and establish a mathematical model that would predict FOG deficits using acoustic analysis at baseline. We enrolled 75 PD patients who were assessed by 6 clinical scales including the Freezing of Gait Questionnaire (FOG-Q). We subsequently extracted 19 acoustic measures quantifying speech disorders in the fields of phonation, articulation and prosody. To identify the relationship between HD and FOG, we performed a partial correlation analysis. Finally, based on the selected acoustic measures, we trained regression models to predict the change in FOG during a 2-year follow-up. We identified significant correlations between FOG-Q scores and the acoustic measures based on formant frequencies (quantifying the movement of the tongue and jaw) and speech rate. Using the regression models, we were able to predict a change in particular FOG-Q scores with an error of between 7.4 and 17.0 %. This study is suggesting that FOG in patients with PD is mainly linked to improper articulation, a disturbed speech rate and to intelligibility. We have also proved that the acoustic analysis of HD at the baseline can be used as a predictor of the FOG deficit during 2 years of follow-up. This knowledge enables researchers to introduce new cognitive systems that predict gait difficulties in PD patients.}, } @article {pmid30571726, year = {2018}, author = {Geissler, DB and Weiler, E and Ehret, G}, title = {Adaptation and spectral enhancement at auditory temporal perceptual boundaries - Measurements via temporal precision of auditory brainstem responses.}, journal = {PloS one}, volume = {13}, number = {12}, pages = {e0208935}, pmid = {30571726}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; *Adaptation, Physiological ; Animal Communication ; Animals ; Auditory Cortex/*physiology ; Auditory Perception/physiology ; Auditory Threshold/physiology ; Brain Mapping ; Evoked Potentials, Auditory/physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Humans ; Mice ; Neurons/physiology ; Reaction Time/*physiology ; Sound ; }, abstract = {In human and animal auditory perception the perceived quality of sound streams changes depending on the duration of inter-sound intervals (ISIs). Here, we studied whether adaptation and the precision of temporal coding in the auditory periphery reproduce general perceptual boundaries in the time domain near 20, 100, and 400 ms ISIs, the physiological origin of which are unknown. In four experiments, we recorded auditory brainstem responses with five wave peaks (P1 -P5) in response to acoustic models of communication calls of house mice, who perceived these calls with the mentioned boundaries. The newly introduced measure of average standard deviations of wave latencies of individual animals indicate the waves' temporal precision (latency jitter) mostly in the range of 30-100 μs, very similar to latency jitter of single neurons. Adaptation effects of response latencies and latency jitter were measured for ISIs of 10-1000 ms. Adaptation decreased with increasing ISI duration following exponential or linear (on a logarithmic scale) functions in the range of up to about 200 ms ISIs. Adaptation effects were specific for each processing level in the auditory system. The perceptual boundaries near 20-30 and 100 ms ISIs were reflected in significant adaptation of latencies together with increases of latency jitter at P2-P5 for ISIs < ~30 ms and at P5 for ISIs < ~100 ms, respectively. Adaptation effects occurred when frequencies in a sound stream were within the same critical band. Ongoing low-frequency components/formants in a sound enhanced (decrease of latencies) coding of high-frequency components/formants when the frequencies concerned different critical bands. The results are discussed in the context of coding multi-harmonic sounds and stop-consonants-vowel pairs in the auditory brainstem. Furthermore, latency data at P1 (cochlea level) offer a reasonable value for the base-to-apex cochlear travel time in the mouse (0.342 ms) that has not been determined experimentally.}, } @article {pmid30570319, year = {2019}, author = {Masapollo, M and Zhao, TC and Franklin, L and Morgan, JL}, title = {Asymmetric discrimination of nonspeech tonal analogues of vowels.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {45}, number = {2}, pages = {285-300}, pmid = {30570319}, issn = {1939-1277}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 HD068501/HD/NICHD NIH HHS/United States ; /NIH HHS/National Institutes of Health/United States ; //University of Washington; Institute for Learning and Brain Sciences/ ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; Discrimination, Psychological/*physiology ; Female ; Humans ; Male ; *Psycholinguistics ; *Speech Acoustics ; Speech Perception/physiology ; Young Adult ; }, abstract = {Directional asymmetries reveal a universal bias in vowel perception favoring extreme vocalic articulations, which lead to acoustic vowel signals with dynamic formant trajectories and well-defined spectral prominences because of the convergence of adjacent formants. The present experiments investigated whether this bias reflects speech-specific processes or general properties of spectral processing in the auditory system. Toward this end, we examined whether analogous asymmetries in perception arise with nonspeech tonal analogues that approximate some of the dynamic and static spectral characteristics of naturally produced /u/ vowels executed with more versus less extreme lip gestures. We found a qualitatively similar but weaker directional effect with 2-component tones varying in both the dynamic changes and proximity of their spectral energies. In subsequent experiments, we pinned down the phenomenon using tones that varied in 1 or both of these 2 acoustic characteristics. We found comparable asymmetries with tones that differed exclusively in their spectral dynamics, and no asymmetries with tones that differed exclusively in their spectral proximity or both spectral features. We interpret these findings as evidence that dynamic spectral changes are a critical cue for eliciting asymmetries in nonspeech tone perception, but that the potential contribution of general auditory processes to asymmetries in vowel perception is limited. (PsycINFO Database Record (c) 2019 APA, all rights reserved).}, } @article {pmid30565098, year = {2019}, author = {Carney, LH and McDonough, JM}, title = {Nonlinear auditory models yield new insights into representations of vowels.}, journal = {Attention, perception & psychophysics}, volume = {81}, number = {4}, pages = {1034-1046}, pmid = {30565098}, issn = {1943-393X}, support = {R01 DC001641/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*psychology ; Humans ; *Nonlinear Dynamics ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Studies of vowel systems regularly appeal to the need to understand how the auditory system encodes and processes the information in the acoustic signal. The goal of this study is to present computational models to address this need, and to use the models to illustrate responses to vowels at two levels of the auditory pathway. Many of the models previously used to study auditory representations of speech are based on linear filter banks simulating the tuning of the inner ear. These models do not incorporate key nonlinear response properties of the inner ear that influence responses at conversational-speech sound levels. These nonlinear properties shape neural representations in ways that are important for understanding responses in the central nervous system. The model for auditory-nerve (AN) fibers used here incorporates realistic nonlinear properties associated with the basilar membrane, inner hair cells (IHCs), and the IHC-AN synapse. These nonlinearities set up profiles of f0-related fluctuations that vary in amplitude across the population of frequency-tuned AN fibers. Amplitude fluctuations in AN responses are smallest near formant peaks and largest at frequencies between formants. These f0-related fluctuations strongly excite or suppress neurons in the auditory midbrain, the first level of the auditory pathway where tuning for low-frequency fluctuations in sounds occurs. Formant-related amplitude fluctuations provide representations of the vowel spectrum in discharge rates of midbrain neurons. These representations in the midbrain are robust across a wide range of sound levels, including the entire range of conversational-speech levels, and in the presence of realistic background noise levels.}, } @article {pmid30547381, year = {2019}, author = {Anikin, A and Johansson, N}, title = {Implicit associations between individual properties of color and sound.}, journal = {Attention, perception & psychophysics}, volume = {81}, number = {3}, pages = {764-777}, pmid = {30547381}, issn = {1943-393X}, support = {Personal PhD funding of both authors//Lund University/ ; }, mesh = {Acoustic Stimulation ; *Auditory Perception ; *Color Perception ; Humans ; Photic Stimulation ; }, abstract = {We report a series of 22 experiments in which the implicit associations test (IAT) was used to investigate cross-modal correspondences between visual (luminance, hue [R-G, B-Y], saturation) and acoustic (loudness, pitch, formants [F1, F2], spectral centroid, trill) dimensions. Colors were sampled from the perceptually accurate CIE-Lab space, and the complex, vowel-like sounds were created with a formant synthesizer capable of separately manipulating individual acoustic properties. In line with previous reports, the loudness and pitch of acoustic stimuli were associated with both luminance and saturation of the presented colors. However, pitch was associated specifically with color lightness, whereas loudness mapped onto greater visual saliency. Manipulating the spectrum of sounds without modifying their pitch showed that an upward shift of spectral energy was associated with the same visual features (higher luminance and saturation) as higher pitch. In contrast, changing formant frequencies of synthetic vowels while minimizing the accompanying shifts in spectral centroid failed to reveal cross-modal correspondences with color. This may indicate that the commonly reported associations between vowels and colors are mediated by differences in the overall balance of low- and high-frequency energy in the spectrum rather than by vowel identity as such. Surprisingly, the hue of colors with the same luminance and saturation was not associated with any of the tested acoustic features, except for a weak preference to match higher pitch with blue (vs. yellow). We discuss these findings in the context of previous research and consider their implications for sound symbolism in world languages.}, } @article {pmid30541011, year = {2019}, author = {Paltura, C and Yelken, K}, title = {An Examination of Vocal Tract Acoustics following Wendler's Glottoplasty.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {71}, number = {1}, pages = {24-28}, doi = {10.1159/000494970}, pmid = {30541011}, issn = {1421-9972}, mesh = {Adult ; Female ; Glottis/physiology/*surgery ; Humans ; Male ; Pitch Perception ; Prospective Studies ; Sex Reassignment Procedures/*methods ; Speech Acoustics ; Stroboscopy ; *Transgender Persons ; Voice Quality ; Young Adult ; }, abstract = {PURPOSE: To investigate the formant frequency (FF) features of transgender females' (TFs) voice after Wendler's glottoplasty surgery and compare these levels with age-matched healthy males and females.

STUDY DESIGN: Controlled prospective.

METHODS: 20 TFs and 20 genetically male and female age-matched healthy controls were enrolled in the study. The fundamental frequency (F0) and FFs F1-F4 were obtained from TF speakers 6 months after surgery. These levels were compared with those of healthy controls.

RESULTS: Statistical analysis showed that the median F0 values were similar between TFs and females. The median F1 levels of TFs were different from females but similar to males. The F2 levels of TFs were similar to females but different from males. The F3 and F4 levels were significantly different from both male and female controls.

CONCLUSION: Wendler's glottoplasty technique is an effective method to increase F0 levels among TF patients; however, these individuals report their voice does not sufficiently project femininity. The results obtained with regard to FF levels may be the reason for this problem. Voice therapy is recommended as a possible approach to assist TF patients achieve a satisfactory feminine voice.}, } @article {pmid30503396, year = {2020}, author = {Hardy, TLD and Rieger, JM and Wells, K and Boliek, CA}, title = {Acoustic Predictors of Gender Attribution, Masculinity-Femininity, and Vocal Naturalness Ratings Amongst Transgender and Cisgender Speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {2}, pages = {300.e11-300.e26}, doi = {10.1016/j.jvoice.2018.10.002}, pmid = {30503396}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Adult ; *Auditory Perception ; Female ; *Femininity ; Humans ; Judgment ; Male ; *Masculinity ; Middle Aged ; Sex Factors ; *Sex Reassignment Procedures ; *Speech Acoustics ; Speech Production Measurement ; Transgender Persons/*psychology ; Transsexualism/physiopathology/psychology/*therapy ; *Voice Quality ; Voice Training ; Young Adult ; }, abstract = {PURPOSE: This study aimed to identify the most salient set of acoustic predictors of (1) gender attribution; (2) perceived masculinity-femininity; and (3) perceived vocal naturalness amongst a group of transgender and cisgender speakers to inform voice and communication feminization training programs. This study used a unique set of acoustic variables and included a third, androgynous, choice for gender attribution ratings.

METHOD: Data were collected across two phases and involved two separate groups of participants: communicators and raters. In the first phase, audio recordings were captured of communicators (n = 40) during cartoon retell, sustained vowel, and carrier phrase tasks. Acoustic measures were obtained from these recordings. In the second phase, raters (n = 20) provided ratings of gender attribution, perceived masculinity-femininity, and vocal naturalness based on a sample of the cartoon description recording.

RESULTS: Results of a multinomial logistic regression analysis identified mean fundamental frequency (fo) as the sole acoustic measure that changed the odds of being attributed as a woman or ambiguous in gender rather than as a man. Multiple linear regression analyses identified mean fo, average formant frequency of /i/, and mean sound pressure level as predictors of masculinity-femininity ratings and mean fo, average formant frequency, and rate of speech as predictors of vocal naturalness ratings.

CONCLUSION: The results of this study support the continued targeting of fo and vocal tract resonance in voice and communication feminization/masculinization training programs and provide preliminary evidence for more emphasis being placed on vocal intensity and rate of speech. Modification of these voice parameters may help clients to achieve a natural-sounding voice that satisfactorily represents their affirmed gender.}, } @article {pmid30498472, year = {2018}, author = {Themistocleous, C and Eckerström, M and Kokkinakis, D}, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks.}, journal = {Frontiers in neurology}, volume = {9}, number = {}, pages = {975}, pmid = {30498472}, issn = {1664-2295}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, } @article {pmid30485441, year = {2019}, author = {Fujimura, S and Kojima, T and Okanoue, Y and Shoji, K and Inoue, M and Hori, R}, title = {Discrimination of "hot potato voice" caused by upper airway obstruction utilizing a support vector machine.}, journal = {The Laryngoscope}, volume = {129}, number = {6}, pages = {1301-1307}, doi = {10.1002/lary.27584}, pmid = {30485441}, issn = {1531-4995}, mesh = {Airway Obstruction/physiopathology/*rehabilitation ; Humans ; Pattern Recognition, Automated/*methods ; Phonation/*physiology ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Support Vector Machine ; Voice/*physiology ; Voice Disorders/physiopathology/*rehabilitation ; Voice Quality ; }, abstract = {OBJECTIVES/HYPOTHESIS: "Hot potato voice" (HPV) is a thick, muffled voice caused by pharyngeal or laryngeal diseases characterized by severe upper airway obstruction, including acute epiglottitis and peritonsillitis. To develop a method for determining upper-airway emergency based on this important vocal feature, we investigated the acoustic characteristics of HPV using a physical, articulatory speech synthesis model. The results of the simulation were then applied to design a computerized recognition framework using a mel-frequency cepstral coefficient domain support vector machine (SVM).

STUDY DESIGN: Quasi-experimental research design.

METHODS: Changes in the voice spectral envelope caused by upper airway obstructions were analyzed using a hybrid time-frequency model of articulatory speech synthesis. We evaluated variations in the formant structure and thresholds of critical vocal tract area functions that triggered HPV. The SVMs were trained using a dataset of 2,200 synthetic voice samples generated by an articulatory synthesizer. Voice classification experiments on test datasets of real patient voices were then performed.

RESULTS: On phonation of the Japanese vowel /e/, the frequency of the second formant fell and coalesced with that of the first formant as the area function of the oropharynx decreased. Changes in higher-order formants varied according to constriction location. The highest accuracy afforded by the SVM classifier trained with synthetic data was 88.3%.

CONCLUSIONS: HPV caused by upper airway obstruction has a highly characteristic spectral envelope. Based on this distinctive voice feature, our SVM classifier, who was trained using synthetic data, was able to diagnose upper-airway obstructions with a high degree of accuracy.

LEVEL OF EVIDENCE: 2c Laryngoscope, 129:1301-1307, 2019.}, } @article {pmid30461826, year = {2018}, author = {Chen, Q and Liu, J and Yang, HM and Liu, HS and Wei, Y and Yuan, BL and Li, J and Zhao, K}, title = {Research on tunable distributed SPR sensor based on bimetal film.}, journal = {Applied optics}, volume = {57}, number = {26}, pages = {7591-7599}, doi = {10.1364/AO.57.007591}, pmid = {30461826}, issn = {1539-4522}, abstract = {In order to overcome the limitations in range of traditional prism structure surface plasmon resonance (SPR) single-point sensor measurement, a symmetric bimetallic film SPR multi-sensor structure is proposed. Based on this, the dual-channel sensing attenuation mechanism of SPR in gold and silver composite film and the improvement of sensing characteristics were studied. By optimizing the characteristics such as material and thickness, a wider range of dual-channel distributed sensing is realized. Using a He-Ne laser (632.8 nm) as the reference light source, prism-excited symmetric SPR sensing was studied theoretically for a symmetrical metal-clad dielectric waveguide using thin-film optics theory. The influence of the angle of incidence of the light source and the thickness of the dielectric layer on the performance of SPR dual formant sensing is explained. The finite-difference time-domain method was used for the simulation calculation for various thicknesses and compositions of the symmetric combined layer, resulting in the choice of silver (30 nm) and gold (10 nm). When the incident angle was 78 deg, the quality factor reached 5960, showing an excellent resonance sensing effect. The sensitivity reached a maximum of 5.25×10[-5] RIU when testing the water content of an aqueous solution of honey, which proves the feasibility and practicality of the structure design. The structure improves the theoretical basis for designing an SPR multi-channel distributed sensing system, which can greatly reduce the cost of biochemical detection and significantly increase the detection efficiency.}, } @article {pmid30448316, year = {2020}, author = {Graf, S and Schwiebacher, J and Richter, L and Buchberger, M and Adachi, S and Mastnak, W and Hoyer, P}, title = {Adjustment of Vocal Tract Shape via Biofeedback: Influence on Vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {3}, pages = {335-345}, doi = {10.1016/j.jvoice.2018.10.007}, pmid = {30448316}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Auditory Perception ; *Biofeedback, Psychology ; Female ; Humans ; Larynx/anatomy & histology/*physiology ; Male ; Middle Aged ; *Singing ; Sound Spectrography ; Visual Perception ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {The study assessed 30 nonprofessional singers to evaluate the effects of vocal tract shape adjustment via increased resonance toward an externally applied sinusoidal frequency of 900 Hz without phonation. The amplification of the sound wave was used as biofeedback signal and the intensity and the formant position of the basic vowels /a/, /e/, /i/, /o/, and /u/ were compared before and after a vocal tract adjustment period. After the adjustment period, the intensities for all vowels increased and the measured changes correlated with the participants' self-perception.The diferences between the second formant position of the vowels and the applied frequency influences the changes in amplitude and in formant frequencies. The most significant changes in formant frequency occurred with vowels that did not include a formant frequency of 900 Hz, while the increase in amplitude was the strongest for vowels with a formant frequency of about 900 Hz.}, } @article {pmid30441583, year = {2018}, author = {Bhat, GS and Reddy, CKA and Shankar, N and Panahi, IMS}, title = {Smartphone based real-time super Gaussian single microphone Speech Enhancement to improve intelligibility for hearing aid users using formant information.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2018}, number = {}, pages = {5503-5506}, pmid = {30441583}, issn = {2694-0604}, support = {R01 DC015430/DC/NIDCD NIH HHS/United States ; }, mesh = {*Hearing Aids ; Noise ; *Smartphone ; Speech Intelligibility ; *Speech Perception ; }, abstract = {In this paper, we present a Speech Enhancement (SE) technique to improve intelligibility of speech perceived by Hearing Aid users using smartphone as an assistive device. We use the formant frequency information to improve the overall quality and intelligibility of the speech. The proposed SE method is based on new super Gaussian joint maximum a Posteriori (SGJMAP) estimator. Using the priori information of formant frequency locations, the derived gain function has " tradeoff" factors that allows the smartphone user to customize perceptual preference, by controlling the amount of noise suppression and speech distortion in real-time. The formant frequency information helps the hearing aid user to control the gains over the non-formant frequency band, allowing the HA users to attain more noise suppression while maintaining the speech intelligibility using a smartphone application. Objective intelligibility measures and subjective results reflect the usability of the developed SE application in noisy real world acoustic environment.}, } @article {pmid30424675, year = {2018}, author = {Williams, D and Escudero, P and Gafos, A}, title = {Spectral change and duration as cues in Australian English listeners' front vowel categorization.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {3}, pages = {EL215}, doi = {10.1121/1.5055019}, pmid = {30424675}, issn = {1520-8524}, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Australia ; *Cues ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Australian English /iː/, /ɪ/, and /ɪə/ exhibit almost identical average first (F1) and second (F2) formant frequencies and differ in duration and vowel inherent spectral change (VISC). The cues of duration, F1 × F2 trajectory direction (TD) and trajectory length (TL) were assessed in listeners' categorization of /iː/ and /ɪə/ compared to /ɪ/. Duration was important for distinguishing both /iː/ and /ɪə/ from /ɪ/. TD and TL were important for categorizing /iː/ versus /ɪ/, whereas only TL was important for /ɪə/ versus /ɪ/. Finally, listeners' use of duration and VISC was not mutually affected for either vowel compared to /ɪ/.}, } @article {pmid30409059, year = {2019}, author = {Gómez-Vilda, P and Gómez-Rodellar, A and Vicente, JMF and Mekyska, J and Palacios-Alonso, D and Rodellar-Biarge, V and Álvarez-Marquina, A and Eliasova, I and Kostalova, M and Rektorova, I}, title = {Neuromechanical Modelling of Articulatory Movements from Surface Electromyography and Speech Formants.}, journal = {International journal of neural systems}, volume = {29}, number = {2}, pages = {1850039}, doi = {10.1142/S0129065718500399}, pmid = {30409059}, issn = {1793-6462}, mesh = {Adult ; Aged ; Biomechanical Phenomena ; Dysarthria/diagnosis/etiology ; Electromyography/*methods ; Humans ; Jaw/physiology ; Masseter Muscle/*physiology ; Middle Aged ; *Models, Neurological ; Parkinson Disease/complications/diagnosis ; Speech/*physiology ; Speech Production Measurement/*methods ; Tongue/physiology ; }, abstract = {Speech articulation is produced by the movements of muscles in the larynx, pharynx, mouth and face. Therefore speech shows acoustic features as formants which are directly related with neuromotor actions of these muscles. The first two formants are strongly related with jaw and tongue muscular activity. Speech can be used as a simple and ubiquitous signal, easy to record and process, either locally or on e-Health platforms. This fact may open a wide set of applications in the study of functional grading and monitoring neurodegenerative diseases. A relevant question, in this sense, is how far speech correlates and neuromotor actions are related. This preliminary study is intended to find answers to this question by using surface electromyographic recordings on the masseter and the acoustic kinematics related with the first formant. It is shown in the study that relevant correlations can be found among the surface electromyographic activity (dynamic muscle behavior) and the positions and first derivatives of the first formant (kinematic variables related to vertical velocity and acceleration of the joint jaw and tongue biomechanical system). As an application example, it is shown that the probability density function associated to these kinematic variables is more sensitive than classical features as Vowel Space Area (VSA) or Formant Centralization Ratio (FCR) in characterizing neuromotor degeneration in Parkinson's Disease.}, } @article {pmid30365651, year = {2018}, author = {Lopes, LW and Alves, JDN and Evangelista, DDS and França, FP and Vieira, VJD and Lima-Silva, MFB and Pernambuco, LA}, title = {Accuracy of traditional and formant acoustic measurements in the evaluation of vocal quality.}, journal = {CoDAS}, volume = {30}, number = {5}, pages = {e20170282}, doi = {10.1590/2317-1782/20182017282}, pmid = {30365651}, issn = {2317-1782}, mesh = {Adult ; Auditory Perception/physiology ; Cross-Sectional Studies ; Dysphonia/diagnosis/*physiopathology ; Female ; Humans ; Middle Aged ; Reference Values ; Reproducibility of Results ; Sensitivity and Specificity ; Severity of Illness Index ; *Speech Acoustics ; Speech Production Measurement/*methods ; Voice Quality/*physiology ; }, abstract = {PURPOSE: Investigate the accuracy of isolated and combined acoustic measurements in the discrimination of voice deviation intensity (GD) and predominant voice quality (PVQ) in patients with dysphonia.

METHODS: A total of 302 female patients with voice complaints participated in the study. The sustained /ɛ/ vowel was used to extract the following acoustic measures: mean and standard deviation (SD) of fundamental frequency (F0), jitter, shimmer, glottal to noise excitation (GNE) ratio and the mean of the first three formants (F1, F2, and F3). Auditory-perceptual evaluation of GD and PVQ was conducted by three speech-language pathologists who were voice specialists.

RESULTS: In isolation, only GNE provided satisfactory performance when discriminating between GD and PVQ. Improvement in the classification of GD and PVQ was observed when the acoustic measures were combined. Mean F0, F2, and GNE (healthy × mild-to-moderate deviation), the SDs of F0, F1, and F3 (mild-to-moderate × moderate deviation), and mean jitter and GNE (moderate × intense deviation) were the best combinations for discriminating GD. The best combinations for discriminating PVQ were mean F0, shimmer, and GNE (healthy × rough), F3 and GNE (healthy × breathy), mean F 0, F3, and GNE (rough × tense), and mean F0 , F1, and GNE (breathy × tense).

CONCLUSION: In isolation, GNE proved to be the only acoustic parameter capable of discriminating between GG and PVQ. There was a gain in classification performance for discrimination of both GD and PVQ when traditional and formant acoustic measurements were combined.}, } @article {pmid30352185, year = {2018}, author = {Grawunder, S and Crockford, C and Clay, Z and Kalan, AK and Stevens, JMG and Stoessel, A and Hohmann, G}, title = {Higher fundamental frequency in bonobos is explained by larynx morphology.}, journal = {Current biology : CB}, volume = {28}, number = {20}, pages = {R1188-R1189}, doi = {10.1016/j.cub.2018.09.030}, pmid = {30352185}, issn = {1879-0445}, mesh = {Acoustics ; Animals ; Larynx/*anatomy & histology ; Pan paniscus/*anatomy & histology/physiology ; Pan troglodytes/*anatomy & histology/physiology ; Species Specificity ; *Vocalization, Animal ; }, abstract = {Acoustic signals, shaped by natural and sexual selection, reveal ecological and social selection pressures [1]. Examining acoustic signals together with morphology can be particularly revealing. But this approach has rarely been applied to primates, where clues to the evolutionary trajectory of human communication may be found. Across vertebrate species, there is a close relationship between body size and acoustic parameters, such as formant dispersion and fundamental frequency (f0). Deviations from this acoustic allometry usually produce calls with a lower f0 than expected for a given body size, often due to morphological adaptations in the larynx or vocal tract [2]. An unusual example of an obvious mismatch between fundamental frequency and body size is found in the two closest living relatives of humans, bonobos (Pan paniscus) and chimpanzees (Pan troglodytes). Although these two ape species overlap in body size [3], bonobo calls have a strikingly higher f0 than corresponding calls from chimpanzees [4]. Here, we compare acoustic structures of calls from bonobos and chimpanzees in relation to their larynx morphology. We found that shorter vocal fold length in bonobos compared to chimpanzees accounted for species differences in f0, showing a rare case of positive selection for signal diminution in both bonobo sexes.}, } @article {pmid30348017, year = {2018}, author = {Niziolek, CA and Kiran, S}, title = {Assessing speech correction abilities with acoustic analyses: Evidence of preserved online correction in persons with aphasia.}, journal = {International journal of speech-language pathology}, volume = {20}, number = {6}, pages = {659-668}, pmid = {30348017}, issn = {1754-9515}, support = {K99 DC014520/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Aphasia/*rehabilitation ; *Feedback, Sensory ; Female ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Speech-Language Pathology/*methods ; }, abstract = {Purpose: Disorders of speech production may be accompanied by abnormal processing of speech sensory feedback. Here, we introduce a semi-automated analysis designed to assess the degree to which speakers use natural online feedback to decrease acoustic variability in spoken words. Because production deficits in aphasia have been hypothesised to stem from problems with sensorimotor integration, we investigated whether persons with aphasia (PWA) can correct their speech acoustics online. Method: Eight PWA in the chronic stage produced 200 repetitions each of three monosyllabic words. Formant variability was measured for each vowel in multiple time windows within the syllable, and the reduction in formant variability from vowel onset to midpoint was quantified. Result: PWA significantly decreased acoustic variability over the course of the syllable, providing evidence of online feedback correction mechanisms. The magnitude of this corrective formant movement exceeded past measurements in control participants. Conclusion: Vowel centreing behaviour suggests that error correction abilities are at least partially spared in speakers with aphasia, and may be relied upon to compensate for feedforward deficits by bringing utterances back on track. These proof of concept data show the potential of this analysis technique to elucidate the mechanisms underlying disorders of speech production.}, } @article {pmid30342799, year = {2020}, author = {Fazeli, M and Moradi, N and Soltani, M and Naderifar, E and Majdinasab, N and Latifi, SM and Dastoorpour, M}, title = {Dysphonia Characteristics and Vowel Impairment in Relation to Neurological Status in Patients with Multiple Sclerosis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {3}, pages = {364-370}, doi = {10.1016/j.jvoice.2018.09.018}, pmid = {30342799}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Adult ; Case-Control Studies ; Disability Evaluation ; Dysphonia/diagnosis/*etiology/physiopathology ; Female ; Humans ; Male ; Middle Aged ; Multiple Sclerosis/*complications/diagnosis/physiopathology ; Neurologic Examination ; *Phonation ; Severity of Illness Index ; *Speech Acoustics ; *Voice Quality ; Young Adult ; }, abstract = {PURPOSE: In this study, we attempted to assess the phonation and articulation subsystem changes in patients with multiple sclerosis compared to healthy individuals using Dysphonia Severity Index and Formant Centralization Ratio with the aim of evaluating the correlation between these two indexes with neurological status.

MATERIALS AND METHODS: A sample of 47 patients with multiple sclerosis and 20 healthy speakers were evaluated. Patients' disease duration and disability were monitored by a neurologist. Dysphonia Severity Index and Formant Centralization Ratio scores were computed for each individual. Acoustic analysis was performed by Praat software; the statistical analysis was run using SPSS 21. To compare multiple sclerosis patients with the control group, Mann-Whitney U test was used for non-normal data and independent-samples t test for normal data. Also a logistic regression was used to compare the data. Correlation between acoustic characteristics and neurological status was verified using Spearman correlation coefficient and linear regression was performed to evaluate the simultaneous effects of neurological data.

RESULTS: Statistical analysis revealed that a significant difference existed between multiple sclerosis and healthy participants. Formant Centralization Ratio had a significant correlation with disease severity.

CONCLUSION: Multiple sclerosis patients would be differentiated from healthy individuals by their phonation and articulatory features. Scores of these two indexes can be considered as appropriate criteria for onset of the speech problems in multiple sclerosis. Also, articulation subsystem changes might be useful signs for the progression of the disease.}, } @article {pmid30337204, year = {2019}, author = {Brabenec, L and Klobusiakova, P and Barton, M and Mekyska, J and Galaz, Z and Zvoncak, V and Kiska, T and Mucha, J and Smekal, Z and Kostalova, M and Rektorova, I}, title = {Non-invasive stimulation of the auditory feedback area for improved articulation in Parkinson's disease.}, journal = {Parkinsonism & related disorders}, volume = {61}, number = {}, pages = {187-192}, doi = {10.1016/j.parkreldis.2018.10.011}, pmid = {30337204}, issn = {1873-5126}, mesh = {Aged ; *Connectome ; Dysarthria/diagnostic imaging/etiology/*physiopathology ; Feedback, Sensory/*physiology ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; Middle Aged ; Motor Cortex/diagnostic imaging/*physiopathology ; Nerve Net/diagnostic imaging/*physiopathology ; Parahippocampal Gyrus/diagnostic imaging/*physiopathology ; Parkinson Disease/complications/diagnostic imaging/*physiopathology ; Speech Acoustics ; Temporal Lobe/diagnostic imaging/*physiopathology ; *Transcranial Magnetic Stimulation ; }, abstract = {INTRODUCTION: Hypokinetic dysarthria (HD) is a common symptom of Parkinson's disease (PD) which does not respond well to PD treatments. We investigated acute effects of repetitive transcranial magnetic stimulation (rTMS) of the motor and auditory feedback area on HD in PD using acoustic analysis of speech.

METHODS: We used 10 Hz and 1 Hz stimulation protocols and applied rTMS over the left orofacial primary motor area, the right superior temporal gyrus (STG), and over the vertex (a control stimulation site) in 16 PD patients with HD. A cross-over design was used. Stimulation sites and protocols were randomised across subjects and sessions. Acoustic analysis of a sentence reading task performed inside the MR scanner was used to evaluate rTMS-induced effects on motor speech. Acute fMRI changes due to rTMS were also analysed.

RESULTS: The 1 Hz STG stimulation produced significant increases of the relative standard deviation of the 2nd formant (p = 0.019), i.e. an acoustic parameter describing the tongue and jaw movements. The effects were superior to the control site stimulation and were accompanied by increased resting state functional connectivity between the stimulated region and the right parahippocampal gyrus. The rTMS-induced acoustic changes were correlated with the reading task-related BOLD signal increases of the stimulated area (R = 0.654, p = 0.029).

CONCLUSION: Our results demonstrate for the first time that low-frequency stimulation of the temporal auditory feedback area may improve articulation in PD and enhance functional connectivity between the STG and the cortical region involved in an overt speech control.}, } @article {pmid30336711, year = {2019}, author = {Gómez-Vilda, P and Galaz, Z and Mekyska, J and Vicente, JMF and Gómez-Rodellar, A and Palacios-Alonso, D and Smekal, Z and Eliasova, I and Kostalova, M and Rektorova, I}, title = {Vowel Articulation Dynamic Stability Related to Parkinson's Disease Rating Features: Male Dataset.}, journal = {International journal of neural systems}, volume = {29}, number = {2}, pages = {1850037}, doi = {10.1142/S0129065718500375}, pmid = {30336711}, issn = {1793-6462}, mesh = {Aged ; Articulation Disorders/etiology/*physiopathology ; Biomechanical Phenomena/*physiology ; Datasets as Topic ; Dysarthria/etiology/physiopathology ; Humans ; Jaw/physiopathology ; Male ; Middle Aged ; Parkinson Disease/complications/*diagnosis ; Severity of Illness Index ; Tongue/physiopathology ; }, abstract = {Neurodegenerative pathologies as Parkinson's Disease (PD) show important distortions in speech, affecting fluency, prosody, articulation and phonation. Classically, measurements based on articulation gestures altering formant positions, as the Vocal Space Area (VSA) or the Formant Centralization Ratio (FCR) have been proposed to measure speech distortion, but these markers are based mainly on static positions of sustained vowels. The present study introduces a measurement based on the mutual information distance among probability density functions of kinematic correlates derived from formant dynamics. An absolute kinematic velocity associated to the position of the jaw and tongue articulation gestures is estimated and modeled statistically. The distribution of this feature may differentiate PD patients from normative speakers during sustained vowel emission. The study is based on a limited database of 53 male PD patients, contrasted to a very selected and stable set of eight normative speakers. In this sense, distances based on Kullback-Leibler divergence seem to be sensitive to PD articulation instability. Correlation studies show statistically relevant relationship between information contents based on articulation instability to certain motor and nonmotor clinical scores, such as freezing of gait, or sleep disorders. Remarkably, one of the statistically relevant correlations point out to the time interval passed since the first diagnostic. These results stress the need of defining scoring scales specifically designed for speech disability estimation and monitoring methodologies in degenerative diseases of neuromotor origin.}, } @article {pmid30297975, year = {2018}, author = {den Ouden, DB and Galkina, E and Basilakos, A and Fridriksson, J}, title = {Vowel Formant Dispersion Reflects Severity of Apraxia of Speech.}, journal = {Aphasiology}, volume = {32}, number = {8}, pages = {902-921}, pmid = {30297975}, issn = {0268-7038}, support = {P50 DC014664/DC/NIDCD NIH HHS/United States ; T32 DC014435/DC/NIDCD NIH HHS/United States ; }, abstract = {BACKGROUND: Apraxia of Speech (AOS) has been associated with deviations in consonantal voice-onset-time (VOT), but studies of vowel acoustics have yielded conflicting results. However, a speech motor planning disorder that is not bound by phonological categories is expected to affect vowel as well as consonant articulations.

AIMS: We measured consonant VOTs and vowel formants produced by a large sample of stroke survivors, and assessed to what extent these variables and their dispersion are predictive of AOS presence and severity, based on a scale that uses clinical observations to rate gradient presence of AOS, aphasia, and dysarthria.

METHODS & PROCEDURES: Picture-description samples were collected from 53 stroke survivors, including unimpaired speakers (12) and speakers with primarily aphasia (19), aphasia with AOS (12), primarily AOS (2), aphasia with dysarthria (2), and aphasia with AOS and dysarthria (6). The first three formants were extracted from vowel tokens bearing main stress in open-class words, as well as VOTs for voiced and voiceless stops. Vowel space was estimated as reflected in the formant centralization ratio. Stepwise Linear Discriminant Analyses were used to predict group membership, and ordinal regression to predict AOS severity, based on the absolute values of these variables, as well as the standard deviations of formants and VOTs within speakers.

OUTCOMES AND RESULTS: Presence and severity of AOS were most consistently predicted by the dispersion of F1, F2, and voiced-stop VOT. These phonetic-acoustic measures do not correlate with aphasia severity.

CONCLUSIONS: These results confirm that the AOS affects articulation across-the-board and does not selectively spare vowel production.}, } @article {pmid30274350, year = {2018}, author = {Baotic, A and Garcia, M and Boeckle, M and Stoeger, A}, title = {Field Propagation Experiments of Male African Savanna Elephant Rumbles: A Focus on the Transmission of Formant Frequencies.}, journal = {Animals : an open access journal from MDPI}, volume = {8}, number = {10}, pages = {}, pmid = {30274350}, issn = {2076-2615}, support = {P 26448/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {African savanna elephants live in dynamic fission[-]fusion societies and exhibit a sophisticated vocal communication system. Their most frequent call-type is the 'rumble', with a fundamental frequency (which refers to the lowest vocal fold vibration rate when producing a vocalization) near or in the infrasonic range. Rumbles are used in a wide variety of behavioral contexts, for short- and long-distance communication, and convey contextual and physical information. For example, maturity (age and size) is encoded in male rumbles by formant frequencies (the resonance frequencies of the vocal tract), having the most informative power. As sound propagates, however, its spectral and temporal structures degrade progressively. Our study used manipulated and resynthesized male social rumbles to simulate large and small individuals (based on different formant values) to quantify whether this phenotypic information efficiently transmits over long distances. To examine transmission efficiency and the potential influences of ecological factors, we broadcasted and re-recorded rumbles at distances of up to 1.5 km in two different habitats at the Addo Elephant National Park, South Africa. Our results show that rumbles were affected by spectral[-]temporal degradation over distance. Interestingly and unlike previous findings, the transmission of formants was better than that of the fundamental frequency. Our findings demonstrate the importance of formant frequencies for the efficiency of rumble propagation and the transmission of information content in a savanna elephant's natural habitat.}, } @article {pmid30269894, year = {2020}, author = {Pabon, P and Ternström, S}, title = {Feature Maps of the Acoustic Spectrum of the Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {1}, pages = {161.e1-161.e26}, doi = {10.1016/j.jvoice.2018.08.014}, pmid = {30269894}, issn = {1873-4588}, mesh = {*Acoustics ; Female ; Humans ; Male ; *Phonation ; *Singing ; Sound Spectrography ; *Voice Quality ; }, abstract = {The change in the spectrum of sustained /a/ vowels was mapped over the voice range from low to high fundamental frequency and low to high sound pressure level (SPL), in the form of the so-called voice range profile (VRP). In each interval of one semitone and one decibel, narrowband spectra were averaged both within and across subjects. The subjects were groups of 7 male and 12 female singing students, as well as a group of 16 untrained female voices. For each individual and also for each group, pairs of VRP recordings were made, with stringent separation of the modal/chest and falsetto/head registers. Maps are presented of eight scalar metrics, each of which was chosen to quantify a particular feature of the voice spectrum, over fundamental frequency and SPL. Metrics 1 and 2 chart the role of the fundamental in relation to the rest of the spectrum. Metrics 3 and 4 are used to explore the role of resonances in relation to SPL. Metrics 5 and 6 address the distribution of high frequency energy, while metrics 7 and 8 seek to describe the distribution of energy at the low end of the voice spectrum. Several examples are observed of phenomena that are difficult to predict from linear source-filter theory, and of the voice source being less uniform over the voice range than is conventionally assumed. These include a high-frequency band-limiting at high SPL and an unexpected persistence of the second harmonic at low SPL. The two voice registers give rise to clearly different maps. Only a few effects of training were observed, in the low frequency end below 2 kHz. The results are of potential interest in voice analysis, voice synthesis and for new insights into the voice production mechanism.}, } @article {pmid30268821, year = {2019}, author = {Kraus, MS and Walker, TM and Jarskog, LF and Millet, RA and Keefe, RSE}, title = {Basic auditory processing deficits and their association with auditory emotion recognition in schizophrenia.}, journal = {Schizophrenia research}, volume = {204}, number = {}, pages = {155-161}, doi = {10.1016/j.schres.2018.08.031}, pmid = {30268821}, issn = {1573-2509}, mesh = {Adult ; Auditory Perception/*physiology ; Cognitive Dysfunction/etiology/*physiopathology ; Emotions/*physiology ; Female ; Humans ; Male ; Middle Aged ; Recognition, Psychology/*physiology ; Schizophrenia/complications/*physiopathology ; }, abstract = {BACKGROUND: Individuals with schizophrenia are impaired in their ability to recognize emotions based on vocal cues and these impairments are associated with poor global outcome. Basic perceptual processes, such as auditory pitch processing, are impaired in schizophrenia and contribute to difficulty identifying emotions. However, previous work has focused on a relatively narrow assessment of auditory deficits and their relation to emotion recognition impairment in schizophrenia.

METHODS: We have assessed 87 patients with schizophrenia and 73 healthy controls on a comprehensive battery of tasks spanning the five empirically derived domains of auditory function. We also explored the relationship between basic auditory processing and auditory emotion recognition within the patient group using correlational analysis.

RESULTS: Patients exhibited widespread auditory impairments across multiple domains of auditory function, with mostly medium effect sizes. Performance on all of the basic auditory tests correlated with auditory emotion recognition at the p < .01 level in the patient group, with 9 out of 13 tests correlating with emotion recognition at r = 0.40 or greater. After controlling for cognition, many of the largest correlations involved spectral processing within the phase-locking range and discrimination of vocally based stimuli.

CONCLUSIONS: While many auditory skills contribute to this impairment, deficient formant discrimination appears to be a key skill contributing to impaired emotion recognition as this was the only basic auditory skill to enter a step-wise multiple regression after first entering a measure of cognitive impairment, and formant discrimination accounted for significant unique variance in emotion recognition performance after accounting for deficits in pitch processing.}, } @article {pmid30251293, year = {2018}, author = {Han, C and Wang, H and Fasolt, V and Hahn, AC and Holzleitner, IJ and Lao, J and DeBruine, LM and Feinberg, DR and Jones, BC}, title = {No clear evidence for correlations between handgrip strength and sexually dimorphic acoustic properties of voices.}, journal = {American journal of human biology : the official journal of the Human Biology Council}, volume = {30}, number = {6}, pages = {e23178}, doi = {10.1002/ajhb.23178}, pmid = {30251293}, issn = {1520-6300}, support = {OCMATE (282655)//European Research Council/International ; }, mesh = {Adult ; China/ethnology ; Female ; *Hand Strength ; Humans ; Male ; Scotland/ethnology ; *Sex Characteristics ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: Recent research on the signal value of masculine physical characteristics in men has focused on the possibility that such characteristics are valid cues of physical strength. However, evidence that sexually dimorphic vocal characteristics are correlated with physical strength is equivocal. Consequently, we undertook a further test for possible relationships between physical strength and masculine vocal characteristics.

METHODS: We tested the putative relationships between White UK (N = 115) and Chinese (N = 106) participants' handgrip strength (a widely used proxy for general upper-body strength) and five sexually dimorphic acoustic properties of voices: fundamental frequency (F0), fundamental frequency's SD (F0-SD), formant dispersion (Df), formant position (Pf), and estimated vocal-tract length (VTL).

RESULTS: Analyses revealed no clear evidence that stronger individuals had more masculine voices.

CONCLUSIONS: Our results do not support the hypothesis that masculine vocal characteristics are a valid cue of physical strength.}, } @article {pmid30242251, year = {2018}, author = {Mou, Z and Chen, Z and Yang, J and Xu, L}, title = {Acoustic properties of vowel production in Mandarin-speaking patients with post-stroke dysarthria.}, journal = {Scientific reports}, volume = {8}, number = {1}, pages = {14188}, pmid = {30242251}, issn = {2045-2322}, support = {81372113//National Natural Science Foundation of China (National Science Foundation of China)/International ; 81372113//National Natural Science Foundation of China (National Science Foundation of China)/International ; }, mesh = {Acoustics ; Adult ; Aged ; Aged, 80 and over ; Dysarthria/*etiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Speech Acoustics ; Speech Intelligibility/physiology ; Stroke/*complications/*physiopathology ; }, abstract = {This study investigated the acoustic features of vowel production in Mandarin-speaking patients with post-stroke dysarthria (PSD). The subjects included 31 native Mandarin-speaking patients with PSD (age: 25-83 years old) and 38 neurologically normal adults in a similar age range (age: 21-76 years old). Each subject was recorded producing a list of Mandarin monosyllables that included six monophthong vowels (i.e., /a, i, u, ɤ, y, o/) embedded in the /CV/ context. The patients' speech samples were evaluated by two native Mandarin speakers. The evaluation scores were then used to classify all patients into two levels of severity: mild or moderate-to-severe. Formants (F1 and F2) were extracted from each vowel token. Results showed that all vowel categories in the patients with PSD were produced with more variability than in the healthy speakers. Great overlaps between vowel categories and reduced vowel space were observed in the patients. The magnitude of the vowel dispersion and overlap between vowel categories increased as a function of the severity of the disorder. The deviations of the vowel acoustic features in the patients in comparison to the healthy speakers may provide guidance for clinical rehabilitation to improve the speech intelligibility of patients with PSD.}, } @article {pmid30240514, year = {2018}, author = {Easwar, V and Banyard, A and Aiken, SJ and Purcell, DW}, title = {Phase-locked responses to the vowel envelope vary in scalp-recorded amplitude due to across-frequency response interactions.}, journal = {The European journal of neuroscience}, volume = {48}, number = {10}, pages = {3126-3145}, doi = {10.1111/ejn.14161}, pmid = {30240514}, issn = {1460-9568}, support = {#493836-2016//CIHR/Canada ; }, mesh = {Adolescent ; Adult ; Brain Waves/*physiology ; Electroencephalography/*methods ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Psychoacoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Neural encoding of the envelope of sounds like vowels is essential to access temporal information useful for speech recognition. Subcortical responses to envelope periodicity of vowels can be assessed using scalp-recorded envelope following responses (EFRs); however, the amplitude of EFRs vary by vowel spectra and the causal relationship is not well understood. One cause for spectral dependency could be interactions between responses with different phases, initiated by multiple stimulus frequencies. Phase differences can arise from earlier initiation of processing high frequencies relative to low frequencies in the cochlea. This study investigated the presence of such phase interactions by measuring EFRs to two naturally spoken vowels (/ε/ and /u/), while delaying the envelope phase of the second formant band (F2+) relative to the first formant (F1) band in 45° increments. At 0° F2+ phase delay, EFRs elicited by the vowel /ε/ were lower in amplitude than the EFRs elicited by /u/. Using vector computations, we found that the lower amplitude of /ε/-EFRs was caused by linear superposition of F1- and F2+-contributions with larger F1-F2+ phase differences (166°) compared to /u/ (19°). While the variation in amplitude across F2+ phase delays could be modeled with two dominant EFR sources for both vowels, the degree of variation was dependent on F1 and F2+ EFR characteristics. Together, we demonstrate that (a) broadband sounds like vowels elicit independent responses from different stimulus frequencies that may be out-of-phase and affect scalp-based measurements, and (b) delaying higher frequency formants can maximize EFR amplitudes for some vowels.}, } @article {pmid30239441, year = {2018}, author = {Omidvar, S and Mahmoudian, S and Khabazkhoob, M and Ahadi, M and Jafari, Z}, title = {Tinnitus Impacts on Speech and Non-speech Stimuli.}, journal = {Otology & neurotology : official publication of the American Otological Society, American Neurotology Society [and] European Academy of Otology and Neurotology}, volume = {39}, number = {10}, pages = {e921-e928}, doi = {10.1097/MAO.0000000000002002}, pmid = {30239441}, issn = {1537-4505}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Perception/*physiology ; Case-Control Studies ; Cross-Sectional Studies ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Hearing Tests ; Humans ; Male ; Middle Aged ; Speech/physiology ; Speech Perception/*physiology ; Tinnitus/*physiopathology ; }, abstract = {OBJECTIVE: To investigate how tinnitus affects the processing of speech and non-speech stimuli at the subcortical level.

STUDY DESIGN: Cross-sectional analytical study.

SETTING: Academic, tertiary referral center.

PATIENTS: Eighteen individuals with tinnitus and 20 controls without tinnitus matched based on their age and sex. All subjects had normal hearing sensitivity.

INTERVENTION: Diagnostic.

MAIN OUTCOME MEASURES: The effect of tinnitus on the parameters of auditory brainstem responses (ABR) to non-speech (click-ABR), and speech (sABR) stimuli was investigated.

RESULTS: Latencies of click ABR in waves III, V, and Vn, as well as inter-peak latency (IPL) of I to V were significantly longer in individuals with tinnitus compared with the controls. Individuals with tinnitus demonstrated significantly longer latencies of all sABR waves than the control group. The tinnitus patients also exhibited a significant decrease in the slope of the V-A complex and reduced encoding of the first and higher formants. A significant difference was observed between the two groups in the spectral magnitudes, the first formant frequency range (F1) and a higher frequency region (HF).

CONCLUSIONS: Our findings suggest that maladaptive neural plasticity resulting from tinnitus can be subcortically measured and affects timing processing of both speech and non-speech stimuli. The findings have been discussed based on models of maladaptive plasticity and the interference of tinnitus as an internal noise in synthesizing speech auditory stimuli.}, } @article {pmid30237569, year = {2018}, author = {Charlton, BD and Owen, MA and Keating, JL and Martin-Wintle, MS and Zhang, H and Swaisgood, RR}, title = {Sound transmission in a bamboo forest and its implications for information transfer in giant panda (Ailuropoda melanoleuca) bleats.}, journal = {Scientific reports}, volume = {8}, number = {1}, pages = {12754}, pmid = {30237569}, issn = {2045-2322}, mesh = {Animals ; *Forests ; *Sound ; Ursidae/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Although mammal vocalisations signal attributes about the caller that are important in a range of contexts, relatively few studies have investigated the transmission of specific types of information encoded in mammal calls. In this study we broadcast and re-recorded giant panda bleats in a bamboo plantation, to assess the stability of individuality and sex differences in these calls over distance, and determine how the acoustic structure of giant panda bleats degrades in this species' typical environment. Our results indicate that vocal recognition of the caller's identity and sex is not likely to be possible when the distance between the vocaliser and receiver exceeds 20 m and 10 m, respectively. Further analysis revealed that the F0 contour of bleats was subject to high structural degradation as it propagated through the bamboo canopy, making the measurement of mean F0 and F0 modulation characteristics highly unreliable at distances exceeding 10 m. The most stable acoustic features of bleats in the bamboo forest environment (lowest % variation) were the upper formants and overall formant spacing. The analysis of amplitude attenuation revealed that the fifth and sixth formant are more prone to decay than the other frequency components of bleats, however, the fifth formant still remained the most prominent and persistent frequency component over distance. Paired with previous studies, these results show that giant panda bleats have the potential to signal the caller's identity at distances of up to 20 m and reliably transmit sex differences up to 10 m from the caller, and suggest that information encoded by F0 modulation in bleats could only be functionally relevant during close-range interactions in this species' natural environment.}, } @article {pmid30233386, year = {2018}, author = {Ward, RM and Kelty-Stephen, DG}, title = {Bringing the Nonlinearity of the Movement System to Gestural Theories of Language Use: Multifractal Structure of Spoken English Supports the Compensation for Coarticulation in Human Speech Perception.}, journal = {Frontiers in physiology}, volume = {9}, number = {}, pages = {1152}, pmid = {30233386}, issn = {1664-042X}, abstract = {Coarticulation is the tendency for speech vocalization and articulation even at the phonemic level to change with context, and compensation for coarticulation (CfC) reflects the striking human ability to perceive phonemic stability despite this variability. A current controversy centers on whether CfC depends on contrast between formants of a speech-signal spectrogram-specifically, contrast between offset formants concluding context stimuli and onset formants opening the target sound-or on speech-sound variability specific to the coordinative movement of speech articulators (e.g., vocal folds, postural muscles, lips, tongues). This manuscript aims to encode that coordinative-movement context in terms of speech-signal multifractal structure and to determine whether speech's multifractal structure might explain the crucial gestural support for any proposed spectral contrast. We asked human participants to categorize individual target stimuli drawn from an 11-step [ga]-to-[da] continuum as either phonemes "GA" or "DA." Three groups each heard a specific-type context stimulus preceding target stimuli: either real-speech [al] or [a], sine-wave tones at the third-formant offset frequency of either [al] or [aɹ], and either simulated-speech contexts [al] or [aɹ]. Here, simulating speech contexts involved randomizing the sequence of relatively homogeneous pitch periods within vowel-sound [a] of each [al] and [aɹ]. Crucially, simulated-speech contexts had the same offset and extremely similar vowel formants as and, to additional naïve participants, sounded identical to real-speech contexts. However, randomization distorted original speech-context multifractality, and effects of spectral contrast following speech only appeared after regression modeling of trial-by-trial "GA" judgments controlled for context-stimulus multifractality. Furthermore, simulated-speech contexts elicited faster responses (like tone contexts do) and weakened known biases in CfC, suggesting that spectral contrast depends on the nonlinear interactions across multiple scales that articulatory gestures express through the speech signal. Traditional mouse-tracking behaviors measured as participants moved their computer-mouse cursor to register their "GA"-or-"DA" decisions with mouse-clicks suggest that listening to speech leads the movement system to resonate with the multifractality of context stimuli. We interpret these results as shedding light on a new multifractal terrain upon which to found a better understanding in which movement systems play an important role in shaping how speech perception makes use of acoustic information.}, } @article {pmid30227757, year = {2019}, author = {Hu, XJ and Li, FF and Lau, CC}, title = {Development of the Mandarin speech banana.}, journal = {International journal of speech-language pathology}, volume = {21}, number = {4}, pages = {404-411}, doi = {10.1080/17549507.2018.1485741}, pmid = {30227757}, issn = {1754-9515}, mesh = {Asian People ; Female ; Humans ; *Language ; Male ; *Speech Acoustics ; Speech-Language Pathology/*standards ; Young Adult ; }, abstract = {Purpose: For Indo-European languages, "speech banana" is widely used to verify the benefits of hearing aids and cochlear implants. As a standardised "Mandarin speech banana" is not available, clinicians in China typically use a non-Mandarin speech banana. However, as Chinese is logographic and tonal, using a non-Mandarin speech banana is inappropriate. This paper was designed to develop the Mandarin speech banana according to the Mandarin phonetic properties. Method: In the first experiment, 14 participants read aloud the standard Mandarin initials and finals. For each pronounced sound, its formants were measured. The boundary of all formants formed the formant graph (intensity versus frequency). In the second experiment, 20 participants listened to a list of pre-recorded initials and finals that had been filtered with different bandwidths. The minimum bandwidth to recognise a target sound defined its location on the formant graph. Result: The Mandarin speech banana was generated with recognisable initials and finals on the formant graph. Tone affected the shape of the formant graph, especially at low frequencies. Conclusion: Clinicians can use the new M andarin speech banana to counsel patients about what sounds are inaudible to them. Speech training can be implemented based on the unheard sounds in the speech banana.}, } @article {pmid30183418, year = {2018}, author = {Sfakianaki, A and Nicolaidis, K and Okalidou, A and Vlahavas, G}, title = {Coarticulatory dynamics in Greek disyllables produced by young adults with and without hearing loss.}, journal = {Clinical linguistics & phonetics}, volume = {32}, number = {12}, pages = {1162-1184}, doi = {10.1080/02699206.2018.1510987}, pmid = {30183418}, issn = {1464-5076}, mesh = {Adult ; Female ; Greece ; *Hearing Loss ; Humans ; Male ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Hearing loss affects both speech perception and production with detrimental effects on various speech characteristics including coarticulatory dynamics. The aim of the present study is to explore consonant-to-vowel (C-to-V) and vowel-to-vowel (V-to-V) coarticulation in magnitude, direction and temporal extent in the speech of young adult male and female speakers of Greek with normal hearing (NH) and hearing impairment (HI). Nine intelligible speakers with profound HI, using conventional hearing aids, and five speakers with NH produced /pV1CV2/ disyllables, with the point vowels /i, a, u/ and the consonants /p, t, s/, stressed either on the first or the second syllable. Formant frequencies F1 and F2 were measured in order to examine C-to-V effects at vowel midpoint and V-to-V effects at vowel onset, midpoint and offset. The acoustic and statistical analyses revealed similarities but also significant differences regarding coarticulatory patterns of the two groups. Interestingly, prevalence of anticipatory coarticulation effects in alveolar contexts was observed for speakers with HI. Findings are interpreted on account of possible differences in articulation strategies between the two groups and with reference to current coarticulatory models.}, } @article {pmid30174221, year = {2020}, author = {Kawitzky, D and McAllister, T}, title = {The Effect of Formant Biofeedback on the Feminization of Voice in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {1}, pages = {53-67}, doi = {10.1016/j.jvoice.2018.07.017}, pmid = {30174221}, issn = {1873-4588}, mesh = {Adult ; Aged ; *Biofeedback, Psychology ; Feasibility Studies ; Female ; *Feminization ; *Formative Feedback ; Humans ; Male ; Middle Aged ; Proof of Concept Study ; Sex Factors ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Transgender Persons/*psychology ; Transsexualism/physiopathology/psychology/*therapy ; Visual Perception ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {Differences in formant frequencies between men and women contribute to the perception of voices as masculine or feminine. This study investigated whether visual-acoustic biofeedback can be used to help transgender women achieve formant targets typical of cisgender women, and whether such a shift influences the perceived femininity of speech. Transgender women and a comparison group of cisgender males were trained to produce vowels in a word context while also attempting to make a visual representation of their second formant (F2) line up with a target that was shifted up relative to their baseline F2 (feminized target) or an unshifted or shifted-down target (control conditions). Despite the short-term nature of the training, both groups showed significant differences in F2 frequency in shifted-up, shifted-down, and unshifted conditions. Gender typicality ratings from blinded listeners indicated that higher F2 values were associated with an increase in the perceived femininity of speech. Consistent with previous literature, we found that fundamental frequency and F2 make a joint contribution to the perception of gender. The results suggest that biofeedback might be a useful tool in voice modification therapy for transgender women; however, larger studies and information about generalization will be essential before strong conclusions can be drawn.}, } @article {pmid30086890, year = {2019}, author = {Núñez-Batalla, F and Vasile, G and Cartón-Corona, N and Pedregal-Mallo, D and Menéndez de Castro, M and Guntín García, M and Gómez-Martínez, J and Carro Fernández, P and Llorente-Pendás, JL}, title = {Vowel production in hearing impaired children: A comparison between normal-hearing, hearing-aided and cochlear-implanted children.}, journal = {Acta otorrinolaringologica espanola}, volume = {70}, number = {5}, pages = {251-257}, doi = {10.1016/j.otorri.2018.05.004}, pmid = {30086890}, issn = {2173-5735}, mesh = {Articulation Disorders/*etiology/physiopathology ; Child ; Child, Preschool ; *Cochlear Implants ; Feedback, Sensory ; Female ; *Hearing Aids ; Hearing Loss/*complications/physiopathology/rehabilitation ; Humans ; Male ; *Persons With Hearing Impairments ; *Phonetics ; Speech Acoustics ; Speech Production Measurement ; Voice Quality ; }, abstract = {INTRODUCTION AND OBJECTIVES: Inadequate auditory feedback in prelingually deaf children alters the articulation of consonants and vowels. The purpose of this investigation was to compare vowel production in Spanish-speaking deaf children with cochlear implantation, and with hearing-aids with normal-hearing children by means of acoustic analysis of formant frequencies and vowel space.

METHODS: A total of 56 prelingually deaf children (25 with cochlear implants and 31 wearing hearing-aids) and 47 normal-hearing children participated. The first 2 formants (F1 and F2) of the five Spanish vowels were measured using Praat software. One-way analysis of variance (ANOVA) and post hoc Scheffé test were applied to analyze the differences between the 3 groups. The surface area of the vowel space was also calculated.

RESULTS: The mean value of F1 in all vowels was not significantly different between the 3 groups. For vowels /i/, /o/ and /u/, the mean value of F2 was significantly different between the 2 groups of deaf children and their normal-hearing peers.

CONCLUSION: Both prelingually hearing-impaired groups tended toward subtle deviations in the articulation of vowels that could be analyzed using an objective acoustic analysis programme.}, } @article {pmid30086545, year = {2019}, author = {Bucci, J and Perrier, P and Gerber, S and Schwartz, JL}, title = {Vowel Reduction in Coratino (South Italy): Phonological and Phonetic Perspectives.}, journal = {Phonetica}, volume = {76}, number = {4}, pages = {287-324}, doi = {10.1159/000490947}, pmid = {30086545}, issn = {1423-0321}, abstract = {Vowel reduction may involve phonetic reduction processes, with nonreached targets, and/or phonological processes in which a vowel target is changed for another target, possibly schwa. Coratino, a dialect of southern Italy, displays complex vowel reduction processes assumed to be phonological. We analyzed a corpus representative of vowel reduction in Coratino, based on a set of a hundred pairs of words contrasting a stressed and an unstressed version of a given vowel in a given consonant environment, produced by 10 speakers. We report vowelformants together with consonant-to-vowel formant trajectories and durations, and show that these data are rather in agreement with a change in vowel target from /i e ɛ·ɔ u/ to schwa when the vowel is a non-word-initial unstressed utterance, unless the vowel shares a place-of-articulation feature with the preceding or following consonant. Interestingly, it also appears that there are 2 targets for phonological reduction, differing in F1 values. A "higher schwa" - which could be considered as /ɨ/ - corresponds to reduction for high vowels /i u/ while a "lower schwa" - which could be considered as /ə/ - corresponds to reduction for midhigh.}, } @article {pmid30075685, year = {2018}, author = {Adriaans, F}, title = {Effects of consonantal context on the learnability of vowel categories from infant-directed speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {1}, pages = {EL20}, doi = {10.1121/1.5045192}, pmid = {30075685}, issn = {1520-8524}, mesh = {Female ; Humans ; Language ; Learning/*physiology ; Male ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Recent studies have shown that vowels in infant-directed speech (IDS) are characterized by highly variable formant distributions. The current study investigates whether vowel variability is partially due to consonantal context, and explores whether consonantal context could support the learning of vowel categories from IDS. A computational model is presented which selects contexts based on frequency in the input and generalizes across contextual categories. Improved categorization performance was found on a vowel contrast in American-English IDS. The findings support a view in which the infant's learning mechanism is anchored in context, in order to cope with acoustic variability in the input.}, } @article {pmid30075677, year = {2018}, author = {Barreda, S and Nearey, TM}, title = {A regression approach to vowel normalization for missing and unbalanced data.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {1}, pages = {500}, doi = {10.1121/1.5047742}, pmid = {30075677}, issn = {1520-8524}, mesh = {*Data Analysis ; Humans ; Language ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Researchers investigating the vowel systems of languages or dialects frequently employ normalization methods to minimize between-speaker variability in formant patterns while preserving between-phoneme separation and (socio-)dialectal variation. Here two methods are considered: log-mean and Lobanov normalization. Although both of these methods express formants in a speaker-dependent space, the methods differ in their complexity and in their implied models of human vowel-perception. Typical implementations of these methods rely on balanced data across speakers so that researchers may have to reduce the data available in the analyses in missing-data situations. Here, an alternative method is proposed for the normalization of vowels using the log-mean method in a linear-regression framework. The performance of the traditional approaches to log-mean and Lobanov normalization against the regression approach to the log-mean method using naturalistic, simulated vowel-data was investigated. The results indicate that the Lobanov method likely removes legitimate linguistic variation from vowel data and often provides very noisy estimates of the actual vowel quality associated with individual tokens. The authors further argue that the Lobanov method is too complex to represent a plausible model of human vowel perception, and so is unlikely to provide results that reflect the true perceptual organization of linguistic data.}, } @article {pmid30075671, year = {2018}, author = {Brajot, FX and Lawrence, D}, title = {Delay-induced low-frequency modulation of the voice during sustained phonation.}, journal = {The Journal of the Acoustical Society of America}, volume = {144}, number = {1}, pages = {282}, doi = {10.1121/1.5046092}, pmid = {30075671}, issn = {1520-8524}, mesh = {Adolescent ; Adult ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Phonation/*physiology ; Sound Spectrography/methods ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Time Factors ; Voice/*physiology ; Young Adult ; }, abstract = {An important property of negative feedback systems is the tendency to oscillate when feedback is delayed. This paper evaluated this phenomenon in a sustained phonation task, where subjects prolonged a vowel with 0-600 ms delays in auditory feedback. This resulted in a delay-dependent vocal wow: from 0.4 to 1 Hz fluctuations in fundamental frequency and intensity that increased in period and amplitude as the delay increased. A similar modulation in low-frequency oscillations was not observed in the first two formant frequencies, although some subjects did display increased variability. Results suggest that delayed auditory feedback enhances an existing periodic fluctuation in the voice, with a more complex, possibly indirect, influence on supraglottal articulation. These findings have important implications for understanding how speech may be affected by artificially applied or disease-based delays in sensory feedback.}, } @article {pmid30073277, year = {2018}, author = {Souza, P and Wright, R and Gallun, F and Reinhart, P}, title = {Reliability and Repeatability of the Speech Cue Profile.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {8}, pages = {2126-2137}, pmid = {30073277}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; UL1 TR001422/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods/psychology ; Adult ; Audiometry, Pure-Tone/*methods ; *Cues ; Female ; Hearing Loss, Sensorineural/*psychology ; Humans ; Male ; *Phonetics ; Reproducibility of Results ; Speech Perception ; }, abstract = {PURPOSE: Researchers have long noted speech recognition variability that is not explained by the pure-tone audiogram. Previous work (Souza, Wright, Blackburn, Tatman, & Gallun, 2015) demonstrated that a small number of listeners with sensorineural hearing loss utilized different types of acoustic cues to identify speechlike stimuli, specifically the extent to which the participant relied upon spectral (or temporal) information for identification. Consistent with recent calls for data rigor and reproducibility, the primary aims of this study were to replicate the pattern of cue use in a larger cohort and to verify stability of the cue profiles over time.

METHOD: Cue-use profiles were measured for adults with sensorineural hearing loss using a syllable identification task consisting of synthetic speechlike stimuli in which spectral and temporal dimensions were manipulated along continua. For the first set, a static spectral shape varied from alveolar to palatal, and a temporal envelope rise time varied from affricate to fricative. For the second set, formant transitions varied from labial to alveolar and a temporal envelope rise time varied from approximant to stop. A discriminant feature analysis was used to determine to what degree spectral and temporal information contributed to stimulus identification. A subset of participants completed a 2nd visit using the same stimuli and procedures.

RESULTS: When spectral information was static, most participants were more influenced by spectral than by temporal information. When spectral information was dynamic, participants demonstrated a balanced distribution of cue-use patterns, with nearly equal numbers of individuals influenced by spectral or temporal cues. Individual cue profile was repeatable over a period of several months.

CONCLUSION: In combination with previously published data, these results indicate that listeners with sensorineural hearing loss are influenced by different cues to identify speechlike sounds and that those patterns are stable over time.}, } @article {pmid30055982, year = {2019}, author = {Denizoglu, II and Sahin, M and Bayrak, S and Uygun, MN}, title = {Efficacy of Doctorvox Voice Therapy Technique for Mutational Falsetto.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {6}, pages = {950.e1-950.e8}, doi = {10.1016/j.jvoice.2018.05.012}, pmid = {30055982}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Case-Control Studies ; Humans ; Male ; Prospective Studies ; *Puberty ; Single-Blind Method ; *Speech ; Time Factors ; Treatment Outcome ; Voice Disorders/diagnosis/physiopathology/*therapy ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {OBJECTIVE: This study investigated the efficiency of the DoctorVox Voice Therapy Technique using high back-pressure on mutational falsetto.

STUDY DESIGN: A total of 21 men with mutational falsetto and 25 age-matched healthy men were included. All patients received DoctorVox voice therapy using the doctorVOX device with high back-pressure. Ear nose and throat examination, videolaryngostroboscopy, and acoustic (SPL, mean F0, first three formants, jitter%, shimmer%, and NHR) and electroglottographic analysis (Closed Quotient and Contact Index) were performed at pretreatment, and at 1 and 6 months after treatment. The VHI-10 and the GRBAS scales were used for perceptual voice evaluation.

RESULTS: Compared with the pretreatment values, the first and sixth month values after treatment demonstrated a significant decrease inVHI-10, GRBAS, F0, F1, F2, F3, Jitt %, Shimm %, NHR, and contact index and a significant increase in closed quotient among MF patients. In the sixth month after treatment, VHI-10, jitt%, and NHR parameters were significantly lower than those of the first month. The first-month data for VHI-10, jitt%, and NHR values was significantly higher than that of the control group, while there was no significant difference between the sixth-month data and that of the control group.

CONCLUSIONS: The DoctorVox Voice Therapy Technique is highly effective in mutational falsetto treatment. At the first session, the patients reach a lower fundamental frequency in the chest register; by the 1 month after treatment, they have a normal pitch at speaking. The improvement in perturbation measures may continue for 6 months. Patients should be followed up regularly for at least 6 months after treatment to obtain optimum treatment outcomes.}, } @article {pmid30055981, year = {2019}, author = {Mahmoudian, S and Aminrasouli, N and Ahmadi, ZZ and Lenarz, T and Farhadi, M}, title = {Acoustic Analysis of Crying Signal in Infants with Disabling Hearing Impairment.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {6}, pages = {946.e7-946.e13}, doi = {10.1016/j.jvoice.2018.05.016}, pmid = {30055981}, issn = {1873-4588}, mesh = {*Acoustics ; Age Factors ; Case-Control Studies ; *Crying ; Disabled Children/*psychology ; Evoked Potentials, Auditory, Brain Stem ; Female ; *Hearing ; Hearing Loss/diagnosis/physiopathology/*psychology ; Humans ; Infant ; *Infant Behavior ; Infant, Newborn ; Male ; Otoacoustic Emissions, Spontaneous ; Persons With Hearing Impairments/*psychology ; Reflex, Babinski ; Sound Spectrography ; Time Factors ; }, abstract = {OBJECTIVE: Crying is a multimodal, dynamic behavior and the first way to communicate. Early identification of hearing impairment is critical for prevention of speech and language disorders. The present study aimed to assess the acoustic features of infant's cry signals to find possible differences between two groups including hearing-impaired (HI) infants and normal hearing (NH) control.

METHODS: The data were collected from 34 (17 HI, 17 NH) infants under 2 months of age. Recording of the infant cry signals was collected during the examination of the Babinski reflex and was subsequently submitted for acoustic analysis. The total duration of the recording for each infant was approximately 30 seconds. The acoustical features included fundamental frequency (F0), formants (F1, F2, and F3), intensity, jitter, shimmer, ratios of F2/F1 and F3/F1, ratio of harmonic to noise, and voice break. The recording device was an Olympus ws-321M voice recorder with 44,100 Hz sampling frequency in the stereo form. Praat analysis software (version 27, 3, 5) was used to analyze the crying signals. The data were then statistically analyzed using SPSS version 21.

RESULTS: Acoustic analysis of the crying signals showed that HI infants have lower intensity and higher F0 and voice break than NH infants. However, the other differences were not statistically significant.

CONCLUSION: The results of the present study demonstrated that the acoustic components including F0, intensity, and voice break may be used as indices to discriminate HI infants from NH infants under 2 months of age. These findings can be increased our knowledge concerning the functional mechanisms of the vocal organ in HI and NH infants.}, } @article {pmid30054898, year = {2019}, author = {Anikin, A}, title = {Soundgen: An open-source tool for synthesizing nonverbal vocalizations.}, journal = {Behavior research methods}, volume = {51}, number = {2}, pages = {778-792}, pmid = {30054898}, issn = {1554-3528}, mesh = {*Communication Aids for Disabled ; Emotions ; Humans ; *Software ; *Sound ; *Speech ; *Voice ; }, abstract = {Voice synthesis is a useful method for investigating the communicative role of different acoustic features. Although many text-to-speech systems are available, researchers of human nonverbal vocalizations and bioacousticians may profit from a dedicated simple tool for synthesizing and manipulating natural-sounding vocalizations. Soundgen (https://CRAN.R-project.org/package=soundgen) is an open-source R package that synthesizes nonverbal vocalizations based on meaningful acoustic parameters, which can be specified from the command line or in an interactive app. This tool was validated by comparing the perceived emotion, valence, arousal, and authenticity of 60 recorded human nonverbal vocalizations (screams, moans, laughs, and so on) and their approximate synthetic reproductions. Each synthetic sound was created by manually specifying only a small number of high-level control parameters, such as syllable length and a few anchors for the intonation contour. Nevertheless, the valence and arousal ratings of synthetic sounds were similar to those of the original recordings, and the authenticity ratings were comparable, maintaining parity with the originals for less complex vocalizations. Manipulating the precise acoustic characteristics of synthetic sounds may shed light on the salient predictors of emotion in the human voice. More generally, soundgen may prove useful for any studies that require precise control over the acoustic features of nonspeech sounds, including research on animal vocalizations and auditory perception.}, } @article {pmid30048758, year = {2018}, author = {Hînganu, MV and Hînganu, D and Cozma, SR and Asimionoaiei-Simionescu, C and Scutariu, IA and Ionesi, DS and Haba, D}, title = {Morphofunctional evaluation of buccopharyngeal space using three-dimensional cone-beam computed tomography (3D-CBCT).}, journal = {Annals of anatomy = Anatomischer Anzeiger : official organ of the Anatomische Gesellschaft}, volume = {220}, number = {}, pages = {1-8}, doi = {10.1016/j.aanat.2018.06.008}, pmid = {30048758}, issn = {1618-0402}, mesh = {Adult ; Cheek/*anatomy & histology/*diagnostic imaging ; Cone-Beam Computed Tomography/*methods ; Epiglottis/anatomy & histology ; Female ; Humans ; Image Processing, Computer-Assisted ; Imaging, Three-Dimensional ; Middle Aged ; Oropharynx/anatomy & histology ; Pharynx/*anatomy & histology/*diagnostic imaging ; *Singing ; Voice ; Young Adult ; }, abstract = {The present study aims to identify the anatomical functional changes of the buccopharyngeal space in case of singers with canto voice. The interest in this field is particularly important in view of the relation between the artistic performance level, phoniatry and functional anatomy, as the voice formation mechanism is not completely known yet. We conducted a morphometric study on three soprano voices that differ in type and training level. The anatomical soft structures from the superior vocal formant of each soprano were measured on images captured using the Cone-beam Computed Tomography (CBCT) technique. The results obtained, as well as the 3D reconstructions emphasize the particularities of the individual morphological features, especially in case of the experienced soprano soloist, which are found to be different for each anatomical soft structure, as well as for their integrity. The experimental results are encouraging and suggest further development of this study on soprano voices and also on other types of opera voices.}, } @article {pmid30034052, year = {2018}, author = {Whalen, DH and Chen, WR and Tiede, MK and Nam, H}, title = {Variability of articulator positions and formants across nine English vowels.}, journal = {Journal of phonetics}, volume = {68}, number = {}, pages = {1-14}, pmid = {30034052}, issn = {0095-4470}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, abstract = {Speech, though communicative, is quite variable both in articulation and acoustics, and it has often been claimed that articulation is more variable. Here we compared variability in articulation and acoustics for 32 speakers in the x-ray microbeam database (XRMB; Westbury, 1994). Variability in tongue, lip and jaw positions for nine English vowels (/u, ʊ, æ, ɑ, ʌ, ɔ, ε, ɪ, i/) was compared to that of the corresponding formant values. The domains were made comparable by creating three-dimensional spaces for each: the first three principal components from an analysis of a 14-dimensional space for articulation, and an F1xF2xF3 space for acoustics. More variability occurred in the articulation than the acoustics for half of the speakers, while the reverse was true for the other half. Individual tokens were further from the articulatory median than the acoustic median for 40-60% of tokens across speakers. A separate analysis of three non-low front vowels (/ε, ɪ, i/, for which the XRMB system provides the most direct articulatory evidence) did not differ from the omnibus analysis. Speakers tended to be either more or less variable consistently across vowels. Across speakers, there was a positive correlation between articulatory and acoustic variability, both for all vowels and for just the three non-low front vowels. Although the XRMB is an incomplete representation of articulation, it nonetheless provides data for direct comparisons between articulatory and acoustic variability that have not been reported previously. The results indicate that articulation is not more variable than acoustics, that speakers had relatively consistent variability across vowels, and that articulatory and acoustic variability were related for the vowels themselves.}, } @article {pmid29999541, year = {2019}, author = {Barakzai, SZ and Wells, J and Parkin, TDH and Cramp, P}, title = {Overground endoscopic findings and respiratory sound analysis in horses with recurrent laryngeal neuropathy after unilateral laser ventriculocordectomy.}, journal = {Equine veterinary journal}, volume = {51}, number = {2}, pages = {185-191}, doi = {10.1111/evj.12993}, pmid = {29999541}, issn = {2042-3306}, support = {//Horserace Betting Levy Board/ ; }, mesh = {Animals ; Endoscopy/*veterinary ; Horse Diseases/*surgery ; Horses ; Laryngoplasty/*veterinary ; Laser Therapy/methods/*veterinary ; Physical Conditioning, Animal ; *Respiratory Sounds ; Treatment Outcome ; Vocal Cord Paralysis/surgery/*veterinary ; }, abstract = {BACKGROUND: Unilateral ventriculocordectomy (VeC) is frequently performed, yet objective studies in horses with naturally occurring recurrent laryngeal neuropathy (RLN) are few.

OBJECTIVES: To evaluate respiratory noise and exercising overground endoscopy in horses with grade B and C laryngeal function, before and after unilateral laser VeC.

STUDY DESIGN: Prospective study in clinically affected client-owned horses.

METHODS: Exercising endoscopy was performed and concurrent respiratory noise was recorded. A left-sided laser VeC was performed under standing sedation. Owners were asked to present the horse for re-examination 6-8 weeks post-operatively when exercising endoscopy and sound recordings were repeated. Exercising endoscopic findings were recorded, including the degree of arytenoid stability. Quantitative measurement of left-to-right quotient angle ratio (LRQ) and rima glottidis area ratio (RGA) were performed pre- and post-operatively. Sound analysis was performed, and measurements of the energy change in F1, F2 and F3 formants between pre- and post-operative recordings were made and statistically analysed.

RESULTS: Three grade B and seven grade C horses were included; 6/7grade C horses preoperatively had bilateral vocal fold collapse (VFC) and 5/7 had mild right-sided medial deviation of the ary-epiglottic fold (MDAF). Right VFC and MDAF was still present in these horses post-operatively; grade B horses had no other endoscopic dynamic abnormalities post-operatively. Sound analysis showed significant reduction in energy in formant F2 (P = 0.05) after surgery.

MAIN LIMITATIONS: The study sample size was small and multiple dynamic abnormalities made sound analysis challenging.

CONCLUSIONS: RLN-affected horses have reduction in sound levels in F2 after unilateral laser VeC. Continuing noise may be caused by other ongoing forms of dynamic obstruction in grade C horses. Unilateral VeC is useful for grade B horses based on endoscopic images. In grade C horses, bilateral VeC, right ary-epiglottic fold resection ± laryngoplasty might be a better option than unilateral VeC alone. The Summary is available in Portuguese - see Supporting Information.}, } @article {pmid29983773, year = {2018}, author = {Buzaneli, ECP and Zenari, MS and Kulcsar, MAV and Dedivitis, RA and Cernea, CR and Nemr, K}, title = {Supracricoid Laryngectomy: The Function of the Remaining Arytenoid in Voice and Swallowing.}, journal = {International archives of otorhinolaryngology}, volume = {22}, number = {3}, pages = {303-312}, pmid = {29983773}, issn = {1809-9777}, abstract = {Introduction Supracricoid laryngectomy still has selected indications; there are few studies in the literature, and the case series are limited, a fact that stimulates the development of new studies to further elucidate the structural and functional aspects of the procedure. Objective To assess voice and deglutition parameters according to the number of preserved arytenoids. Methods Eleven patients who underwent subtotal laryngectomy with cricohyoidoepiglottopexy were evaluated by laryngeal nasofibroscopy, videofluoroscopy, and auditory-perceptual, acoustic, and voice pleasantness analyses, after resuming oral feeding. Results Functional abnormalities were detected in two out of the three patients who underwent arytenoidectomy, and in six patients from the remainder of the sample. Almost half of the sample presented silent laryngeal penetration and/or vallecular/hypopharyngeal stasis on the videofluoroscopy. The mean voice analysis scores indicated moderate vocal deviation, roughness and breathiness; severe strain and loudness deviation; shorter maximum phonation time; the presence of noise; and high third and fourth formant values. The voices were rated as unpleasant. There was no difference in the number and functionality of the remaining arytenoids as prognostic factors for deglutition; however, in the qualitative analysis, favorable voice and deglutition outcomes were more common among patients who did not undergo arytenoidectomy and had normal functional conditions. Conclusion The number and functionality of the preserved arytenoids were not found to be prognostic factors for favorable deglutition efficiency outcomes. However, the qualitative analysis showed that the preservation of both arytenoids and the absence of functional abnormalities were associated with more satisfactory voice and deglutition patterns.}, } @article {pmid29969852, year = {2018}, author = {Pan, Z and Gui, C and Zhang, J and Zhu, J and Cui, D}, title = {Detecting Manic State of Bipolar Disorder Based on Support Vector Machine and Gaussian Mixture Model Using Spontaneous Speech.}, journal = {Psychiatry investigation}, volume = {15}, number = {7}, pages = {695-700}, pmid = {29969852}, issn = {1738-3684}, abstract = {OBJECTIVE: This study was aimed to compare the accuracy of Support Vector Machine (SVM) and Gaussian Mixture Model (GMM) in the detection of manic state of bipolar disorders (BD) of single patients and multiple patients.

METHODS: 21 hospitalized BD patients (14 females, average age 34.5±15.3) were recruited after admission. Spontaneous speech was collected through a preloaded smartphone. Firstly, speech features [pitch, formants, mel-frequency cepstrum coefficients (MFCC), linear prediction cepstral coefficient (LPCC), gamma-tone frequency cepstral coefficients (GFCC) etc.] were preprocessed and extracted. Then, speech features were selected using the features of between-class variance and within-class variance. The manic state of patients was then detected by SVM and GMM methods.

RESULTS: LPCC demonstrated the best discrimination efficiency. The accuracy of manic state detection for single patients was much better using SVM method than GMM method. The detection accuracy for multiple patients was higher using GMM method than SVM method.

CONCLUSION: SVM provided an appropriate tool for detecting manic state for single patients, whereas GMM worked better for multiple patients' manic state detection. Both of them could help doctors and patients for better diagnosis and mood state monitoring in different situations.}, } @article {pmid29960490, year = {2018}, author = {El Boghdady, N and Başkent, D and Gaudrain, E}, title = {Effect of frequency mismatch and band partitioning on vocal tract length perception in vocoder simulations of cochlear implant processing.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {6}, pages = {3505}, doi = {10.1121/1.5041261}, pmid = {29960490}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Auditory Threshold ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Cues ; Electric Stimulation ; Humans ; Larynx/anatomy & histology/*physiology ; *Pitch Perception ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {The vocal tract length (VTL) of a speaker is an important voice cue that aids speech intelligibility in multi-talker situations. However, cochlear implant (CI) users demonstrate poor VTL sensitivity. This may be partially caused by the mismatch between frequencies received by the implant and those corresponding to places of stimulation along the cochlea. This mismatch can distort formant spacing, where VTL cues are encoded. In this study, the effects of frequency mismatch and band partitioning on VTL sensitivity were investigated in normal hearing listeners with vocoder simulations of CI processing. The hypotheses were that VTL sensitivity may be reduced by increased frequency mismatch and insufficient spectral resolution in how the frequency range is partitioned, specifically where formants lie. Moreover, optimal band partitioning might mitigate the detrimental effects of frequency mismatch on VTL sensitivity. Results showed that VTL sensitivity decreased with increased frequency mismatch and reduced spectral resolution near the low frequencies of the band partitioning map. Band partitioning was independent of mismatch, indicating that if a given partitioning is suboptimal, a better partitioning might improve VTL sensitivity despite the degree of mismatch. These findings suggest that customizing the frequency partitioning map may enhance VTL perception in individual CI users.}, } @article {pmid29960457, year = {2018}, author = {Vikram, CM and Macha, SK and Kalita, S and Mahadeva Prasanna, SR}, title = {Acoustic analysis of misarticulated trills in cleft lip and palate children.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {6}, pages = {EL474}, doi = {10.1121/1.5042339}, pmid = {29960457}, issn = {1520-8524}, mesh = {*Acoustics ; Age Factors ; Child ; Cleft Lip/diagnosis/*physiopathology ; Cleft Palate/diagnosis/*physiopathology ; Female ; Humans ; Male ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; *Voice Quality ; }, abstract = {In this paper, acoustic analysis of misarticulated trills in cleft lip and palate speakers is carried out using excitation source based features: strength of excitation and fundamental frequency, derived from zero-frequency filtered signal, and vocal tract system features: first formant frequency (F1) and trill frequency, derived from the linear prediction analysis and autocorrelation approach, respectively. These features are found to be statistically significant while discriminating normal from misarticulated trills. Using acoustic features, dynamic time warping based trill misarticulation detection system is demonstrated. The performance of the proposed system in terms of the F1-score is 73.44%, whereas that for conventional Mel-frequency cepstral coefficients is 66.11%.}, } @article {pmid29953978, year = {2018}, author = {Ng, ML and Yan, N and Chan, V and Chen, Y and Lam, PKY}, title = {A Volumetric Analysis of the Vocal Tract Associated with Laryngectomees Using Acoustic Reflection Technology.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {70}, number = {1}, pages = {44-49}, doi = {10.1159/000486714}, pmid = {29953978}, issn = {1421-9972}, mesh = {Aged ; Aged, 80 and over ; Anthropometry/instrumentation ; Female ; Humans ; Laryngectomy ; Male ; Middle Aged ; Mouth/*pathology ; Organ Size ; Pharynx/*pathology ; Phonetics ; *Speech, Alaryngeal ; }, abstract = {OBJECTIVE: Previous studies of the laryngectomized vocal tract using formant frequencies reported contradictory findings. Imagining studies of the vocal tract in alaryngeal speakers are limited due to the possible radiation effect as well as the cost and time associated with the studies. The present study examined the vocal tract configuration of laryngectomized individuals using acoustic reflection technology.

SUBJECTS AND METHODS: Thirty alaryngeal and 30 laryngeal male speakers of Cantonese participated in the study. A pharyngometer was used to obtain volumetric information of the vocal tract. All speakers were instructed to imitate the production of /a/ when the length and volume information of the oral cavity, pharyngeal cavity, and the entire vocal tract were obtained. The data of alaryngeal and laryngeal speakers were compared.

RESULTS: Pharyngometric measurements revealed no significant difference in the vocal tract dimensions between laryngeal and alaryngeal speakers.

CONCLUSION: Despite the removal of the larynx and a possible alteration in the pharyngeal cavity during total laryngectomy, the vocal tract configuration (length and volume) in laryngectomized individuals was not significantly different from laryngeal speakers. It is suggested that other factors might have affected formant measures in previous studies.}, } @article {pmid29941611, year = {2018}, author = {Reby, D and Wyman, MT and Frey, R and Charlton, BD and Dalmont, JP and Gilbert, J}, title = {Vocal tract modelling in fallow deer: are male groans nasalized?.}, journal = {The Journal of experimental biology}, volume = {221}, number = {Pt 17}, pages = {}, doi = {10.1242/jeb.179416}, pmid = {29941611}, issn = {1477-9145}, mesh = {Animals ; Deer/*physiology ; Larynx/*physiology ; Male ; Models, Biological ; Mouth/physiology ; Nasal Cavity/physiology ; Nasopharynx/*physiology ; Oropharynx/*physiology ; Tomography, X-Ray Computed/veterinary ; Vocalization, Animal/*physiology ; }, abstract = {Males of several species of deer have a descended and mobile larynx, resulting in an unusually long vocal tract, which can be further extended by lowering the larynx during call production. Formant frequencies are lowered as the vocal tract is extended, as predicted when approximating the vocal tract as a uniform quarter wavelength resonator. However, formant frequencies in polygynous deer follow uneven distribution patterns, indicating that the vocal tract configuration may in fact be rather complex. We CT-scanned the head and neck region of two adult male fallow deer specimens with artificially extended vocal tracts and measured the cross-sectional areas of the supra-laryngeal vocal tract along the oral and nasal tracts. The CT data were then used to predict the resonances produced by three possible configurations, including the oral vocal tract only, the nasal vocal tract only, or combining the two. We found that the area functions from the combined oral and nasal vocal tracts produced resonances more closely matching the formant pattern and scaling observed in fallow deer groans than those predicted by the area functions of the oral vocal tract only or of the nasal vocal tract only. This indicates that the nasal and oral vocal tracts are both simultaneously involved in the production of a non-human mammal vocalization, and suggests that the potential for nasalization in putative oral loud calls should be carefully considered.}, } @article {pmid29938340, year = {2018}, author = {Yilmaz, A and Sarac, ET and Aydinli, FE and Yildizgoren, MT and Okuyucu, EE and Serarslan, Y}, title = {Investigating the effect of STN-DBS stimulation and different frequency settings on the acoustic-articulatory features of vowels.}, journal = {Neurological sciences : official journal of the Italian Neurological Society and of the Italian Society of Clinical Neurophysiology}, volume = {39}, number = {10}, pages = {1683-1689}, pmid = {29938340}, issn = {1590-3478}, mesh = {Adult ; Aged ; Articulation Disorders/etiology/physiopathology/*therapy ; *Deep Brain Stimulation/methods ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Parkinson Disease/physiopathology/*therapy ; *Phonetics ; *Speech Acoustics ; *Subthalamic Nucleus/physiopathology ; Treatment Outcome ; }, abstract = {INTRODUCTION: Parkinson's disease (PD) is the second most frequent progressive neuro-degenerative disorder. In addition to motor symptoms, nonmotor symptoms and voice and speech disorders can also develop in 90% of PD patients. The aim of our study was to investigate the effects of DBS and different DBS frequencies on speech acoustics of vowels in PD patients.

METHODS: The study included 16 patients who underwent STN-DBS surgery due to PD. The voice recordings for the vowels including [a], [e], [i], and [o] were performed at frequencies including 230, 130, 90, and 60 Hz and off-stimulation. The voice recordings were gathered and evaluated by the Praat software, and the effects on the first (F1), second (F2), and third formant (F3) frequencies were analyzed.

RESULTS: A significant difference was found for the F1 value of the vowel [a] at 130 Hz compared to off-stimulation. However, no significant difference was found between the three formant frequencies with regard to the stimulation frequencies and off-stimulation. In addition, though not statistically significant, stimulation at 60 and 230 Hz led to several differences in the formant frequencies of other three vowels.

CONCLUSION: Our results indicated that STN-DBS stimulation at 130 Hz had a significant positive effect on articulation of [a] compared to off-stimulation. Although there is not any statistical significant stimulation at 60 and 230 Hz may also have an effect on the articulation of [e], [i], and [o] but this effect needs to be investigated in future studies with higher numbers of participants.}, } @article {pmid29896086, year = {2018}, author = {Dietrich, S and Hertrich, I and Müller-Dahlhaus, F and Ackermann, H and Belardinelli, P and Desideri, D and Seibold, VC and Ziemann, U}, title = {Reduced Performance During a Sentence Repetition Task by Continuous Theta-Burst Magnetic Stimulation of the Pre-supplementary Motor Area.}, journal = {Frontiers in neuroscience}, volume = {12}, number = {}, pages = {361}, pmid = {29896086}, issn = {1662-4548}, abstract = {The pre-supplementary motor area (pre-SMA) is engaged in speech comprehension under difficult circumstances such as poor acoustic signal quality or time-critical conditions. Previous studies found that left pre-SMA is activated when subjects listen to accelerated speech. Here, the functional role of pre-SMA was tested for accelerated speech comprehension by inducing a transient "virtual lesion" using continuous theta-burst stimulation (cTBS). Participants were tested (1) prior to (pre-baseline), (2) 10 min after (test condition for the cTBS effect), and (3) 60 min after stimulation (post-baseline) using a sentence repetition task (formant-synthesized at rates of 8, 10, 12, 14, and 16 syllables/s). Speech comprehension was quantified by the percentage of correctly reproduced speech material. For high speech rates, subjects showed decreased performance after cTBS of pre-SMA. Regarding the error pattern, the number of incorrect words without any semantic or phonological similarity to the target context increased, while related words decreased. Thus, the transient impairment of pre-SMA seems to affect its inhibitory function that normally eliminates erroneous speech material prior to speaking or, in case of perception, prior to encoding into a semantically/pragmatically meaningful message.}, } @article {pmid29893162, year = {2019}, author = {Ahmed, S and Grosvald, M}, title = {Long-Distance Vowel-to-Vowel Coarticulation in Arabic: Influences of Intervening Consonant Pharyngealization and Length.}, journal = {Language and speech}, volume = {62}, number = {2}, pages = {399-424}, doi = {10.1177/0023830918777268}, pmid = {29893162}, issn = {1756-6053}, mesh = {Adult ; Humans ; Male ; Middle Aged ; Pharynx/anatomy & histology/*physiology ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {This study investigates anticipatory vowel-to-vowel coarticulation in Arabic, and seeks to determine the degree to which it is affected by the pharyngealization and length of intervening consonants. Speakers of Egyptian Arabic were recorded saying sentences containing nonsense sequences of the form /baɁabaCV:/, where C was chosen from {/t/, /tˤ/, /t:/, /tˤ:/} and V was a long vowel /i:/, /a:/ or /u:/. Analysis of the first and second formants of the recorded vowels revealed that (a) vowel-to-vowel coarticulatory effects could sometimes extend to a distance of three vowels before the context vowel; (b) the consonant-to-vowel effects associated with pharyngealization were consistently seen at similar distances, while also decreasing in magnitude at greater distances from the triggering consonant; and (c) effects related to intervening consonant length were idiosyncratic, and in particular did not lead to consistent blocking of vowel-to-vowel effects. In contrast, one speaker showed significant vowel-to-vowel effects at all three measured distances that were effectively blocked in the pharyngealized consonant condition.}, } @article {pmid29892847, year = {2018}, author = {Volodin, IA and Matrosova, VA and Frey, R and Kozhevnikova, JD and Isaeva, IL and Volodina, EV}, title = {Altai pika (Ochotona alpina) alarm calls: individual acoustic variation and the phenomenon of call-synchronous ear folding behavior.}, journal = {Die Naturwissenschaften}, volume = {105}, number = {7-8}, pages = {40}, pmid = {29892847}, issn = {1432-1904}, support = {14-14-00237//Russian Science Foundation/ ; 14-14-00237//Russian Science Foundation/ ; 14-14-00237//Russian Science Foundation/ ; }, mesh = {Acoustics ; Animals ; Ear/physiology ; Lagomorpha/*physiology ; Video Recording ; *Vocalization, Animal ; }, abstract = {Non-hibernating pikas collect winter food reserves and store them in hay piles. Individualization of alarm calls might allow discrimination between colony members and conspecifics trying to steal food items from a colony pile. We investigated vocal posture, vocal tract length, and individual acoustic variation of alarm calls, emitted by wild-living Altai pikas Ochotona alpina toward a researcher. Recording started when a pika started calling and lasted as long as possible. The alarm call series of 442 individual callers from different colonies consisted of discrete short (0.073-0.157 s), high-frequency (7.31-15.46 kHz), and frequency-modulated calls separated by irregular intervals. Analysis of 442 discrete calls, the second of each series, revealed that 44.34% calls lacked nonlinear phenomena, in 7.02% nonlinear phenomena covered less than half of call duration, and in 48.64% nonlinear phenomena covered more than half of call duration. Peak frequencies varied among individuals but always fitted one of three maxima corresponding to the vocal tract resonance frequencies (formants) calculated for an estimated 45-mm oral vocal tract. Discriminant analysis using variables of 8 calls per series of 36 different callers, each from a different colony, correctly assigned over 90% of the calls to individuals. Consequently, Altai pika alarm calls are individualistic and nonlinear phenomena might further increase this acoustic individualization. Additionally, video analysis revealed a call-synchronous, very fast (0.13-0.23 s) folding, depression, and subsequent re-expansion of the pinna confirming an earlier report of this behavior that apparently contributes to protecting the hearing apparatus from damage by the self-generated high-intensity alarm calls.}, } @article {pmid29891085, year = {2018}, author = {Kent, RD and Vorperian, HK}, title = {Static measurements of vowel formant frequencies and bandwidths: A review.}, journal = {Journal of communication disorders}, volume = {74}, number = {}, pages = {74-97}, pmid = {29891085}, issn = {1873-7994}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Child Language ; Child, Preschool ; Humans ; *Language Development ; *Phonetics ; *Speech Acoustics ; Speech Perception ; }, abstract = {PURPOSE: Data on vowel formants have been derived primarily from static measures representing an assumed steady state. This review summarizes data on formant frequencies and bandwidths for American English and also addresses (a) sources of variability (focusing on speech sample and time sampling point), and (b) methods of data reduction such as vowel area and dispersion.

METHOD: Searches were conducted with CINAHL, Google Scholar, MEDLINE/PubMed, SCOPUS, and other online sources including legacy articles and references. The primary search items were vowels, vowel space area, vowel dispersion, formants, formant frequency, and formant bandwidth.

RESULTS: Data on formant frequencies and bandwidths are available for both sexes over the lifespan, but considerable variability in results across studies affects even features of the basic vowel quadrilateral. Origins of variability likely include differences in speech sample and time sampling point. The data reveal the emergence of sex differences by 4 years of age, maturational reductions in formant bandwidth, and decreased formant frequencies with advancing age in some persons. It appears that a combination of methods of data reduction provide for optimal data interpretation.

CONCLUSION: The lifespan database on vowel formants shows considerable variability within specific age-sex groups, pointing to the need for standardized procedures.}, } @article {pmid29884510, year = {2019}, author = {Horáček, J and Radolf, V and Laukkanen, AM}, title = {Impact Stress in Water Resistance Voice Therapy: A Physical Modeling Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {4}, pages = {490-496}, doi = {10.1016/j.jvoice.2018.01.025}, pmid = {29884510}, issn = {1873-4588}, mesh = {Humans ; Models, Anatomic ; *Phonation ; Polymethyl Methacrylate ; Pressure ; Silicon ; Stress, Mechanical ; Vocal Cords/anatomy & histology/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVES: Phonation through a tube in water is used in voice therapy. This study investigates whether this exercise may increase mechanical loading on the vocal folds.

STUDY DESIGN: This is an experimental modeling study.

METHODS: A model with three-layer silicone vocal fold replica and a plexiglass, MK Plexi, Prague vocal tract set for the articulation of vowel [u:] was used. Impact stress (IS) was measured in three conditions: for [u:] (1) without a tube, (2) with a silicon Lax Vox tube (35 cm in length, 1 cm in inner diameter) immersed 2 cm in water, and (3) with the tube immersed 10 cm in water. Subglottic pressure and airflow ranges were selected to correspond to those reported in normal human phonation.

RESULTS: Phonation threshold pressure was lower for phonation into water compared with [u:] without a tube. IS increased with the airflow rate. IS measured in the range of subglottic pressure, which corresponds to measurements in humans, was highest for vowel [u:] without a tube and lower with the tube in water.

CONCLUSIONS: Even though the model and humans cannot be directly compared, for instance due to differences in vocal tract wall properties, the results suggest that IS is not likely to increase harmfully in water resistance therapy. However, there may be other effects related to it, possibly causing symptoms of vocal fatigue (eg, increased activity in the adductors or high amplitudes of oral pressure variation probably capable of increasing stress in the vocal fold). These need to be studied further, especially for cases where the water bubbling frequency is close to the acoustical-mechanical resonance and at the same time the fundamental phonation frequency is near the first formant frequency of the system.}, } @article {pmid29874830, year = {2018}, author = {Maigrot, AL and Hillmann, E and Briefer, EF}, title = {Encoding of Emotional Valence in Wild Boar (Sus scrofa) Calls.}, journal = {Animals : an open access journal from MDPI}, volume = {8}, number = {6}, pages = {}, pmid = {29874830}, issn = {2076-2615}, abstract = {Measuring emotions in nonhuman mammals is challenging. As animals are not able to verbally report how they feel, we need to find reliable indicators to assess their emotional state. Emotions can be described using two key dimensions: valence (negative or positive) and arousal (bodily activation or excitation). In this study, we investigated vocal expression of emotional valence in wild boars (Sus scrofa). The animals were observed in three naturally occurring situations: anticipation of a food reward (positive), affiliative interactions (positive), and agonistic interactions (negative). Body movement was used as an indicator of emotional arousal to control for the effect of this dimension. We found that screams and squeals were mostly produced during negative situations, and grunts during positive situations. Additionally, the energy quartiles, duration, formants, and harmonicity indicated valence across call types and situations. The mean of the first and second formants also indicated valence, but varied according to the call type. Our results suggest that wild boars can vocally express their emotional states. Some of these indicators could allow us to identify the emotional valence that wild boars are experiencing during vocal production and thus inform us about their welfare.}, } @article {pmid29870993, year = {2018}, author = {Bauerly, KR}, title = {The Effects of Emotion on Second Formant Frequency Fluctuations in Adults Who Stutter.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {70}, number = {1}, pages = {13-23}, doi = {10.1159/000488758}, pmid = {29870993}, issn = {1421-9972}, mesh = {Adult ; Anxiety/physiopathology ; Arousal/physiology ; *Emotions ; Humans ; Male ; Middle Aged ; Phonetics ; Photic Stimulation ; Reproducibility of Results ; *Sound Spectrography ; Speech Articulation Tests ; Stuttering/physiopathology/*psychology ; Young Adult ; }, abstract = {OBJECTIVE: Changes in second formant frequency fluctuations (FFF2) were examined in adults who stutter (AWS) and adults who do not stutter (ANS) when producing nonwords under varying emotional conditions.

METHODS: Ten AWS and 10 ANS viewed images selected from the International Affective Picture System representing dimensions of arousal (e.g., excited versus bored) and hedonic valence (e.g., happy versus sad). Immediately following picture presentation, participants produced a consonant-vowel + final /t/ (CVt) nonword consisting of the initial sounds /p/, /b/, /s/, or /z/, followed by a vowel (/i/, /u/, /ε/) and a final /t/. CVt tokens were assessed for word duration and FFF2.

RESULTS: Significantly slower word durations were shown in the AWS compared to the ANS across conditions. Although these differences appeared to increase under arousing conditions, no interaction was found. Results for FFF2 revealed a significant group-condition interaction. Post hoc analysis indicated that this was due to the AWS showing significantly greater FFF2 when speaking under conditions eliciting increases in arousal and unpleasantness. ANS showed little change in FFF2 across conditions.

CONCLUSIONS: The results suggest that AWS' articulatory stability is more susceptible to breakdown under negative emotional influences.}, } @article {pmid29860083, year = {2018}, author = {Fisher, JM and Dick, FK and Levy, DF and Wilson, SM}, title = {Neural representation of vowel formants in tonotopic auditory cortex.}, journal = {NeuroImage}, volume = {178}, number = {}, pages = {574-582}, pmid = {29860083}, issn = {1095-9572}, support = {R01 DC013270/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Brain Mapping/*methods ; Female ; Humans ; Magnetic Resonance Imaging/methods ; Male ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Speech sounds are encoded by distributed patterns of activity in bilateral superior temporal cortex. However, it is unclear whether speech sounds are topographically represented in cortex, or which acoustic or phonetic dimensions might be spatially mapped. Here, using functional MRI, we investigated the potential spatial representation of vowels, which are largely distinguished from one another by the frequencies of their first and second formants, i.e. peaks in their frequency spectra. This allowed us to generate clear hypotheses about the representation of specific vowels in tonotopic regions of auditory cortex. We scanned participants as they listened to multiple natural tokens of the vowels [ɑ] and [i], which we selected because their first and second formants overlap minimally. Formant-based regions of interest were defined for each vowel based on spectral analysis of the vowel stimuli and independently acquired tonotopic maps for each participant. We found that perception of [ɑ] and [i] yielded differential activation of tonotopic regions corresponding to formants of [ɑ] and [i], such that each vowel was associated with increased signal in tonotopic regions corresponding to its own formants. This pattern was observed in Heschl's gyrus and the superior temporal gyrus, in both hemispheres, and for both the first and second formants. Using linear discriminant analysis of mean signal change in formant-based regions of interest, the identity of untrained vowels was predicted with ∼73% accuracy. Our findings show that cortical encoding of vowels is scaffolded on tonotopy, a fundamental organizing principle of auditory cortex that is not language-specific.}, } @article {pmid29857767, year = {2018}, author = {Dubey, AK and Tripathi, A and Prasanna, SRM and Dandapat, S}, title = {Detection of hypernasality based on vowel space area.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {5}, pages = {EL412}, doi = {10.1121/1.5039718}, pmid = {29857767}, issn = {1520-8524}, mesh = {Child ; Cleft Palate/*physiopathology ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Perception/*physiology ; Speech Production Measurement/*methods ; }, abstract = {This study proposes a method for differentiating hypernasal-speech from normal speech using the vowel space area (VSA). Hypernasality introduces extra formant and anti-formant pairs in vowel spectrum, which results in shifting of formants. This shifting affects the size of the VSA. The results show that VSA is reduced in hypernasal-speech compared to normal speech. The VSA feature plus Mel-frequency cepstral coefficient feature for support vector machine based hypernasality detection leads to an accuracy of 86.89% for sustained vowels and 89.47%, 90.57%, and 91.70% for vowels in contexts of high pressure consonants /k/, /p/, and /t/, respectively.}, } @article {pmid29857736, year = {2018}, author = {Story, BH and Vorperian, HK and Bunton, K and Durtschi, RB}, title = {An age-dependent vocal tract model for males and females based on anatomic measurements.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {5}, pages = {3079}, pmid = {29857736}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; R01 DC011275/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Child ; Child Development/*physiology ; Child, Preschool ; Female ; Humans ; Infant ; Infant, Newborn ; Male ; *Sex Characteristics ; Sex Factors ; Speech/*physiology ; Vocal Cords/*anatomy & histology/diagnostic imaging/*physiology ; }, abstract = {The purpose of this study was to take a first step toward constructing a developmental and sex-specific version of a parametric vocal tract area function model representative of male and female vocal tracts ranging in age from infancy to 12 yrs, as well as adults. Anatomic measurements collected from a large imaging database of male and female children and adults provided the dataset from which length warping and cross-dimension scaling functions were derived, and applied to the adult-based vocal tract model to project it backward along an age continuum. The resulting model was assessed qualitatively by projecting hypothetical vocal tract shapes onto midsagittal images from the cohort of children, and quantitatively by comparison of formant frequencies produced by the model to those reported in the literature. An additional validation of modeled vocal tract shapes was made possible by comparison to cross-sectional area measurements obtained for children and adults using acoustic pharyngometry. This initial attempt to generate a sex-specific developmental vocal tract model paves a path to study the relation of vocal tract dimensions to documented prepubertal acoustic differences.}, } @article {pmid29857694, year = {2018}, author = {Carignan, C}, title = {Using ultrasound and nasalance to separate oral and nasal contributions to formant frequencies of nasalized vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {5}, pages = {2588}, doi = {10.1121/1.5034760}, pmid = {29857694}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Male ; Mouth/*physiology ; Nasal Cavity/*physiology ; *Phonetics ; Speech/physiology ; *Speech Acoustics ; Speech Perception/physiology ; Speech Production Measurement/instrumentation/*methods ; *Ultrasonic Waves ; }, abstract = {The experimental method described in this manuscript offers a possible means to address a well known issue in research on the independent effects of nasalization on vowel acoustics: given that the separate transfer functions associated with the oral and nasal cavities are merged in the acoustic signal, the task of teasing apart the respective effects of the two cavities seems to be an intractable problem. The proposed method uses ultrasound and nasalance to predict the effect of lingual configuration on formant frequencies of nasalized vowels, thus accounting for acoustic variation due to changing lingual posture and excluding its contribution to the acoustic signal. The results reveal that the independent effect of nasalization on the acoustic vowel quadrilateral resembles a counter-clockwise chain shift of nasal compared to non-nasal vowels. The results from the productions of 11 vowels by six speakers of different language backgrounds are compared to predictions presented in previous modeling studies, as well as discussed in the light of sound change of nasal vowel systems.}, } @article {pmid29852482, year = {2018}, author = {Romanelli, S and Menegotto, A and Smyth, R}, title = {Stress-Induced Acoustic Variation in L2 and L1 Spanish Vowels.}, journal = {Phonetica}, volume = {75}, number = {3}, pages = {190-218}, doi = {10.1159/000484611}, pmid = {29852482}, issn = {1423-0321}, mesh = {Adult ; Analysis of Variance ; Female ; Humans ; Language ; *Multilingualism ; *Phonetics ; *Speech Acoustics ; }, abstract = {AIM: We assessed the effect of lexical stress on the duration and quality of Spanish word-final vowels /a, e, o/ produced by American English late intermediate learners of L2 Spanish, as compared to those of native L1 Argentine Spanish speakers.

METHODS: Participants read 54 real words ending in /a, e, o/, with either final or penultimate lexical stress, embedded in a text and a word list. We measured vowel duration and both F1 and F2 frequencies at 3 temporal points.

RESULTS: stressed vowels were longer than unstressed vowels, in Spanish L1 and L2. L1 and L2 Spanish stressed /a/ and /e/ had higher F1 values than their unstressed counterparts. Only the L2 speakers showed evidence of rising offglides for /e/ and /o/. The L2 and L1 Spanish vowel space was compressed in the absence of stress.

CONCLUSION: Lexical stress affected the vowel quality of L1 and L2 Spanish vowels. We provide an up-to-date account of the formant trajectories of Argentine River Plate Spanish word-final /a, e, o/ and offer experimental support to the claim that stress affects the quality of Spanish vowels in word-final contexts.}, } @article {pmid29800360, year = {2018}, author = {Peter, V and Kalashnikova, M and Burnham, D}, title = {Weighting of Amplitude and Formant Rise Time Cues by School-Aged Children: A Mismatch Negativity Study.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {5}, pages = {1322-1333}, doi = {10.1044/2018_JSLHR-H-17-0334}, pmid = {29800360}, issn = {1558-9102}, mesh = {Adult ; Brain/*growth & development/*physiology ; Child ; Cues ; Electroencephalography ; Evoked Potentials ; Female ; Humans ; Language Development ; Male ; Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Speech Perception/*physiology ; }, abstract = {PURPOSE: An important skill in the development of speech perception is to apply optimal weights to acoustic cues so that phonemic information is recovered from speech with minimum effort. Here, we investigated the development of acoustic cue weighting of amplitude rise time (ART) and formant rise time (FRT) cues in children as measured by mismatch negativity (MMN).

METHOD: Twelve adults and 36 children aged 6-12 years listened to a /ba/-/wa/ contrast in an oddball paradigm in which the standard stimulus had the ART and FRT cues of /ba/. In different blocks, the deviant stimulus had either the ART or FRT cues of /wa/.

RESULTS: The results revealed that children younger than 10 years were sensitive to both ART and FRT cues whereas 10- to 12-year-old children and adults were sensitive only to FRT cues. Moreover, children younger than 10 years generated a positive mismatch response, whereas older children and adults generated MMN.

CONCLUSION: These results suggest that preattentive adultlike weighting of ART and FRT cues is attained only by 10 years of age and accompanies the change from mismatch response to the more mature MMN response.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.6207608.}, } @article {pmid29800072, year = {2018}, author = {Redford, MA}, title = {Grammatical Word Production Across Metrical Contexts in School-Aged Children's and Adults' Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {6}, pages = {1339-1354}, pmid = {29800072}, issn = {1558-9102}, support = {R01 HD061458/HD/NICHD NIH HHS/United States ; R01 HD087452/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Child ; *Child Language ; Child, Preschool ; Humans ; Language ; Linguistics ; *Speech ; Speech Production Measurement ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study is to test whether age-related differences in grammatical word production are due to differences in how children and adults chunk speech for output or to immature articulatory timing control in children.

METHOD: Two groups of 12 children, 5 and 8 years old, and 1 group of 12 adults produced sentences with phrase-medial determiners. Preceding verbs were varied to create different metrical contexts for chunking the determiner with an adjacent content word. Following noun onsets were varied to assess the coherence of determiner-noun sequences. Determiner vowel duration, amplitude, and formant frequencies were measured.

RESULTS: Children produced significantly longer and louder determiners than adults regardless of metrical context. The effect of noun onset on F1 was stronger in children's speech than in adults' speech; the effect of noun onset on F2 was stronger in adults' speech than in children's. Effects of metrical context on anticipatory formant patterns were more evident in children's speech than in adults' speech.

CONCLUSION: The results suggest that both immature articulatory timing control and age-related differences in how chunks are accessed or planned influence grammatical word production in school-aged children's speech. Future work will focus on the development of long-distance coarticulation to reveal the evolution of speech plan structure over time.}, } @article {pmid29792525, year = {2019}, author = {Dugan, SH and Silbert, N and McAllister, T and Preston, JL and Sotto, C and Boyce, SE}, title = {Modelling category goodness judgments in children with residual sound errors.}, journal = {Clinical linguistics & phonetics}, volume = {33}, number = {4}, pages = {295-315}, pmid = {29792525}, issn = {1464-5076}, support = {R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Child ; Female ; Humans ; *Judgment ; Male ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {This study investigates category goodness judgments of /r/ in adults and children with and without residual speech errors (RSEs) using natural speech stimuli. Thirty adults, 38 children with RSE (ages 7-16) and 35 age-matched typically developing (TD) children provided category goodness judgments on whole words, recorded from 27 child speakers, with /r/ in various phonetic environments. The salient acoustic property of /r/ - the lowered third formant (F3) - was normalized in two ways. A logistic mixed-effect model quantified the relationships between listeners' responses and the third formant frequency, vowel context and clinical group status. Goodness judgments from the adult group showed a statistically significant interaction with the F3 parameter when compared to both child groups (p < 0.001) using both normalization methods. The RSE group did not differ significantly from the TD group in judgments of /r/. All listeners were significantly more likely to judge /r/ as correct in a front-vowel context. Our results suggest that normalized /r/ F3 is a statistically significant predictor of category goodness judgments for both adults and children, but children do not appear to make adult-like judgments. Category goodness judgments do not have a clear relationship with /r/ production abilities in children with RSE. These findings may have implications for clinical activities that include category goodness judgments in natural speech, especially for recorded productions.}, } @article {pmid29784818, year = {2018}, author = {Tai, HC and Shen, YP and Lin, JH and Chung, DT}, title = {Acoustic evolution of old Italian violins from Amati to Stradivari.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {115}, number = {23}, pages = {5926-5931}, pmid = {29784818}, issn = {1091-6490}, abstract = {The shape and design of the modern violin are largely influenced by two makers from Cremona, Italy: The instrument was invented by Andrea Amati and then improved by Antonio Stradivari. Although the construction methods of Amati and Stradivari have been carefully examined, the underlying acoustic qualities which contribute to their popularity are little understood. According to Geminiani, a Baroque violinist, the ideal violin tone should "rival the most perfect human voice." To investigate whether Amati and Stradivari violins produce voice-like features, we recorded the scales of 15 antique Italian violins as well as male and female singers. The frequency response curves are similar between the Andrea Amati violin and human singers, up to ∼4.2 kHz. By linear predictive coding analyses, the first two formants of the Amati exhibit vowel-like qualities (F1/F2 = 503/1,583 Hz), mapping to the central region on the vowel diagram. Its third and fourth formants (F3/F4 = 2,602/3,731 Hz) resemble those produced by male singers. Using F1 to F4 values to estimate the corresponding vocal tract length, we observed that antique Italian violins generally resemble basses/baritones, but Stradivari violins are closer to tenors/altos. Furthermore, the vowel qualities of Stradivari violins show reduced backness and height. The unique formant properties displayed by Stradivari violins may represent the acoustic correlate of their distinctive brilliance perceived by musicians. Our data demonstrate that the pioneering designs of Cremonese violins exhibit voice-like qualities in their acoustic output.}, } @article {pmid29782442, year = {2019}, author = {Niemczak, CE and Vander Werff, KR}, title = {Informational Masking Effects on Neural Encoding of Stimulus Onset and Acoustic Change.}, journal = {Ear and hearing}, volume = {40}, number = {1}, pages = {156-167}, doi = {10.1097/AUD.0000000000000604}, pmid = {29782442}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Noise ; Perceptual Masking/*physiology ; Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: Recent investigations using cortical auditory evoked potentials have shown masker-dependent effects on sensory cortical processing of speech information. Background noise maskers consisting of other people talking are particularly difficult for speech recognition. Behavioral studies have related this to perceptual masking, or informational masking, beyond just the overlap of the masker and target at the auditory periphery. The aim of the present study was to use cortical auditory evoked potentials, to examine how maskers (i.e., continuous speech-shaped noise [SSN] and multi-talker babble) affect the cortical sensory encoding of speech information at an obligatory level of processing. Specifically, cortical responses to vowel onset and formant change were recorded under different background noise conditions presumed to represent varying amounts of energetic or informational masking. The hypothesis was, that even at this obligatory cortical level of sensory processing, we would observe larger effects on the amplitude and latency of the onset and change components as the amount of informational masking increased across background noise conditions.

DESIGN: Onset and change responses were recorded to a vowel change from /u-i/ in young adults under four conditions: quiet, continuous SSN, eight-talker (8T) babble, and two-talker (2T) babble. Repeated measures analyses by noise condition were conducted on amplitude, latency, and response area measurements to determine the differential effects of these noise conditions, designed to represent increasing and varying levels of informational and energetic masking, on cortical neural representation of a vowel onset and acoustic change response waveforms.

RESULTS: All noise conditions significantly reduced onset N1 and P2 amplitudes, onset N1-P2 peak to peak amplitudes, as well as both onset and change response area compared with quiet conditions. Further, all amplitude and area measures were significantly reduced for the two babble conditions compared with continuous SSN. However, there were no significant differences in peak amplitude or area for either onset or change responses between the two different babble conditions (eight versus two talkers). Mean latencies for all onset peaks were delayed for noise conditions compared with quiet. However, in contrast to the amplitude and area results, differences in peak latency between SSN and the babble conditions did not reach statistical significance.

CONCLUSIONS: These results support the idea that while background noise maskers generally reduce amplitude and increase latency of speech-sound evoked cortical responses, the type of masking has a significant influence. Speech babble maskers (eight talkers and two talkers) have a larger effect on the obligatory cortical response to speech sound onset and change compared with purely energetic continuous SSN maskers, which may be attributed to informational masking effects. Neither the neural responses to the onset nor the vowel change, however, were sensitive to the hypothesized increase in the amount of informational masking between speech babble maskers with two talkers compared with eight talkers.}, } @article {pmid29772117, year = {2018}, author = {Sóskuthy, M and Foulkes, P and Hughes, V and Haddican, B}, title = {Changing Words and Sounds: The Roles of Different Cognitive Units in Sound Change.}, journal = {Topics in cognitive science}, volume = {10}, number = {4}, pages = {787-802}, doi = {10.1111/tops.12346}, pmid = {29772117}, issn = {1756-8765}, mesh = {Adult ; Aged ; Aged, 80 and over ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; *Psycholinguistics ; *Speech Acoustics ; Young Adult ; }, abstract = {This study considers the role of different cognitive units in sound change: phonemes, contextual variants and words. We examine /u/-fronting and /j/-dropping in data from three generations of Derby English speakers. We analyze dynamic formant data and auditory judgments, using mixed effects regression methods, including generalized additive mixed models (GAMMs). /u/-fronting is reaching its end-point, showing complex conditioning by context and a frequency effect that weakens over time. /j/-dropping is declining, with low-frequency words showing more innovative variants with /j/ than high-frequency words. The two processes interact: words with variable /j/-dropping (new) exhibit more fronting than words that never have /j/ (noodle) even when the /j/ is deleted. These results support models of change that rely on phonetically detailed representations for both word- and sound-level cognitive units.}, } @article {pmid29764783, year = {2018}, author = {Sanfins, MD and Hatzopoulos, S and Donadon, C and Diniz, TA and Borges, LR and Skarzynski, PH and Colella-Santos, MF}, title = {An Analysis of The Parameters Used In Speech ABR Assessment Protocols.}, journal = {The journal of international advanced otology}, volume = {14}, number = {1}, pages = {100-105}, pmid = {29764783}, issn = {1308-7649}, mesh = {Acoustic Stimulation/*methods ; Audiometry, Evoked Response/methods ; Child, Preschool ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Humans ; Language Development Disorders/physiopathology ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {The aim of this study was to assess the parameters of choice, such as duration, intensity, rate, polarity, number of sweeps, window length, stimulated ear, fundamental frequency, first formant, and second formant, from previously published speech ABR studies. To identify candidate articles, five databases were assessed using the following keyword descriptors: speech ABR, ABR-speech, speech auditory brainstem response, auditory evoked potential to speech, speech-evoked brainstem response, and complex sounds. The search identified 1288 articles published between 2005 and 2015. After filtering the total number of papers according to the inclusion and exclusion criteria, 21 studies were selected. Analyzing the protocol details used in 21 studies suggested that there is no consensus to date on a speech-ABR protocol and that the parameters of analysis used are quite variable between studies. This inhibits the wider generalization and extrapolation of data across languages and studies.}, } @article {pmid29761963, year = {2017}, author = {Chen, Y and Wang, J and Chen, W and Guan, M}, title = {[Research on spectrum feature of speech processing strategy for cochlear implant].}, journal = {Sheng wu yi xue gong cheng xue za zhi = Journal of biomedical engineering = Shengwu yixue gongchengxue zazhi}, volume = {34}, number = {5}, pages = {760-766}, pmid = {29761963}, issn = {1001-5515}, abstract = {Cochlear implant (CI) in present Chinese environment will lose pitch information and result in low speech recognition. In order to research Chinese feature-based speech processing strategy for cochlear implant contrapuntally and to improve the speech recognition for CI recipients, we improve the CI front-end signal acquisition platform and research the signal features. Our search includes the waveform, spectrogram, energy intensity, pitch and formant parameters for different speech processing strategies of cochlear implant. Features in two kinds of speech processing strategies are analyzed and extracted for the study of parameter characteristics. Therefore, the proposed aim of this paper is to extend the research on Chinese-based CI speech processing strategy.}, } @article {pmid29745524, year = {2018}, author = {Wang, Q and Bai, J and Xue, P and Zhang, X and Feng, P}, title = {[An acoustic-articulatory study of the nasal finals in students with and without hearing loss].}, journal = {Sheng wu yi xue gong cheng xue za zhi = Journal of biomedical engineering = Shengwu yixue gongchengxue zazhi}, volume = {35}, number = {2}, pages = {198-205}, pmid = {29745524}, issn = {1001-5515}, abstract = {The central aim of this experiment was to compare the articulatory and acoustic characteristics of students with normal hearing (NH) and school aged children with hearing loss (HL), and to explore the articulatory-acoustic relations during the nasal finals. Fourteen HL and 10 control group were enrolled in this study, and the data of 4 HL students were removed because of their high pronunciation error rate. Data were collected using an electromagnetic articulography. The acoustic data and kinematics data of nasal finals were extracted by the phonetics and data processing software, and all data were analyzed by t test and correlation analysis. The paper shows that, the difference was statistically significant (P<0.05 or P<0.01) in different vowels under the first two formant frequencies (F1, F2), the tongue position and the articulatory-acoustic relations between HL and NH group. The HL group's vertical movement data-F1 relations in /en/ and /eng/ are same as NH group. The conclusion of this study about participants with HL can provide support for speech healing training at increasing pronunciation accuracy in HL participants.}, } @article {pmid29731377, year = {2019}, author = {Lee, Y and Kim, G and Wang, S and Jang, J and Cha, W and Choi, H and Kim, H}, title = {Acoustic Characteristics in Epiglottic Cyst.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {4}, pages = {497-500}, doi = {10.1016/j.jvoice.2018.01.023}, pmid = {29731377}, issn = {1873-4588}, mesh = {Acoustics ; Aged ; Cysts/*complications/diagnosis ; Epiglottis/*physiopathology ; Humans ; Laryngeal Diseases/*complications/diagnosis ; Male ; Middle Aged ; Retrospective Studies ; *Speech Acoustics ; Speech Production Measurement ; Vocal Cords/*physiopathology ; Voice Disorders/diagnosis/*etiology/physiopathology ; *Voice Quality ; }, abstract = {OBJECTIVE: The purpose of this study was to analyze the acoustic characteristics associated with alternation deformation of the vocal tract due to large epiglottic cyst, and to confirm the relation between the anatomical change and resonant function of the vocal tract.

METHODS: Eight men with epiglottic cyst were enrolled in this study. The jitter, shimmer, noise-to-harmonic ratio, and first two formants were analyzed in vowels /a:/, /e:/, /i:/, /o:/, and /u:/. These values were analyzed before and after laryngeal microsurgery.

RESULTS: The F1 value of /a:/ was significantly raised after surgery. Significant differences of formant frequencies in other vowels, jitter, shimmer, and noise-to-harmonic ratio were not presented.

CONCLUSION: The results of this study could be used to analyze changes in the resonance of vocal tracts due to the epiglottic cysts.}, } @article {pmid29716273, year = {2018}, author = {Başkent, D and Luckmann, A and Ceha, J and Gaudrain, E and Tamati, TN}, title = {The discrimination of voice cues in simulations of bimodal electro-acoustic cochlear-implant hearing.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {4}, pages = {EL292}, doi = {10.1121/1.5034171}, pmid = {29716273}, issn = {1520-8524}, mesh = {Acoustic Stimulation/*methods ; Adult ; Cochlear Implantation/*methods ; *Cues ; Deafness/*rehabilitation ; Electric Stimulation/*methods ; Female ; Humans ; Male ; Pitch Discrimination/*physiology ; Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {In discriminating speakers' voices, normal-hearing individuals effectively use two vocal characteristics, vocal pitch (related to fundamental frequency, F0) and vocal-tract length (VTL, related to speaker size). Typical cochlear-implant users show poor perception of these cues. However, in implant users with low-frequency residual acoustic hearing, this bimodal electro-acoustic stimulation may provide additional voice-related cues, such as low-numbered harmonics and formants, which could improve F0/VTL perception. In acoustic noise-vocoder simulations, where added low-pass filtered speech simulated residual hearing, a strong bimodal benefit was observed for F0 perception. No bimodal benefit was observed for VTL, which seems to mainly rely on vocoder spectral resolution.}, } @article {pmid29710247, year = {2018}, author = {Whitfield, JA and Dromey, C and Palmer, P}, title = {Examining Acoustic and Kinematic Measures of Articulatory Working Space: Effects of Speech Intensity.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {5}, pages = {1104-1117}, doi = {10.1044/2018_JSLHR-S-17-0388}, pmid = {29710247}, issn = {1558-9102}, mesh = {Adult ; *Biomechanical Phenomena ; Female ; Humans ; Male ; *Phonetics ; *Speech/physiology ; Speech Production Measurement ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to examine the effect of speech intensity on acoustic and kinematic vowel space measures and conduct a preliminary examination of the relationship between kinematic and acoustic vowel space metrics calculated from continuously sampled lingual marker and formant traces.

METHOD: Young adult speakers produced 3 repetitions of 2 different sentences at 3 different loudness levels. Lingual kinematic and acoustic signals were collected and analyzed. Acoustic and kinematic variants of several vowel space metrics were calculated from the formant frequencies and the position of 2 lingual markers. Traditional metrics included triangular vowel space area and the vowel articulation index. Acoustic and kinematic variants of sentence-level metrics based on the articulatory-acoustic vowel space and the vowel space hull area were also calculated.

RESULTS: Both acoustic and kinematic variants of the sentence-level metrics significantly increased with an increase in loudness, whereas no statistically significant differences in traditional vowel-point metrics were observed for either the kinematic or acoustic variants across the 3 loudness conditions. In addition, moderate-to-strong relationships between the acoustic and kinematic variants of the sentence-level vowel space metrics were observed for the majority of participants.

CONCLUSIONS: These data suggest that both kinematic and acoustic vowel space metrics that reflect the dynamic contributions of both consonant and vowel segments are sensitive to within-speaker changes in articulation associated with manipulations of speech intensity.}, } @article {pmid29708065, year = {2018}, author = {DiNino, M and Arenberg, JG}, title = {Age-Related Performance on Vowel Identification and the Spectral-temporally Modulated Ripple Test in Children With Normal Hearing and With Cochlear Implants.}, journal = {Trends in hearing}, volume = {22}, number = {}, pages = {2331216518770959}, pmid = {29708065}, issn = {2331-2165}, support = {R01 DC012142/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Cochlear Implantation ; *Cochlear Implants ; Deafness ; Female ; Humans ; Male ; *Speech Perception ; }, abstract = {Children's performance on psychoacoustic tasks improves with age, but inadequate auditory input may delay this maturation. Cochlear implant (CI) users receive a degraded auditory signal with reduced frequency resolution compared with normal, acoustic hearing; thus, immature auditory abilities may contribute to the variation among pediatric CI users' speech recognition scores. This study investigated relationships between age-related variables, spectral resolution, and vowel identification scores in prelingually deafened, early-implanted children with CIs compared with normal hearing (NH) children. All participants performed vowel identification and the Spectral-temporally Modulated Ripple Test (SMRT). Vowel stimuli for NH children were vocoded to simulate the reduced spectral resolution of CI hearing. Age positively predicted NH children's vocoded vowel identification scores, but time with the CI was a stronger predictor of vowel recognition and SMRT performance of children with CIs. For both groups, SMRT thresholds were related to vowel identification performance, analogous to previous findings in adults. Sequential information analysis of vowel feature perception indicated greater transmission of duration-related information compared with formant features in both groups of children. In addition, the amount of F2 information transmitted predicted SMRT thresholds in children with NH and with CIs. Comparisons between the two CIs of bilaterally implanted children revealed disparate task performance levels and information transmission values within the same child. These findings indicate that adequate auditory experience contributes to auditory perceptual abilities of pediatric CI users. Further, factors related to individual CIs may be more relevant to psychoacoustic task performance than are the overall capabilities of the child.}, } @article {pmid29699694, year = {2019}, author = {Chiaramonte, R and Di Luciano, C and Chiaramonte, I and Serra, A and Bonfiglio, M}, title = {Multi-disciplinary clinical protocol for the diagnosis of bulbar amyotrophic lateral sclerosis.}, journal = {Acta otorrinolaringologica espanola}, volume = {70}, number = {1}, pages = {25-31}, doi = {10.1016/j.otorri.2017.12.002}, pmid = {29699694}, issn = {2173-5735}, mesh = {Adult ; Amyotrophic Lateral Sclerosis/complications/*diagnosis ; Clinical Protocols ; Deglutition Disorders/etiology ; Disease Progression ; Dysphonia/etiology ; Early Diagnosis ; Electrodiagnosis ; Esophagoscopy ; Female ; Fiber Optic Technology ; Humans ; Interdisciplinary Communication ; *Internal Medicine ; Middle Aged ; Neurologic Examination ; *Neurology ; *Otolaryngology ; *Patient Care Team ; Pharyngeal Muscles/physiopathology ; Respiration Disorders/etiology ; Severity of Illness Index ; Sound Spectrography ; Speech Acoustics ; Tongue/physiopathology ; }, abstract = {INTRODUCTION AND OBJECTIVES: The objective of this study was to examine the role of different specialists in the diagnosis of amyotrophic lateral sclerosis (ALS), to understand changes in verbal expression and phonation, respiratory dynamics and swallowing that occurred rapidly over a short period of time.

MATERIALS AND METHODS: 22 patients with bulbar ALS were submitted for voice assessment, ENT evaluation, Multi-Dimensional Voice Program (MDVP), spectrogram, electroglottography, fiberoptic endoscopic evaluation of swallowing.

RESULTS: In the early stage of the disease, the oral tract and velopharyngeal port were involved. Three months after the initial symptoms, most of the patients presented hoarseness, breathy voice, dysarthria, pitch modulation problems and difficulties in pronunciation of explosive, velar and lingual consonants. Values of MDVP were altered. Spectrogram showed an additional formant, due to nasal resonance. Electroglottography showed periodic oscillation of the vocal folds only during short vocal cycle. Swallowing was characterized by weakness and incoordination of oro-pharyngeal muscles with penetration or aspiration.

CONCLUSIONS: A specific multidisciplinary clinical protocol was designed to report vocal parameters and swallowing disorders that changed more quickly in bulbar ALS patients. Furthermore, the patients were stratified according to involvement of pharyngeal structures, and severity index.}, } @article {pmid29690785, year = {2018}, author = {Prévost, F and Lehmann, A}, title = {Saliency of Vowel Features in Neural Responses of Cochlear Implant Users.}, journal = {Clinical EEG and neuroscience}, volume = {49}, number = {6}, pages = {388-397}, doi = {10.1177/1550059418770051}, pmid = {29690785}, issn = {2169-5202}, mesh = {Adult ; Aged ; Auditory Perception/*physiology ; Cochlear Implantation/methods ; Cochlear Implants/*adverse effects ; Electroencephalography/methods ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Psychoacoustics ; Speech/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Cochlear implants restore hearing in deaf individuals, but speech perception remains challenging. Poor discrimination of spectral components is thought to account for limitations of speech recognition in cochlear implant users. We investigated how combined variations of spectral components along two orthogonal dimensions can maximize neural discrimination between two vowels, as measured by mismatch negativity. Adult cochlear implant users and matched normal-hearing listeners underwent electroencephalographic event-related potentials recordings in an optimum-1 oddball paradigm. A standard /a/ vowel was delivered in an acoustic free field along with stimuli having a deviant fundamental frequency (+3 and +6 semitones), a deviant first formant making it a /i/ vowel or combined deviant fundamental frequency and first formant (+3 and +6 semitones /i/ vowels). Speech recognition was assessed with a word repetition task. An analysis of variance between both amplitude and latency of mismatch negativity elicited by each deviant vowel was performed. The strength of correlations between these parameters of mismatch negativity and speech recognition as well as participants' age was assessed. Amplitude of mismatch negativity was weaker in cochlear implant users but was maximized by variations of vowels' first formant. Latency of mismatch negativity was later in cochlear implant users and was particularly extended by variations of the fundamental frequency. Speech recognition correlated with parameters of mismatch negativity elicited by the specific variation of the first formant. This nonlinear effect of acoustic parameters on neural discrimination of vowels has implications for implant processor programming and aural rehabilitation.}, } @article {pmid29667469, year = {2018}, author = {Hamdan, AL and Khandakji, M and Macari, AT}, title = {Maxillary arch dimensions associated with acoustic parameters in prepubertal children.}, journal = {The Angle orthodontist}, volume = {88}, number = {4}, pages = {410-415}, pmid = {29667469}, issn = {1945-7103}, mesh = {Adolescent ; Child ; Dental Arch/*anatomy & histology/physiology ; Female ; Humans ; Male ; Maxilla/*anatomy & histology/physiology ; *Speech Acoustics ; Voice/physiology ; }, abstract = {OBJECTIVES: To evaluate the association between maxillary arch dimensions and fundamental frequency and formants of voice in prepubertal subjects.

MATERIALS AND METHODS: Thirty-five consecutive prepubertal patients seeking orthodontic treatment were recruited (mean age = 11.41 ± 1.46 years; range, 8 to 13.7 years). Participants with a history of respiratory infection, laryngeal manipulation, dysphonia, congenital facial malformations, or history of orthodontic treatment were excluded. Dental measurements included maxillary arch length, perimeter, depth, and width. Voice parameters comprising fundamental frequency (f0_sustained), Habitual pitch (f0_count), Jitter, Shimmer, and different formant frequencies (F1, F2, F3, and F4) were measured using acoustic analysis prior to initiation of any orthodontic treatment. Pearson's correlation coefficients were used to measure the strength of associations between different dental and voice parameters. Multiple linear regressions were computed for the predictions of different dental measurements.

RESULTS: Arch width and arch depth had moderate significant negative correlations with f0 (r = -0.52; P = .001 and r = -0.39; P = .022, respectively) and with habitual frequency (r = -0.51; P = .0014 and r = -0.34; P = .04, respectively). Arch depth and arch length were significantly correlated with formant F3 and formant F4, respectively. Predictors of arch depth included frequencies of F3 vowels, with a significant regression equation (P-value < .001; R[2] = 0.49). Similarly, fundamental frequency f0 and frequencies of formant F3 vowels were predictors of arch width, with a significant regression equation (P-value < .001; R[2] = 0.37).

CONCLUSIONS: There is a significant association between arch dimensions, particularly arch length and depth, and voice parameters. The formant most predictive of arch depth and width is the third formant, along with fundamental frequency of voice.}, } @article {pmid29652794, year = {2018}, author = {Elgendi, M and Bobhate, P and Jain, S and Guo, L and Rutledge, J and Coe, Y and Zemp, R and Schuurmans, D and Adatia, I}, title = {The Voice of the Heart: Vowel-Like Sound in Pulmonary Artery Hypertension.}, journal = {Diseases (Basel, Switzerland)}, volume = {6}, number = {2}, pages = {}, pmid = {29652794}, issn = {2079-9721}, abstract = {Increased blood pressure in the pulmonary artery is referred to as pulmonary hypertension and often is linked to loud pulmonic valve closures. For the purpose of this paper, it was hypothesized that pulmonary circulation vibrations will create sounds similar to sounds created by vocal cords during speech and that subjects with pulmonary artery hypertension (PAH) could have unique sound signatures across four auscultatory sites. Using a digital stethoscope, heart sounds were recorded at the cardiac apex, 2[nd] left intercostal space (2LICS), 2[nd] right intercostal space (2RICS), and 4[th] left intercostal space (4LICS) undergoing simultaneous cardiac catheterization. From the collected heart sounds, relative power of the frequency band, energy of the sinusoid formants, and entropy were extracted. PAH subjects were differentiated by applying the linear discriminant analysis with leave-one-out cross-validation. The entropy of the first sinusoid formant decreased significantly in subjects with a mean pulmonary artery pressure (mPAp) ≥ 25 mmHg versus subjects with a mPAp < 25 mmHg with a sensitivity of 84% and specificity of 88.57%, within a 10-s optimized window length for heart sounds recorded at the 2LICS. First sinusoid formant entropy reduction of heart sounds in PAH subjects suggests the existence of a vowel-like pattern. Pattern analysis revealed a unique sound signature, which could be used in non-invasive screening tools.}, } @article {pmid29641392, year = {2018}, author = {Brumberg, JS and Pitt, KM and Burnison, JD}, title = {A Noninvasive Brain-Computer Interface for Real-Time Speech Synthesis: The Importance of Multimodal Feedback.}, journal = {IEEE transactions on neural systems and rehabilitation engineering : a publication of the IEEE Engineering in Medicine and Biology Society}, volume = {26}, number = {4}, pages = {874-881}, pmid = {29641392}, issn = {1558-0210}, support = {R03 DC011304/DC/NIDCD NIH HHS/United States ; U54 HD090216/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Algorithms ; *Brain-Computer Interfaces ; *Communication Aids for Disabled ; Data Interpretation, Statistical ; Electroencephalography ; *Feedback, Psychological ; Feedback, Sensory ; Female ; Humans ; Imagination ; Learning ; Male ; Mental Fatigue ; Practice, Psychological ; Psychomotor Performance ; Reproducibility of Results ; Young Adult ; }, abstract = {We conducted a study of a motor imagery brain-computer interface (BCI) using electroencephalography to continuously control a formant frequency speech synthesizer with instantaneous auditory and visual feedback. Over a three-session training period, sixteen participants learned to control the BCI for production of three vowel sounds (/ textipa i/ [heed], / textipa A/ [hot], and / textipa u/ [who'd]) and were split into three groups: those receiving unimodal auditory feedback of synthesized speech, those receiving unimodal visual feedback of formant frequencies, and those receiving multimodal, audio-visual (AV) feedback. Audio feedback was provided by a formant frequency artificial speech synthesizer, and visual feedback was given as a 2-D cursor on a graphical representation of the plane defined by the first two formant frequencies. We found that combined AV feedback led to the greatest performance in terms of percent accuracy, distance to target, and movement time to target compared with either unimodal feedback of auditory or visual information. These results indicate that performance is enhanced when multimodal feedback is meaningful for the BCI task goals, rather than as a generic biofeedback signal of BCI progress.}, } @article {pmid29631938, year = {2019}, author = {Li, G and Li, H and Hou, Q and Jiang, Z}, title = {Distinct Acoustic Features and Glottal Changes Define Two Modes of Singing in Peking Opera.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {4}, pages = {583.e9-583.e14}, doi = {10.1016/j.jvoice.2018.01.009}, pmid = {29631938}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; China ; Female ; Glottis/diagnostic imaging/*physiology ; Humans ; Laryngoscopy ; *Occupations ; *Phonation ; *Singing ; Sound Spectrography ; Stroboscopy ; *Voice Quality ; }, abstract = {OBJECTIVE: We aimed to delineate the acoustic characteristics of the Laodan and Qingyi role in Peking Opera and define glottis closure states and mucosal wave changes during singing in the two roles.

METHODS: The range of singing in A4 (440 Hz) pitch in seven female Peking Opera singers was determined using two classic pieces of Peking Opera. Glottal changes during singing were examined by stroboscopic laryngoscope. The fundamental frequency of /i/ in the first 15 seconds of the two pieces and the /i/ pitch range were determined. The relative length of the glottis fissure and the relative maximum mucosal amplitude were calculated.

RESULTS: Qingyi had significantly higher mean fundamental frequency than Laodan. The long-term average spectrum showed an obvious formant cluster near 3000 Hz in Laodan versus Qingyi. No formant cluster was observed in singing in the regular mode. Strobe laryngoscopy showed complete glottal closure in Laodan and incomplete glottal closure in Qingyi in the maximal glottis closure phase. The relative length of the glottis fissure of Laodan was significantly lower than that of Qingyi in the singing mode. The relative maximum mucosal amplitude of Qingyi was significantly lower than that of Laodan.

CONCLUSION: The Laodan role and the Qingyi role in Peking Opera sing in a fundamental frequency range compatible with the respective use of da sang (big voice) and xiao sang (small voice). The morphological patterns of glottal changes also indicate that the Laodan role and the Qingyi role sing with da sang and xiao sang, respectively.}, } @article {pmid29623381, year = {2018}, author = {Brajot, FX and Nguyen, D and DiGiovanni, J and Gracco, VL}, title = {The impact of perilaryngeal vibration on the self-perception of loudness and the Lombard effect.}, journal = {Experimental brain research}, volume = {236}, number = {6}, pages = {1713-1723}, pmid = {29623381}, issn = {1432-1106}, support = {DC-013915/NH/NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking/*physiology ; Pharynx/*physiology ; *Self Concept ; Speech/*physiology ; Touch Perception/*physiology ; *Vibration ; Young Adult ; }, abstract = {The role of somatosensory feedback in speech and the perception of loudness was assessed in adults without speech or hearing disorders. Participants completed two tasks: loudness magnitude estimation of a short vowel and oral reading of a standard passage. Both tasks were carried out in each of three conditions: no-masking, auditory masking alone, and mixed auditory masking plus vibration of the perilaryngeal area. A Lombard effect was elicited in both masking conditions: speakers unconsciously increased vocal intensity. Perilaryngeal vibration further increased vocal intensity above what was observed for auditory masking alone. Both masking conditions affected fundamental frequency and the first formant frequency as well, but only vibration was associated with a significant change in the second formant frequency. An additional analysis of pure-tone thresholds found no difference in auditory thresholds between masking conditions. Taken together, these findings indicate that perilaryngeal vibration effectively masked somatosensory feedback, resulting in an enhanced Lombard effect (increased vocal intensity) that did not alter speakers' self-perception of loudness. This implies that the Lombard effect results from a general sensorimotor process, rather than from a specific audio-vocal mechanism, and that the conscious self-monitoring of speech intensity is not directly based on either auditory or somatosensory feedback.}, } @article {pmid29604687, year = {2018}, author = {Lawson, E and Stuart-Smith, J and Scobbie, JM}, title = {The role of gesture delay in coda /r/ weakening: An articulatory, auditory and acoustic study.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {3}, pages = {1646}, doi = {10.1121/1.5027833}, pmid = {29604687}, issn = {1520-8524}, mesh = {Adolescent ; Child ; Female ; Gestures ; Humans ; Language ; Linguistics ; Male ; *Phonetics ; Scotland ; Sex Factors ; Social Class ; *Speech Acoustics ; *Tongue/diagnostic imaging ; Ultrasonography ; }, abstract = {The cross-linguistic tendency of coda consonants to weaken, vocalize, or be deleted is shown to have a phonetic basis, resulting from gesture reduction, or variation in gesture timing. This study investigates the effects of the timing of the anterior tongue gesture for coda /r/ on acoustics and perceived strength of rhoticity, making use of two sociolects of Central Scotland (working- and middle-class) where coda /r/ is weakening and strengthening, respectively. Previous articulatory analysis revealed a strong tendency for these sociolects to use different coda /r/ tongue configurations-working- and middle-class speakers tend to use tip/front raised and bunched variants, respectively; however, this finding does not explain working-class /r/ weakening. A correlational analysis in the current study showed a robust relationship between anterior lingual gesture timing, F3, and percept of rhoticity. A linear mixed effects regression analysis showed that both speaker social class and linguistic factors (word structure and the checked/unchecked status of the prerhotic vowel) had significant effects on tongue gesture timing and formant values. This study provides further evidence that gesture delay can be a phonetic mechanism for coda rhotic weakening and apparent loss, but social class emerges as the dominant factor driving lingual gesture timing variation.}, } @article {pmid29587304, year = {2018}, author = {Waaramaa, T and Kukkonen, T and Mykkänen, S and Geneid, A}, title = {Vocal Emotion Identification by Children Using Cochlear Implants, Relations to Voice Quality, and Musical Interests.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {4}, pages = {973-985}, doi = {10.1044/2017_JSLHR-H-17-0054}, pmid = {29587304}, issn = {1558-9102}, mesh = {Adolescent ; Child ; *Cochlear Implants ; *Emotions ; Female ; Hearing Loss/psychology/*rehabilitation ; Humans ; Male ; *Music ; Sex Factors ; Social Perception ; Speech Acoustics ; *Speech Perception ; Time-to-Treatment ; *Voice Quality ; }, abstract = {PURPOSE: Listening tests for emotion identification were conducted with 8-17-year-old children with hearing impairment (HI; N = 25) using cochlear implants, and their 12-year-old peers with normal hearing (N = 18). The study examined the impact of musical interests and acoustics of the stimuli on correct emotion identification.

METHOD: The children completed a questionnaire with their background information and noting musical interests. They then listened to vocal stimuli produced by actors (N = 5) and consisting of nonsense sentences and prolonged vowels ([a:], [i:], and [u:]; N = 32) expressing excitement, anger, contentment, and fear. The children's task was to identify the emotions they heard in the sample by choosing from the provided options. Acoustics of the samples were studied using Praat software, and statistics were examined using SPSS 24 software.

RESULTS: The children with HI identified the emotions with 57% accuracy and the normal hearing children with 75% accuracy. Female listeners were more accurate than male listeners in both groups. Those who were implanted before age of 3 years identified emotions more accurately than others (p < .05). No connection between the child's audiogram and correct identification was observed. Musical interests and voice quality parameters were found to be related to correct identification.

CONCLUSIONS: Implantation age, musical interests, and voice quality tended to have an impact on correct emotion identification. Thus, in developing the cochlear implants, it may be worth paying attention to the acoustic structures of vocal emotional expressions, especially the formant frequency of F3. Supporting the musical interests of children with HI may help their emotional development and improve their social lives.}, } @article {pmid29567051, year = {2019}, author = {de Andrade, BMR and Valença, EHO and Salvatori, R and Souza, AHO and Oliveira-Neto, LA and Oliveira, AHA and Oliveira, MCP and Melo, EV and Andrade, MS and Freitas, CA and Santos, MP and Custodio, FA and Monteiro, GC and de Carvalho, S and Aguiar-Oliveira, MH}, title = {Effects of Therapy With Semi-occluded Vocal Tract and Choir Training on Voice in Adult Individuals With Congenital, Isolated, Untreated Growth Hormone Deficiency.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {5}, pages = {808.e1-808.e5}, doi = {10.1016/j.jvoice.2018.02.018}, pmid = {29567051}, issn = {1873-4588}, mesh = {Adult ; Aged ; Dwarfism, Pituitary/*complications/diagnosis/physiopathology ; Female ; Humans ; Longitudinal Studies ; Male ; Middle Aged ; Prospective Studies ; *Singing ; *Speech Acoustics ; Speech Therapy/*methods ; Time Factors ; Treatment Outcome ; Voice Disorders/diagnosis/etiology/physiopathology/*therapy ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVES: Voice is produced by the vibration of the vocal folds expressed by its fundamental frequency (Hz), whereas the formants (F) are fundamental frequency multiples, indicating amplification zones of the vowels in the vocal tract. We have shown that lifetime isolated growth hormone deficiency (IGHD) causes high pitch voice, with higher values of most formant frequencies, maintaining a prepuberal acoustic prediction. The objectives of this work were to verify the effects of the therapy with a semi-occluded vocal tract (SOVTT) and choir training on voice in these subjects with IGHD. We speculated that acoustic vocal parameters can be improved by SOVTT or choir training.

STUDY DESIGN: This is a prospective longitudinal study without control group.

METHODS: Acoustic analysis of isolated vowels was performed in 17 adults with IGHD before and after SOVTT (pre-SOVTT and post-SOVTT) and after choir training (post training), in a 30-day period.

RESULTS: The first formant was higher in post training compared with the pre-SOVTT (P = 0.009). The second formant was higher in post-SOVTT than in pre-SOVTT (P = 0.045). There was a trend of reduction in shimmer in post-choir training in comparison with pre-SOVTT (P = 0.051), and a reduction in post-choir training in comparison with post-SOVTT (P = 0.047).

CONCLUSIONS: SOVTT was relevant to the second formant, whereas choir training improved first formant and shimmer. Therefore, this speech therapy approach was able to improve acoustic parameters of the voice of individuals with congenital, untreated IGHD. This seems particularly important in a scenario in which few patients are submitted to growth hormone replacement therapy.}, } @article {pmid29522383, year = {2019}, author = {Vainio, L and Mustonen, T and Vainio, M}, title = {The Influence of Number Magnitude on Vocal Responses.}, journal = {Journal of motor behavior}, volume = {51}, number = {2}, pages = {129-140}, doi = {10.1080/00222895.2018.1440522}, pmid = {29522383}, issn = {1940-1027}, mesh = {Adult ; Female ; Humans ; Male ; *Mathematics ; Middle Aged ; Photic Stimulation ; Reaction Time ; *Verbal Behavior ; Young Adult ; }, abstract = {The study investigated whether number magnitude can influence vocal responses. Participants produced either short or long version of the vowel [ɑ] (Experiment 1), or high or low-pitched version of that vowel (Experiment 2), according to the parity of a visually presented number. In addition to measuring reaction times (RT) of vocal responses, we measured the intensity, the fundamental frequency (f0) and the first and second formants of the vocalization. The RTs showed that the long and high-pitched vocal responses were associated with large numbers, while short and low-pitched vocal responses were associated with small numbers. It was also found that high-pitched vocalizations were mapped with the odd numbers, while the low-pitched vocalizations were mapped with the even numbers. Finally, large numbers increased the f0 values. The study shows systematic interactions between the processes that represent number magnitude and produce vocal responses.}, } @article {pmid29517257, year = {2018}, author = {Masapollo, M and Polka, L and Ménard, L and Franklin, L and Tiede, M and Morgan, J}, title = {Asymmetries in unimodal visual vowel perception: The roles of oral-facial kinematics, orientation, and configuration.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {44}, number = {7}, pages = {1103-1118}, pmid = {29517257}, issn = {1939-1277}, support = {R01 HD068501/HD/NICHD NIH HHS/United States ; //Natural Sciences and Engineering Research Council/ ; //National Institutes of Health/ ; }, mesh = {Adult ; Eye Movement Measurements ; Female ; Humans ; Male ; Mouth/*physiology ; Psycholinguistics ; Speech/*physiology ; Speech Perception/*physiology ; Visual Perception/*physiology ; Young Adult ; }, abstract = {Masapollo, Polka, and Ménard (2017) recently reported a robust directional asymmetry in unimodal visual vowel perception: Adult perceivers discriminate a change from an English /u/ viseme to a French /u/ viseme significantly better than a change in the reverse direction. This asymmetry replicates a frequent pattern found in unimodal auditory vowel perception that points to a universal bias favoring more extreme vocalic articulations, which lead to acoustic signals with increased formant convergence. In the present article, the authors report 5 experiments designed to investigate whether this asymmetry in the visual realm reflects a speech-specific or general processing bias. They successfully replicated the directional effect using Masapollo et al.'s dynamically articulating faces but failed to replicate the effect when the faces were shown under static conditions. Asymmetries also emerged during discrimination of canonically oriented point-light stimuli that retained the kinematics and configuration of the articulating mouth. In contrast, no asymmetries emerged during discrimination of rotated point-light stimuli or Lissajou patterns that retained the kinematics, but not the canonical orientation or spatial configuration, of the labial gestures. These findings suggest that the perceptual processes underlying asymmetries in unimodal visual vowel discrimination are sensitive to speech-specific motion and configural properties and raise foundational questions concerning the role of specialized and general processes in vowel perception. (PsycINFO Database Record}, } @article {pmid29516081, year = {2018}, author = {Tamura, S and Ito, K and Hirose, N and Mori, S}, title = {Psychophysical Boundary for Categorization of Voiced-Voiceless Stop Consonants in Native Japanese Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {3}, pages = {789-796}, doi = {10.1044/2017_JSLHR-H-17-0131}, pmid = {29516081}, issn = {1558-9102}, mesh = {Female ; Humans ; Judgment ; Male ; Noise ; *Phonetics ; Psychophysics ; Sound Spectrography ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to investigate the psychophysical boundary used for categorization of voiced-voiceless stop consonants in native Japanese speakers.

METHOD: Twelve native Japanese speakers participated in the experiment. The stimuli were synthetic stop consonant-vowel stimuli varying in voice onset time (VOT) with manipulation of the amplitude of the initial noise portion and the first formant (F1) frequency of the periodic portion. There were 3 tasks, namely, speech identification to either /d/ or /t/, detection of the noise portion, and simultaneity judgment of onsets of the noise and periodic portions.

RESULTS: The VOT boundaries of /d/-/t/ were close to the shortest VOT values that allowed for detection of the noise portion but not to those for perceived nonsimultaneity of the noise and periodic portions. The slopes of noise detection functions along VOT were as sharp as those of voiced-voiceless identification functions. In addition, the effects of manipulating the amplitude of the noise portion and the F1 frequency of the periodic portion on the detection of the noise portion were similar to those on voiced-voiceless identification.

CONCLUSION: The psychophysical boundary of perception of the initial noise portion masked by the following periodic portion may be used for voiced-voiceless categorization by Japanese speakers.}, } @article {pmid29495741, year = {2018}, author = {Roberts, B and Summers, RJ}, title = {Informational masking of speech by time-varying competitors: Effects of frequency region and number of interfering formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {2}, pages = {891}, doi = {10.1121/1.5023476}, pmid = {29495741}, issn = {1520-8524}, abstract = {This study explored the extent to which informational masking of speech depends on the frequency region and number of extraneous formants in an interferer. Target formants-monotonized three-formant (F1+F2+F3) analogues of natural sentences-were presented monaurally, with target ear assigned randomly on each trial. Interferers were presented contralaterally. In experiment 1, single-formant interferers were created using the time-reversed F2 frequency contour and constant amplitude, root-mean-square (RMS)-matched to F2. Interferer center frequency was matched to that of F1, F2, or F3, while maintaining the extent of formant-frequency variation (depth) on a log scale. Adding an interferer lowered intelligibility; the effect of frequency region was small and broadly tuned around F2. In experiment 2, interferers comprised either one formant (F1, the most intense) or all three, created using the time-reversed frequency contours of the corresponding targets and RMS-matched constant amplitudes. Interferer formant-frequency variation was scaled to 0%, 50%, or 100% of the original depth. Increasing the depth of formant-frequency variation and number of formants in the interferer had independent and additive effects. These findings suggest that the impact on intelligibility depends primarily on the overall extent of frequency variation in each interfering formant (up to ∼100% depth) and the number of extraneous formants.}, } @article {pmid29495730, year = {2018}, author = {Barreda, S and Liu, ZY}, title = {Apparent-talker height is influenced by Mandarin lexical tone.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {2}, pages = {EL61}, doi = {10.1121/1.5022156}, pmid = {29495730}, issn = {1520-8524}, abstract = {Apparent-talker height is determined by a talker's fundamental frequency (f0) and spectral information, typically indexed using formant frequencies (FFs). Barreda [(2017b). J. Acoust. Soc. Am. 141, 4781-4792] reports that the apparent height of a talker can be influenced by vowel-specific variation in the f0 or FFs of a sound. In this experiment, native speakers of Mandarin were presented with a series of syllables produced by talkers of different apparent heights. Results indicate that there is substantial variability in the estimated height of a single talker based on lexical tone, as well as the inherent f0 and FFs of vowel phonemes.}, } @article {pmid29492006, year = {2017}, author = {Gamba, M and Favaro, L and Araldi, A and Matteucci, V and Giacoma, C and Friard, O}, title = {Modeling individual vocal differences in group-living lemurs using vocal tract morphology.}, journal = {Current zoology}, volume = {63}, number = {4}, pages = {467-475}, pmid = {29492006}, issn = {1674-5507}, abstract = {Vocal individuality is widespread in social animals. Individual variation in vocalizations is a prerequisite for discriminating among conspecifics and may have facilitated the evolution of large complex societies. Ring-tailed lemurs Lemur catta live in relatively large social groups, have conspicuous vocal repertoires, and their species-specific utterances can be interpreted in light of source-filter theory of vocal production. Indeed, their utterances allow individual discrimination and even recognition thanks to the resonance frequencies of the vocal tract. The purpose of this study is to determine which distinctive vocal features can be derived from the morphology of the upper vocal tract. To accomplish this, we built computational models derived from anatomical measurements collected on lemur cadavers and compared the results with the spectrographic output of vocalizations recorded from ex situ live individuals. Our results demonstrate that the morphological variation of the ring-tailed lemur vocal tract explains individual distinctiveness of their species-specific utterances. We also provide further evidence that vocal tract modeling is a powerful tool for studying the vocal output of non-human primates.}, } @article {pmid29486490, year = {2018}, author = {Croake, DJ and Andreatta, RD and Stemple, JC}, title = {Vocalization Subsystem Responses to a Temporarily Induced Unilateral Vocal Fold Paralysis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {3}, pages = {479-495}, doi = {10.1044/2017_JSLHR-S-17-0227}, pmid = {29486490}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Air Movements ; Biomechanical Phenomena ; Female ; Humans ; Larynx/diagnostic imaging/*physiopathology ; Lung/physiopathology ; Male ; Models, Biological ; Multilevel Analysis ; Phonation/*physiology ; Pressure ; Reproducibility of Results ; *Respiration ; Vocal Cord Paralysis/*physiopathology ; Voice/*physiology ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study is to quantify the interactions of the 3 vocalization subsystems of respiration, phonation, and resonance before, during, and after a perturbation to the larynx (temporarily induced unilateral vocal fold paralysis) in 10 vocally healthy participants. Using dynamic systems theory as a guide, we hypothesized that data groupings would emerge revealing context-dependent patterns in the relationships of variables representing the 3 vocalization subsystems. We also hypothesized that group data would mask important individual variability important to understanding the relationships among the vocalization subsystems.

METHOD: A perturbation paradigm was used to obtain respiratory kinematic, aerodynamic, and acoustic formant measures from 10 healthy participants (8 women, 2 men) with normal voices. Group and individual data were analyzed to provide a multilevel analysis of the data. A 3-dimensional state space model was constructed to demonstrate the interactive relationships among the 3 subsystems before, during, and after perturbation.

RESULTS: During perturbation, group data revealed that lung volume initiations and terminations were lower, with longer respiratory excursions; airflow rates increased while subglottic pressures were maintained. Acoustic formant measures indicated that the spacing between the upper formants decreased (F3-F5), whereas the spacing between F1 and F2 increased. State space modeling revealed the changing directionality and interactions among the 3 subsystems.

CONCLUSIONS: Group data alone masked important variability necessary to understand the unique relationships among the 3 subsystems. Multilevel analysis permitted a richer understanding of the individual differences in phonatory regulation and permitted subgroup analysis. Dynamic systems theory may be a useful heuristic to model the interactive relationships among vocalization subsystems.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.5913532.}, } @article {pmid29454552, year = {2019}, author = {Hoyer, P and Graf, S}, title = {Adjustment of the Vocal Tract Shape via Biofeedback: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {4}, pages = {482-489}, doi = {10.1016/j.jvoice.2018.01.018}, pmid = {29454552}, issn = {1873-4588}, mesh = {*Acoustics ; *Biofeedback, Psychology ; Fourier Analysis ; Humans ; *Phonation ; Pressure ; Single-Case Studies as Topic ; Sound Spectrography ; Time Factors ; Vibration ; Vocal Cords/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {In this study, an adjustment of the vocal tract shape toward selected sound waves in the frequency range of the first and second formants without phonation is discussed. The sound waves of a loudspeaker in front of the open mouth and amplified by the vocal tract are used as biofeedback signals. It is shown that the resonance amplification of the vocal tract complies with the concept of forced oscillation, with the driver being the sound source and the resonator being the vocal tract. An adjustment toward increased amplification via vocal tract resonance can be related to smaller bandwidths and lower damping. Furthermore, the applied adjustment frequencies are preserved as vocal tract resonances during exhalation and even phonation. This novel form of biofeedback might enrich standard voice training procedures by exercises without phonation.}, } @article {pmid29449060, year = {2018}, author = {Compton, MT and Lunden, A and Cleary, SD and Pauselli, L and Alolayan, Y and Halpern, B and Broussard, B and Crisafio, A and Capulong, L and Balducci, PM and Bernardini, F and Covington, MA}, title = {The aprosody of schizophrenia: Computationally derived acoustic phonetic underpinnings of monotone speech.}, journal = {Schizophrenia research}, volume = {197}, number = {}, pages = {392-399}, pmid = {29449060}, issn = {1573-2509}, support = {R21 MH097999/MH/NIMH NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Psycholinguistics/*methods ; Psychotic Disorders/complications/*physiopathology ; Schizophrenia/complications/*physiopathology ; *Speech Acoustics ; Speech Disorders/*diagnosis/etiology/*physiopathology ; Speech Production Measurement/*methods ; }, abstract = {OBJECTIVE: Acoustic phonetic methods are useful in examining some symptoms of schizophrenia; we used such methods to understand the underpinnings of aprosody. We hypothesized that, compared to controls and patients without clinically rated aprosody, patients with aprosody would exhibit reduced variability in: pitch (F0), jaw/mouth opening and tongue height (formant F1), tongue front/back position and/or lip rounding (formant F2), and intensity/loudness.

METHODS: Audiorecorded speech was obtained from 98 patients (including 25 with clinically rated aprosody and 29 without) and 102 unaffected controls using five tasks: one describing a drawing, two based on spontaneous speech elicited through a question (Tasks 2 and 3), and two based on reading prose excerpts (Tasks 4 and 5). We compared groups on variation in pitch (F0), formant F1 and F2, and intensity/loudness.

RESULTS: Regarding pitch variation, patients with aprosody differed significantly from controls in Task 5 in both unadjusted tests and those adjusted for sociodemographics. For the standard deviation (SD) of F1, no significant differences were found in adjusted tests. Regarding SD of F2, patients with aprosody had lower values than controls in Task 3, 4, and 5. For variation in intensity/loudness, patients with aprosody had lower values than patients without aprosody and controls across the five tasks.

CONCLUSIONS: Findings could represent a step toward developing new methods for measuring and tracking the severity of this specific negative symptom using acoustic phonetic parameters; such work is relevant to other psychiatric and neurological disorders.}, } @article {pmid29439164, year = {2018}, author = {Conant, DF and Bouchard, KE and Leonard, MK and Chang, EF}, title = {Human Sensorimotor Cortex Control of Directly Measured Vocal Tract Movements during Vowel Production.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {38}, number = {12}, pages = {2955-2966}, pmid = {29439164}, issn = {1529-2401}, support = {F32 DC013486/DC/NIDCD NIH HHS/United States ; U01 NS098971/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Biomechanical Phenomena ; Female ; Humans ; Jaw/*physiology ; Lip/*physiology ; Male ; Movement/*physiology ; Sensorimotor Cortex/*physiology ; Speech/*physiology ; Tongue/*physiology ; }, abstract = {During speech production, we make vocal tract movements with remarkable precision and speed. Our understanding of how the human brain achieves such proficient control is limited, in part due to the challenge of simultaneously acquiring high-resolution neural recordings and detailed vocal tract measurements. To overcome this challenge, we combined ultrasound and video monitoring of the supralaryngeal articulators (lips, jaw, and tongue) with electrocorticographic recordings from the cortical surface of 4 subjects (3 female, 1 male) to investigate how neural activity in the ventral sensory-motor cortex (vSMC) relates to measured articulator movement kinematics (position, speed, velocity, acceleration) during the production of English vowels. We found that high-gamma activity at many individual vSMC electrodes strongly encoded the kinematics of one or more articulators, but less so for vowel formants and vowel identity. Neural population decoding methods further revealed the structure of kinematic features that distinguish vowels. Encoding of articulator kinematics was sparsely distributed across time and primarily occurred during the time of vowel onset and offset. In contrast, encoding was low during the steady-state portion of the vowel, despite sustained neural activity at some electrodes. Significant representations were found for all kinematic parameters, but speed was the most robust. These findings enabled by direct vocal tract monitoring demonstrate novel insights into the representation of articulatory kinematic parameters encoded in the vSMC during speech production.SIGNIFICANCE STATEMENT Speaking requires precise control and coordination of the vocal tract articulators (lips, jaw, and tongue). Despite the impressive proficiency with which humans move these articulators during speech production, our understanding of how the brain achieves such control is rudimentary, in part because the movements themselves are difficult to observe. By simultaneously measuring speech movements and the neural activity that gives rise to them, we demonstrate how neural activity in sensorimotor cortex produces complex, coordinated movements of the vocal tract.}, } @article {pmid29433119, year = {2018}, author = {Howson, P}, title = {Rhotics and Palatalization: An Acoustic Examination of Upper and Lower Sorbian.}, journal = {Phonetica}, volume = {75}, number = {2}, pages = {132-150}, doi = {10.1159/000481783}, pmid = {29433119}, issn = {1423-0321}, mesh = {Alveolar Process ; Analysis of Variance ; Germany ; Humans ; *Language ; *Palate ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {Two of the major problems with rhotics are: (1) rhotics, unlike most other classes, are highly resistant to secondary palatalization, and (2) acoustic cues for rhotics as a class have been elusive. This study examines the acoustics of Upper and Lower Sorbian rhotics. Dynamic measures of the F1-F3 and F2-F1 were recorded and compared using SSANOVAs. The results indicate there is a striking delay in achievement of F2 for both the palatalized rhotics, while F2, F1, and F2-F1 are similar for all the rhotics tested here. The results suggest an inherent articulatory conflict between rhotics and secondary palatalization. The delay in the F2 increase indicates a delay in the palatalization gesture. This is likely due to conflicting constraints on the tongue dorsum. There was also an overlap in the F2 and F2-F1 for both the uvular and alveolar rhotics. This suggests a strong acoustic cue to rhotic classhood is found in the F2 signal. The overall formant similarities in frequency and trajectory also suggest a strong similarity in the vocal tract shapes between uvular and alveolar rhotics.}, } @article {pmid29407995, year = {2018}, author = {Janott, C and Schmitt, M and Zhang, Y and Qian, K and Pandit, V and Zhang, Z and Heiser, C and Hohenhorst, W and Herzog, M and Hemmert, W and Schuller, B}, title = {Snoring classified: The Munich-Passau Snore Sound Corpus.}, journal = {Computers in biology and medicine}, volume = {94}, number = {}, pages = {106-118}, doi = {10.1016/j.compbiomed.2018.01.007}, pmid = {29407995}, issn = {1879-0534}, mesh = {*Databases, Factual ; Female ; Humans ; Male ; Respiratory Sounds/*physiopathology ; *Signal Processing, Computer-Assisted ; *Snoring/classification/pathology/physiopathology ; }, abstract = {OBJECTIVE: Snoring can be excited in different locations within the upper airways during sleep. It was hypothesised that the excitation locations are correlated with distinct acoustic characteristics of the snoring noise. To verify this hypothesis, a database of snore sounds is developed, labelled with the location of sound excitation.

METHODS: Video and audio recordings taken during drug induced sleep endoscopy (DISE) examinations from three medical centres have been semi-automatically screened for snore events, which subsequently have been classified by ENT experts into four classes based on the VOTE classification. The resulting dataset containing 828 snore events from 219 subjects has been split into Train, Development, and Test sets. An SVM classifier has been trained using low level descriptors (LLDs) related to energy, spectral features, mel frequency cepstral coefficients (MFCC), formants, voicing, harmonic-to-noise ratio (HNR), spectral harmonicity, pitch, and microprosodic features.

RESULTS: An unweighted average recall (UAR) of 55.8% could be achieved using the full set of LLDs including formants. Best performing subset is the MFCC-related set of LLDs. A strong difference in performance could be observed between the permutations of train, development, and test partition, which may be caused by the relatively low number of subjects included in the smaller classes of the strongly unbalanced data set.

CONCLUSION: A database of snoring sounds is presented which are classified according to their sound excitation location based on objective criteria and verifiable video material. With the database, it could be demonstrated that machine classifiers can distinguish different excitation location of snoring sounds in the upper airway based on acoustic parameters.}, } @article {pmid29401061, year = {2018}, author = {Cameron, S and Chong-White, N and Mealings, K and Beechey, T and Dillon, H and Young, T}, title = {The Phoneme Identification Test for Assessment of Spectral and Temporal Discrimination Skills in Children: Development, Normative Data, and Test-Retest Reliability Studies.}, journal = {Journal of the American Academy of Audiology}, volume = {29}, number = {2}, pages = {135-150}, doi = {10.3766/jaaa.16145}, pmid = {29401061}, issn = {2157-3107}, mesh = {Adult ; Age Factors ; Child ; Cues ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Psychometrics ; Reproducibility of Results ; *Speech Discrimination Tests ; Speech Perception/*physiology ; Time Factors ; Young Adult ; }, abstract = {BACKGROUND: Previous research suggests that a proportion of children experiencing reading and listening difficulties may have an underlying primary deficit in the way that the central auditory nervous system analyses the perceptually important, rapidly varying, formant frequency components of speech.

PURPOSE: The Phoneme Identification Test (PIT) was developed to investigate the ability of children to use spectro-temporal cues to perceptually categorize speech sounds based on their rapidly changing formant frequencies. The PIT uses an adaptive two-alternative forced-choice procedure whereby the participant identifies a synthesized consonant-vowel (CV) (/ba/ or /da/) syllable. CV syllables differed only in the second formant (F2) frequency along an 11-step continuum (between 0% and 100%-representing an ideal /ba/ and /da/, respectively). The CV syllables were presented in either quiet (PIT Q) or noise at a 0 dB signal-to-noise ratio (PIT N).

RESEARCH DESIGN: Development of the PIT stimuli and test protocols, and collection of normative and test-retest reliability data.

STUDY SAMPLE: Twelve adults (aged 23 yr 10 mo to 50 yr 9 mo, mean 32 yr 5 mo) and 137 typically developing, primary-school children (aged 6 yr 0 mo to 12 yr 4 mo, mean 9 yr 3 mo). There were 73 males and 76 females.

DATA COLLECTION AND ANALYSIS: Data were collected using a touchscreen computer. Psychometric functions were automatically fit to individual data by the PIT software. Performance was determined by the width of the continuum for which responses were neither clearly /ba/ nor /da/ (referred to as the uncertainty region [UR]). A shallower psychometric function slope reflected greater uncertainty. Age effects were determined based on raw scores. Z scores were calculated to account for the effect of age on performance. Outliers, and individual data for which the confidence interval of the UR exceeded a maximum allowable value, were removed. Nonparametric tests were used as the data were skewed toward negative performance.

RESULTS: Across participants, the median value of the F2 range that resulted in uncertain responses was 33% in quiet and 40% in noise. There was a significant effect of age on the width of this UR (p < 0.00001) in both quiet and noise, with performance becoming adult like by age 9 on the PIT Q and age 10 on the PIT N. A skewed distribution toward negative performance occurred in both quiet (p = 0.01) and noise (p = 0.006). Median UR scores were significantly wider in noise than in quiet (T = 2041, p < 0.0000001). Performance (z scores) across the two tests was significantly correlated (r = 0.36, p = 0.000009). Test-retest z scores were significantly correlated in both quiet and noise (r = 0.4 and 0.37, respectively, p < 0.0001).

CONCLUSIONS: The PIT normative data show that the ability to identify phonemes based on changes in formant transitions improves with age, and that some children in the general population have performance much worse than their age peers. In children, uncertainty increases when the stimuli are presented in noise. The test is suitable for use in planned studies in a clinical population.}, } @article {pmid29392286, year = {2018}, author = {Holt, YF}, title = {Mechanisms of Vowel Variation in African American English.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {61}, number = {2}, pages = {197-209}, doi = {10.1044/2017_JSLHR-S-16-0375}, pmid = {29392286}, issn = {1558-9102}, mesh = {Adolescent ; Adult ; *Black or African American/psychology ; Humans ; Male ; North Carolina ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; White People ; Young Adult ; }, abstract = {PURPOSE: This research explored mechanisms of vowel variation in African American English by comparing 2 geographically distant groups of African American and White American English speakers for participation in the African American Shift and the Southern Vowel Shift.

METHOD: Thirty-two male (African American: n = 16, White American controls: n = 16) lifelong residents of cities in eastern and western North Carolina produced heed,hid,heyd,head,had,hod,hawed,whod,hood,hoed,hide,howed,hoyd, and heard 3 times each in random order. Formant frequency, duration, and acoustic analyses were completed for the vowels /i, ɪ, e, ɛ, æ, ɑ, ɔ, u, ʊ, o, aɪ, aʊ, oɪ, ɝ/ produced in the listed words.

RESULTS: African American English speakers show vowel variation. In the west, the African American English speakers are participating in the Southern Vowel Shift and hod fronting of the African American Shift. In the east, neither the African American English speakers nor their White peers are participating in the Southern Vowel Shift. The African American English speakers show limited participation in the African American Shift.

CONCLUSION: The results provide evidence of regional and socio-ethnic variation in African American English in North Carolina.}, } @article {pmid29390795, year = {2018}, author = {Settibhaktini, H and Chintanpalli, A}, title = {Modeling the level-dependent changes of concurrent vowel scores.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {1}, pages = {440}, pmid = {29390795}, issn = {1520-8524}, support = {P50 DC000422/DC/NIDCD NIH HHS/United States ; R01 DC000184/DC/NIDCD NIH HHS/United States ; UL1 RR029882/RR/NCRR NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Algorithms ; Cochlear Nerve/*physiology ; *Computer Simulation ; *Cues ; Humans ; *Models, Neurological ; *Pitch Perception ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; }, abstract = {The difference in fundamental frequency (F0) between talkers is an important cue for speaker segregation. To understand how this cue varies across sound level, Chintanpalli, Ahlstrom, and Dubno [(2014). J. Assoc. Res. Otolaryngol. 15, 823-837] collected level-dependent changes in concurrent-vowel identification scores for same- and different-F0 conditions in younger adults with normal hearing. Modeling suggested that level-dependent changes in phase locking of auditory-nerve (AN) fibers to formants and F0s may contribute to concurrent-vowel identification scores; however, identification scores were not predicted to test this suggestion directly. The current study predicts these identification scores using the temporal responses of a computational AN model and a modified version of Meddis and Hewitt's [(1992). J. Acoust. Soc. Am. 91, 233-245] F0-based segregation algorithm. The model successfully captured the level-dependent changes in identification scores of both vowels with and without F0 difference, as well as identification scores for one vowel correct. The model's F0-based vowel segregation was controlled using the actual F0-benefit across levels such that the predicted F0-benefit matched qualitatively with the actual F0-benefit as a function of level. The quantitative predictions from this F0-based segregation algorithm demonstrate that temporal responses of AN fibers to vowel formants and F0s can account for variations in identification scores across sound level and F0-difference conditions in a concurrent-vowel task.}, } @article {pmid29390775, year = {2018}, author = {Ponsot, E and Arias, P and Aucouturier, JJ}, title = {Uncovering mental representations of smiled speech using reverse correlation.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {1}, pages = {EL19}, doi = {10.1121/1.5020989}, pmid = {29390775}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; *Cues ; Female ; Humans ; Male ; *Smiling ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {Which spectral cues underlie the perceptual processing of smiles in speech? Here, the question was addressed using reverse-correlation in the case of the isolated vowel [a]. Listeners were presented with hundreds of pairs of utterances with randomly manipulated spectral characteristics and were asked to indicate, in each pair, which was the most smiling. The analyses revealed that they relied on robust spectral representations that specifically encoded vowel's formants. These findings demonstrate the causal role played by formants in the perception of smile. Overall, this paper suggests a general method to estimate the spectral bases of high-level (e.g., emotional/social/paralinguistic) speech representations.}, } @article {pmid29390771, year = {2018}, author = {Wu, S and Huang, X and Wang, J and Hong, N and Li, Y}, title = {Evaluation of speech improvement following obturator prostheses for patients with palatal defect.}, journal = {The Journal of the Acoustical Society of America}, volume = {143}, number = {1}, pages = {202}, doi = {10.1121/1.5020781}, pmid = {29390771}, issn = {1520-8524}, abstract = {Palatal defect is a common maxillofacial defect after maxillectomy that can be repaired by obturator prostheses, which can effectively improve patients' speech. However, comprehensive evaluation methods for speech recovery are still controversial and remain undefined. A prospective cohort study on 34 patients with palatal defect and 34 healthy controls was performed. Patients received obturator prostheses and their speech was recorded without and then with obturators. Participants pronounced six Chinese vowels and 100 syllables for recording. This paper evaluated the recovery of speech function of patients through the combination of subjective and objective assessment methods. Recruited listeners evaluated the speech intelligibility (SI) of 100 syllables. Vowel formant frequency and quantified vowel nasalization were measured using analysis software. The SI of patients improved significantly after wearing obturators. F2 values of six vowels in patients with obturators were higher than patients without obturators and close to the corresponding values in normal controls. The differences in F2 of /i/ and /u/, (A1-P1) of /i/ and /u/ for patients without and with obturator use were significant. Patients' ability to control the pronunciation of /i/ and /u/ improved greatly with obturators. These provide clinical evidence of the treatment outcomes in patients with palatal defect who received obturators.}, } @article {pmid29371897, year = {2018}, author = {Takaki, PB and Vieira, MM and Said, AV and Bommarito, S}, title = {Does Body Mass Index Interfere in the Formation of Speech Formants?.}, journal = {International archives of otorhinolaryngology}, volume = {22}, number = {1}, pages = {45-49}, pmid = {29371897}, issn = {1809-9777}, abstract = {Introduction Studies in the fields of voice and speech have increasingly focused on the vocal tract and the importance of its structural integrity, and changes in the anatomy and configuration of the vocal tract determine the variations in phonatory and acoustic measurements, especially in the formation of the formants (Fs). Recent studies have revealed the functional consequences arising from being overweight and having an accumulation of fat in the pharyngeal region, including obstructive sleep apnea syndrome (OSAS) and impacts on the voice. Objectives To assess the relationship between body mass index (BMI) and analysis of the speech. Methods This study was approved by the Ethics Committee of the Universidade Federal de São Paulo (no. 288,430). The cohort consisted of 124 individuals aged between 18 and 45 with full permanent dentition and selected randomly. The participants underwent a brief medical history taking, BMI assessments and recording emissions of the sustained vowels /a/, /ε/, /i/, and /u/ by acoustic program PRAAT (v. 5.3.85, Boersma and Weenink, Amsterdam, Netherlands). Recordings were taken using a unidirectional microphone headset (model Karsect HT-9, Guangdong, China), with a condenser connected to an external sound card (USB-SA 2.0, model Andrea, PureAudio™, Pleasant Grove, UT, USA), to reduce noise. Results There was a significant correlation between BMI and formant 3 (F3) vowel /a/; however, there was a low degree of correlation intensity. Conclusions We did not observe a correlation between the BMI and the speech formants, but we believe there is a trend in this correlation that leads to changes in speech patterns with increases in BMI.}, } @article {pmid29352208, year = {2018}, author = {Demopoulos, C and Kothare, H and Mizuiri, D and Henderson-Sabes, J and Fregeau, B and Tjernagel, J and Houde, JF and Sherr, EH and Nagarajan, SS}, title = {Abnormal Speech Motor Control in Individuals with 16p11.2 Deletions.}, journal = {Scientific reports}, volume = {8}, number = {1}, pages = {1274}, pmid = {29352208}, issn = {2045-2322}, support = {R01 NS058721/NS/NINDS NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Adolescent ; Autistic Disorder/*physiopathology ; Child ; Chromosome Deletion ; Chromosome Disorders/*physiopathology ; Chromosomes, Human, Pair 16 ; Female ; Humans ; Intellectual Disability/*physiopathology ; Male ; *Speech ; Voice ; }, abstract = {Speech and motor deficits are highly prevalent (>70%) in individuals with the 600 kb BP4-BP5 16p11.2 deletion; however, the mechanisms that drive these deficits are unclear, limiting our ability to target interventions and advance treatment. This study examined fundamental aspects of speech motor control in participants with the 16p11.2 deletion. To assess capacity for control of voice, we examined how accurately and quickly subjects changed the pitch of their voice within a trial to correct for a transient perturbation of the pitch of their auditory feedback. When compared to controls, 16p11.2 deletion carriers show an over-exaggerated pitch compensation response to unpredictable mid-vocalization pitch perturbations. We also examined sensorimotor adaptation of speech by assessing how subjects learned to adapt their sustained productions of formants (speech spectral peak frequencies important for vowel identity), in response to consistent changes in their auditory feedback during vowel production. Deletion carriers show reduced sensorimotor adaptation to sustained vowel identity changes in auditory feedback. These results together suggest that 16p11.2 deletion carriers have fundamental impairments in the basic mechanisms of speech motor control and these impairments may partially explain the deficits in speech and language in these individuals.}, } @article {pmid29326563, year = {2017}, author = {Deroche, MLD and Nguyen, DL and Gracco, VL}, title = {Modulation of Speech Motor Learning with Transcranial Direct Current Stimulation of the Inferior Parietal Lobe.}, journal = {Frontiers in integrative neuroscience}, volume = {11}, number = {}, pages = {35}, pmid = {29326563}, issn = {1662-5145}, support = {R01 DC012502/DC/NIDCD NIH HHS/United States ; }, abstract = {The inferior parietal lobe (IPL) is a region of the cortex believed to participate in speech motor learning. In this study, we investigated whether transcranial direct current stimulation (tDCS) of the IPL could influence the extent to which healthy adults (1) adapted to a sensory alteration of their own auditory feedback, and (2) changed their perceptual representation. Seventy subjects completed three tasks: a baseline perceptual task that located the phonetic boundary between the vowels /e/ and /a/; a sensorimotor adaptation task in which subjects produced the word "head" under conditions of altered or unaltered feedback; and a post-adaptation perceptual task identical to the first. Subjects were allocated to four groups which differed in current polarity and feedback manipulation. Subjects who received anodal tDCS to their IPL (i.e., presumably increasing cortical excitability) lowered their first formant frequency (F1) by 10% in opposition to the upward shift in F1 in their auditory feedback. Subjects who received the same stimulation with unaltered feedback did not change their production. Subjects who received cathodal tDCS to their IPL (i.e., presumably decreasing cortical excitability) showed a 5% adaptation to the F1 alteration similar to subjects who received sham tDCS. A subset of subjects returned a few days later to reiterate the same protocol but without tDCS, enabling assessment of any facilitatory effects of the previous tDCS. All subjects exhibited a 5% adaptation effect. In addition, across all subjects and for the two recording sessions, the phonetic boundary was shifted toward the vowel /e/ being repeated, consistently with the selective adaptation effect, but a correlation between perception and production suggested that anodal tDCS had enhanced this perceptual shift. In conclusion, we successfully demonstrated that anodal tDCS could (1) enhance the motor adaptation to a sensory alteration, and (2) potentially affect the perceptual representation of those sounds, but we failed to demonstrate the reverse effect with the cathodal configuration. Overall, tDCS of the left IPL can be used to enhance speech performance but only under conditions in which new or adaptive learning is required.}, } @article {pmid29301390, year = {2018}, author = {Kim, C and Lee, S and Jin, I and Kim, J}, title = {Acoustic Features and Cortical Auditory Evoked Potentials according to Emotional Statues of /u/, /a/, /i/ Vowels.}, journal = {Journal of audiology & otology}, volume = {22}, number = {2}, pages = {80-88}, pmid = {29301390}, issn = {2384-1621}, abstract = {BACKGROUND AND OBJECTIVES: Although Ling 6 sounds are often used in the rehabilitation process, its acoustic features have not been fully analyzed and represented in cortical responses. Current study was aimed to analyze acoustic features according to gender and emotional statuses of core vowels of Ling 6 sounds, /u/, /a/, and /i/. Cortical auditory evoked potentials (CAEPs) were also observed in those vowels.

SUBJECTS AND METHODS: Vowel sounds /u/, /a/, and /i/ out of Ling 6 sounds representing low, middle and high frequencies were recorded from normal 20 young adults. The participants watched relevant videos for 4-5 minutes in order for them to sympathize emotions with anger (A), happiness (H), and sadness (S) before producing vowels. And without any emotional salience, neutrally production was performed. The recording was extracted for 500 ms to select pure vowel portion of production. For analysis of CAEP, the latencies and amplitudes of P1, N1, P2, N2, N1-P2 were analyzed.

RESULTS: Intensities of /u/, /a/, and /i/ were 61.47, 63.38, and 60.55 dB. The intensities of neutral (N), H, A, S were 60.60, 65.43, 64.21, and 55.75 dB for vowel /u/, vowel /a/ were 61.80, 68.98, 66.50, and 56.23 dB, and vowel /i/ were 59.34, 64.90, 61.90, and 56.05 dB. The statistical significances for vowel and emotion were found but not for gender. The fundamental frequency (F0) of vowels for N, A, H, and S were 168.04, 174.93, 182.72, and 149.76 Hz and the first formant were 743.75, 815.59, 823.32, and 667.62 Hz. The statistical significance of F0 was found by vowel, emotion, and gender. The latencies and amplitudes of CAEP components did not show any statistical significance according to vowel.

CONCLUSIONS: Ling 6 sounds should be produced consistently in the rehabilitation process for considering their difference of intensities and frequencies according to speaker's emotions and gender. The vowels seemed to be interpreted as tonal stimuli for CAEP components of this study with similar acoustic features among them. Careful selection of materials is necessary to observe meaningful conclusion of CAEP measurement with vowel stimuli.}, } @article {pmid29300756, year = {2018}, author = {Dawson, C and Tervaniemi, M and Aalto, D}, title = {Behavioral and subcortical signatures of musical expertise in Mandarin Chinese speakers.}, journal = {PloS one}, volume = {13}, number = {1}, pages = {e0190793}, pmid = {29300756}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Asian People ; Auditory Perception/*physiology ; Evoked Potentials, Auditory, Brain Stem ; Female ; Humans ; *Language ; Linguistics ; Male ; *Music/psychology ; Phonetics ; Pitch Perception/physiology ; Psychoacoustics ; Speech Perception/physiology ; Young Adult ; }, abstract = {Both musical training and native language have been shown to have experience-based plastic effects on auditory processing. However, the combined effects within individuals are unclear. Recent research suggests that musical training and tone language speaking are not clearly additive in their effects on processing of auditory features and that there may be a disconnect between perceptual and neural signatures of auditory feature processing. The literature has only recently begun to investigate the effects of musical expertise on basic auditory processing for different linguistic groups. This work provides a profile of primary auditory feature discrimination for Mandarin speaking musicians and nonmusicians. The musicians showed enhanced perceptual discrimination for both frequency and duration as well as enhanced duration discrimination in a multifeature discrimination task, compared to nonmusicians. However, there were no differences between the groups in duration processing of nonspeech sounds at a subcortical level or in subcortical frequency representation of a nonnative tone contour, for fo or for the first or second formant region. The results indicate that musical expertise provides a cognitive, but not subcortical, advantage in a population of Mandarin speakers.}, } @article {pmid29291581, year = {2018}, author = {Zhang, J and Pan, Z and Gui, C and Xue, T and Lin, Y and Zhu, J and Cui, D}, title = {Analysis on speech signal features of manic patients.}, journal = {Journal of psychiatric research}, volume = {98}, number = {}, pages = {59-63}, doi = {10.1016/j.jpsychires.2017.12.012}, pmid = {29291581}, issn = {1879-1379}, mesh = {Adult ; Biomarkers ; Bipolar Disorder/*diagnosis/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Psychiatric Status Rating Scales ; Remission Induction ; *Speech Acoustics ; }, abstract = {Given the lack of effective biological markers for early diagnosis of bipolar mania, and the tendency for voice fluctuation during transition between mood states, this study aimed to investigate the speech features of manic patients to identify a potential set of biomarkers for diagnosis of bipolar mania. 30 manic patients and 30 healthy controls were recruited and their corresponding speech features were collected during natural dialogue using the Automatic Voice Collecting System. Bech-Rafaelsdn Mania Rating Scale (BRMS) and Clinical impression rating scale (CGI) were used to assess illness. The speech features were compared between two groups: mood group (mania vs remission) and bipolar group (manic patients vs healthy individuals). We found that the characteristic speech signals differed between mood groups and bipolar groups. The fourth formant (F4) and Linear Prediction Coefficient (LPC) (P < .05) were significantly differed when patients transmitted from manic to remission state. The first formant (F1), the second formant (F2), and LPC (P < .05) also played key roles in distinguishing between patients and healthy individuals. In addition, there was a significantly correlation between LPC and BRMS, indicating that LPC may play an important role in diagnosis of bipolar mania. In this study we traced speech features of bipolar mania during natural dialogue (conversation), which is an accessible approach in clinic practice. Such specific indicators may respectively serve as promising biomarkers for benefiting the diagnosis and clinical therapeutic evaluation of bipolar mania.}, } @article {pmid29285844, year = {2018}, author = {Escudero, P and Mulak, KE and Elvin, J and Traynor, NM}, title = {"Mummy, keep it steady": phonetic variation shapes word learning at 15 and 17 months.}, journal = {Developmental science}, volume = {21}, number = {5}, pages = {e12640}, doi = {10.1111/desc.12640}, pmid = {29285844}, issn = {1467-7687}, mesh = {Australia ; Canada ; Female ; Humans ; Infant ; *Language Development ; Pattern Recognition, Physiological/*physiology ; Phonetics ; Verbal Learning/*physiology ; }, abstract = {Fifteen-month-olds have difficulty detecting differences between novel words differing in a single vowel. Previous work showed that Australian English (AusE) infants habituated to the word-object pair DEET detected an auditory switch to DIT and DOOT in Canadian English (CanE) but not in their native AusE (Escudero et al.,). The authors speculated that this may be because the vowel inherent spectral change variation (VISC) in AusE DEET is larger than in CanE DEET. We investigated whether VISC leads to difficulty in encoding phonetic detail during early word learning, and whether this difficulty dissipates with age. In Experiment 1, we familiarized AusE-learning 15-month-olds to AusE DIT, which contains smaller VISC than AusE DEET. Unlike infants familiarized with AusE DEET (Escudero et al.,), infants detected a switch to DEET and DOOT. In Experiment 2, we familiarized AusE-learning 17-month-olds to AusE DEET. This time, infants detected a switch to DOOT, and marginally detected a switch to DIT. Our acoustic analysis showed that AusE DEET and DOOT are differentiated by the second vowel formant, while DEET and DIT can only be distinguished by their changing dynamic properties throughout the vowel trajectory. Thus, by 17 months, AusE infants can encode highly dynamic acoustic properties, enabling them to learn the novel vowel minimal pairs that are difficult at 15 months. These findings suggest that the development of word learning is shaped by the phonetic properties of the specific word minimal pair.}, } @article {pmid29280401, year = {2018}, author = {Maruthy, S and Feng, Y and Max, L}, title = {Spectral Coefficient Analyses of Word-Initial Stop Consonant Productions Suggest Similar Anticipatory Coarticulation for Stuttering and Nonstuttering Adults.}, journal = {Language and speech}, volume = {61}, number = {1}, pages = {31-42}, pmid = {29280401}, issn = {0023-8309}, support = {P30 DC004661/DC/NIDCD NIH HHS/United States ; R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; Y154221431//CIHR/Canada ; }, mesh = {*Acoustics ; Adult ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; Sound Spectrography ; *Speech Acoustics ; *Speech Production Measurement ; Stuttering/diagnosis/physiopathology/*psychology ; Time Factors ; *Verbal Behavior ; *Voice Quality ; Young Adult ; }, abstract = {A longstanding hypothesis about the sensorimotor mechanisms underlying stuttering suggests that stuttered speech dysfluencies result from a lack of coarticulation. Formant-based measures of either the stuttered or fluent speech of children and adults who stutter have generally failed to obtain compelling evidence in support of the hypothesis that these individuals differ in the timing or degree of coarticulation. Here, we used a sensitive acoustic technique-spectral coefficient analyses-that allowed us to compare stuttering and nonstuttering speakers with regard to vowel-dependent anticipatory influences as early as the onset burst of a preceding voiceless stop consonant. Eight adults who stutter and eight matched adults who do not stutter produced C1VC2 words, and the first four spectral coefficients were calculated for one analysis window centered on the burst of C1 and two subsequent windows covering the beginning of the aspiration phase. Findings confirmed that the combined use of four spectral coefficients is an effective method for detecting the anticipatory influence of a vowel on the initial burst of a preceding voiceless stop consonant. However, the observed patterns of anticipatory coarticulation showed no statistically significant differences, or trends toward such differences, between the stuttering and nonstuttering groups. Combining the present results for fluent speech in one given phonetic context with prior findings from both stuttered and fluent speech in a variety of other contexts, we conclude that there is currently no support for the hypothesis that the fluent speech of individuals who stutter is characterized by limited coarticulation.}, } @article {pmid29277351, year = {2019}, author = {Palaparthi, A and Maxfield, L and Titze, IR}, title = {Estimation of Source-Filter Interaction Regions Based on Electroglottography.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {3}, pages = {269-276}, pmid = {29277351}, issn = {1873-4588}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Algorithms ; *Electrodiagnosis ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; Male ; *Phonation ; Pressure ; Sex Factors ; Sound Spectrography ; Time Factors ; Vibration ; Vocal Cords/anatomy & histology/*physiology ; *Voice Quality ; }, abstract = {Source-filter interaction is a phenomenon in which acoustic airway pressures influence the glottal airflow at the source (level 1) and the vibration pattern of the vocal folds (level 2). This interaction is most significant when dominant source harmonics are near airway resonances. The influence of acoustic airway pressures on vocal fold vibration (level 2) was studied systematically by changing the supraglottal vocal tract length in human subjects with tube extensions. The subjects were asked to perform fundamental frequency (fo) glides while phonating through tubes of various lengths. An algorithm was developed using the quasi-open quotient extracted from the electroglottograph. Regions of sudden vocal fold vibration pattern change due to source-filter interaction were inferred from contact area changes. The algorithm correctly identified 89% of male and 84.8% of female quantal changes in contact pattern associated with interactions between source harmonics and formants during ascending glides. During the descending glides, the algorithm correctly identified 84% of male and 81.1% of female quantal changes in contact pattern. These results are in comparison with those obtained from the fo-based algorithm (Maxfield et al).}, } @article {pmid29265931, year = {2018}, author = {Xue, P and Zhang, X and Bai, J and Wang, Z}, title = {Acoustic and kinematic analyses of Mandarin vowels in speakers with hearing impairment.}, journal = {Clinical linguistics & phonetics}, volume = {32}, number = {7}, pages = {622-639}, doi = {10.1080/02699206.2017.1416492}, pmid = {29265931}, issn = {1464-5076}, mesh = {Adolescent ; *Asian People ; Biomechanical Phenomena ; China ; Female ; *Hearing Loss ; Humans ; Male ; *Speech Acoustics ; Speech Production Measurement/*methods ; }, abstract = {The central aim of this experiment was to compare acoustic parameters, formant frequencies and vowel space area (VSA), in adolescents with hearing-impaired (HI) and their normal-hearing (NH) peers; for kinematic parameters, the movements of vocal organs, especially the lips, jaw and tongue, during vowel production were analysed. The participants were 12 adolescents with different degrees of hearing impairment. The control group consisted of 12 age-matched NH adolescents. All participants were native Chinese speakers who were asked to produce the Mandarin vowels /a/, /i/ and /u/, with subsequent acoustic and kinematic analysis. There was significant difference between the two groups. Additionally, the HI group produced more exaggerated mouth and less tongue movements in all vowels, compared to their NH peers. Results were discussed regarding possible relationship between acoustic data, articulatory movements and degree of hearing loss to provide an integrative assessment of acoustic and kinematic characteristics of individuals with hearing loss.}, } @article {pmid29238318, year = {2017}, author = {Zaltz, Y and Globerson, E and Amir, N}, title = {Auditory Perceptual Abilities Are Associated with Specific Auditory Experience.}, journal = {Frontiers in psychology}, volume = {8}, number = {}, pages = {2080}, pmid = {29238318}, issn = {1664-1078}, abstract = {The extent to which auditory experience can shape general auditory perceptual abilities is still under constant debate. Some studies show that specific auditory expertise may have a general effect on auditory perceptual abilities, while others show a more limited influence, exhibited only in a relatively narrow range associated with the area of expertise. The current study addresses this issue by examining experience-dependent enhancement in perceptual abilities in the auditory domain. Three experiments were performed. In the first experiment, 12 pop and rock musicians and 15 non-musicians were tested in frequency discrimination (DLF), intensity discrimination, spectrum discrimination (DLS), and time discrimination (DLT). Results showed significant superiority of the musician group only for the DLF and DLT tasks, illuminating enhanced perceptual skills in the key features of pop music, in which miniscule changes in amplitude and spectrum are not critical to performance. The next two experiments attempted to differentiate between generalization and specificity in the influence of auditory experience, by comparing subgroups of specialists. First, seven guitar players and eight percussionists were tested in the DLF and DLT tasks that were found superior for musicians. Results showed superior abilities on the DLF task for guitar players, though no difference between the groups in DLT, demonstrating some dependency of auditory learning on the specific area of expertise. Subsequently, a third experiment was conducted, testing a possible influence of vowel density in native language on auditory perceptual abilities. Ten native speakers of German (a language characterized by a dense vowel system of 14 vowels), and 10 native speakers of Hebrew (characterized by a sparse vowel system of five vowels), were tested in a formant discrimination task. This is the linguistic equivalent of a DLS task. Results showed that German speakers had superior formant discrimination, demonstrating highly specific effects for auditory linguistic experience as well. Overall, results suggest that auditory superiority is associated with the specific auditory exposure.}, } @article {pmid29223866, year = {2018}, author = {Easwar, V and Banyard, A and Aiken, S and Purcell, D}, title = {Phase delays between tone pairs reveal interactions in scalp-recorded envelope following responses.}, journal = {Neuroscience letters}, volume = {665}, number = {}, pages = {257-262}, doi = {10.1016/j.neulet.2017.12.014}, pmid = {29223866}, issn = {1872-7972}, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Electroencephalography/methods ; Evoked Potentials, Auditory/*physiology ; Female ; Hearing/*physiology ; Humans ; Male ; Noise ; Scalp/*physiopathology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Evoked potentials to envelope periodicity in sounds, such as vowels, are dependent on the stimulus spectrum. We hypothesize that phase differences between responses elicited by multiple frequencies spread tonotopically across the cochlear partition may contribute to variation in scalp-recorded amplitude. The present study evaluated this hypothesis by measuring envelope following responses (EFRs) to two concurrent tone pairs, p1 and p2, that approximated the first and second formant frequencies of a vowel, while controlling their relative envelope phase. We found that the scalp-recorded amplitude of EFRs changed significantly in phase and amplitude when the envelope phase of p2, the higher frequency tone pair, was delayed. The maximum EFR amplitude occurred at the p2 envelope phase delay of 90°, likely because the stimulus delay compensated for the average phase lead of 73.57° exhibited by p2-contributed EFRs relative to p1-contributed EFRs, owing to earlier cochlear processing of higher frequencies. Findings suggest a linear superimposition of independently generated EFRs from tonotopically separated pathways. This suggests that introducing frequency-specific delays may help to optimize EFRs to broadband stimuli like vowels.}, } @article {pmid29221889, year = {2019}, author = {Delviniotis, DS and Theodoridis, S}, title = {On Exploring Vocal Ornamentation in Byzantine Chant.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {2}, pages = {256.e17-256.e34}, doi = {10.1016/j.jvoice.2017.10.016}, pmid = {29221889}, issn = {1873-4588}, mesh = {*Acoustics ; Humans ; Male ; *Singing ; Sound Spectrography ; Time Factors ; Vibration ; *Voice Quality ; }, abstract = {OBJECTIVES: A special vocal ornament in Byzantine chant (BC), the single cycle ornamentation structure (SCOS), is defined and compared with the vibrato with respect to its time (rate, extent) and spectral (slope [SS], relative speaker's formant [SPF] level, formant frequencies [Fi] and bandwidths [Bi], and noise-to-harmonics ratio [NHR]) characteristics.

STUDY DESIGN: This is a comparative study between the vocal ornaments of SCOS and vibrato, of which time and spectral acoustic parameters were measured, statistically analyzed, and compared.

METHODS: From the same hymn recordings chanted by four chanters, the SS, SPF level, FFi, FBi, and NHR difference values between the vocal ornament and its neighbor steady note, and the rate and extent, were compared with those of vibrato.

RESULTS: The mean extent values for SCOS were found to be almost double the corresponding values for vibrato, and the rate of SCOS tends to be different from the rate of vibrato. The difference values of: 1) the NHR, 2) the spectral slope, and 3) the SPF level, between the vocal ornament and its neighbor steady note were found to be: 1) higher for SCOS, 2) mainly lower for SCOS, and 3) lower for SCOS, respectively. No significant differences were detected for the FFi and FBi. The FF1 differences tend to be negative in both ornaments indicating a formant tuning effect.

CONCLUSIONS: A new vocal ornament (SCOS) in BC is studied, of which the extent, NHR (HNR), the spectral slope, and the SPF level are different compared to those of vibrato.}, } @article {pmid29211651, year = {2018}, author = {Lametti, DR and Smith, HJ and Freidin, PF and Watkins, KE}, title = {Cortico-cerebellar Networks Drive Sensorimotor Learning in Speech.}, journal = {Journal of cognitive neuroscience}, volume = {30}, number = {4}, pages = {540-551}, doi = {10.1162/jocn_a_01216}, pmid = {29211651}, issn = {1530-8898}, support = {MR/M025539/1//Medical Research Council/United Kingdom ; }, mesh = {Adaptation, Physiological/physiology ; Adult ; Cerebellum/*physiology ; Humans ; Learning/*physiology ; Motor Cortex/*physiology ; Motor Skills/*physiology ; Neural Pathways/physiology ; Speech/*physiology ; Transcranial Direct Current Stimulation ; Young Adult ; }, abstract = {The motor cortex and cerebellum are thought to be critical for learning and maintaining motor behaviors. Here we use transcranial direct current stimulation (tDCS) to test the role of the motor cortex and cerebellum in sensorimotor learning in speech. During productions of "head," "bed," and "dead," the first formant of the vowel sound was altered in real time toward the first formant of the vowel sound in "had," "bad," and "dad." Compensatory changes in first and second formant production were used as a measure of motor adaptation. tDCS to either the motor cortex or the cerebellum improved sensorimotor learning in speech compared with sham stimulation (n = 20 in each group). However, in the case of cerebellar tDCS, production changes were restricted to the source of the acoustical error (i.e., the first formant). Motor cortex tDCS drove production changes that offset errors in the first formant, but unlike cerebellar tDCS, adaptive changes in the second formant also occurred. The results suggest that motor cortex and cerebellar tDCS have both shared and dissociable effects on motor adaptation. The study provides initial causal evidence in speech production that the motor cortex and the cerebellum support different aspects of sensorimotor learning. We propose that motor cortex tDCS drives sensorimotor learning toward previously learned patterns of movement, whereas cerebellar tDCS focuses sensorimotor learning on error correction.}, } @article {pmid29210668, year = {2017}, author = {Wiedmer, S and Erdbeer, A and Volke, B and Randel, S and Kapplusch, F and Hanig, S and Kurth, M}, title = {Identification and analysis of Eimeria nieschulzi gametocyte genes reveal splicing events of gam genes and conserved motifs in the wall-forming proteins within the genus Eimeria (Coccidia, Apicomplexa).}, journal = {Parasite (Paris, France)}, volume = {24}, number = {}, pages = {50}, pmid = {29210668}, issn = {1776-1042}, mesh = {Alternative Splicing ; Amino Acid Sequence ; Animals ; Centrifugation, Density Gradient ; DNA, Protozoan/chemistry ; Eimeria/chemistry/classification/*genetics ; Gene Library ; Intestinal Mucosa/parasitology ; Intestine, Small/parasitology ; Oocysts/*genetics/ultrastructure ; Phylogeny ; Protozoan Proteins/chemistry/*genetics ; Rats ; Rats, Sprague-Dawley ; }, abstract = {The genus Eimeria (Apicomplexa, Coccidia) provides a wide range of different species with different hosts to study common and variable features within the genus and its species. A common characteristic of all known Eimeria species is the oocyst, the infectious stage where its life cycle starts and ends. In our study, we utilized Eimeria nieschulzi as a model organism. This rat-specific parasite has complex oocyst morphology and can be transfected and even cultivated in vitro up to the oocyst stage. We wanted to elucidate how the known oocyst wall-forming proteins are preserved in this rodent Eimeria species compared to other Eimeria. In newly obtained genomics data, we were able to identify different gametocyte genes that are orthologous to already known gam genes involved in the oocyst wall formation of avian Eimeria species. These genes appeared putatively as single exon genes, but cDNA analysis showed alternative splicing events in the transcripts. The analysis of the translated sequence revealed different conserved motifs but also dissimilar regions in GAM proteins, as well as polymorphic regions. The occurrence of an underrepresented gam56 gene version suggests the existence of a second distinct E. nieschulzi genotype within the E. nieschulzi Landers isolate that we maintain.}, } @article {pmid29210525, year = {2019}, author = {Elbashti, ME and Sumita, YI and Hattori, M and Aswehlee, AM and Taniguchi, H}, title = {Digitized Speech Characteristics in Patients with Maxillectomy Defects.}, journal = {Journal of prosthodontics : official journal of the American College of Prosthodontists}, volume = {28}, number = {6}, pages = {649-655}, doi = {10.1111/jopr.12705}, pmid = {29210525}, issn = {1532-849X}, support = {JP17H00755//JSPS KAKENHI/ ; 15K20508//JSPS KAKENHI/ ; 15KK0336//JSPS KAKENHI/ ; }, mesh = {Analysis of Variance ; Humans ; Male ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Therapy ; }, abstract = {PURPOSE: Accurate evaluation of speech characteristics through formant frequency measurement is important for proper speech rehabilitation in patients after maxillectomy. This study aimed to evaluate the utility of digital acoustic analysis and vowel pentagon space for the prediction of speech ability after maxillectomy, by comparing the acoustic characteristics of vowel articulation in three classes of maxillectomy defects.

MATERIALS AND METHODS: Aramany's classifications I, II, and IV were used to group 27 male patients after maxillectomy. Digital acoustic analysis of five Japanese vowels-/a/, /e/, /i/, /o/, and /u/-was performed using a speech analysis system. First formant (F1) and second formant (F2) frequencies were calculated using an autocorrelation method. Data were plotted on an F1-F2 plane for each patient, and the F1 and F2 ranges were calculated. The vowel pentagon spaces were also determined. One-way ANOVA was applied to compare all results between the three groups.

RESULTS: Class II maxillectomy patients had a significantly higher F2 range than did Class I and Class IV patients (p = 0.002). In contrast, there was no significant difference in the F1 range between the three classes. The vowel pentagon spaces were significantly larger in class II maxillectomy patients than in Class I and Class IV patients (p = 0.014).

CONCLUSION: The results of this study indicate that the acoustic characteristics of maxillectomy patients are affected by the defect area. This finding may provide information for obturator design based on vowel articulation and defect class.}, } @article {pmid29205443, year = {2018}, author = {Sumita, YI and Hattori, M and Murase, M and Elbashti, ME and Taniguchi, H}, title = {Digitised evaluation of speech intelligibility using vowels in maxillectomy patients.}, journal = {Journal of oral rehabilitation}, volume = {45}, number = {3}, pages = {216-221}, doi = {10.1111/joor.12595}, pmid = {29205443}, issn = {1365-2842}, mesh = {Adult ; Aged ; Asian People ; Female ; Follow-Up Studies ; Humans ; Male ; Mandibular Reconstruction/psychology/*rehabilitation ; Middle Aged ; Phonetics ; Quality of Life ; Signal Processing, Computer-Assisted ; Speech Disorders/psychology/*rehabilitation ; Speech Intelligibility/*physiology ; *Speech Production Measurement ; *Speech Therapy ; }, abstract = {Among the functional disabilities that patients face following maxillectomy, speech impairment is a major factor influencing quality of life. Proper rehabilitation of speech, which may include prosthodontic and surgical treatments and speech therapy, requires accurate evaluation of speech intelligibility (SI). A simple, less time-consuming yet accurate evaluation is desirable both for maxillectomy patients and the various clinicians providing maxillofacial treatment. This study sought to determine the utility of digital acoustic analysis of vowels for the prediction of SI in maxillectomy patients, based on a comprehensive understanding of speech production in the vocal tract of maxillectomy patients and its perception. Speech samples were collected from 33 male maxillectomy patients (mean age 57.4 years) in two conditions, without and with a maxillofacial prosthesis, and formant data for the vowels /a/,/e/,/i/,/o/, and /u/ were calculated based on linear predictive coding. The frequency range of formant 2 (F2) was determined by differences between the minimum and maximum frequency. An SI test was also conducted to reveal the relationship between SI score and F2 range. Statistical analyses were applied. F2 range and SI score were significantly different between the two conditions without and with a prosthesis (both P < .0001). F2 range was significantly correlated with SI score in both the conditions (Spearman's r = .843, P < .0001; r = .832, P < .0001, respectively). These findings indicate that calculating the F2 range from 5 vowels has clinical utility for the prediction of SI after maxillectomy.}, } @article {pmid29195437, year = {2017}, author = {Hauser, I}, title = {A revised metric for calculating acoustic dispersion applied to stop inventories.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {5}, pages = {EL500}, doi = {10.1121/1.5012098}, pmid = {29195437}, issn = {1520-8524}, abstract = {Dispersion Theory [DT; Liljencrants and Lindblom (1972). Language 12(1), 839-862] claims that acoustically dispersed vowel inventories should be typologically common. Dispersion is often quantified using triangle area between three mean vowel formant points. This approach is problematic; it ignores distributions, which affect speech perception [Clayards, Tanenhaus, Aslin, and Jacobs (2008). Cognition 108, 804-809]. This letter proposes a revised metric for calculating dispersion which incorporates covariance. As a test case, modeled vocal tract articulatory-acoustic data of stop consonants [Schwartz, Boe, Badin, and Sawallis (2012). J. Phonetics 40, 20-36] are examined. Although the revised metric does not recover DT predictions for stop inventories, it changes results, showing that dispersion results depend on metric choice, which is often overlooked. The metric can be used in any acoustic space to include information about within-category variation when calculating dispersion.}, } @article {pmid29184336, year = {2017}, author = {Thoppil, MG and Kumar, CS and Kumar, A and Amose, J}, title = {Speech Signal Analysis and Pattern Recognition in Diagnosis of Dysarthria.}, journal = {Annals of Indian Academy of Neurology}, volume = {20}, number = {4}, pages = {352-357}, pmid = {29184336}, issn = {0972-2327}, abstract = {BACKGROUND: Dysarthria refers to a group of disorders resulting from disturbances in muscular control over the speech mechanism due to damage of central or peripheral nervous system. There is wide subjective variability in assessment of dysarthria between different clinicians. In our study, we tried to identify a pattern among types of dysarthria by acoustic analysis and to prevent intersubject variability.

OBJECTIVES: (1) Pattern recognition among types of dysarthria with software tool and to compare with normal subjects. (2) To assess the severity of dysarthria with software tool.

MATERIALS AND METHODS: Speech of seventy subjects were recorded, both normal subjects and the dysarthric patients who attended the outpatient department/admitted in AIMS. Speech waveforms were analyzed using Praat and MATHLAB toolkit. The pitch contour, formant variation, and speech duration of the extracted graphs were analyzed.

RESULTS: Study population included 25 normal subjects and 45 dysarthric patients. Dysarthric subjects included 24 patients with extrapyramidal dysarthria, 14 cases of spastic dysarthria, and 7 cases of ataxic dysarthria. Analysis of pitch of the study population showed a specific pattern in each type. F0 jitter was found in spastic dysarthria, pitch break with ataxic dysarthria, and pitch monotonicity with extrapyramidal dysarthria. By pattern recognition, we identified 19 cases in which one or more recognized patterns coexisted. There was a significant correlation between the severity of dysarthria and formant range.

CONCLUSIONS: Specific patterns were identified for types of dysarthria so that this software tool will help clinicians to identify the types of dysarthria in a better way and could prevent intersubject variability. We also assessed the severity of dysarthria by formant range. Mixed dysarthria can be more common than clinically expected.}, } @article {pmid29180974, year = {2017}, author = {Themistocleous, C}, title = {Effects of Two Linguistically Proximal Varieties on the Spectral and Coarticulatory Properties of Fricatives: Evidence from Athenian Greek and Cypriot Greek.}, journal = {Frontiers in psychology}, volume = {8}, number = {}, pages = {1945}, pmid = {29180974}, issn = {1664-1078}, abstract = {Several studies have explored the acoustic structure of fricatives, yet there has been very little acoustic research on the effects of dialects on the production of fricatives. This article investigates the effects of two linguistically proximal Modern Greek dialects, Athenian Greek and Cypriot Greek on the temporal, spectral, and coarticulatory properties of fricatives and aims to determine the acoustic properties that convey information about these two dialects. Productions of voiced and voiceless labiodental, dental, alveolar, palatal, and velar fricatives were extracted from a speaking task from typically speaking female adult speakers (25 Cypriot Greek and 20 Athenian Greek speakers). Measures were made of spectral properties, using a spectral moments analysis. The formants of the following vowel were measured and second degree polynomials of the formant contours were calculated. The findings showed that Athenian Greek and Cypriot Greek fricatives differ in all spectral properties across all places of articulation. Also, the co-articulatory effects of fricatives on following vowel were different depending on the dialect. Duration, spectral moments, and the starting frequencies of F1, F2, F3, and F4 contributed the most to the classification of dialect. These findings provide a solid evidence base for the manifestation of dialectal information in the acoustic structure of fricatives.}, } @article {pmid29172630, year = {2018}, author = {Sjerps, MJ and Zhang, C and Peng, G}, title = {Lexical tone is perceived relative to locally surrounding context, vowel quality to preceding context.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {44}, number = {6}, pages = {914-924}, doi = {10.1037/xhp0000504}, pmid = {29172630}, issn = {1939-1277}, support = {//Council of Hong Kong/ ; //European Union's Seventh Framework Programme; People Programme (Marie Curie Actions)/ ; }, mesh = {Adult ; Female ; Humans ; Male ; *Psycholinguistics ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Important speech cues such as lexical tone and vowel quality are perceptually contrasted to the distribution of those same cues in surrounding contexts. However, it is unclear whether preceding and following contexts have similar influences, and to what extent those influences are modulated by the auditory history of previous trials. To investigate this, Cantonese participants labeled sounds from (a) a tone continuum (mid- to high-level), presented with a context that had raised or lowered fundamental frequency (F0) values and (b) a vowel quality continuum (/u/ to /o/), where the context had raised or lowered first formant (F1) values. Contexts with high or low F0/F1 were presented in separate blocks or intermixed in 1 block. Contexts were presented following (Experiment 1) or preceding the target continuum (Experiment 2). Contrastive effects were found for both tone and vowel quality (e.g., decreased F0 values in contexts lead to more high tone target judgments and vice versa). Importantly, however, lexical tone was only influenced by F0 in immediately preceding and following contexts. Vowel quality was only influenced by the F1 in preceding contexts, but this extended to contexts from preceding trials. Contextual influences on tone and vowel quality are qualitatively different, which has important implications for understanding the mechanism of context effects in speech perception. (PsycINFO Database Record}, } @article {pmid29169049, year = {2018}, author = {Daliri, A and Max, L}, title = {Stuttering adults' lack of pre-speech auditory modulation normalizes when speaking with delayed auditory feedback.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {99}, number = {}, pages = {55-68}, pmid = {29169049}, issn = {1973-8102}, support = {P30 DC004661/DC/NIDCD NIH HHS/United States ; R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adult ; *Auditory Perception ; Case-Control Studies ; Electroencephalography ; *Evoked Potentials, Auditory ; *Feedback, Sensory ; Female ; Humans ; *Learning ; Male ; Middle Aged ; *Speech ; Stuttering/*physiopathology ; Young Adult ; }, abstract = {Auditory modulation during speech movement planning is limited in adults who stutter (AWS), but the functional relevance of the phenomenon itself remains unknown. We investigated for AWS and adults who do not stutter (AWNS) (a) a potential relationship between pre-speech auditory modulation and auditory feedback contributions to speech motor learning and (b) the effect on pre-speech auditory modulation of real-time versus delayed auditory feedback. Experiment I used a sensorimotor adaptation paradigm to estimate auditory-motor speech learning. Using acoustic speech recordings, we quantified subjects' formant frequency adjustments across trials when continually exposed to formant-shifted auditory feedback. In Experiment II, we used electroencephalography to determine the same subjects' extent of pre-speech auditory modulation (reductions in auditory evoked potential N1 amplitude) when probe tones were delivered prior to speaking versus not speaking. To manipulate subjects' ability to monitor real-time feedback, we included speaking conditions with non-altered auditory feedback (NAF) and delayed auditory feedback (DAF). Experiment I showed that auditory-motor learning was limited for AWS versus AWNS, and the extent of learning was negatively correlated with stuttering frequency. Experiment II yielded several key findings: (a) our prior finding of limited pre-speech auditory modulation in AWS was replicated; (b) DAF caused a decrease in auditory modulation for most AWNS but an increase for most AWS; and (c) for AWS, the amount of auditory modulation when speaking with DAF was positively correlated with stuttering frequency. Lastly, AWNS showed no correlation between pre-speech auditory modulation (Experiment II) and extent of auditory-motor learning (Experiment I) whereas AWS showed a negative correlation between these measures. Thus, findings suggest that AWS show deficits in both pre-speech auditory modulation and auditory-motor learning; however, limited pre-speech modulation is not directly related to limited auditory-motor adaptation; and in AWS, DAF paradoxically tends to normalize their otherwise limited pre-speech auditory modulation.}, } @article {pmid29165082, year = {2018}, author = {Kato, S and Homma, A and Sakuma, T}, title = {Easy Screening for Mild Alzheimer's Disease and Mild Cognitive Impairment from Elderly Speech.}, journal = {Current Alzheimer research}, volume = {15}, number = {2}, pages = {104-110}, doi = {10.2174/1567205014666171120144343}, pmid = {29165082}, issn = {1875-5828}, mesh = {Aged ; Aged, 80 and over ; Alzheimer Disease/*diagnosis ; Cognitive Dysfunction/*diagnosis ; Female ; Humans ; Male ; Neuropsychological Tests ; Phonetics ; Sensitivity and Specificity ; *Speech ; }, abstract = {OBJECTIVE: This study presents a novel approach for early detection of cognitive impairment in the elderly. The approach incorporates the use of speech sound analysis, multivariate statistics, and data-mining techniques. We have developed a speech prosody-based cognitive impairment rating (SPCIR) that can distinguish between cognitively normal controls and elderly people with mild Alzheimer's disease (mAD) or mild cognitive impairment (MCI) using prosodic signals extracted from elderly speech while administering a questionnaire. Two hundred and seventy-three Japanese subjects (73 males and 200 females between the ages of 65 and 96) participated in this study. The authors collected speech sounds from segments of dialogue during a revised Hasegawa's dementia scale (HDS-R) examination and talking about topics related to hometown, childhood, and school. The segments correspond to speech sounds from answers to questions regarding birthdate (T1), the name of the subject's elementary school (T2), time orientation (Q2), and repetition of three-digit numbers backward (Q6). As many prosodic features as possible were extracted from each of the speech sounds, including fundamental frequency, formant, and intensity features and mel-frequency cepstral coefficients. They were refined using principal component analysis and/or feature selection. The authors calculated an SPCIR using multiple linear regression analysis.

CONCLUSION: In addition, this study proposes a binary discrimination model of SPCIR using multivariate logistic regression and model selection with receiver operating characteristic curve analysis and reports on the sensitivity and specificity of SPCIR for diagnosis (control vs. MCI/mAD). The study also reports discriminative performances well, thereby suggesting that the proposed approach might be an effective tool for screening the elderly for mAD and MCI.}, } @article {pmid29109068, year = {2017}, author = {Tyan, M and Espinoza-Cuadros, F and Fernández Pozo, R and Toledano, D and Lopez Gonzalo, E and Alcazar Ramirez, JD and Hernandez Gomez, LA}, title = {Obstructive Sleep Apnea in Women: Study of Speech and Craniofacial Characteristics.}, journal = {JMIR mHealth and uHealth}, volume = {5}, number = {11}, pages = {e169}, pmid = {29109068}, issn = {2291-5222}, abstract = {BACKGROUND: Obstructive sleep apnea (OSA) is a common sleep disorder characterized by frequent cessation of breathing lasting 10 seconds or longer. The diagnosis of OSA is performed through an expensive procedure, which requires an overnight stay at the hospital. This has led to several proposals based on the analysis of patients' facial images and speech recordings as an attempt to develop simpler and cheaper methods to diagnose OSA.

OBJECTIVE: The objective of this study was to analyze possible relationships between OSA and speech and facial features on a female population and whether these possible connections may be affected by the specific clinical characteristics in OSA population and, more specifically, to explore how the connection between OSA and speech and facial features can be affected by gender.

METHODS: All the subjects are Spanish subjects suspected to suffer from OSA and referred to a sleep disorders unit. Voice recordings and photographs were collected in a supervised but not highly controlled way, trying to test a scenario close to a realistic clinical practice scenario where OSA is assessed using an app running on a mobile device. Furthermore, clinical variables such as weight, height, age, and cervical perimeter, which are usually reported as predictors of OSA, were also gathered. Acoustic analysis is centered in sustained vowels. Facial analysis consists of a set of local craniofacial features related to OSA, which were extracted from images after detecting facial landmarks by using the active appearance models. To study the probable OSA connection with speech and craniofacial features, correlations among apnea-hypopnea index (AHI), clinical variables, and acoustic and facial measurements were analyzed.

RESULTS: The results obtained for female population indicate mainly weak correlations (r values between .20 and .39). Correlations between AHI, clinical variables, and speech features show the prevalence of formant frequencies over bandwidths, with F2/i/ being the most appropriate formant frequency for OSA prediction in women. Results obtained for male population indicate mainly very weak correlations (r values between .01 and .19). In this case, bandwidths prevail over formant frequencies. Correlations between AHI, clinical variables, and craniofacial measurements are very weak.

CONCLUSIONS: In accordance with previous studies, some clinical variables are found to be good predictors of OSA. Besides, strong correlations are found between AHI and some clinical variables with speech and facial features. Regarding speech feature, the results show the prevalence of formant frequency F2/i/ over the rest of features for the female population as OSA predictive feature. Although the correlation reported is weak, this study aims to find some traces that could explain the possible connection between OSA and speech in women. In the case of craniofacial measurements, results evidence that some features that can be used for predicting OSA in male patients are not suitable for testing female population.}, } @article {pmid29092545, year = {2017}, author = {Styler, W}, title = {On the acoustical features of vowel nasality in English and French.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {4}, pages = {2469}, doi = {10.1121/1.5008854}, pmid = {29092545}, issn = {1520-8524}, mesh = {Female ; Humans ; *Language ; Linear Models ; Linguistics ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Young Adult ; }, abstract = {Although much is known about the linguistic function of vowel nasality, whether contrastive (as in French) or coarticulatory (as in English), and much effort has gone into identifying potential correlates for the phenomenon, this study examines these proposed features to find the optimal acoustic feature(s) for nasality measurement. To this end, a corpus of 4778 oral and nasal vowels in English and French was collected, and data for 22 features were extracted. A series of linear mixed-effects regressions highlighted three promising features with large oral-to-nasal feature differences and strong effects relative to normal oral vowel variability: A1-P0, F1's bandwidth, and spectral tilt. However, these three features, particularly A1-P0, showed considerable variation in baseline and range across speakers and vowels within each language. Moreover, although the features were consistent in direction across both languages, French speakers' productions showed markedly stronger effects, and showed evidence of spectral tilt beyond the nasal norm being used to enhance the oral-nasal contrast. These findings strongly suggest that the acoustic nature of vowel nasality is both language- and speaker-specific, and that, like vowel formants, nasality measurements require speaker normalization for across-speaker comparison, and that these acoustic properties should not be taken as constant across different languages.}, } @article {pmid29092534, year = {2017}, author = {Henry, KS and Amburgey, KN and Abrams, KS and Idrobo, F and Carney, LH}, title = {Formant-frequency discrimination of synthesized vowels in budgerigars (Melopsittacus undulatus) and humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {4}, pages = {2073}, pmid = {29092534}, issn = {1520-8524}, support = {R00 DC013792/DC/NIDCD NIH HHS/United States ; R01 DC001641/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Animals ; Audiometry, Pure-Tone ; Auditory Threshold ; *Behavior, Animal ; *Discrimination, Psychological ; Female ; Humans ; Male ; *Melopsittacus ; *Pitch Discrimination ; Psychoacoustics ; Species Specificity ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {Vowels are complex sounds with four to five spectral peaks known as formants. The frequencies of the two lowest formants, F1and F2, are sufficient for vowel discrimination. Behavioral studies show that many birds and mammals can discriminate vowels. However, few studies have quantified thresholds for formant-frequency discrimination. The present study examined formant-frequency discrimination in budgerigars (Melopsittacus undulatus) and humans using stimuli with one or two formants and a constant fundamental frequency of 200 Hz. Stimuli had spectral envelopes similar to natural speech and were presented with random level variation. Thresholds were estimated for frequency discrimination of F1, F2, and simultaneous F1 and F2 changes. The same two-down, one-up tracking procedure and single-interval, two-alternative task were used for both species. Formant-frequency discrimination thresholds were as sensitive in budgerigars as in humans and followed the same patterns across all conditions. Thresholds expressed as percent frequency difference were higher for F1 than for F2, and were unchanged between stimuli with one or two formants. Thresholds for simultaneous F1 and F2 changes indicated that discrimination was based on combined information from both formant regions. Results were consistent with previous human studies and show that budgerigars provide an exceptionally sensitive animal model of vowel feature discrimination.}, } @article {pmid32705090, year = {2017}, author = {Bhat, GS and Shankar, N and Reddy, CKA and Panahi, IMS}, title = {Formant Frequency-based Speech Enhancement Technique to improve Intelligibility for hearing aid users with smartphone as an assistive device.}, journal = {... Health innovations and point-of-care technologies conference. Health innovations and point-of-care technologies conference}, volume = {2017}, number = {}, pages = {32-35}, pmid = {32705090}, support = {R01 DC015430/DC/NIDCD NIH HHS/United States ; }, abstract = {In this paper, we present a Speech Enhancement (SE) method implemented on a smartphone, and this arrangement functions as an assistive device to hearing aids (HA). Many benchmark single channel SE algorithms implemented on HAs provide considerable improvement in speech quality, while speech intelligibility improvement still remains a prime challenge. The proposed SE method based on Log spectral amplitude estimator improves speech intelligibility in the noisy real world acoustic environment using the priori information of formant frequency locations. The formant frequency information avails us to control the amount of speech distortion in these frequency bands, thereby controlling speech distortion. We introduce a 'scaling' parameter for the SE gain function, which controls the gains over the non-formant frequency band, allowing the HA users to customize the playback speech using a smartphone application to their listening preference. Objective intelligibility measures show the effectiveness of the proposed SE method. Subjective results reflect the suitability of the developed Speech Enhancement application in real-world noisy conditions at SNR levels of -5 dB, 0 dB and 5 dB.}, } @article {pmid29073618, year = {2018}, author = {Tabain, M and Kochetov, A}, title = {Acoustic Realization and Inventory Size: Kannada and Malayalam Alveolar/Retroflex Laterals and /ɻ/.}, journal = {Phonetica}, volume = {75}, number = {2}, pages = {85-109}, doi = {10.1159/000478104}, pmid = {29073618}, issn = {1423-0321}, mesh = {Alveolar Process ; Humans ; India ; *Language ; Palate ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {This study examines formant and spectral moment data for the apical and retroflex lateral sounds /l ɭ/ of the Dravidian languages Kannada and Malayalam, together with the rhotic /ɻ/ of Malayalam. Data are presented for 10 male speakers of each language. We find that the first spectral moment is lower for retroflex laterals than for alveolar laterals, and lower for the rhotic /ɻ/ of Malayalam than for the retroflex lateral in the same language. Differences emerge when the retroflex lateral of Kannada is compared with the same sound in Malayalam. For both languages, F1 is higher and F3 and F4 are lower for the retroflex /ɭ/ than for the alveolar /l/. However, F2 is higher for the retroflex than for the alveolar sound in Kannada, but lower in Malayalam. This difference is also reflected in differences in the second spectral moment between the languages. It is suggested that since proximity of F2 and F3 is known to be a defining feature of the rhotic /ɻ/ in Malayalam, principles of phonetic dispersion apply to keep F2 from becoming too close to F3 for the retroflex lateral /ɭ/ of Malayalam, but not for the same sound in Kannada.}, } @article {pmid29032347, year = {2017}, author = {Martel-Sauvageau, V and Tjaden, K}, title = {Vocalic transitions as markers of speech acoustic changes with STN-DBS in Parkinson's Disease.}, journal = {Journal of communication disorders}, volume = {70}, number = {}, pages = {1-11}, pmid = {29032347}, issn = {1873-7994}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Deep Brain Stimulation/*methods ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*complications ; Quebec ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Subthalamic Nucleus/*physiopathology ; }, abstract = {PURPOSE: Deep Brain Stimulation of the subthalamic nucleus (STN-DBS) effectively treats cardinal symptoms of idiopathic Parkinson's disease (PD) that cannot be satisfactorily managed with medication. Research is equivocal regarding speech changes associated with STN-DBS. This study investigated the impact of STN-DBS on vocalic transitions and the relationship to intelligibility.

METHODS: Eight Quebec-French speakers with PD and eight healthy controls participated. The slope of the second formant frequency (F2 slope) for glides was obtained. Locus equations (LEs) were calculated to capture vocalic transitions in consonant-vowel sequences. A visual analog scale was used to obtain judgments of intelligibility. Measures for the PD group were obtained both On and Off stimulation.

RESULTS: F2 slopes and LEs differed among groups, but there were no systematic differences for On versus Off STN-DBS. On an individual level, participants with PD exhibited heterogeneous changes with DBS stimulation. Intelligibility was significantly correlated with F2 slope.

CONCLUSION: F2 slope appears to be sensitive to articulatory impairment in PD and could be used in clinical settings to distinguish these speakers from healthy controls. However, acoustic metrics failed to identify systematic change with STN-DBS. The heterogeneity of results, as well as the clinical relevance of acoustic metrics are discussed.}, } @article {pmid28989784, year = {2017}, author = {Turoman, N and Styles, SJ}, title = {Glyph guessing for 'oo' and 'ee': spatial frequency information in sound symbolic matching for ancient and unfamiliar scripts.}, journal = {Royal Society open science}, volume = {4}, number = {9}, pages = {170882}, pmid = {28989784}, issn = {2054-5703}, abstract = {In three experiments, we asked whether diverse scripts contain interpretable information about the speech sounds they represent. When presented with a pair of unfamiliar letters, adult readers correctly guess which is /i/ (the 'ee' sound in 'feet'), and which is /u/ (the 'oo' sound in 'shoe') at rates higher than expected by chance, as shown in a large sample of Singaporean university students (Experiment 1) and replicated in a larger sample of international Internet users (Experiment 2). To uncover what properties of the letters contribute to different scripts' 'guessability,' we analysed the visual spatial frequencies in each letter (Experiment 3). We predicted that the lower spectral frequencies in the formants of the vowel /u/ would pattern with lower spatial frequencies in the corresponding letters. Instead, we found that across all spatial frequencies, the letter with more black/white cycles (i.e. more ink) was more likely to be guessed as /u/, and the larger the difference between the glyphs in a pair, the higher the script's guessability. We propose that diverse groups of humans across historical time and geographical space tend to employ similar iconic strategies for representing speech in visual form, and provide norms for letter pairs from 56 diverse scripts.}, } @article {pmid28974695, year = {2017}, author = {Intartaglia, B and White-Schwoch, T and Kraus, N and Schön, D}, title = {Music training enhances the automatic neural processing of foreign speech sounds.}, journal = {Scientific reports}, volume = {7}, number = {1}, pages = {12631}, pmid = {28974695}, issn = {2045-2322}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Perception/*physiology ; Electroencephalography ; Female ; Humans ; *Language ; Male ; *Music ; Phonetics ; Pitch Discrimination/physiology ; Pitch Perception/physiology ; Speech/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Growing evidence shows that music and language experience affect the neural processing of speech sounds throughout the auditory system. Recent work mainly focused on the benefits induced by musical practice on the processing of native language or tonal foreign language, which rely on pitch processing. The aim of the present study was to take this research a step further by investigating the effect of music training on processing English sounds by foreign listeners. We recorded subcortical electrophysiological responses to an English syllable in three groups of participants: native speakers, non-native nonmusicians, and non-native musicians. Native speakers had enhanced neural processing of the formant frequencies of speech, compared to non-native nonmusicians, suggesting that automatic encoding of these relevant speech cues are sensitive to language experience. Most strikingly, in non-native musicians, neural responses to the formant frequencies did not differ from those of native speakers, suggesting that musical training may compensate for the lack of language experience by strengthening the neural encoding of important acoustic information. Language and music experience seem to induce a selective sensory gain along acoustic dimensions that are functionally-relevant-here, formant frequencies that are crucial for phoneme discrimination.}, } @article {pmid28964567, year = {2018}, author = {Koo, SK and Kwon, SB and Moon, JS and Lee, SH and Lee, HB and Lee, SJ}, title = {Comparison of snoring sounds between natural and drug-induced sleep recorded using a smartphone.}, journal = {Auris, nasus, larynx}, volume = {45}, number = {4}, pages = {777-782}, doi = {10.1016/j.anl.2017.09.005}, pmid = {28964567}, issn = {1879-1476}, mesh = {*Acoustics ; Adult ; Humans ; Laryngoscopy ; Male ; Polysomnography ; *Sleep ; Sleep Aids, Pharmaceutical/*therapeutic use ; Sleep Apnea, Obstructive/*physiopathology ; Smartphone ; Snoring/*physiopathology ; Sound ; *Sound Spectrography ; }, abstract = {OBJECTIVES: Snoring is an important clinical feature of obstructive sleep apnea (OSA), and recent studies suggest that the acoustic quality of snoring sounds is markedly different in drug-induced sleep compared with natural sleep. However, considering differences in sound recording methods and analysis parameters, further studies are required. This study explored whether acoustic analysis of drug-induced sleep is useful as a screening test that reflects the characteristics of natural sleep in snoring patients.

SUBJECTS AND MATERIALS: The snoring sounds of 30 male subjects (mean age=41.8years) were recorded using a smartphone during natural and induced sleep, with the site of vibration noted during drug-induced sleep endoscopy (DISE); then, we compared the sound intensity (dB), formant frequencies, and spectrograms of snoring sounds.

RESULTS: Regarding the intensity of snoring sounds, there were minor differences within the retrolingual level obstruction group, but there was no significant difference between natural and induced sleep at either obstruction site. There was no significant difference in the F1 and F2 formant frequencies of snoring sounds between natural sleep and induced sleep at either obstruction site. Compared with natural sleep, induced sleep was slightly more irregular, with a stronger intensity on the spectrogram, but the spectrograms showed the same pattern at both obstruction sites.

CONCLUSION: Although further studies are required, the spectrograms and formant frequencies of the snoring sounds of induced sleep did not differ significantly from those of natural sleep, and may be used as a screening test that reflects the characteristics of natural sleep according to the obstruction site.}, } @article {pmid28964072, year = {2017}, author = {Gowda, D and Airaksinen, M and Alku, P}, title = {Quasi-closed phase forward-backward linear prediction analysis of speech for accurate formant detection and estimation.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {3}, pages = {1542}, doi = {10.1121/1.5001512}, pmid = {28964072}, issn = {1520-8524}, abstract = {Recently, a quasi-closed phase (QCP) analysis of speech signals for accurate glottal inverse filtering was proposed. However, the QCP analysis which belongs to the family of temporally weighted linear prediction (WLP) methods uses the conventional forward type of sample prediction. This may not be the best choice especially in computing WLP models with a hard-limiting weighting function. A sample selective minimization of the prediction error in WLP reduces the effective number of samples available within a given window frame. To counter this problem, a modified quasi-closed phase forward-backward (QCP-FB) analysis is proposed, wherein each sample is predicted based on its past as well as future samples thereby utilizing the available number of samples more effectively. Formant detection and estimation experiments on synthetic vowels generated using a physical modeling approach as well as natural speech utterances show that the proposed QCP-FB method yields statistically significant improvements over the conventional linear prediction and QCP methods.}, } @article {pmid28959222, year = {2017}, author = {Keller, PE and König, R and Novembre, G}, title = {Simultaneous Cooperation and Competition in the Evolution of Musical Behavior: Sex-Related Modulations of the Singer's Formant in Human Chorusing.}, journal = {Frontiers in psychology}, volume = {8}, number = {}, pages = {1559}, pmid = {28959222}, issn = {1664-1078}, abstract = {Human interaction through music is a vital part of social life across cultures. Influential accounts of the evolutionary origins of music favor cooperative functions related to social cohesion or competitive functions linked to sexual selection. However, work on non-human "chorusing" displays, as produced by congregations of male insects and frogs to attract female mates, suggests that cooperative and competitive functions may coexist. In such chorusing, rhythmic coordination between signalers, which maximizes the salience of the collective broadcast, can arise through competitive mechanisms by which individual males jam rival signals. Here, we show that mixtures of cooperative and competitive behavior also occur in human music. Acoustic analyses of the renowned St. Thomas Choir revealed that, in the presence of female listeners, boys with the deepest voices enhance vocal brilliance and carrying power by boosting high spectral energy. This vocal enhancement may reflect sexually mature males competing for female attention in a covert manner that does not undermine collaborative musical goals. The evolutionary benefits of music may thus lie in its aptness as a medium for balancing sexually motivated behavior and group cohesion.}, } @article {pmid28935209, year = {2018}, author = {Jiao, Y and Wang, R and Zeng, Q and Xu, X and Zhang, Y and Leggon, B and Jiang, J and Zhuang, P}, title = {Establishment and Analysis of False Vocal Folds Hypertrophy Model in Excised Canine Larynges.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {2}, pages = {143-148}, doi = {10.1016/j.jvoice.2017.04.022}, pmid = {28935209}, issn = {1873-4588}, mesh = {Animals ; Disease Models, Animal ; Dogs ; Fructose ; Hypertrophy ; Laryngeal Diseases/chemically induced/pathology/*physiopathology ; Laryngectomy ; Male ; *Phonation ; Pressure ; Time Factors ; Vocal Cords/pathology/*physiopathology/surgery ; *Vocalization, Animal ; }, abstract = {OBJECTIVE: This study aimed to investigate the role of false vocal folds (FVFs) medialization in phonation and the acoustic impact of ventricular hypertrophy by establishing an FVF hypertrophy model.

STUDY DESIGN: A prospective in vitro experiment was carried out.

SETTING: The study was carried out using a pseudolung platform with high-speed camera in a soundproof room.

MATERIALS AND METHODS: Control, degree I, and degree II FVFs hypertrophy were simulated in 10 excised larynges via fructose injection of 0.1 mL for degree I and 0.25 mL for degree II. Mean flow rate (MFR), fundamental frequencies (F0), formants, and sound pressure level were measured with a subglottal pressure of 1.5 kPa and 2.5 kPa, respectively.

RESULTS: When the subglottal pressure was controlled at both at 1.5 kPa and at 2.5 kPa, the degree of FVF hypertrophy significantly influenced the distribution of the formants, F0, and MFR in excised canine larynges. Increasing the degree of hypertrophy was associated with a decrease in F0 and an increase in MFR. In degree II FVF hypertrophy models, the sound pressure level and the first formant were significantly higher (P < 0.05) than in normal models.

CONCLUSION: Hypertrophy of the FVFs has a significant influence on the distribution of sound energy and is associated with changes in sound quality.}, } @article {pmid28915781, year = {2017}, author = {Chládková, K and Hamann, S and Williams, D and Hellmuth, S}, title = {F2 slope as a Perceptual Cue for the Front-Back Contrast in Standard Southern British English.}, journal = {Language and speech}, volume = {60}, number = {3}, pages = {377-398}, doi = {10.1177/0023830916650991}, pmid = {28915781}, issn = {0023-8309}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; *Cues ; England ; Female ; Humans ; Male ; Pattern Recognition, Physiological ; *Phonetics ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {Acoustic studies of several languages indicate that second-formant (F2) slopes in high vowels have opposing directions (independent of consonantal context): front [iː]-like vowels are produced with a rising F2 slope, whereas back [uː]-like vowels are produced with a falling F2 slope. The present study first reports acoustic measurements that confirm this pattern for the English variety of Standard Southern British English (SSBE), where /uː/ has shifted from the back to the front area of the vowel space and is now realized with higher midpoint F2 values than several decades ago. Subsequently, we test whether the direction of F2 slope also serves as a reliable cue to the /iː/-/uː/ contrast in perception. The findings show that F2 slope direction is used as a cue (additional to midpoint formant values) to distinguish /iː/ from /uː/ by both young and older Standard Southern British English listeners: an otherwise ambiguous token is identified as /iː/ if it has a rising F2 slope and as /uː/ if it has a falling F2 slope. Furthermore, our results indicate that listeners generalize their reliance on F2 slope to other contrasts, namely /ɛ/-/ɒ/ and /æ/-/ɒ/, even though F2 slope is not employed to differentiate these vowels in production. This suggests that in Standard Southern British English, a rising F2 seems to be perceptually associated with an abstract feature such as [+front], whereas a falling F2 with an abstract feature such as [-front].}, } @article {pmid28886973, year = {2018}, author = {Moerman, M and Vanhecke, F and Van Assche, L and Vercruysse, J}, title = {Vocal Tract Morphology in Inhaling Singing: Characteristics During Vowel Production-A Case Study in a Professional Singer.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {5}, pages = {643.e17-643.e23}, doi = {10.1016/j.jvoice.2017.08.001}, pmid = {28886973}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; *Inhalation ; Magnetic Resonance Imaging ; Mouth/anatomy & histology/*physiology ; *Occupations ; Pharynx/anatomy & histology/*physiology ; *Phonation ; Prospective Studies ; *Singing ; *Voice Quality ; }, abstract = {BACKGROUND: A professional singer produced various vowels on a comfortable loudness and pitch in an inspiratory and expiratory phonation manner. The present study investigates the morphological differences and tries to find a link with the acoustical characteristics.

OBJECTIVES/HYPOTHESIS: We hypothesize that features, constantly present over all vowels, characterize inhaling phonation and that the formant frequencies reflect the morphological findings.

STUDY DESIGN: A prospective case study was carried out.

METHODS: A female singer uttered the vowels /a/, /e/, /i/, /o/, and /u/ in a supine position under magnetic resonance imaging, on a comfortable loudness and pitch, in both inhaling and exhaling manner. The exact same parameters as in previous reports were measured (1-3). Acoustical analysis was performed with Praat.

RESULTS: Wilcoxon directional testing demonstrates a statistically significant difference in (1) the distance between the lips, (2) the antero-posterior tongue diameter, (3) the distance between the lips and the tip of the tongue, (4) the distance between the epiglottis and the posterior pharyngeal wall, (5) the narrowing of the subglottic space, and (6) the oropharyngeal and the hypopharyngeal areas. Acoustical analysis reveals slightly more noise and irregularity during reverse phonation. The central frequency of F0 and F1 is identical, whereas that of F2 and F3 increases, and that of F4 varies.

CONCLUSIONS: A smaller mouth opening, a narrowing of the subglottic space, a larger supralaryngeal inlet, and a smaller antero-posterior tongue diameter can be considered as morphological characteristics for reverse phonation. Acoustically, reverse phonation discretely contains more noise and perturbation. The formant frequency distribution concurs with a mouth narrowing and pharyngeal widening during inhaling.}, } @article {pmid28878980, year = {2017}, author = {Kalashnikova, M and Carignan, C and Burnham, D}, title = {The origins of babytalk: smiling, teaching or social convergence?.}, journal = {Royal Society open science}, volume = {4}, number = {8}, pages = {170306}, pmid = {28878980}, issn = {2054-5703}, abstract = {When addressing their young infants, parents systematically modify their speech. Such infant-directed speech (IDS) contains exaggerated vowel formants, which have been proposed to foster language development via articulation of more distinct speech sounds. Here, this assumption is rigorously tested using both acoustic and, for the first time, fine-grained articulatory measures. Mothers were recorded speaking to their infant and to another adult, and measures were taken of their acoustic vowel space, their tongue and lip movements and the length of their vocal tract. Results showed that infant- but not adult-directed speech contains acoustically exaggerated vowels, and these are not the product of adjustments to tongue or to lip movements. Rather, they are the product of a shortened vocal tract due to a raised larynx, which can be ascribed to speakers' unconscious effort to appear smaller and more non-threatening to the young infant. This adjustment in IDS may be a vestige of early mother-infant interactions, which had as its primary purpose the transmission of non-aggressiveness and/or a primitive manifestation of pre-linguistic vocal social convergence of the mother to her infant. With the advent of human language, this vestige then acquired a secondary purpose-facilitating language acquisition via the serendipitously exaggerated vowels.}, } @article {pmid28871074, year = {2017}, author = {Çavuşoğlu, M and Poets, CF and Urschitz, MS}, title = {Acoustics of snoring and automatic snore sound detection in children.}, journal = {Physiological measurement}, volume = {38}, number = {11}, pages = {1919-1938}, doi = {10.1088/1361-6579/aa8a39}, pmid = {28871074}, issn = {1361-6579}, mesh = {*Acoustics ; Algorithms ; Automation ; Child ; Female ; Humans ; Male ; Middle Aged ; Signal Processing, Computer-Assisted ; Snoring/*diagnosis ; *Sound ; }, abstract = {OBJECTIVE: Acoustic analyses of snoring sounds have been used to objectively assess snoring and applied in various clinical problems for adult patients. Such studies require highly automatized tools to analyze the sound recordings of the whole night's sleep, in order to extract clinically relevant snore- related statistics. The existing techniques and software used for adults are not efficiently applicable to snoring sounds in children, basically because of different acoustic signal properties. In this paper, we present a broad range of acoustic characteristics of snoring sounds in children (N = 38) in comparison to adult (N = 30) patients.

APPROACH: Acoustic characteristics of the signals were calculated, including frequency domain representations, spectrogram-based characteristics, spectral envelope analysis, formant structures and loudness of the snoring sounds.

MAIN RESULTS: We observed significant differences in spectral features, formant structures and loudness of the snoring signals of children compared to adults that may arise from the diversity of the upper airway anatomy as the principal determinant of the snore sound generation mechanism. Furthermore, based on the specific audio features of snoring children, we proposed a novel algorithm for the automatic detection of snoring sounds from ambient acoustic data specifically in a pediatric population. The respiratory sounds were recorded using a pair of microphones and a multi-channel data acquisition system simultaneously with full-night polysomnography during sleep. Brief sound chunks of 0.5 s were classified as either belonging to a snoring event or not with a multi-layer perceptron, which was trained in a supervised fashion using stochastic gradient descent on a large hand-labeled dataset using frequency domain features.

SIGNIFICANCE: The method proposed here has been used to extract snore-related statistics that can be calculated from the detected snore episodes for the whole night's sleep, including number of snore episodes (total snoring time), ratio of snore to whole sleep time, variation of snoring rate, regularity of snoring episodes in time and amplitude and snore loudness. These statistics will ultimately serve as a clinical tool providing information for the objective evaluation of snoring for several clinical applications.}, } @article {pmid28864082, year = {2018}, author = {Eichhorn, JT and Kent, RD and Austin, D and Vorperian, HK}, title = {Effects of Aging on Vocal Fundamental Frequency and Vowel Formants in Men and Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {5}, pages = {644.e1-644.e9}, pmid = {28864082}, issn = {1873-4588}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Age Factors ; Aged ; Aged, 80 and over ; Aging/*physiology ; Female ; Humans ; Male ; Menopause ; Middle Aged ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; Young Adult ; }, abstract = {PURPOSE: This study reports data on vocal fundamental frequency (fo) and the first four formant frequencies (F1, F2, F3, F4) for four vowels produced by speakers in three adult age cohorts, in a test of the null hypothesis that there are no age-related changes in these variables. Participants were 43 men and 53 women between the ages of 20 and 92 years.

RESULTS: The most consistent age-related effect was a decrease in fo for women. Significant differences in F1, F2, and F3 were vowel-specific for both sexes. No significant differences were observed for the highest formant F4.

CONCLUSIONS: Women experience a significant decrease in fo, which is likely related to menopause. Formant frequencies of the corner vowels change little across several decades of adult life, either because physiological aging has small effects on these variables or because individuals compensate for age-related changes in anatomy and physiology.}, } @article {pmid28863619, year = {2017}, author = {Friedrichs, D and Maurer, D and Rosen, S and Dellwo, V}, title = {Vowel recognition at fundamental frequencies up to 1 kHz reveals point vowels as acoustic landmarks.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {2}, pages = {1025}, doi = {10.1121/1.4998706}, pmid = {28863619}, issn = {1520-8524}, abstract = {The phonological function of vowels can be maintained at fundamental frequencies (fo) up to 880 Hz [Friedrichs, Maurer, and Dellwo (2015). J. Acoust. Soc. Am. 138, EL36-EL42]. Here, the influence of talker variability and multiple response options on vowel recognition at high fos is assessed. The stimuli (n = 264) consisted of eight isolated vowels (/i y e ø ε a o u/) produced by three female native German talkers at 11 fos within a range of 220-1046 Hz. In a closed-set identification task, 21 listeners were presented excised 700-ms vowel nuclei with quasi-flat fo contours and resonance trajectories. The results show that listeners can identify the point vowels /i a u/ at fos up to almost 1 kHz, with a significant decrease for the vowels /y ε/ and a drop to chance level for the vowels /e ø o/ toward the upper fos. Auditory excitation patterns reveal highly differentiable representations for /i a u/ that can be used as landmarks for vowel category perception at high fos. These results suggest that theories of vowel perception based on overall spectral shape will provide a fuller account of vowel perception than those based solely on formant frequency patterns.}, } @article {pmid28863596, year = {2017}, author = {Alsius, A and Mitsuya, T and Latif, N and Munhall, KG}, title = {Linguistic initiation signals increase auditory feedback error correction.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {2}, pages = {838}, doi = {10.1121/1.4997193}, pmid = {28863596}, issn = {1520-8524}, support = {//CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Acoustics ; Adolescent ; Adult ; Auditory Threshold ; *Feedback, Sensory ; Female ; Humans ; *Linguistics ; Male ; Photic Stimulation ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Visual Perception ; *Voice Quality ; Young Adult ; }, abstract = {Previous research has shown that speakers can adapt their speech in a flexible manner as a function of a variety of contextual and task factors. While it is known that speech tasks may play a role in speech motor behavior, it remains to be explored if the manner in which the speaking action is initiated can modify low-level, automatic control of vocal motor action. In this study, the nature (linguistic vs non-linguistic) and modality (auditory vs visual) of the go signal (i.e., the prompts) was manipulated in an otherwise identical vocal production task. Participants were instructed to produce the word "head" when prompted, and the auditory feedback they were receiving was altered by systematically changing the first formants of the vowel /ε/ in real time using a custom signal processing system. Linguistic prompts induced greater corrective behaviors to the acoustic perturbations than non-linguistic prompts. This suggests that the accepted variance for the intended speech sound decreases when external linguistic templates are provided to the speaker. Overall, this result shows that the automatic correction of vocal errors is influenced by flexible, context-dependant mechanisms.}, } @article {pmid28863565, year = {2017}, author = {Peters, J and Heeringa, WJ and Schoormann, HE}, title = {Cross-linguistic vowel variation in trilingual speakers of Saterland Frisian, Low German, and High German.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {2}, pages = {991}, doi = {10.1121/1.4998723}, pmid = {28863565}, issn = {1520-8524}, abstract = {The present study compares the acoustic realization of Saterland Frisian, Low German, and High German vowels by trilingual speakers in the Saterland. The Saterland is a rural municipality in northwestern Germany. It offers the unique opportunity to study trilingualism with languages that differ both by their vowel inventories and by external factors, such as their social status and the autonomy of their speech communities. The objective of the study was to examine whether the trilingual speakers differ in their acoustic realizations of vowel categories shared by the three languages and whether those differences can be interpreted as effects of either the differences in the vowel systems or of external factors. Monophthongs produced in a /hVt/ frame revealed that High German vowels show the most divergent realizations in terms of vowel duration and formant frequencies, whereas Saterland Frisian and Low German vowels show small differences. These findings suggest that vowels of different languages are likely to share the same phonological space when the speech communities largely overlap, as is the case with Saterland Frisian and Low German, but may resist convergence if at least one language is shared with a larger, monolingual speech community, as is the case with High German.}, } @article {pmid28850589, year = {2017}, author = {Wood, A and Martin, J and Niedenthal, P}, title = {Towards a social functional account of laughter: Acoustic features convey reward, affiliation, and dominance.}, journal = {PloS one}, volume = {12}, number = {8}, pages = {e0183811}, pmid = {28850589}, issn = {1932-6203}, support = {T32 MH018931/MH/NIMH NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Aged ; Female ; Humans ; Laughter/*psychology ; Male ; Middle Aged ; *Reward ; Sex Factors ; *Social Dominance ; *Social Identification ; Young Adult ; }, abstract = {Recent work has identified the physical features of smiles that accomplish three tasks fundamental to human social living: rewarding behavior, establishing and managing affiliative bonds, and negotiating social status. The current work extends the social functional account to laughter. Participants (N = 762) rated the degree to which reward, affiliation, or dominance (between-subjects) was conveyed by 400 laughter samples acquired from a commercial sound effects website. Inclusion of a fourth rating dimension, spontaneity, allowed us to situate the current approach in the context of existing laughter research, which emphasizes the distinction between spontaneous and volitional laughter. We used 11 acoustic properties extracted from the laugh samples to predict participants' ratings. Actor sex moderated, and sometimes even reversed, the relation between acoustics and participants' judgments. Spontaneous laughter appears to serve the reward function in the current framework, as similar acoustic properties guided perceiver judgments of spontaneity and reward: reduced voicing and increased pitch, increased duration for female actors, and increased pitch slope, center of gravity, first formant, and noisiness for male actors. Affiliation ratings diverged from reward in their sex-dependent relationship to intensity and, for females, reduced pitch range and raised second formant. Dominance displayed the most distinct pattern of acoustic predictors, including increased pitch range, reduced second formant in females, and decreased pitch variability in males. We relate the current findings to existing findings on laughter and human and non-human vocalizations, concluding laughter can signal much more that felt or faked amusement.}, } @article {pmid28844651, year = {2018}, author = {Gallena, SJK and Stickels, B and Stickels, E}, title = {Gender Perception After Raising Vowel Fundamental and Formant Frequencies: Considerations for Oral Resonance Research.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {5}, pages = {592-601}, doi = {10.1016/j.jvoice.2017.06.023}, pmid = {28844651}, issn = {1873-4588}, mesh = {Acoustic Stimulation ; Audiometry, Speech ; Female ; Feminization ; Humans ; Male ; Masculinity ; Prospective Studies ; *Recognition, Psychology ; *Sex Characteristics ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; }, abstract = {OBJECTIVES: Voice feminization therapy for male-to-female transgender women typically targets increasing fundamental frequency (F0). Increasing vowel formant frequencies (FFs) has also been proposed. To better understand formant conditions that shift listeners' perception of gender from male to "not-male," individual and combined vowel FFs were incrementally raised, whereas F0 was held constant at a gender-ambiguous level.

METHODS: The study used a prospective, experimental group design. Using a customized MATLAB program, vowels (/i/, /æ/, /ɑ/, and /u/) spoken by an adult were manipulated by isolating and increasing FF1-3 until they matched those of a woman. Listeners heard randomized samples and perceptually categorized each as male, female, or gender neutral. The latter two choices were combined and labeled not-male.

RESULTS: Chi-square analyses revealed that listeners rated samples as not-male for /ɑ/ and /æ/ with all three formants shifted or individual formants shifted at >60%. Individual analysis of vowels, formants, and shifted FF using Kruskal-Wallis revealed a statistical significance for vowels only.

CONCLUSIONS: Results suggest that voice was convincingly perceived as not-male, for vowels characterized by a high F1 frequency, and that raising FFs for all four vowels increased (in varying amounts) the perception of voice femininity beyond that of raising F0 alone.}, } @article {pmid28844642, year = {2017}, author = {Charlton, BD and Taylor, AM and Reby, D}, title = {Function and Evolution of Vibrato-like Frequency Modulation in Mammals.}, journal = {Current biology : CB}, volume = {27}, number = {17}, pages = {2692-2697.e3}, doi = {10.1016/j.cub.2017.07.046}, pmid = {28844642}, issn = {1879-0445}, mesh = {Acoustics ; Adolescent ; *Auditory Perception ; Female ; Humans ; Male ; Young Adult ; }, abstract = {Why do distantly related mammals like sheep, giant pandas, and fur seals produce bleats that are characterized by vibrato-like fundamental frequency (F0) modulation? To answer this question, we used psychoacoustic tests and comparative analyses to investigate whether this distinctive vocal feature has evolved to improve the perception of formants, key acoustic components of animal calls that encode important information about the caller's size and identity [1]. Psychoacoustic tests on humans confirmed that vibrato-like F0 modulation improves the ability of listeners to detect differences in the formant patterns of synthetic bleat-like stimuli. Subsequent phylogenetically controlled comparative analyses revealed that vibrato-like F0 modulation has evolved independently in six mammalian orders in vocal signals with relatively high F0 and, therefore, low spectral density (i.e., less harmonic overtones). We also found that mammals modulate the vibrato in these calls over greater frequency extents when the number of harmonic overtones per formant is low, suggesting that this is a mechanism to improve formant perception in calls with low spectral density. Our findings constitute the first evidence that formant perception in non-speech sounds is improved by fundamental frequency modulation and provide a mechanism for the convergent evolution of bleat-like calls in mammals. They also indicate that selection pressures for animals to transmit important information encoded by formant frequencies (on size and identity, for example) are likely to have been a key driver in the evolution of mammal vocal diversity.}, } @article {pmid28835529, year = {2017}, author = {Reilly, KJ and Pettibone, C}, title = {Vowel generalization and its relation to adaptation during perturbations of auditory feedback.}, journal = {Journal of neurophysiology}, volume = {118}, number = {5}, pages = {2925-2934}, pmid = {28835529}, issn = {1522-1598}, support = {R03 DC011159/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Brain/physiology ; *Feedback, Physiological ; Female ; *Generalization, Psychological ; Humans ; Male ; Phonetics ; *Speech Perception ; }, abstract = {Repeated perturbations of auditory feedback during vowel production elicit changes not only in the production of the perturbed vowel (adaptation) but also in the production of nearby vowels that were not perturbed (generalization). The finding that adaptation generalizes to other, nonperturbed vowels suggests that sensorimotor representations for vowels are not independent; instead, the goals for producing any one vowel may depend in part on the goals for other vowels. The present study investigated the dependence or independence of vowel representations by evaluating adaptation and generalization in two groups of speakers exposed to auditory perturbations of their first formant (F1) during different vowels. The speakers in both groups who adapted to the perturbation exhibited generalization in two nonperturbed vowels that were produced under masking noise. Correlation testing was performed to evaluate the relations between adaptation and generalization as well as between the generalization in the two nonperturbed vowels. These tests identified significant coupling between the F1 changes of adjacent vowels but not nonadjacent vowels. The pattern of correlation findings indicates that generalization was due in part to feedforward representations that are partly shared across adjacent vowels, possibly to maintain their acoustic contrast.NEW & NOTEWORTHY Speech adaptations to alterations, or perturbations, of auditory feedback have provided important insights into sensorimotor representations underlying speech. One finding from these studies that is yet to be accounted for is vowel generalization, which describes the effects of repeated perturbations to one vowel on the production of other vowels that were not perturbed. The present study used correlation testing to quantify the effects of changes in a perturbed vowel on neighboring (i.e., similar) nonperturbed vowels. The results identified significant correlations between the changes of adjacent, but not nonadjacent, vowel pairs. This finding suggests that generalization is partly a response to adaptation and not solely due to the auditory perturbation.}, } @article {pmid28829808, year = {2017}, author = {Zhang, C and Shao, J and Huang, X}, title = {Deficits of congenital amusia beyond pitch: Evidence from impaired categorical perception of vowels in Cantonese-speaking congenital amusics.}, journal = {PloS one}, volume = {12}, number = {8}, pages = {e0183151}, pmid = {28829808}, issn = {1932-6203}, mesh = {China ; Humans ; *Music ; *Pitch Perception ; }, abstract = {Congenital amusia is a lifelong disorder of fine-grained pitch processing in music and speech. However, it remains unclear whether amusia is a pitch-specific deficit, or whether it affects frequency/spectral processing more broadly, such as the perception of formant frequency in vowels, apart from pitch. In this study, in order to illuminate the scope of the deficits, we compared the performance of 15 Cantonese-speaking amusics and 15 matched controls on the categorical perception of sound continua in four stimulus contexts: lexical tone, pure tone, vowel, and voice onset time (VOT). Whereas lexical tone, pure tone and vowel continua rely on frequency/spectral processing, the VOT continuum depends on duration/temporal processing. We found that the amusic participants performed similarly to controls in all stimulus contexts in the identification, in terms of the across-category boundary location and boundary width. However, the amusic participants performed systematically worse than controls in discriminating stimuli in those three contexts that depended on frequency/spectral processing (lexical tone, pure tone and vowel), whereas they performed normally when discriminating duration differences (VOT). These findings suggest that the deficit of amusia is probably not pitch specific, but affects frequency/spectral processing more broadly. Furthermore, there appeared to be differences in the impairment of frequency/spectral discrimination in speech and nonspeech contexts. The amusic participants exhibited less benefit in between-category discriminations than controls in speech contexts (lexical tone and vowel), suggesting reduced categorical perception; on the other hand, they performed inferiorly compared to controls across the board regardless of between- and within-category discriminations in nonspeech contexts (pure tone), suggesting impaired general auditory processing. These differences imply that the frequency/spectral-processing deficit might be manifested differentially in speech and nonspeech contexts in amusics-it is manifested as a deficit of higher-level phonological processing in speech sounds, and as a deficit of lower-level auditory processing in nonspeech sounds.}, } @article {pmid28826980, year = {2018}, author = {Wang, R and Bao, H and Xu, X and Piotrowski, D and Zhang, Y and Zhuang, P}, title = {The Effect of Vocal Fold Inferior Surface Hypertrophy on Voice Function in Excised Canine Larynges.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {4}, pages = {396-402}, doi = {10.1016/j.jvoice.2017.06.013}, pmid = {28826980}, issn = {1873-4588}, mesh = {Animals ; Biomechanical Phenomena ; Computer Simulation ; Dogs ; Fructose/administration & dosage ; Hypertrophy ; Imaging, Three-Dimensional ; In Vitro Techniques ; Injections ; Larynx/drug effects/pathology/*physiopathology/surgery ; Models, Anatomic ; *Phonation ; Pressure ; Software ; Tomography, X-Ray Computed ; Vibration ; Vocal Cords/drug effects/pathology/*physiopathology/surgery ; *Vocalization, Animal/drug effects ; }, abstract = {OBJECTIVE: This study aimed to explore the changes in vocal fold inferior surface hypertrophy (VFISH) on vocal fold vibration by aerodynamic and acoustic analysis. The present study allows us to gain new insights into the subglottal convergence angle (SCA), which will change with VFISH.

STUDY DESIGN: The study is prospective, and designed for repeated measures with each excised canine larynx serving as own control.

SUBJECTS AND METHODS: Three degrees of VFISH, initial, mild, and severe, were simulated by injecting different doses of fructose injections into the inferior surface of the vocal folds of 10 excised canine larynges. Computed tomographic images of the larynx were gathered, and three-dimensional models of the airway and vocal folds were reconstructed using the Mimics software. The SCA was measured from the reconstructed models. Phonation threshold flow (PTF), phonation threshold pressure (PTP), and mean flow rate (MFR) were recorded directly in the excised canine larynx phonation setup. Glottal resistance (GR), sound pressure level (SPL), fundamental frequency (F0), and formants 1-4 (F1-4) were measured when subglottal pressure (Psub) was at 1.5 kPa or 2.5 kPa, separately. Using ordinary one-way analysis of variance, we compared the aerodynamic outcomes and voice quality among the three groups of hypertrophy.

RESULTS: The SCA, PTP, and PTF increased with the degree of VFISH. When the Psub was controlled at 1.5 kPa or 2.5 kPa, F0 also increased significantly with the degree of VFISH of the excised canine larynges. The MFR, GR, SPL, and F1-4 had little change between the three groups and were not significantly different.

CONCLUSION: The VFISH makes onset phonation more difficult, increases the SCA, and increases the F0 in sustained phonation.}, } @article {pmid28799983, year = {2018}, author = {Gaudrain, E and Başkent, D}, title = {Discrimination of Voice Pitch and Vocal-Tract Length in Cochlear Implant Users.}, journal = {Ear and hearing}, volume = {39}, number = {2}, pages = {226-237}, pmid = {28799983}, issn = {1538-4667}, mesh = {Aged ; *Cochlear Implants ; Deafness/rehabilitation ; Female ; Humans ; Male ; Middle Aged ; *Pitch Discrimination ; Speech Acoustics ; *Speech Perception ; }, abstract = {OBJECTIVES: When listening to two competing speakers, normal-hearing (NH) listeners can take advantage of voice differences between the speakers. Users of cochlear implants (CIs) have difficulty in perceiving speech on speech. Previous literature has indicated sensitivity to voice pitch (related to fundamental frequency, F0) to be poor among implant users, while sensitivity to vocal-tract length (VTL; related to the height of the speaker and formant frequencies), the other principal voice characteristic, has not been directly investigated in CIs. A few recent studies evaluated F0 and VTL perception indirectly, through voice gender categorization, which relies on perception of both voice cues. These studies revealed that, contrary to prior literature, CI users seem to rely exclusively on F0 while not utilizing VTL to perform this task. The objective of the present study was to directly and systematically assess raw sensitivity to F0 and VTL differences in CI users to define the extent of the deficit in voice perception.

DESIGN: The just-noticeable differences (JNDs) for F0 and VTL were measured in 11 CI listeners using triplets of consonant-vowel syllables in an adaptive three-alternative forced choice method.

RESULTS: The results showed that while NH listeners had average JNDs of 1.95 and 1.73 semitones (st) for F0 and VTL, respectively, CI listeners showed JNDs of 9.19 and 7.19 st. These JNDs correspond to differences of 70% in F0 and 52% in VTL. For comparison to the natural range of voices in the population, the F0 JND in CIs remains smaller than the typical male-female F0 difference. However, the average VTL JND in CIs is about twice as large as the typical male-female VTL difference.

CONCLUSIONS: These findings, thus, directly confirm that CI listeners do not seem to have sufficient access to VTL cues, likely as a result of limited spectral resolution, and, hence, that CI listeners' voice perception deficit goes beyond poor perception of F0. These results provide a potential common explanation not only for a number of deficits observed in CI listeners, such as voice identification and gender categorization, but also for competing speech perception.}, } @article {pmid28785043, year = {2017}, author = {Elmer, S and Hausheer, M and Albrecht, J and Kühnis, J}, title = {Human Brainstem Exhibits higher Sensitivity and Specificity than Auditory-Related Cortex to Short-Term Phonetic Discrimination Learning.}, journal = {Scientific reports}, volume = {7}, number = {1}, pages = {7455}, pmid = {28785043}, issn = {2045-2322}, mesh = {Acoustic Stimulation/*methods ; Auditory Cortex/*physiology ; Auditory Perception ; Brain Stem/*physiology ; Discrimination Learning/*physiology ; Electroencephalography ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; Phonetics ; }, abstract = {Phonetic discrimination learning is an active perceptual process that operates under the influence of cognitive control mechanisms by increasing the sensitivity of the auditory system to the trained stimulus attributes. It is assumed that the auditory cortex and the brainstem interact in order to refine how sounds are transcribed into neural codes. Here, we evaluated whether these two computational entities are prone to short-term functional changes, whether there is a chronological difference in malleability, and whether short-term training suffices to alter reciprocal interactions. We performed repeated cortical (i.e., mismatch negativity responses, MMN) and subcortical (i.e., frequency-following response, FFR) EEG measurements in two groups of participants who underwent one hour of phonetic discrimination training or were passively exposed to the same stimulus material. The training group showed a distinctive brainstem energy reduction in the trained frequency-range (i.e., first formant), whereas the passive group did not show any response modulation. Notably, brainstem signal change correlated with the behavioral improvement during training, this result indicating a close relationship between behavior and underlying brainstem physiology. Since we did not reveal group differences in MMN responses, results point to specific short-term brainstem changes that precede functional alterations in the auditory cortex.}, } @article {pmid28780308, year = {2017}, author = {Elmer, S and Greber, M and Pushparaj, A and Kühnis, J and Jäncke, L}, title = {Faster native vowel discrimination learning in musicians is mediated by an optimization of mnemonic functions.}, journal = {Neuropsychologia}, volume = {104}, number = {}, pages = {64-75}, doi = {10.1016/j.neuropsychologia.2017.08.001}, pmid = {28780308}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Cognition/physiology ; Discrimination Learning/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; *Music ; Phonetics ; Pitch Perception/*physiology ; Psychometrics ; Reaction Time/physiology ; Young Adult ; }, abstract = {The ability to discriminate phonemes varying in spectral and temporal attributes constitutes one of the most basic intrinsic elements underlying language learning mechanisms. Since previous work has consistently shown that professional musicians are characterized by perceptual and cognitive advantages in a variety of language-related tasks, and since vowels can be considered musical sounds within the domain of speech, here we investigated the behavioral and electrophysiological correlates of native vowel discrimination learning in a sample of professional musicians and non-musicians. We evaluated the contribution of both the neurophysiological underpinnings of perceptual (i.e., N1/P2 complex) and mnemonic functions (i.e., N400 and P600 responses) while the participants were instructed to judge whether pairs of native consonant-vowel (CV) syllables manipulated in the first formant transition of the vowel (i.e., from /tu/ to /to/) were identical or not. Results clearly demonstrated faster learning in musicians, compared to non-musicians, as reflected by shorter reaction times and higher accuracy. Most notably, in terms of morphology, time course, and voltage strength, this steeper learning curve was accompanied by distinctive N400 and P600 manifestations between the two groups. In contrast, we did not reveal any group differences during the early stages of auditory processing (i.e., N1/P2 complex), suggesting that faster learning was mediated by an optimization of mnemonic but not perceptual functions. Based on a clear taxonomy of the mnemonic functions involved in the task, results are interpreted as pointing to a relationship between faster learning mechanisms in musicians and an optimization of echoic (i.e., N400 component) and working memory (i.e., P600 component) functions.}, } @article {pmid28766525, year = {2017}, author = {Kir'yanov, PA and Kaganov, AS}, title = {[The possibilities for the use of the spectral analysis to overcome the incompatibility of the speech materials for the purpose of medical criminalistic identification of the speaker].}, journal = {Sudebno-meditsinskaia ekspertiza}, volume = {60}, number = {4}, pages = {25-28}, doi = {10.17116/sudmed201760425-28}, pmid = {28766525}, issn = {0039-4521}, mesh = {Biometric Identification/*methods ; Forensic Medicine/methods ; Humans ; *Phonetics ; Sound Spectrography/methods ; *Speech Acoustics ; Speech Production Measurement/*methods ; *Voice Quality ; }, abstract = {The objective of the present study was the formulation of the theoretical premises on which to base the approaches to the overcoming of the limitations not infrequently imposed on the sound records of the speech material of interest. The secondary objective was the development of the methods for this purpose. We analyzed the literature sources, the methodological instructive regulations, and the results of medical criminalistics investigations. The study has demonstrated that the spectral analysis of the speech materials provides a reliable tool for overcoming the incomplete (by force of different causes) compatibility between the initial phonograms and those under consideration for the purpose of forensic medical expertise with a view to the identification of personality of the speaker.}, } @article {pmid28764485, year = {2017}, author = {Fox, RA and Jacewicz, E}, title = {Reconceptualizing the vowel space in analyzing regional dialect variation and sound change in American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {1}, pages = {444}, pmid = {28764485}, issn = {1520-8524}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adolescent ; Adult ; Aged ; Aged, 80 and over ; Child ; Female ; Humans ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {Vowel space area (VSA) calculated on the basis of corner vowels has emerged as a metric for the study of regional variation, speech intelligibility and speech development. This paper gives an evaluation of the basic assumptions underlying both the concept of the vowel space and the utility of the VSA in making cross-dialectal and sound change comparisons. Using cross-generational data from 135 female speakers representing three distinct dialects of American English, the first step was to establish that the vowel quadrilateral fails as a metric in the context of dialect variation. The next step was to examine the efficacy of more complete assessments of VSA represented by the convex hull and the concave hull. Despite the improvement over the quadrilateral, both metrics yielded inconsistent estimates of VSA. This paper then explores the possibility that regional variation can be characterized more effectively if formant dynamics and the resulting spectral overlap were also considered in defining the space. The proposed formant density approach showed that the working space may be common to all dialects but the differences are in the internal distribution of spectral density regions that define dialect-specific "usage" of the acoustic space. The dialect-inherent distribution of high and low density regions is largely shaped by sound change.}, } @article {pmid28764479, year = {2017}, author = {Renwick, MEL and Olsen, RM}, title = {Analyzing dialect variation in historical speech corpora.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {1}, pages = {406}, doi = {10.1121/1.4991009}, pmid = {28764479}, issn = {1520-8524}, abstract = {The Linguistic Atlas of the Gulf States is an extensive audio corpus of sociolinguistic interviews with 1121 speakers from eight southeastern U.S. states. Complete interviews have never been fully transcribed, leaving a wealth of phonetic information unexplored. This paper details methods for large-scale acoustic analysis of this historical speech corpus, providing a fuller picture of Southern speech than offered by previous impressionistic analyses. Interviews from 10 speakers (∼36 h) in southeast Georgia were transcribed and analyzed for dialectal features associated with the Southern Vowel Shift and African American Vowel Shift, also considering the effects of age, gender, and race. Multiple tokens of common words were annotated (N = 6085), and formant values of their stressed vowels were extracted. The effects of shifting on relative vowel placement were evaluated via Pillai scores, and vowel dynamics were estimated via functional data analysis and modeled with linear mixed-effects regression. Results indicate that European American speakers show features of the Southern Vowel Shift, though certain speakers shift in more ways than others, and African American speakers' productions are consistent with the African American Vowel Shift. Wide variation is apparent, even within this small geographic region, contributing evidence of the complexity of Southern speech.}, } @article {pmid28764447, year = {2017}, author = {Blackwood Ximenes, A and Shaw, JA and Carignan, C}, title = {A comparison of acoustic and articulatory methods for analyzing vowel differences across dialects: Data from American and Australian English.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {1}, pages = {363}, doi = {10.1121/1.4991346}, pmid = {28764447}, issn = {1520-8524}, abstract = {In studies of dialect variation, the articulatory nature of vowels is sometimes inferred from formant values using the following heuristic: F1 is inversely correlated with tongue height and F2 is inversely correlated with tongue backness. This study compared vowel formants and corresponding lingual articulation in two dialects of English, standard North American English, and Australian English. Five speakers of North American English and four speakers of Australian English were recorded producing multiple repetitions of ten monophthongs embedded in the /sVd/ context. Simultaneous articulatory data were collected using electromagnetic articulography. Results show that there are significant correlations between tongue position and formants in the direction predicted by the heuristic but also that the relations implied by the heuristic break down under specific conditions. Articulatory vowel spaces, based on tongue dorsum position, and acoustic vowel spaces, based on formants, show systematic misalignment due in part to the influence of other articulatory factors, including lip rounding and tongue curvature on formant values. Incorporating these dimensions into dialect comparison yields a richer description and a more robust understanding of how vowel formant patterns are reproduced within and across dialects.}, } @article {pmid28764432, year = {2017}, author = {Kiefte, M and Nearey, TM}, title = {Modeling consonant-context effects in a large database of spontaneous speech recordings.}, journal = {The Journal of the Acoustical Society of America}, volume = {142}, number = {1}, pages = {434}, doi = {10.1121/1.4991022}, pmid = {28764432}, issn = {1520-8524}, abstract = {Given recent interest in the analysis of naturally produced spontaneous speech, a large database of speech samples from the Canadian Maritimes was collected, processed, and analyzed with the primary aim of examining vowel-inherent spectral change in formant trajectories. Although it takes few resources to collect a large sample of audio recordings, the analysis of spontaneous speech introduces a number of difficulties compared to that of laboratory citation speech: Surrounding consonants may have a large influence on vowel formant frequencies and the distribution of consonant contexts is highly unbalanced. To overcome these problems, a statistical procedure inspired by that of Broad and Clermont [(2014). J. Phon. 47, 47-80] was developed to estimate the magnitude of both onset and coda effects on vowel formant frequencies. Estimates of vowel target formant frequencies and the parameters associated with consonant-context effects were allowed to vary freely across the duration of the vocalic portion of a syllable which facilitated the examination of vowel-inherent spectral change. Thirty-five hours of recorded speech samples from 223 speakers were automatically segmented and formant-frequency values were measured for all stressed vowels in the database. Consonant effects were accounted for to produce context-normalized vowel formant frequencies that varied across time.}, } @article {pmid28748486, year = {2017}, author = {Lee, N and Schrode, KM and Bee, MA}, title = {Nonlinear processing of a multicomponent communication signal by combination-sensitive neurons in the anuran inferior colliculus.}, journal = {Journal of comparative physiology. A, Neuroethology, sensory, neural, and behavioral physiology}, volume = {203}, number = {9}, pages = {749-772}, pmid = {28748486}, issn = {1432-1351}, mesh = {Acoustic Stimulation ; Action Potentials/physiology ; Animals ; Anura/physiology ; Auditory Pathways/physiology ; Auditory Perception ; Female ; Inferior Colliculi/*cytology ; Models, Neurological ; Neurons/*physiology ; *Nonlinear Dynamics ; Rana clamitans/*physiology ; Sexual Behavior, Animal ; Vocalization, Animal/*physiology ; }, abstract = {Diverse animals communicate using multicomponent signals. How a receiver's central nervous system integrates multiple signal components remains largely unknown. We investigated how female green treefrogs (Hyla cinerea) integrate the multiple spectral components present in male advertisement calls. Typical calls have a bimodal spectrum consisting of formant-like low-frequency (~0.9 kHz) and high-frequency (~2.7 kHz) components that are transduced by different sensory organs in the inner ear. In behavioral experiments, only bimodal calls reliably elicited phonotaxis in no-choice tests, and they were selectively chosen over unimodal calls in two-alternative choice tests. Single neurons in the inferior colliculus of awake, passively listening subjects were classified as combination-insensitive units (27.9%) or combination-sensitive units (72.1%) based on patterns of relative responses to the same bimodal and unimodal calls. Combination-insensitive units responded similarly to the bimodal call and one or both unimodal calls. In contrast, combination-sensitive units exhibited both linear responses (i.e., linear summation) and, more commonly, nonlinear responses (e.g., facilitation, compressive summation, or suppression) to the spectral combination in the bimodal call. These results are consistent with the hypothesis that nonlinearities play potentially critical roles in spectral integration and in the neural processing of multicomponent communication signals.}, } @article {pmid28737705, year = {2017}, author = {Zhu, L and Chen, L and Zhao, D and Zhou, J and Zhang, W}, title = {Emotion Recognition from Chinese Speech for Smart Affective Services Using a Combination of SVM and DBN.}, journal = {Sensors (Basel, Switzerland)}, volume = {17}, number = {7}, pages = {}, pmid = {28737705}, issn = {1424-8220}, mesh = {Algorithms ; Databases, Factual ; *Emotions ; Humans ; Speech ; Support Vector Machine ; }, abstract = {Accurate emotion recognition from speech is important for applications like smart health care, smart entertainment, and other smart services. High accuracy emotion recognition from Chinese speech is challenging due to the complexities of the Chinese language. In this paper, we explore how to improve the accuracy of speech emotion recognition, including speech signal feature extraction and emotion classification methods. Five types of features are extracted from a speech sample: mel frequency cepstrum coefficient (MFCC), pitch, formant, short-term zero-crossing rate and short-term energy. By comparing statistical features with deep features extracted by a Deep Belief Network (DBN), we attempt to find the best features to identify the emotion status for speech. We propose a novel classification method that combines DBN and SVM (support vector machine) instead of using only one of them. In addition, a conjugate gradient method is applied to train DBN in order to speed up the training process. Gender-dependent experiments are conducted using an emotional speech database created by the Chinese Academy of Sciences. The results show that DBN features can reflect emotion status better than artificial features, and our new classification approach achieves an accuracy of 95.8%, which is higher than using either DBN or SVM separately. Results also show that DBN can work very well for small training databases if it is properly designed.}, } @article {pmid28726592, year = {2017}, author = {Veugen, LCE and Chalupper, J and Mens, LHM and Snik, AFM and van Opstal, AJ}, title = {Effect of extreme adaptive frequency compression in bimodal listeners on sound localization and speech perception.}, journal = {Cochlear implants international}, volume = {18}, number = {5}, pages = {266-277}, doi = {10.1080/14670100.2017.1353762}, pmid = {28726592}, issn = {1754-7628}, mesh = {Aged ; Aged, 80 and over ; Cochlear Implantation/methods ; *Cochlear Implants ; Combined Modality Therapy ; Correction of Hearing Impairment/*methods ; Cues ; Female ; *Hearing Aids ; Hearing Loss/physiopathology/*rehabilitation ; Humans ; Male ; Middle Aged ; Noise ; Pitch Perception ; Sound Localization/*physiology ; Speech Perception/*physiology ; Treatment Outcome ; }, abstract = {OBJECTIVES: This study aimed to improve access to high-frequency interaural level differences (ILD), by applying extreme frequency compression (FC) in the hearing aid (HA) of 13 bimodal listeners, using a cochlear implant (CI) and conventional HA in opposite ears.

DESIGN: An experimental signal-adaptive frequency-lowering algorithm was tested, compressing frequencies above 160 Hz into the individual audible range of residual hearing, but only for consonants (adaptive FC), thus protecting vowel formants, with the aim to preserve speech perception. In a cross-over design with at least 5 weeks of acclimatization between sessions, bimodal performance with and without adaptive FC was compared for horizontal sound localization, speech understanding in quiet and in noise, and vowel, consonant and voice-pitch perception.

RESULTS: On average, adaptive FC did not significantly affect any of the test results. Yet, two subjects who were fitted with a relatively weak frequency compression ratio, showed improved horizontal sound localization. After the study, four subjects preferred adaptive FC, four preferred standard frequency mapping, and four had no preference. Noteworthy, the subjects preferring adaptive FC were those with best performance on all tasks, both with and without adaptive FC.

CONCLUSION: On a group level, extreme adaptive FC did not change sound localization and speech understanding in bimodal listeners. Possible reasons are too strong compression ratios, insufficient residual hearing or that the adaptive switching, although preserving vowel perception, may have been ineffective to produce consistent ILD cues. Individual results suggested that two subjects were able to integrate the frequency-compressed HA input with that of the CI, and benefitted from enhanced binaural cues for horizontal sound localization.}, } @article {pmid28717151, year = {2017}, author = {Knoeferle, K and Li, J and Maggioni, E and Spence, C}, title = {What drives sound symbolism? Different acoustic cues underlie sound-size and sound-shape mappings.}, journal = {Scientific reports}, volume = {7}, number = {1}, pages = {5562}, pmid = {28717151}, issn = {2045-2322}, mesh = {Acoustic Stimulation/*methods ; Female ; Humans ; Judgment ; Nontherapeutic Human Experimentation ; Photic Stimulation ; *Symbolism ; Young Adult ; }, abstract = {Sound symbolism refers to the non-arbitrary mappings that exist between phonetic properties of speech sounds and their meaning. Despite there being an extensive literature on the topic, the acoustic features and psychological mechanisms that give rise to sound symbolism are not, as yet, altogether clear. The present study was designed to investigate whether different sets of acoustic cues predict size and shape symbolism, respectively. In two experiments, participants judged whether a given consonant-vowel speech sound was large or small, round or angular, using a size or shape scale. Visual size judgments were predicted by vowel formant F1 in combination with F2, and by vowel duration. Visual shape judgments were, however, predicted by formants F2 and F3. Size and shape symbolism were thus not induced by a common mechanism, but rather were distinctly affected by acoustic properties of speech sounds. These findings portray sound symbolism as a process that is not based merely on broad categorical contrasts, such as round/unround and front/back vowels. Rather, individuals seem to base their sound-symbolic judgments on specific sets of acoustic cues, extracted from speech sounds, which vary across judgment dimensions.}, } @article {pmid28716965, year = {2017}, author = {Berezutskaya, J and Freudenburg, ZV and Güçlü, U and van Gerven, MAJ and Ramsey, NF}, title = {Neural Tuning to Low-Level Features of Speech throughout the Perisylvian Cortex.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {37}, number = {33}, pages = {7906-7920}, pmid = {28716965}, issn = {1529-2401}, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Auditory Cortex/*physiology ; Brain Mapping/*methods ; Electrocorticography/methods ; Electrodes, Implanted ; Female ; Humans ; Magnetic Resonance Imaging/methods ; Male ; Nerve Net/*physiology ; *Phonetics ; Photic Stimulation/methods ; Speech/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Despite a large body of research, we continue to lack a detailed account of how auditory processing of continuous speech unfolds in the human brain. Previous research showed the propagation of low-level acoustic features of speech from posterior superior temporal gyrus toward anterior superior temporal gyrus in the human brain (Hullett et al., 2016). In this study, we investigate what happens to these neural representations past the superior temporal gyrus and how they engage higher-level language processing areas such as inferior frontal gyrus. We used low-level sound features to model neural responses to speech outside of the primary auditory cortex. Two complementary imaging techniques were used with human participants (both males and females): electrocorticography (ECoG) and fMRI. Both imaging techniques showed tuning of the perisylvian cortex to low-level speech features. With ECoG, we found evidence of propagation of the temporal features of speech sounds along the ventral pathway of language processing in the brain toward inferior frontal gyrus. Increasingly coarse temporal features of speech spreading from posterior superior temporal cortex toward inferior frontal gyrus were associated with linguistic features such as voice onset time, duration of the formant transitions, and phoneme, syllable, and word boundaries. The present findings provide the groundwork for a comprehensive bottom-up account of speech comprehension in the human brain.SIGNIFICANCE STATEMENT We know that, during natural speech comprehension, a broad network of perisylvian cortical regions is involved in sound and language processing. Here, we investigated the tuning to low-level sound features within these regions using neural responses to a short feature film. We also looked at whether the tuning organization along these brain regions showed any parallel to the hierarchy of language structures in continuous speech. Our results show that low-level speech features propagate throughout the perisylvian cortex and potentially contribute to the emergence of "coarse" speech representations in inferior frontal gyrus typically associated with high-level language processing. These findings add to the previous work on auditory processing and underline a distinctive role of inferior frontal gyrus in natural speech comprehension.}, } @article {pmid28712601, year = {2018}, author = {Park, SK and Lee, YS and Kang, YA and Xu, J and Rha, KS and Kim, YM}, title = {The effects of uvulopalatal flap operation on speech nasalance and the acoustic parameters of the final nasal consonants.}, journal = {Auris, nasus, larynx}, volume = {45}, number = {2}, pages = {311-319}, doi = {10.1016/j.anl.2017.06.004}, pmid = {28712601}, issn = {1879-1476}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Palate, Soft/physiopathology/*surgery ; *Postoperative Period ; Sleep Apnea, Obstructive/*surgery ; Uvula/physiopathology/*surgery ; Voice ; Voice Quality/*physiology ; }, abstract = {OBJECTIVE: The acoustic characteristics of voice are determined by the source of the sound and shape of the vocal tract. Various anatomical changes after uvulopalatal flap (UPF) operation can change nasalance and/or other voice characteristics. Our aim was to explore the possible effects of UPF creation on speech nasalance and the resonatory features of the final nasal consonants, and thus voice characteristics.

METHODS: A total of 30 patients (26 males, 4 females) with obstructive sleep apnea who underwent UPF operation were recruited. A Nasometer II 3.4 instrument was used to assess nasalance pre- and post-operatively; the patients read standard Korean passages and the readings were recorded in Computer Speech Laboratory for later spectral analysis. Praat software was used to identify frequency bands affecting perioperative nasalance scores. Minima, maxima, and slopes were analyzed.

RESULTS: We found no significant correlation between nasalance scores (any passage) and the respiratory distress index or body mass index. No significant perioperative change in any nasalance score. The moment variations in the final consonants /m/ and /n/ did not change significantly postoperatively. However, the postoperative moment variation of the final consonant /ng/ differed significantly in the third formant (F3) and second bandwidth (BW2).

CONCLUSION: Few significant changes in nasal resonance speech quality were apparent after UPF operation. However, a postoperative acoustic change in the final sound /ng/ may be sustained. Patients may be preoperatively advised that the risk of voice change is very low, but not absent.}, } @article {pmid28703653, year = {2018}, author = {Campbell, H and McAllister Byun, T}, title = {Deriving individualised /r/ targets from the acoustics of children's non-rhotic vowels.}, journal = {Clinical linguistics & phonetics}, volume = {32}, number = {1}, pages = {70-87}, pmid = {28703653}, issn = {1464-5076}, support = {R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {*Articulation Disorders ; *Biofeedback, Psychology ; Child ; Female ; Humans ; Male ; Speech ; *Speech Acoustics ; Speech Sound Disorder ; Tongue ; }, abstract = {In visual-acoustic biofeedback for rhotic errors, learners are guided to match the third formant (F3) location to a visual target on a real-time acoustic spectrum. As the acoustic properties of correct English /r/differ across speakers, this study aimed to improve target selection by investigating the validity of individualised targets derived from children's non-rhotic vowels. A previously proposed prediction formula was adjusted using data from a child normative sample and tested in two groups of children. Study 1 found that predicted values were unexpectedly higher than actual F3 values in children whose /r/ errors had been remediated. To understand this discrepancy, Study 2 applied the formula to typically developing children and found that predicted values were also higher than actual F3 values, suggesting that different normative data might better represent the current samples. An updated formula is proposed, which can be used to generate individualised targets within acoustic biofeedback applications.}, } @article {pmid28695209, year = {2017}, author = {Lieberman, P}, title = {Comment on "Monkey vocal tracts are speech-ready".}, journal = {Science advances}, volume = {3}, number = {7}, pages = {e1700442}, pmid = {28695209}, issn = {2375-2548}, mesh = {Animals ; Haplorhini ; Humans ; *Speech ; }, abstract = {Monkey vocal tracts are capable of producing monkey speech, not the full range of articulate human speech. The evolution of human speech entailed both anatomy and brains. Fitch, de Boer, Mathur, and Ghazanfar in Science Advances claim that "monkey vocal tracts are speech-ready," and conclude that "…the evolution of human speech capabilities required neural change rather than modifications of vocal anatomy." Neither premise is consistent either with the data presented and the conclusions reached by de Boer and Fitch themselves in their own published papers on the role of anatomy in the evolution of human speech or with the body of independent studies published since the 1950s.}, } @article {pmid28679275, year = {2017}, author = {Barreda, S}, title = {An investigation of the systematic use of spectral information in the determination of apparent-talker height.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {6}, pages = {4781}, doi = {10.1121/1.4985192}, pmid = {28679275}, issn = {1520-8524}, abstract = {The perception of apparent-talker height is mostly determined by the fundamental frequency (f0) and spectral characteristics of a voice. Although it is traditionally thought that spectral cues affect apparent-talker height by influencing apparent vocal-tract length, a recent experiment [Barreda (2016). J. Phon. 55, 1-18] suggests that apparent-talker height can vary significantly within-talker on the basis of phonemically-determined spectral variability. In this experiment, listeners were asked to estimate the height of 10 female talkers based on manipulated natural productions of bVd words containing one of /i æ ɑ u ɝ/. Results indicate that although listeners appear to use vocal-tract length estimates in determining apparent-height, apparent-talker height also varies significantly within-talker based on the inherent spectral and source characteristics of different vowels, with vowels with lower formant-frequencies and f0 being associated with taller talkers overall. The use of spectral and f0 information in apparent-height estimation varied considerably between listeners, resulting in additional variation in the apparent-height of talkers. Although the use of acoustic information in the determination of apparent-height was highly systematic, it does not necessarily follow from the empirical relationship between speech acoustics and actual talker height.}, } @article {pmid28679267, year = {2017}, author = {Hussain, Q and Proctor, M and Harvey, M and Demuth, K}, title = {Acoustic characteristics of Punjabi retroflex and dental stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {6}, pages = {4522}, doi = {10.1121/1.4984595}, pmid = {28679267}, issn = {1520-8524}, mesh = {*Acoustics ; Adult ; Cues ; Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The phonological category "retroflex" is found in many Indo-Aryan languages; however, it has not been clearly established which acoustic characteristics reliably differentiate retroflexes from other coronals. This study investigates the acoustic phonetic properties of Punjabi retroflex /ʈ/ and dental /ʈ̪/ in word-medial and word-initial contexts across /i e a o u/, and in word-final context across /i a u/. Formant transitions, closure and release durations, and spectral moments of release bursts are compared in 2280 stop tokens produced by 30 speakers. Although burst spectral measures and formant transitions do not consistently differentiate retroflexes from dentals in some vowel contexts, stop release duration, and total stop duration reliably differentiate Punjabi retroflex and dental stops across all word contexts and vocalic environments. These results suggest that Punjabi coronal place contrasts are signaled by the complex interaction of temporal and spectral cues.}, } @article {pmid28669914, year = {2017}, author = {Andermann, M and Patterson, RD and Vogt, C and Winterstetter, L and Rupp, A}, title = {Neuromagnetic correlates of voice pitch, vowel type, and speaker size in auditory cortex.}, journal = {NeuroImage}, volume = {158}, number = {}, pages = {79-89}, doi = {10.1016/j.neuroimage.2017.06.065}, pmid = {28669914}, issn = {1095-9572}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Cortex/*physiology ; Body Size ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Magnetoencephalography ; Male ; Pitch Perception/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Vowel recognition is largely immune to differences in speaker size despite the waveform differences associated with variation in speaker size. This has led to the suggestion that voice pitch and mean formant frequency (MFF) are extracted early in the hierarchy of hearing/speech processing and used to normalize the internal representation of vowel sounds. This paper presents a magnetoencephalographic (MEG) experiment designed to locate and compare neuromagnetic activity associated with voice pitch, MFF and vowel type in human auditory cortex. Sequences of six sustained vowels were used to contrast changes in the three components of vowel perception, and MEG responses to the changes were recorded from 25 participants. A staged procedure was employed to fit the MEG data with a source model having one bilateral pair of dipoles for each component of vowel perception. This dipole model showed that the activity associated with the three perceptual changes was functionally separable; the pitch source was located in Heschl's gyrus (bilaterally), while the vowel-type and formant-frequency sources were located (bilaterally) just behind Heschl's gyrus in planum temporale. The results confirm that vowel normalization begins in auditory cortex at an early point in the hierarchy of speech processing.}, } @article {pmid28655064, year = {2017}, author = {Allison, KM and Annear, L and Policicchio, M and Hustad, KC}, title = {Range and Precision of Formant Movement in Pediatric Dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {60}, number = {7}, pages = {1864-1876}, pmid = {28655064}, issn = {1558-9102}, support = {F31 DC013925/DC/NIDCD NIH HHS/United States ; P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC009411/DC/NIDCD NIH HHS/United States ; }, mesh = {Cerebral Palsy/complications/physiopathology ; Child, Preschool ; Dysarthria/complications/*physiopathology ; Female ; Humans ; Longitudinal Studies ; Male ; Motor Activity/physiology ; Mouth/*physiopathology ; Phonetics ; Reproducibility of Results ; Speech/*physiology ; Speech Production Measurement ; }, abstract = {PURPOSE: This study aimed to improve understanding of speech characteristics associated with dysarthria in children with cerebral palsy by analyzing segmental and global formant measures in single-word and sentence contexts.

METHOD: Ten 5-year-old children with cerebral palsy and dysarthria and 10 age-matched, typically developing children participated in this study. Vowel space area and second formant interquartile range were measured from children's elicited productions of single words and sentences.

RESULTS: Results showed that the children with dysarthria had significantly smaller vowel space areas than typically developing children in both word and sentence contexts; however, overall ranges of second formant movement did not differ between groups in word or sentence contexts. Additional analysis of single words revealed that, compared to typical children, children with dysarthria had smaller second formant interquartile ranges in single words with phonetic contexts requiring large changes in vocal tract configuration, but not in single words with monophthongs.

CONCLUSIONS: Results of this study suggest that children with dysarthria may not have globally reduced ranges of articulatory movement compared to typically developing peers; however, they do exhibit reduced precision in producing phonetic targets.}, } @article {pmid28653556, year = {2018}, author = {Casserly, ED and Wang, Y and Celestin, N and Talesnick, L and Pisoni, DB}, title = {Supra-Segmental Changes in Speech Production as a Result of Spectral Feedback Degradation: Comparison with Lombard Speech.}, journal = {Language and speech}, volume = {61}, number = {2}, pages = {227-245}, pmid = {28653556}, issn = {0023-8309}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; }, abstract = {Perturbations to acoustic speech feedback have been typically localized to specific phonetic characteristics, for example, fundamental frequency (F0) or the first two formants (F1/F2), or affect all aspects of the speech signal equally, for example, via the addition of background noise. This paper examines the consequences of a more selective global perturbation: real-time cochlear implant (CI) simulation of acoustic speech feedback. Specifically, we examine the potential similarity between speakers' response to noise vocoding and the characteristics of Lombard speech. An acoustic analysis of supra-segmental characteristics in speaking rate, F0 production, and voice amplitude revealed changes that paralleled the Lombard effect in some domains but not others. Two studies of speech intelligibility complemented the acoustic analysis, finding that intelligibility significantly decreased as a result of CI simulation of speaker feedback. Together, the results point to differences in speakers' responses to these two superficially similar feedback manipulations. In both cases we see a complex, multi-faceted behavior on the part of talkers. We argue that more instances of global perturbation and broader response assessment are needed to determine whether such complexity is present in other feedback manipulations or if it represents a relatively rare exception to the typical compensatory feedback response.}, } @article {pmid28645445, year = {2018}, author = {Pellicani, AD and Fontes, AR and Santos, FF and Pellicani, AD and Aguiar-Ricz, LN}, title = {Fundamental Frequency and Formants Before and After Prolonged Voice Use in Teachers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {2}, pages = {177-184}, doi = {10.1016/j.jvoice.2017.04.011}, pmid = {28645445}, issn = {1873-4588}, mesh = {Adult ; Cross-Sectional Studies ; Disability Evaluation ; Female ; Health Behavior ; Health Knowledge, Attitudes, Practice ; Humans ; Middle Aged ; Occupational Diseases/diagnosis/*etiology/physiopathology/psychology ; *Occupational Health ; Risk Factors ; *School Teachers/psychology ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Voice Disorders/diagnosis/*etiology/physiopathology/psychology ; *Voice Quality ; }, abstract = {OBJECTIVES: This study aimed to describe and correlate the fundamental frequency behavior and the first four formants before and after exposure to usual and routinely prolonged voice use from teachers with over 4 years of experience in teaching.

STUDY DESIGN: The study design is observational and transversal with quantitative and descriptive evaluations.

METHODS: A total of 28 female teachers were subjected to the Screening Index for Voice Disorder (SIVD) and to recordings of the sustained vowel /a/ before and after exposure to prolonged voice use. Data were obtained about the fundamental frequency and the first four formants before and after voice use. Descriptive analysis and statistical processing were performed with P ≤ 0.05 for the general sample and in groups according to the outcome of the SIVD (normal and altered) and the evaluation period (morning or afternoon).

RESULTS: The average exposure time to prolonged voice use was 176 minutes. There was no statistical difference in any of the variables studied. Correlations were positive and similar across all assessments before the class, something not observed in evaluations conducted after exposure to prolonged voice use. In the general sample, altered SIVD and afternoon period groups, the second formant from before-class measurements seems to interfere negatively in the fourth formant from after-class measurements.

CONCLUSIONS: There were no changes in vocal behavior before and after exposure to prolonged voice use in the occupational environment. However, formants F1 and F2 measured before class correlated inversely with F4 after exposure to prolonged voice use.}, } @article {pmid28623039, year = {2017}, author = {Wang, Y and Liang, F and Yang, J and Zhang, X and Liu, J and Zheng, Y}, title = {The Acoustic Characteristics of the Voice in Cochlear-Implanted Children: A Longitudinal Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {6}, pages = {773.e21-773.e26}, doi = {10.1016/j.jvoice.2017.02.007}, pmid = {28623039}, issn = {1873-4588}, mesh = {*Acoustics ; Case-Control Studies ; Child ; Child, Preschool ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Deafness/diagnosis/physiopathology/psychology/*rehabilitation ; Feedback, Sensory ; Female ; Hearing Loss, Sensorineural/diagnosis/physiopathology/psychology/*rehabilitation ; Humans ; Longitudinal Studies ; Male ; Persons With Hearing Impairments/psychology/*rehabilitation ; *Speech Acoustics ; *Speech Perception ; *Speech Production Measurement ; Time Factors ; Treatment Outcome ; *Voice Quality ; }, abstract = {OBJECTIVE: The purpose of this study was to characterize changes in the voice and vowel articulation of prelingually deaf children after cochlear implantation.

METHODS: In this study, the patient group included 30 prelingually deaf children who underwent unilateral cochlear implantation at 4-6 years of age. The control group included normally hearing children of the same age. All deaf children had follow-ups before cochlear implantation and at 1, 3, 6, 12, and 24 months after implantation. The acoustic parameters, aerodynamic parameters, and vowel formants were measured in the patient group and compared with those of the control group.

RESULTS: All acoustic parameters, aerodynamic parameters, and vowel formants differed significantly between normally hearing children and prelingually deaf children. For prelingually deaf children, all of the above parameters gradually decreased after cochlear implantation. Furthermore, the acoustic parameters Jitter and Shimmer were significantly reduced as early as 6 months, whereas the fundamental frequency, the standard deviation of fundamental frequency, estimated subglottal pressure, aF1, iF2, and uF2 were significantly altered 12 months after implantation. However, statistically significant differences in these parameters were not observed between 12 and 24 months after cochlear implantation.

CONCLUSION: After cochlear implantation, prelingually deaf children established auditory feedback and improved voice control and vowel production.}, } @article {pmid28618801, year = {2017}, author = {Han, Y and Chen, F}, title = {Relative contributions of formants to the intelligibility of sine-wave sentences in Mandarin Chinese.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {6}, pages = {EL495}, doi = {10.1121/1.4983747}, pmid = {28618801}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Speech ; Cues ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {Early sine-wave speech (SWS) studies showed that the first three formants contain sufficient intelligibility information. The present work assessed the relative perceptual contributions of the first three formants. Mandarin sentences were edited to generate two SWS conditions: removal of one of the first three formant trajectories, and preservation of only one formant trajectory. In addition, SWS synthesis was implemented in the absence of sine-wave amplitude modulation. The results consistently showed that the trajectory of the second formant contributed the most to intelligibility, and the effect of amplitude modulation was smaller than that of the formant number.}, } @article {pmid28601721, year = {2017}, author = {Masapollo, M and Polka, L and Ménard, L}, title = {A universal bias in adult vowel perception - By ear or by eye.}, journal = {Cognition}, volume = {166}, number = {}, pages = {358-370}, doi = {10.1016/j.cognition.2017.06.001}, pmid = {28601721}, issn = {1873-7838}, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Middle Aged ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Vocabulary ; Young Adult ; }, abstract = {Speech perceivers are universally biased toward "focal" vowels (i.e., vowels whose adjacent formants are close in frequency, which concentrates acoustic energy into a narrower spectral region). This bias is demonstrated in phonetic discrimination tasks as a directional asymmetry: a change from a relatively less to a relatively more focal vowel results in significantly better performance than a change in the reverse direction. We investigated whether the critical information for this directional effect is limited to the auditory modality, or whether visible articulatory information provided by the speaker's face also plays a role. Unimodal auditory and visual as well as bimodal (auditory-visual) vowel stimuli were created from video recordings of a speaker producing variants of /u/, differing in both their degree of focalization and visible lip rounding (i.e., lip compression and protrusion). In Experiment 1, we confirmed that subjects showed an asymmetry while discriminating the auditory vowel stimuli. We then found, in Experiment 2, a similar asymmetry when subjects lip-read those same vowels. In Experiment 3, we found asymmetries, comparable to those found for unimodal vowels, for bimodal vowels when the audio and visual channels were phonetically-congruent. In contrast, when the audio and visual channels were phonetically-incongruent (as in the "McGurk effect"), this asymmetry was disrupted. These findings collectively suggest that the perceptual processes underlying the "focal" vowel bias are sensitive to articulatory information available across sensory modalities, and raise foundational issues concerning the extent to which vowel perception derives from general-auditory or speech-gesture-specific processes.}, } @article {pmid28599542, year = {2017}, author = {Story, BH and Bunton, K}, title = {Vowel space density as an indicator of speech performance.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {5}, pages = {EL458}, pmid = {28599542}, issn = {1520-8524}, support = {R01 DC011275/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Humans ; *Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; *Voice Quality ; }, abstract = {The purpose of this study was to develop a method for visualizing and assessing the characteristics of vowel production by measuring the local density of normalized F1 and F2 formant frequencies. The result is a three-dimensional plot called the vowel space density (VSD) and indicates the regions in the vowel space most heavily used by a talker during speech production. The area of a convex hull enclosing the vowel space at specific threshold density values was proposed as a means of quantifying the VSD.}, } @article {pmid28599525, year = {2017}, author = {Moreno-Torres, I and Otero, P and Luna-Ramírez, S and Garayzábal Heinze, E}, title = {Analysis of Spanish consonant recognition in 8-talker babble.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {5}, pages = {3079}, doi = {10.1121/1.4982251}, pmid = {28599525}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Audiometry, Speech ; Comprehension ; Female ; Humans ; Male ; Noise/*adverse effects ; *Perceptual Masking ; *Phonetics ; *Recognition, Psychology ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {This paper presents the results of a closed-set recognition task for 80 Spanish consonant-vowel sounds (16 C × 5 V, spoken by 2 talkers) in 8-talker babble (-6, -2, +2 dB). A ranking of resistance to noise was obtained using the signal detection d' measure, and confusion patterns were analyzed using a graphical method (confusion graphs). The resulting ranking indicated the existence of three resistance groups: (1) high resistance: /ʧ, s, ʝ/; (2) mid resistance: /r, l, m, n/; and (3) low resistance: /t, θ, x, ɡ, b, d, k, f, p/. Confusions involved mostly place of articulation and voicing errors, and occurred especially among consonants in the same resistance group. Three perceptual confusion groups were identified: the three low-energy fricatives (i.e., /f, θ, x/), the six stops (i.e., /p, t, k, b, d, ɡ/), and three consonants with clear formant structure (i.e., /m, n, l/). The factors underlying consonant resistance and confusion patterns are discussed. The results are compared with data from other languages.}, } @article {pmid28592674, year = {2017}, author = {Pietraszewski, D and Wertz, AE and Bryant, GA and Wynn, K}, title = {Three-month-old human infants use vocal cues of body size.}, journal = {Proceedings. Biological sciences}, volume = {284}, number = {1856}, pages = {}, pmid = {28592674}, issn = {1471-2954}, mesh = {*Body Size ; *Cues ; Female ; Humans ; Infant ; Male ; Phenotype ; Sound Spectrography ; *Voice ; }, abstract = {Differences in vocal fundamental (F0) and average formant (Fn) frequencies covary with body size in most terrestrial mammals, such that larger organisms tend to produce lower frequency sounds than smaller organisms, both between species and also across different sex and life-stage morphs within species. Here we examined whether three-month-old human infants are sensitive to the relationship between body size and sound frequencies. Using a violation-of-expectation paradigm, we found that infants looked longer at stimuli inconsistent with the relationship-that is, a smaller organism producing lower frequency sounds, and a larger organism producing higher frequency sounds-than at stimuli that were consistent with it. This effect was stronger for fundamental frequency than it was for average formant frequency. These results suggest that by three months of age, human infants are already sensitive to the biologically relevant covariation between vocalization frequencies and visual cues to body size. This ability may be a consequence of developmental adaptations for building a phenotype capable of identifying and representing an organism's size, sex and life-stage.}, } @article {pmid28575731, year = {2017}, author = {Enzinger, E and Morrison, GS}, title = {Empirical test of the performance of an acoustic-phonetic approach to forensic voice comparison under conditions similar to those of a real case.}, journal = {Forensic science international}, volume = {277}, number = {}, pages = {30-40}, doi = {10.1016/j.forsciint.2017.05.007}, pmid = {28575731}, issn = {1872-6283}, mesh = {Biometric Identification/*methods ; Cell Phone ; Forensic Sciences/legislation & jurisprudence/*methods ; Humans ; Likelihood Functions ; *Phonetics ; Reproducibility of Results ; Sound Spectrography ; *Speech Acoustics ; *Voice ; }, abstract = {In a 2012 case in New South Wales, Australia, the identity of a speaker on several audio recordings was in question. Forensic voice comparison testimony was presented based on an auditory-acoustic-phonetic-spectrographic analysis. No empirical demonstration of the validity and reliability of the analytical methodology was presented. Unlike the admissibility standards in some other jurisdictions (e.g., US Federal Rule of Evidence 702 and the Daubert criteria, or England & Wales Criminal Practice Directions 19A), Australia's Unified Evidence Acts do not require demonstration of the validity and reliability of analytical methods and their implementation before testimony based upon them is presented in court. The present paper reports on empirical tests of the performance of an acoustic-phonetic-statistical forensic voice comparison system which exploited the same features as were the focus of the auditory-acoustic-phonetic-spectrographic analysis in the case, i.e., second-formant (F2) trajectories in /o/ tokens and mean fundamental frequency (f0). The tests were conducted under conditions similar to those in the case. The performance of the acoustic-phonetic-statistical system was very poor compared to that of an automatic system.}, } @article {pmid28575087, year = {2017}, author = {Hung, YC and Lee, YJ and Tsai, LC}, title = {Vowel production of Mandarin-speaking hearing aid users with different types of hearing loss.}, journal = {PloS one}, volume = {12}, number = {6}, pages = {e0178588}, pmid = {28575087}, issn = {1932-6203}, mesh = {Adolescent ; Adult ; Child ; China ; Female ; *Hearing Aids ; Hearing Loss/*physiopathology ; Humans ; Male ; *Speech ; Young Adult ; }, abstract = {In contrast with previous research focusing on cochlear implants, this study examined the speech performance of hearing aid users with conductive (n = 11), mixed (n = 10), and sensorineural hearing loss (n = 7) and compared it with the speech of hearing control. Speech intelligibility was evaluated by computing the vowel space area defined by the Mandarin Chinese corner vowels /a, u, i/. The acoustic differences between the vowels were assessed using the Euclidean distance. The results revealed that both the conductive and mixed hearing loss groups exhibited a reduced vowel working space, but no significant difference was found between the sensorineural hearing loss and normal hearing groups. An analysis using the Euclidean distance further showed that the compression of vowel space area in conductive hearing loss can be attributed to the substantial lowering of the second formant of /i/. The differences in vowel production between groups are discussed in terms of the occlusion effect and the signal transmission media of various hearing devices.}, } @article {pmid28562597, year = {2017}, author = {Flaherty, M and Dent, ML and Sawusch, JR}, title = {Experience with speech sounds is not necessary for cue trading by budgerigars (Melopsittacus undulatus).}, journal = {PloS one}, volume = {12}, number = {5}, pages = {e0177676}, pmid = {28562597}, issn = {1932-6203}, mesh = {Adolescent ; Adult ; Animals ; *Cues ; Female ; Humans ; Male ; Melopsittacus/*physiology ; *Speech ; Young Adult ; }, abstract = {The influence of experience with human speech sounds on speech perception in budgerigars, vocal mimics whose speech exposure can be tightly controlled in a laboratory setting, was measured. Budgerigars were divided into groups that differed in auditory exposure and then tested on a cue-trading identification paradigm with synthetic speech. Phonetic cue trading is a perceptual phenomenon observed when changes on one cue dimension are offset by changes in another cue dimension while still maintaining the same phonetic percept. The current study examined whether budgerigars would trade the cues of voice onset time (VOT) and the first formant onset frequency when identifying syllable initial stop consonants and if this would be influenced by exposure to speech sounds. There were a total of four different exposure groups: No speech exposure (completely isolated), Passive speech exposure (regular exposure to human speech), and two Speech-trained groups. After the exposure period, all budgerigars were tested for phonetic cue trading using operant conditioning procedures. Birds were trained to peck keys in response to different synthetic speech sounds that began with "d" or "t" and varied in VOT and frequency of the first formant at voicing onset. Once training performance criteria were met, budgerigars were presented with the entire intermediate series, including ambiguous sounds. Responses on these trials were used to determine which speech cues were used, if a trading relation between VOT and the onset frequency of the first formant was present, and whether speech exposure had an influence on perception. Cue trading was found in all birds and these results were largely similar to those of a group of humans. Results indicated that prior speech experience was not a requirement for cue trading by budgerigars. The results are consistent with theories that explain phonetic cue trading in terms of a rich auditory encoding of the speech signal.}, } @article {pmid28554824, year = {2018}, author = {Vos, RR and Murphy, DT and Howard, DM and Daffern, H}, title = {The Perception of Formant Tuning in Soprano Voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {1}, pages = {126.e1-126.e10}, doi = {10.1016/j.jvoice.2017.03.017}, pmid = {28554824}, issn = {1873-4588}, mesh = {Adult ; Aged ; *Auditory Perception ; Female ; Humans ; Male ; Middle Aged ; *Singing ; *Voice ; Young Adult ; }, abstract = {INTRODUCTION: At the upper end of the soprano range, singers adjust their vocal tract to bring one or more of its resonances (Rn) toward a source harmonic, increasing the amplitude of the sound; this process is known as resonance tuning. This study investigated the perception of (R1) and (R2) tuning, key strategies observed in classically trained soprano voices, which were expected to be preferred by listeners. Furthermore, different vowels were compared, whereas previous investigations have usually focused on a single vowel.

METHODS: Listeners compared three synthetic vowel sounds, at four fundamental frequencies (f0), to which four tuning strategies were applied: (A) no tuning, (B) R1 tuned to f0, (C) R2 tuned to 2f0, and (D) both R1 and R2 tuned. Participants compared preference and naturalness for these strategies and were asked to identify each vowel.

RESULTS: The preference and naturalness results were similar for /ɑ/, with no clear pattern observed for vowel identification. The results for /u/ showed no clear difference for preference, and only slight separation for naturalness, with poor vowel identification. The results for /i/ were striking, with strategies including R2 tuning both preferred and considered more natural than those without. However, strategies without R2 tuning were correctly identified more often.

CONCLUSIONS: The results indicate that perception of different tuning strategies depends on the vowel and perceptual quality investigated, and the relationship between the formants and (f0). In some cases, formant tuning was beneficial at lower f0s than expected, based on previous resonance tuning studies.}, } @article {pmid28554088, year = {2017}, author = {Miyazawa, K and Shinya, T and Martin, A and Kikuchi, H and Mazuka, R}, title = {Vowels in infant-directed speech: More breathy and more variable, but not clearer.}, journal = {Cognition}, volume = {166}, number = {}, pages = {84-93}, doi = {10.1016/j.cognition.2017.05.003}, pmid = {28554088}, issn = {1873-7838}, mesh = {Child Language ; Female ; Humans ; Infant ; Language ; *Language Development ; Learning/physiology ; Male ; *Mother-Child Relations ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {Infant-directed speech (IDS) is known to differ from adult-directed speech (ADS) in a number of ways, and it has often been argued that some of these IDS properties facilitate infants' acquisition of language. An influential study in support of this view is Kuhl et al. (1997), which found that vowels in IDS are produced with expanded first and second formants (F1/F2) on average, indicating that the vowels are acoustically further apart in IDS than in ADS. These results have been interpreted to mean that the way vowels are produced in IDS makes infants' task of learning vowel categories easier. The present paper revisits this interpretation by means of a thorough analysis of IDS vowels using a large-scale corpus of Japanese natural utterances. We will show that the expansion of F1/F2 values does occur in spontaneous IDS even when the vowels' prosodic position, lexical pitch accent, and lexical bias are accounted for. When IDS vowels are compared to carefully read speech (CS) by the same mothers, however, larger variability among IDS vowel tokens means that the acoustic distances among vowels are farther apart only in CS, but not in IDS when compared to ADS. Finally, we will show that IDS vowels are significantly more breathy than ADS or CS vowels. Taken together, our results demonstrate that even though expansion of formant values occurs in spontaneous IDS, this expansion cannot be interpreted as an indication that the acoustic distances among vowels are farther apart, as is the case in CS. Instead, we found that IDS vowels are characterized by breathy voice, which has been associated with the communication of emotional affect.}, } @article {pmid28523949, year = {2018}, author = {Hennessey, NW and Fisher, G and Ciccone, N}, title = {Developmental changes in pharyngeal swallowing acoustics: a comparison of adults and children.}, journal = {Logopedics, phoniatrics, vocology}, volume = {43}, number = {2}, pages = {63-72}, doi = {10.1080/14015439.2017.1326526}, pmid = {28523949}, issn = {1651-2022}, mesh = {*Acoustics ; Adult ; Age Factors ; *Child Development ; Child, Preschool ; *Deglutition ; Female ; Humans ; Male ; Pharynx/*growth & development ; Sound Spectrography ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; Voice Quality ; Young Adult ; }, abstract = {This study examined developmental differences in the acoustics of pharyngeal swallowing. Thirty-one young children (M = 4.5 years) and 29 adults (M = 22.5 years) were recorded swallowing thin liquid and puree boluses. In comparison with adults, children showed longer total swallow sound duration and duration to peak intensity, as well as greater variability in the duration to peak intensity and mean of the averaged spectrum in Hz. Thin and puree boluses differed in measures of duration, intensity and frequency of the averaged sound spectrum, although these effects did not interact with age. The increased variability in swallowing observed in children paralleled that found in acoustic measures of vowel formants, although speech and swallowing acoustic measures were uncorrelated. Using Formant 2 frequency as a proxy measure of vocal tract length, the age differences in swallowing acoustics appear to be independent of physical size, although associations between duration to peak intensity and pharyngeal size warrant further investigation. These findings suggest acoustic measures of swallowing are sensitive to developmental status, possibly reflecting ongoing refinement of the pharyngeal swallow across childhood, and support continued research into the use of digital cervical auscultation as a tool to assess the efficiency and stability of the swallowing neuromuscular control system in children and adults.}, } @article {pmid28502668, year = {2018}, author = {Saruhan, S and Guclu, E and Ertugrul, A}, title = {Spectrographic and Electroglottographic Findings of Religious Vocal Performers in Düzce Province of Turkey.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {32}, number = {1}, pages = {127.e25-127.e35}, doi = {10.1016/j.jvoice.2017.03.007}, pmid = {28502668}, issn = {1873-4588}, mesh = {Adult ; Electrodiagnosis ; Glottis/*physiology ; Humans ; *Islam ; Male ; Middle Aged ; Sound Spectrography ; *Speech Acoustics ; Turkey ; Voice/*physiology ; }, abstract = {OBJECTIVES: In the present study, the register phenomenon and spectral characteristics of vocal styles used by vocal performers in a Muslim community were investigated.

METHODS: Electroglottography was performed on 17 subjects, whereas spectrography was performed on 18 subjects, and 16 of these subjects participated in both parts of the study.

RESULTS: We observed that the participants used chest register in voice production and there was no change in this situation related to the increase in frequency. It was found that Western opera singers' formant cluster did not exist in their normal speech and performance voice spectrum. Generally, there were clear energy peaks at the 3- to 4-kHz spectral region in their performance voice, but this peak did not appear in the daily speech voice.

CONCLUSIONS: It was concluded that a bunched F3, F4, and F5 is a critical prerequisite in the production of a calling formant cluster. However, it was observed that in certain cases, this phenomenon was produced only with bunching of F4 and F5 or with an increased F4 energy level. Although an increase in F3 assists in the production of the calling formant cluster, the main source of the calling formant cluster was the decrease in F4 and F5 frequencies, and the main contribution came from F5, which was fairly decreased. Moreover, it was found that a decreased closed quotient value caused a raise in the relative level of calling formant cluster (Lcfc) value. In conclusion, our results indicate that the production of the calling formant cluster is based not only on vocal tract properties but also on glottal settings.}, } @article {pmid28500350, year = {2017}, author = {Reber, SA and Janisch, J and Torregrosa, K and Darlington, J and Vliet, KA and Fitch, WT}, title = {Formants provide honest acoustic cues to body size in American alligators.}, journal = {Scientific reports}, volume = {7}, number = {1}, pages = {1816}, pmid = {28500350}, issn = {2045-2322}, mesh = {Alligators and Crocodiles/*anatomy & histology/*physiology ; Animals ; *Body Size ; *Cues ; Female ; Male ; *Vocalization, Animal ; }, abstract = {In many vertebrates, acoustic cues to body size are encoded in resonance frequencies of the vocal tract ("formants"), rather than in the rate of tissue vibration in the sound source ("pitch"). Anatomical constraints on the vocal tract's size render formants honest cues to size in many bird and mammal species, but it is not clear whether this correlation evolved convergently in these two clades, or whether it is widespread among amniotes (mammals, birds, and non-avian reptiles). We investigated the potential for honest acoustic cues in the bellows of adult American alligators and found that formant spacing provided highly reliable cues to body size, while presumed correlates of the source signal did not. These findings held true for both sexes and for all bellows whether produced in or out of water. Because birds and crocodilians are the last extant Archosaurians and share common ancestry with all extinct dinosaurs, our findings support the hypothesis that dinosaurs used formants as cues to body size. The description of formants as honest signals in a non-avian reptile combined with previous evidence from birds and mammals strongly suggests that the principle of honest signalling via vocal tract resonances may be a broadly shared trait among amniotes.}, } @article {pmid28483225, year = {2017}, author = {Yang, J and Xu, L}, title = {Mandarin compound vowels produced by prelingually deafened children with cochlear implants.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {97}, number = {}, pages = {143-149}, pmid = {28483225}, issn = {1872-8464}, support = {R15 DC014587/DC/NIDCD NIH HHS/United States ; }, mesh = {Beijing ; Child ; Child, Preschool ; Cochlear Implantation/*methods ; Cochlear Implants ; Deafness/*surgery ; Female ; Humans ; Language ; *Language Development ; Male ; Phonetics ; Speech ; *Speech Acoustics ; Speech Perception ; Speech Production Measurement ; }, abstract = {OBJECTIVE: Compound vowels including diphthongs and triphthongs have complex, dynamic spectral features. The production of compound vowels by children with cochlear implants (CIs) has not been studied previously. The present study examined the dynamic features of compound vowels in native Mandarin-speaking children with CIs.

METHODS: Fourteen prelingually deafened children with CIs (aged 2.9-8.3 years old) and 14 age-matched, normal-hearing (NH) children produced monosyllables containing six Mandarin compound vowels (i.e., /aɪ/, /aʊ/, /uo/, /iɛ/, /iaʊ/, /ioʊ/). The frequency values of the first two formants were measured at nine equidistant time points over the course of the vowel duration. All formant frequency values were normalized and then used to calculate vowel trajectory length and overall spectral rate of change.

RESULTS: The results revealed that the CI children produced significantly longer durations for all six compound vowels. The CI children's ability to produce formant movement for the compound vowels varied considerably. Some CI children produced relatively static formant trajectories for certain diphthongs, whereas others produced certain vowels with greater formant movement than did the NH children. As a group, the CI children roughly followed the NH children on the pattern of magnitude of formant movement, but they showed a slower rate of formant change than did the NH children.

CONCLUSIONS: The findings suggested that prelingually deafened children with CIs, during the early stage of speech acquisition, had not established appropriate targets and articulatory coordination for compound vowel productions. This preliminary study may shed light on rehabilitation of prelingually deafened children with CIs.}, } @article {pmid28464686, year = {2017}, author = {Lee, J and Ali, H and Ziaei, A and Tobey, EA and Hansen, JHL}, title = {The Lombard effect observed in speech produced by cochlear implant users in noisy environments: A naturalistic study.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {4}, pages = {2788}, pmid = {28464686}, issn = {1520-8524}, support = {R01 DC010494/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; Aged ; Aged, 80 and over ; Case-Control Studies ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Environmental Exposure/*adverse effects ; Feedback, Sensory ; Female ; Humans ; Male ; Middle Aged ; Noise/*adverse effects ; *Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Phonetics ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; *Voice Quality ; }, abstract = {The Lombard effect is an involuntary response speakers experience in the presence of noise during voice communication. This phenomenon is known to cause changes in speech production such as an increase in intensity, pitch structure, formant characteristics, etc., for enhanced audibility in noisy environments. Although well studied for normal hearing listeners, the Lombard effect has received little, if any, attention in the field of cochlear implants (CIs). The objective of this study is to analyze speech production of CI users who are postlingually deafened adults with respect to environmental context. A total of six adult CI users were recruited to produce spontaneous speech in various realistic environments. Acoustic-phonetic analysis was then carried out to characterize their speech production in these environments. The Lombard effect was observed in the speech production of all CI users who participated in this study in adverse listening environments. The results indicate that both suprasegmental (e.g., F0, glottal spectral tilt and vocal intensity) and segmental (e.g., F1 for /i/ and /u/) features were altered in such environments. The analysis from this study suggests that modification of speech production of CI users under the Lombard effect may contribute to some degree an intelligible communication in adverse noisy environments.}, } @article {pmid28464659, year = {2017}, author = {Mitsuya, T and Munhall, KG and Purcell, DW}, title = {Modulation of auditory-motor learning in response to formant perturbation as a function of delayed auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {4}, pages = {2758}, pmid = {28464659}, issn = {1520-8524}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Threshold ; *Feedback, Sensory ; Female ; Humans ; *Learning ; *Motor Activity ; Noise/adverse effects ; Perceptual Masking ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The interaction of language production and perception has been substantiated by empirical studies where speakers compensate their speech articulation in response to the manipulated sound of their voice heard in real-time as auditory feedback. A recent study by Max and Maffett [(2015). Neurosci. Lett. 591, 25-29] reported an absence of compensation (i.e., auditory-motor learning) for frequency-shifted formants when auditory feedback was delayed by 100 ms. In the present study, the effect of auditory feedback delay was studied when only the first formant was manipulated while delaying auditory feedback systematically. In experiment 1, a small yet significant compensation was observed even with 100 ms of auditory delay unlike the past report. This result suggests that the tolerance of feedback delay depends on different types of auditory errors being processed. In experiment 2, it was revealed that the amount of formant compensation had an inverse linear relationship with the amount of auditory delay. One of the speculated mechanisms to account for these results is that as auditory delay increases, undelayed (and unperturbed) somatosensory feedback is given more preference for accuracy control of vowel formants.}, } @article {pmid28464636, year = {2017}, author = {Masapollo, M and Polka, L and Molnar, M and Ménard, L}, title = {Directional asymmetries reveal a universal bias in adult vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {4}, pages = {2857}, doi = {10.1121/1.4981006}, pmid = {28464636}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Speech ; Bias ; Female ; Humans ; Male ; Multilingualism ; Phonetics ; Pitch Discrimination ; *Pitch Perception ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {Research on cross-language vowel perception in both infants and adults has shown that for many vowel contrasts, discrimination is easier when the same pair of vowels is presented in one direction compared to the reverse direction. According to one account, these directional asymmetries reflect a universal bias favoring "focal" vowels (i.e., vowels whose adjacent formants are close in frequency, which concentrates acoustic energy into a narrower spectral region). An alternative, but not mutually exclusive, account is that such effects reflect an experience-dependent bias favoring prototypical instances of native-language vowel categories. To disentangle the effects of focalization and prototypicality, the authors first identified a certain location in phonetic space where vowels were consistently categorized as /u/ by both Canadian-English and Canadian-French listeners, but that nevertheless varied in their stimulus goodness (i.e., the best Canadian-French /u/ exemplars were more focal compared to the best Canadian-English /u/ exemplars). In subsequent AX discrimination tests, both Canadian-English and Canadian-French listeners performed better at discriminating changes from less to more focal /u/'s compared to the reverse, regardless of variation in prototypicality. These findings demonstrate a universal bias favoring vowels with greater formant convergence that operates independently of biases related to language-specific prototype categorization.}, } @article {pmid28383990, year = {2017}, author = {Zuk, J and Bishop-Liebler, P and Ozernov-Palchik, O and Moore, E and Overy, K and Welch, G and Gaab, N}, title = {Revisiting the "enigma" of musicians with dyslexia: Auditory sequencing and speech abilities.}, journal = {Journal of experimental psychology. General}, volume = {146}, number = {4}, pages = {495-511}, pmid = {28383990}, issn = {1939-2222}, support = {F31 DC015919/DC/NIDCD NIH HHS/United States ; R01 HD065762/HD/NICHD NIH HHS/United States ; R01 HD067312/HD/NICHD NIH HHS/United States ; T32 DC000038/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; *Auditory Perception ; Cues ; Dyslexia/*diagnosis/*psychology ; Female ; Humans ; Male ; *Music ; Phonetics ; Pitch Discrimination ; *Serial Learning ; Sound Spectrography ; *Speech ; *Speech Perception ; Time Perception ; Young Adult ; }, abstract = {Previous research has suggested a link between musical training and auditory processing skills. Musicians have shown enhanced perception of auditory features critical to both music and speech, suggesting that this link extends beyond basic auditory processing. It remains unclear to what extent musicians who also have dyslexia show these specialized abilities, considering often-observed persistent deficits that coincide with reading impairments. The present study evaluated auditory sequencing and speech discrimination in 52 adults comprised of musicians with dyslexia, nonmusicians with dyslexia, and typical musicians. An auditory sequencing task measuring perceptual acuity for tone sequences of increasing length was administered. Furthermore, subjects were asked to discriminate synthesized syllable continua varying in acoustic components of speech necessary for intraphonemic discrimination, which included spectral (formant frequency) and temporal (voice onset time [VOT] and amplitude envelope) features. Results indicate that musicians with dyslexia did not significantly differ from typical musicians and performed better than nonmusicians with dyslexia for auditory sequencing as well as discrimination of spectral and VOT cues within syllable continua. However, typical musicians demonstrated superior performance relative to both groups with dyslexia for discrimination of syllables varying in amplitude information. These findings suggest a distinct profile of speech processing abilities in musicians with dyslexia, with specific weaknesses in discerning amplitude cues within speech. Because these difficulties seem to remain persistent in adults with dyslexia despite musical training, this study only partly supports the potential for musical training to enhance the auditory processing skills known to be crucial for literacy in individuals with dyslexia. (PsycINFO Database Record}, } @article {pmid28372061, year = {2017}, author = {Arai, T and Iwagami, E and Yanagisawa, E}, title = {Seeing closing gesture of articulators affects speech perception of geminate consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {3}, pages = {EL319}, doi = {10.1121/1.4978343}, pmid = {28372061}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; *Cues ; Female ; *Gestures ; Humans ; Male ; Photic Stimulation ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Video Recording ; *Visual Perception ; *Voice Quality ; Young Adult ; }, abstract = {This study tests the perception of geminate consonants for native speakers of Japanese using audio and visual information. A previous study showed that formant transitions associated with the closing gesture of articulators at the end of a preceding vowel are crucial for perception of stop geminate consonants in Japanese. In addition, this study further focuses on visual cues, to test if seeing the closing gesture affects perception of geminate consonants. Based on a perceptual experiment, it is observed that visual information can compensate for a deficiency in geminate consonant auditory information, such as formant transitions.}, } @article {pmid28372057, year = {2017}, author = {Yu, C and Hansen, JH}, title = {A study of voice production characteristics of astronuat speech during Apollo 11 for speaker modeling in space.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {3}, pages = {1605}, doi = {10.1121/1.4976048}, pmid = {28372057}, issn = {1520-8524}, mesh = {*Acoustics ; *Astronauts ; Humans ; Male ; *Space Flight ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; *Voice Quality ; *Weightlessness ; }, abstract = {Human physiology has evolved to accommodate environmental conditions, including temperature, pressure, and air chemistry unique to Earth. However, the environment in space varies significantly compared to that on Earth and, therefore, variability is expected in astronauts' speech production mechanism. In this study, the variations of astronaut voice characteristics during the NASA Apollo 11 mission are analyzed. Specifically, acoustical features such as fundamental frequency and phoneme formant structure that are closely related to the speech production system are studied. For a further understanding of astronauts' vocal tract spectrum variation in space, a maximum likelihood frequency warping based analysis is proposed to detect the vocal tract spectrum displacement during space conditions. The results from fundamental frequency, formant structure, as well as vocal spectrum displacement indicate that astronauts change their speech production mechanism when in space. Moreover, the experimental results for astronaut voice identification tasks indicate that current speaker recognition solutions are highly vulnerable to astronaut voice production variations in space conditions. Future recommendations from this study suggest that successful applications of speaker recognition during extended space missions require robust speaker modeling techniques that could effectively adapt to voice production variation caused by diverse space conditions.}, } @article {pmid28372044, year = {2017}, author = {Broad, DJ and Clermont, F}, title = {Target-locus scaling for modeling formant transitions in vowel + consonant + vowel utterances.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {3}, pages = {EL192}, doi = {10.1121/1.4976139}, pmid = {28372044}, issn = {1520-8524}, mesh = {*Acoustics ; Humans ; Linear Models ; Pattern Recognition, Automated ; *Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Production Measurement ; *Voice Quality ; }, abstract = {The coarticulatory effects described by Öhman [J. Acoust. Soc. Am. 39, 151-168 (1966)] imply that consonant loci cannot always be defined by their associated consonantal contexts alone, but, in Swedish vowel + consonant + vowel utterances, they must also depend on their trans-consonantal vowels (TCVs). His discovery is extended here using a model with TCV-dependent consonant loci and with a unique target for each vowel. The model accurately characterizes the vowel-to-consonant and consonant-to-vowel transitions for the /b/ and /d/ contexts from Öhman's second formant-frequency data as families connected by linear-scaling relationships.}, } @article {pmid28362674, year = {2017}, author = {Ananthakrishnan, S and Luo, X and Krishnan, A}, title = {Human Frequency Following Responses to Vocoded Speech.}, journal = {Ear and hearing}, volume = {38}, number = {5}, pages = {e256-e267}, pmid = {28362674}, issn = {1538-4667}, support = {R01 DC008549/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Auditory Perception/*physiology ; Brain Stem/*physiology ; Cochlear Implants ; Female ; Humans ; Male ; Sound Spectrography ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVES: Vocoders offer an effective platform to simulate the effects of cochlear implant speech processing strategies in normal-hearing listeners. Several behavioral studies have examined the effects of varying spectral and temporal cues on vocoded speech perception; however, little is known about the neural indices of vocoded speech perception. Here, the scalp-recorded frequency following response (FFR) was used to study the effects of varying spectral and temporal cues on brainstem neural representation of specific acoustic cues, the temporal envelope periodicity related to fundamental frequency (F0) and temporal fine structure (TFS) related to formant and formant-related frequencies, as reflected in the phase-locked neural activity in response to vocoded speech.

DESIGN: In experiment 1, FFRs were measured in 12 normal-hearing, adult listeners in response to a steady state English back vowel /u/ presented in an unaltered, unprocessed condition and six sine-vocoder conditions with varying numbers of channels (1, 2, 4, 8, 16, and 32), while the temporal envelope cutoff frequency was fixed at 500 Hz. In experiment 2, FFRs were obtained from 14 normal-hearing, adult listeners in response to the same English vowel /u/, presented in an unprocessed condition and four vocoded conditions where both the temporal envelope cutoff frequency (50 versus 500 Hz) and carrier type (sine wave versus noise band) were varied separately with the number of channels fixed at 8. Fast Fourier Transform was applied to the time waveforms of FFR to analyze the strength of brainstem neural representation of temporal envelope periodicity (F0) and TFS-related peaks (formant structure).

RESULTS: Brainstem neural representation of both temporal envelope and TFS cues improved when the number of channels increased from 1 to 4, followed by a plateau with 8 and 16 channels, and a reduction in phase-locking strength with 32 channels. For the sine vocoders, peaks in the FFRTFS spectra corresponded with the low-frequency sine-wave carriers and side band frequencies in the stimulus spectra. When the temporal envelope cutoff frequency increased from 50 to 500 Hz, an improvement was observed in brainstem F0 representation with no change in brainstem representation of spectral peaks proximal to the first formant frequency (F1). There was no significant effect of carrier type (sine- versus noise-vocoder) on brainstem neural representation of F0 cues when the temporal envelope cutoff frequency was 500 Hz.

CONCLUSIONS: While the improvement in neural representation of temporal envelope and TFS cues with up to 4 vocoder channels is consistent with the behavioral literature, the reduced neural phase-locking strength noted with even more channels may be because of the narrow bandwidth of each channel as the number of channels increases. Stronger neural representation of temporal envelope cues with higher temporal envelope cutoff frequencies is likely a reflection of brainstem neural phase-locking to F0-related periodicity fluctuations preserved in the 500-Hz temporal envelopes, which are unavailable in the 50-Hz temporal envelopes. No effect of temporal envelope cutoff frequency was seen for neural representation of TFS cues, suggesting that spectral side band frequencies created by the 500-Hz temporal envelopes did not improve neural representation of F1 cues over the 50-Hz temporal envelopes. Finally, brainstem F0 representation was not significantly affected by carrier type with a temporal envelope cutoff frequency of 500 Hz, which is inconsistent with previous results of behavioral studies examining pitch perception of vocoded stimuli.}, } @article {pmid28362227, year = {2017}, author = {Grandon, B and Vilain, A and Lœvenbruck, H and Schmerber, S and Truy, E}, title = {Realisation of voicing by French-speaking CI children after long-term implant use: An acoustic study.}, journal = {Clinical linguistics & phonetics}, volume = {31}, number = {7-9}, pages = {598-611}, doi = {10.1080/02699206.2017.1302511}, pmid = {28362227}, issn = {1464-5076}, mesh = {*Acoustic Stimulation ; Child ; Cochlear Implantation ; *Cochlear Implants ; Female ; France ; Humans ; Language ; Male ; Phonetics ; *Speech Discrimination Tests ; *Voice ; }, abstract = {Studies of speech production in French-speaking cochlear-implanted (CI) children are very scarce. Yet, difficulties in speech production have been shown to impact the intelligibility of these children. The goal of this study is to understand the effect of long-term use of cochlear implant on speech production, and more precisely on the coordination of laryngeal-oral gestures in stop production. The participants were all monolingual French children: 13 6;6- to 10;7-year-old CI children and 20 age-matched normally hearing (NH) children. We compared /p/, /t/, /k/, /b/, /d/ and /g/ in word-initial consonant-vowel sequences, produced in isolation in two different tasks, and we studied the effects of CI use, vowel context, task and age factors (i.e. chronological age, age at implantation and duration of implant use). Statistical analyses show a difference in voicing production between groups for voiceless consonants (shorter Voice Onset Times for CI children), with significance reached only for /k/, but no difference for voiced consonants. Our study indicates that in the long run, use of CI seems to have limited effects on the acquisition of oro-laryngeal coordination needed to produce voicing, except for specific difficulties located on velars. In a follow-up study, further acoustic analyses on vowel and fricative production by the same children reveal more difficulties, which suggest that cochlear implantation impacts frequency-based features (second formant of vowels and spectral moments of fricatives) more than durational cues (voicing).}, } @article {pmid28347616, year = {2017}, author = {Sundberg, J and Bitelli, M and Holmberg, A and Laaksonen, V}, title = {The "Overdrive" Mode in the "Complete Vocal Technique": A Preliminary Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {5}, pages = {528-535}, doi = {10.1016/j.jvoice.2017.02.009}, pmid = {28347616}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Glottis/*physiology ; Humans ; Male ; *Phonation ; Preliminary Data ; Pressure ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; Vocal Cords/physiology ; *Voice Quality ; *Voice Training ; }, abstract = {"Complete Vocal Technique," or CVT, is an internationally widespread method for teaching voice. It classifies voicing into four types, referred to as "vocal modes," one of which is called "Overdrive." The physiological correlates of these types are unclear. This study presents an attempt to analyze its voice source and formant frequency characteristics. A male and a female expert of CVT sang a set of "Overdrive" and falsetto tones on the syllable /pᴂ/. The voice source could be analyzed by inverse filtering in the case of the male subject. Results showed that subglottal pressure, measured as the oral pressure during /p/ occlusion, was low in falsetto and high in "Overdrive", and it was strongly correlated with each of the voice source parameters. These correlations could be described in terms of equations. The deviations from these equations of the different voice source parameters for the various voice samples suggested that "Overdrive" phonation was produced with stronger vocal fold adduction than the falsetto tones. Further, the subject was also found to tune the first formant to the second partial in "Overdrive" tones. The results support the conclusion that the method used, to compensate for the influence of subglottal pressure on the voice source, seems promising to use for analyses of other CVT vocal modes and also for other types of phonation.}, } @article {pmid28334401, year = {2017}, author = {Carey, D and Miquel, ME and Evans, BG and Adank, P and McGettigan, C}, title = {Vocal Tract Images Reveal Neural Representations of Sensorimotor Transformation During Speech Imitation.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {27}, number = {5}, pages = {3064-3079}, pmid = {28334401}, issn = {1460-2199}, mesh = {Adult ; *Brain Mapping ; Female ; Humans ; Image Processing, Computer-Assisted ; Larynx/*diagnostic imaging ; Lip/*diagnostic imaging ; Magnetic Resonance Imaging ; Male ; Oxygen/blood ; Palate, Soft/diagnostic imaging ; Sensorimotor Cortex/diagnostic imaging/*physiology ; Speech/*physiology ; Speech Acoustics ; Tongue/*diagnostic imaging ; Young Adult ; }, abstract = {Imitating speech necessitates the transformation from sensory targets to vocal tract motor output, yet little is known about the representational basis of this process in the human brain. Here, we address this question by using real-time MR imaging (rtMRI) of the vocal tract and functional MRI (fMRI) of the brain in a speech imitation paradigm. Participants trained on imitating a native vowel and a similar nonnative vowel that required lip rounding. Later, participants imitated these vowels and an untrained vowel pair during separate fMRI and rtMRI runs. Univariate fMRI analyses revealed that regions including left inferior frontal gyrus were more active during sensorimotor transformation (ST) and production of nonnative vowels, compared with native vowels; further, ST for nonnative vowels activated somatomotor cortex bilaterally, compared with ST of native vowels. Using test representational similarity analysis (RSA) models constructed from participants' vocal tract images and from stimulus formant distances, we found that RSA searchlight analyses of fMRI data showed either type of model could be represented in somatomotor, temporal, cerebellar, and hippocampal neural activation patterns during ST. We thus provide the first evidence of widespread and robust cortical and subcortical neural representation of vocal tract and/or formant parameters, during prearticulatory ST.}, } @article {pmid28326988, year = {2017}, author = {Richardson, K and Sussman, JE}, title = {Discrimination and Identification of a Third Formant Frequency Cue to Place of Articulation by Young Children and Adults.}, journal = {Language and speech}, volume = {60}, number = {1}, pages = {27-47}, doi = {10.1177/0023830915625680}, pmid = {28326988}, issn = {0023-8309}, mesh = {Acoustic Stimulation ; Adult ; Age Factors ; *Child Development ; Child, Preschool ; *Cues ; Female ; Humans ; Male ; *Phonetics ; *Pitch Discrimination ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Typically-developing children, 4 to 6 years of age, and adults participated in discrimination and identification speech perception tasks using a synthetic consonant-vowel continuum ranging from /da/ to /ga/. The seven-step synthetic /da/-/ga/ continuum was created by adjusting the first 40 ms of the third formant frequency transition. For the discrimination task, listeners participated in a Change/No-Change paradigm with four different stimuli compared to the endpoint-1 /da/ token. For the identification task, listeners labeled each token along the /da/-/ga/ continuum as either "DA" or "GA." Results of the discrimination experiment showed that sensitivity to the third-formant transition cue improved for the adult listeners as the stimulus contrast increased, whereas the performance of the children remained poor across all stimulus comparisons. Results of the identification experiment support previous hypotheses of age-related differences in phonetic categorization. Results have implications for normative data on identification and discrimination tasks. These norms provide a metric against which children with auditory-based speech sound disorders can be compared. Furthermore, the results provide some insight into the developmental nature of categorical and non-categorical speech perception.}, } @article {pmid28320627, year = {2017}, author = {Lebacq, J and Schoentgen, J and Cantarella, G and Bruss, FT and Manfredi, C and DeJonckere, P}, title = {Maximal Ambient Noise Levels and Type of Voice Material Required for Valid Use of Smartphones in Clinical Voice Research.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {5}, pages = {550-556}, doi = {10.1016/j.jvoice.2017.02.017}, pmid = {28320627}, issn = {1873-4588}, mesh = {Acoustics/*instrumentation ; Humans ; Noise/adverse effects ; Reproducibility of Results ; Signal Processing, Computer-Assisted ; Signal-To-Noise Ratio ; *Smartphone ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*instrumentation/methods ; Speech-Language Pathology/*instrumentation/methods ; Time Factors ; Voice Disorders/*diagnosis/physiopathology ; *Voice Quality ; }, abstract = {PURPOSE: Smartphone technology provides new opportunities for recording standardized voice samples of patients and transmitting the audio files to the voice laboratory. This drastically improves the achievement of baseline designs, used in research on efficiency of voice treatments. However, the basic requirement is the suitability of smartphones for recording and digitizing pathologic voices (mainly characterized by period perturbations and noise) without significant distortion. In a previous article, this was tested using realistic synthesized deviant voice samples (/a:/) with three precisely known levels of jitter and of noise in all combinations. High correlations were found between jitter and noise to harmonics ratio measured in (1) recordings via smartphones, (2) direct microphone recordings, and (3) sound files generated by the synthesizer. In the present work, similar experiments were performed (1) in the presence of increasing levels of ambient noise and (2) using synthetic deviant voice samples (/a:/) as well as synthetic voice material simulating a deviant short voiced utterance (/aiuaiuaiu/).

RESULTS: Ambient noise levels up to 50 dBA are acceptable. However, signal processing occurs in some smartphones, and this significantly affects estimates of jitter and noise to harmonics ratio when formant changes are introduced in analogy with running speech. The conclusion is that voice material must provisionally be limited to a sustained /a/.}, } @article {pmid28301265, year = {2017}, author = {Peng, H and Xu, H and Xu, Z and Huang, W and Jia, R and Yu, H and Zhao, Z and Wang, J and Gao, Z and Zhang, Q and Huang, W}, title = {Acoustic analysis of snoring sounds originating from different sources determined by drug-induced sleep endoscopy.}, journal = {Acta oto-laryngologica}, volume = {137}, number = {8}, pages = {872-876}, doi = {10.1080/00016489.2017.1293291}, pmid = {28301265}, issn = {1651-2251}, mesh = {Adult ; Dexmedetomidine/administration & dosage ; Epiglottis/physiopathology ; Female ; Humans ; Hypnotics and Sedatives/administration & dosage ; *Laryngoscopy ; Male ; Middle Aged ; Palate, Soft/physiopathology ; Pharynx/physiopathology ; Propofol/administration & dosage ; Snoring/*physiopathology ; Tongue/physiopathology ; *Video Recording ; Young Adult ; }, abstract = {OBJECTIVE: To discuss the possibility of fundamental frequency (F0) and formant frequency (FF) to generally differentiate the sources of snoring sounds determined by drug-induced sleep endoscopy (DISE).

METHODS: A total of 74 snoring subjects underwent DISE and snoring sounds were recorded simultaneously. The noise-suppressed snoring sounds were analyzed and classified into different groups based on the sources of vibration identified by DISE. F0 and FFs were calculated.

RESULTS: Totally, 516 snoring sounds from three vibrating sources (the palate, combined the palate and the lateral wall, the lateral wall) of 47 patients were divided into three groups then analyzed. The levels of F0 and FFs for each group follow the order: Group 1 < Group 2 < Group 3. There was statistical difference between Group 1 and other groups in F0 and F2 (p < .05). The area under the receiver-operator curves (AUC) was F0, at 0.727, and the cut-off value was 134.2 Hz; and F2, at 0.654, and the cut-off value was 2028.0 Hz.

CONCLUSIONS: F0 and the second formant frequency (F2) are found to be significantly lower in palatal snoring sound. F0 might be a significant in distinguishing palatal snoring sound from non-palatal snoring sound. F2 is more significant than F1 and F3 in identifying the sources of the snoring sounds but is less sensitive than F0.}, } @article {pmid28282570, year = {2017}, author = {Katlowitz, KA and Oya, H and Howard, MA and Greenlee, JDW and Long, MA}, title = {Paradoxical vocal changes in a trained singer by focally cooling the right superior temporal gyrus.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {89}, number = {}, pages = {111-119}, pmid = {28282570}, issn = {1973-8102}, support = {TL1 TR001447/TR/NCATS NIH HHS/United States ; UL1 TR001445/TR/NCATS NIH HHS/United States ; K23 DC009589/DC/NIDCD NIH HHS/United States ; R01 NS075044/NS/NINDS NIH HHS/United States ; R01 DC015260/DC/NIDCD NIH HHS/United States ; R01 DC004290/DC/NIDCD NIH HHS/United States ; T32 GM136573/GM/NIGMS NIH HHS/United States ; R56 NS075044/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; *Cold Temperature ; Craniotomy ; Epilepsy/physiopathology/surgery ; Humans ; Magnetic Resonance Imaging ; Male ; *Singing ; Temporal Lobe/*physiology/surgery ; Voice/*physiology ; }, abstract = {The production and perception of music is preferentially mediated by cortical areas within the right hemisphere, but little is known about how these brain regions individually contribute to this process. In an experienced singer undergoing awake craniotomy, we demonstrated that direct electrical stimulation to a portion of the right posterior superior temporal gyrus (pSTG) selectively interrupted singing but not speaking. We then focally cooled this region to modulate its activity during vocalization. In contrast to similar manipulations in left hemisphere speech production regions, pSTG cooling did not elicit any changes in vocal timing or quality. However, this manipulation led to an increase in the pitch of speaking with no such change in singing. Further analysis revealed that all vocalizations exhibited a cooling-induced increase in the frequency of the first formant, raising the possibility that potential pitch offsets may have been actively avoided during singing. Our results suggest that the right pSTG plays a key role in vocal sensorimotor processing whose impact is dependent on the type of vocalization produced.}, } @article {pmid28277224, year = {2017}, author = {Kemaloğlu, YK and Mengü, G}, title = {Speech and OSA: Could Lower Formant Frequencies of the Vowels Only Be Expected in Subjects with Obstructive Sleep Apnea?: Re: Montero Benavides A., Blanco Murillo J.L., Pozo R.F., Cuadros F.E., Toledano D.T., Alcazar-Ramirez J.D., Hernandez Gomez L.A. Formant frequencies and bandwidths in relation to clinical variables in an obstructive sleep apnea population. J Voice. 2016;30:21-29.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {2}, pages = {e3-e4}, doi = {10.1016/j.jvoice.2016.05.001}, pmid = {28277224}, issn = {1873-4588}, mesh = {Humans ; *Sleep Apnea, Obstructive ; *Speech ; Speech Acoustics ; }, } @article {pmid28257585, year = {2017}, author = {van den Bunt, MR and Groen, MA and Ito, T and Francisco, AA and Gracco, VL and Pugh, KR and Verhoeven, L}, title = {Increased Response to Altered Auditory Feedback in Dyslexia: A Weaker Sensorimotor Magnet Implied in the Phonological Deficit.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {60}, number = {3}, pages = {654-667}, pmid = {28257585}, issn = {1558-9102}, support = {P01 HD001994/HD/NICHD NIH HHS/United States ; R37 HD090153/HD/NICHD NIH HHS/United States ; }, mesh = {Adaptation, Psychological ; Dyslexia/*psychology ; *Feedback, Sensory ; Female ; Humans ; Language Tests ; Linear Models ; Male ; *Phonetics ; Psychophysics ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to examine whether developmental dyslexia (DD) is characterized by deficiencies in speech sensory and motor feedforward and feedback mechanisms, which are involved in the modulation of phonological representations.

METHOD: A total of 42 adult native speakers of Dutch (22 adults with DD; 20 participants who were typically reading controls) were asked to produce /bep/ while the first formant (F1) of the /e/ was not altered (baseline), increased (ramp), held at maximal perturbation (hold), and not altered again (after-effect). The F1 of the produced utterance was measured for each trial and used for statistical analyses. The measured F1s produced during each phase were entered in a linear mixed-effects model.

RESULTS: Participants with DD adapted more strongly during the ramp phase and returned to baseline to a lesser extent when feedback was back to normal (after-effect phase) when compared with the typically reading group. In this study, a faster deviation from baseline during the ramp phase, a stronger adaptation response during the hold phase, and a slower return to baseline during the after-effect phase were associated with poorer reading and phonological abilities.

CONCLUSION: The data of the current study are consistent with the notion that the phonological deficit in DD is associated with a weaker sensorimotor magnet for phonological representations.}, } @article {pmid28256029, year = {2018}, author = {Daliri, A and Wieland, EA and Cai, S and Guenther, FH and Chang, SE}, title = {Auditory-motor adaptation is reduced in adults who stutter but not in children who stutter.}, journal = {Developmental science}, volume = {21}, number = {2}, pages = {}, pmid = {28256029}, issn = {1467-7687}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R01 DC011277/DC/NIDCD NIH HHS/United States ; }, mesh = {*Adaptation, Physiological ; Adolescent ; Adult ; Auditory Perception ; Child ; Feedback, Sensory/*physiology ; Female ; Humans ; Learning ; Male ; Speech/physiology ; Stuttering/etiology/*physiopathology ; Young Adult ; }, abstract = {Previous studies have shown that adults who stutter produce smaller corrective motor responses to compensate for unexpected auditory perturbations in comparison to adults who do not stutter, suggesting that stuttering may be associated with deficits in integration of auditory feedback for online speech monitoring. In this study, we examined whether stuttering is also associated with deficiencies in integrating and using discrepancies between expected and received auditory feedback to adaptively update motor programs for accurate speech production. Using a sensorimotor adaptation paradigm, we measured adaptive speech responses to auditory formant frequency perturbations in adults and children who stutter and their matched nonstuttering controls. We found that the magnitude of the speech adaptive response for children who stutter did not differ from that of fluent children. However, the adaptation magnitude of adults who stutter in response to auditory perturbation was significantly smaller than the adaptation magnitude of adults who do not stutter. Together these results indicate that stuttering is associated with deficits in integrating discrepancies between predicted and received auditory feedback to calibrate the speech production system in adults but not children. This auditory-motor integration deficit thus appears to be a compensatory effect that develops over years of stuttering.}, } @article {pmid28253672, year = {2017}, author = {Sagi, E and Svirsky, MA}, title = {Contribution of formant frequency information to vowel perception in steady-state noise by cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {141}, number = {2}, pages = {1027}, pmid = {28253672}, issn = {1520-8524}, support = {R01 DC003937/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Aged ; Algorithms ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Cues ; Electric Stimulation ; Female ; Humans ; Male ; Middle Aged ; Noise/*adverse effects ; *Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {Cochlear implant (CI) recipients have difficulty understanding speech in noise even at moderate signal-to-noise ratios. Knowing the mechanisms they use to understand speech in noise may facilitate the search for better speech processing algorithms. In the present study, a computational model is used to assess whether CI users' vowel identification in noise can be explained by formant frequency cues (F1 and F2). Vowel identification was tested with 12 unilateral CI users in quiet and in noise. Formant cues were measured from vowels in each condition, specific to each subject's speech processor. Noise distorted the location of vowels in the F2 vs F1 plane in comparison to quiet. The best fit model to subjects' data in quiet produced model predictions in noise that were within 8% of actual scores on average. Predictions in noise were much better when assuming that subjects used a priori knowledge regarding how formant information is degraded in noise (experiment 1). However, the model's best fit to subjects' confusion matrices in noise was worse than in quiet, suggesting that CI users utilize formant cues to identify vowels in noise, but to a different extent than how they identify vowels in quiet (experiment 2).}, } @article {pmid30621385, year = {2017}, author = {Boelen, C and Chabot, JM and Diot, P and Dumas, JL and Vinel, JP}, title = {[Challenges to health system, medical profession and accreditation of medical schools].}, journal = {La Revue du praticien}, volume = {67}, number = {3}, pages = {250-254}, pmid = {30621385}, issn = {2101-017X}, mesh = {Accreditation ; Curriculum ; *Education, Medical ; Schools, Medical ; }, } @article {pmid28243711, year = {2017}, author = {Sibiryakova, OV and Volodin, IA and Frey, R and Zuther, S and Kisebaev, TB and Salemgareev, AR and Volodina, EV}, title = {Remarkable vocal identity in wild-living mother and neonate saiga antelopes: a specialization for breeding in huge aggregations?.}, journal = {Die Naturwissenschaften}, volume = {104}, number = {3-4}, pages = {11}, pmid = {28243711}, issn = {1432-1904}, mesh = {Acoustics ; Adaptation, Biological ; Animals ; Animals, Newborn ; *Animals, Wild ; Antelopes/*physiology ; Breeding ; Discriminant Analysis ; Female ; Kazakhstan ; Population Density ; Vocalization, Animal/*physiology ; }, abstract = {Saiga antelopes Saiga tatarica tatarica give birth in large aggregations, and offspring follow the herd soon after birth. Herding is advantageous as anti-predator strategy; however, communication between mothers and neonates is strongly complicated in large aggregations. Individual series of nasal and oral contact calls of mother and neonate saiga antelopes were selected from recordings made with automated recording systems placed near the hiding neonates on the saiga breeding grounds in Northern Kazakhstan during synchronized parturitions of 30,000 calving females. We used for comparison of the acoustic structure of nasal and oral contact calls 168 nasal calls of 18 mothers, 192 oral calls of 21 mothers, 78 nasal calls of 16 neonates, and 197 oral calls of 22 neonates. In the oral calls of either mothers or neonates, formant frequencies were higher and the duration was longer than in the nasal calls, whereas fundamental frequencies did not differ between oral and nasal calls. Discriminant function analysis (DFA) based on six acoustic variables, accurately classified individual identity for 99.4% of oral calls of 18 mothers, for 89.3% of nasal calls of 18 mothers, and for 94.4% of oral calls of 18 neonates. The average value of correct classification to individual was higher in mother oral than in mother nasal calls and in mother oral calls than in neonate oral calls; no significant difference was observed between mother nasal and neonate oral calls. Variables mainly responsible for vocal identity were the fundamental frequency and the second and third formants in either mothers or neonates, and in either nasal or oral calls. The high vocal identity of mothers and neonates suggests a powerful potential for the mutual mother-offspring recognition in dense aggregations of saiga antelopes as an important component of their survival strategy.}, } @article {pmid28222332, year = {2017}, author = {Oohashi, H and Watanabe, H and Taga, G}, title = {Acquisition of vowel articulation in childhood investigated by acoustic-to-articulatory inversion.}, journal = {Infant behavior & development}, volume = {46}, number = {}, pages = {178-193}, doi = {10.1016/j.infbeh.2017.01.007}, pmid = {28222332}, issn = {1934-8800}, mesh = {Child, Preschool ; Female ; Humans ; Infant ; Language ; *Language Development ; Male ; *Phonetics ; Speech/physiology ; *Speech Acoustics ; Speech Articulation Tests/*methods/trends ; Speech Perception/physiology ; Tongue/physiology ; }, abstract = {While the acoustical features of speech sounds in children have been extensively studied, limited information is available as to their articulation during speech production. Instead of directly measuring articulatory movements, this study used an acoustic-to-articulatory inversion model with scalable vocal tract size to estimate developmental changes in articulatory state during vowel production. Using a pseudo-inverse Jacobian matrix of a model mapping seven articulatory parameters to acoustic ones, the formant frequencies of each vowel produced by three Japanese children over time at ages between 6 and 60 months were transformed into articulatory parameters. We conducted the discriminant analysis to reveal differences in articulatory states for production of each vowel. The analysis suggested that development of vowel production went through gradual functionalization of articulatory parameters. At 6-9 months, the coordination of position of tongue body and lip aperture forms three vowels: front, back, and central. At 10-17 months, recruitments of jaw and tongue apex enable differentiation of these three vowels into five. At 18 months and older, recruitment of tongue shape produces more distinct vowels specific to Japanese. These results suggest that the jaw and tongue apex contributed to speech production by young children regardless of kinds of vowel. Moreover, initial articulatory states for each vowel could be distinguished by the manner of coordination between lip and tongue, and these initial states are differentiated and refined into articulations adjusted to the native language over the course of development.}, } @article {pmid28199318, year = {2017}, author = {Favaro, L and Gamba, M and Gili, C and Pessani, D}, title = {Acoustic correlates of body size and individual identity in banded penguins.}, journal = {PloS one}, volume = {12}, number = {2}, pages = {e0170001}, pmid = {28199318}, issn = {1932-6203}, mesh = {Animals ; Body Size/*physiology ; Spheniscidae/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Animal vocalisations play a role in individual recognition and mate choice. In nesting penguins, acoustic variation in vocalisations originates from distinctiveness in the morphology of the vocal apparatus. Using the source-filter theory approach, we investigated vocal individuality cues and correlates of body size and mass in the ecstatic display songs the Humboldt and Magellanic penguins. We demonstrate that both fundamental frequency (f0) and formants (F1-F4) are essential vocal features to discriminate among individuals. However, we show that only duration and f0 are honest indicators of the body size and mass, respectively. We did not find any effect of body dimension on formants, formant dispersion nor estimated vocal tract length of the emitters. Overall, our findings provide the first evidence that the resonant frequencies of the vocal tract do not correlate with body size in penguins. Our results add important information to a growing body of literature on the role of the different vocal parameters in conveying biologically meaningful information in bird vocalisations.}, } @article {pmid28187922, year = {2017}, author = {Balasubramaniam, RK and N, N}, title = {Voice Mutation During Adolescence in Mangalore, India: Implications for the Assessment and Management of Mutational Voice Disorders.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {4}, pages = {511.e29-511.e33}, doi = {10.1016/j.jvoice.2016.11.019}, pmid = {28187922}, issn = {1873-4588}, mesh = {Adolescent ; *Adolescent Behavior ; Child ; Cross-Sectional Studies ; Female ; Humans ; India ; Male ; *Voice ; Voice Disorders ; }, abstract = {BACKGROUND: The knowledge of vocal mutation is important for speech pathologists in the diagnosis and management of individuals with mutational voice disorders. However, data on vocal mutation in the Indian population are scarce and hence the present study was planned to investigate the age of attainment of vocal mutation in boys and girls from Mangalore, India, in the age range of 8-18 years.

METHODS: A total of 600 participants in the age range of 8-18 years were divided into 10 groups with a 1-year interval. Sustained phonation /a/ and a narration were recorded. Two-way analysis of variance was used to obtain significant difference between the means across age and gender for the fundamental frequency and formant frequency measures.

RESULTS: There was significant main effect of groups for fundamental frequency measure in boys, with post hoc tests revealing statistically significant differences from 14 years of age onward. However, the cutoff criteria of 140 Hz in boys and 240 Hz in girls were attained only by 16 years of age in boys and 15 years in girls, indicating that 16 and 15 years as the ages of onset of vocal mutation in boys and girls, respectively. Results also revealed that first formant frequency undergoes changes from 13 years onward. However, F2 changes from 16 years of age, with no significance observed in F3.

CONCLUSION: The results of the present study are useful in the assessment and management of individuals with mutational voice disorders.}, } @article {pmid28128427, year = {2017}, author = {Delgado-Hernandez, J}, title = {[Pilot study of the acoustic values of the vowels in Spanish as indicators of the severity of dysarthria].}, journal = {Revista de neurologia}, volume = {64}, number = {3}, pages = {105-111}, pmid = {28128427}, issn = {1576-6578}, mesh = {Adult ; Dysarthria/*physiopathology ; Female ; Humans ; Language ; Male ; Middle Aged ; Pilot Projects ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; }, abstract = {INTRODUCTION: The acoustic analysis is a tool that provides objective data on changes of speech in dysarthria.

AIM: To evaluate, in the ataxic dysarthria, the relationship between the vowel space area (VSA), the formant centralization ratio (FCR) and the mean of the primary distances with the speech intelligibility.

SUBJECTS AND METHODS: A sample of fourteen Spanish speakers, ten with dysarthria and four controls, was used. The values of first and second formants in 140 vowels extracted of 140 words were analyzed. To calculate the level of intelligibility seven listeners were involved and a task of identification verbal stimuli was used.

RESULTS: The dysarthric subjects have less contrast between middle and high vowels and between back vowels. Significant differences in the VSA, FCR and mean of the primary distances compared to control subjects (p = 0.007, 0.005 and 0.030, respectively) are observed. Regression analysis show the relationship between VSA and the mean of primary distances with the level of speech intelligibility (r = 0.60 and 0.74, respectively).

CONCLUSIONS: Ataxic dysarthria subjects have lower contrast and vowel centralization in carrying out the vowels. The acoustic measures studied in this preliminary work have a high sensitivity in the detection of dysarthria but only the VSA and the mean of primary distances provide information on the severity of this type of speech disturbance.}, } @article {pmid28124069, year = {2017}, author = {Fletcher, AR and McAuliffe, MJ and Lansford, KL and Liss, JM}, title = {Assessing Vowel Centralization in Dysarthria: A Comparison of Methods.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {60}, number = {2}, pages = {341-354}, pmid = {28124069}, issn = {1558-9102}, support = {R01 DC006859/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Dysarthria/*diagnosis/physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Reading ; Reproducibility of Results ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement/*methods ; }, abstract = {PURPOSE: The strength of the relationship between vowel centralization measures and perceptual ratings of dysarthria severity has varied considerably across reports. This article evaluates methods of acoustic-perceptual analysis to determine whether procedural changes can strengthen the association between these measures.

METHOD: Sixty-one speakers (17 healthy individuals and 44 speakers with dysarthria) read a standard passage. To obtain acoustic data, 2 points of formant extraction (midpoint and articulatory point) and 2 frequency measures (Hz and Bark) were trialed. Both vowel space area and an adapted formant centralization ratio were calculated using first and second formants of speakers' corner vowels. Twenty-eight listeners rated speech samples using different prompts: one with a focus on intelligibility, the other on speech precision.

RESULTS: Perceptually, listener ratings of speech precision provided the best index of acoustic change. Acoustically, the combined use of an articulatory-based formant extraction point, Bark frequency units, and the formant centralization ratio was most effective in explaining perceptual ratings. This combination of procedures resulted in an increase of 17% to 27% explained variance between measures.

CONCLUSIONS: The procedures researchers use to assess articulatory impairment can significantly alter the strength of relationship between acoustic and perceptual measures. Procedures that maximize this relationship are recommended.}, } @article {pmid28124068, year = {2017}, author = {McAuliffe, MJ and Fletcher, AR and Kerr, SE and O'Beirne, GA and Anderson, T}, title = {Effect of Dysarthria Type, Speaking Condition, and Listener Age on Speech Intelligibility.}, journal = {American journal of speech-language pathology}, volume = {26}, number = {1}, pages = {113-123}, doi = {10.1044/2016_AJSLP-15-0182}, pmid = {28124068}, issn = {1558-9110}, mesh = {Adolescent ; Adult ; Age Factors ; Aged, 80 and over ; Auditory Threshold ; Comprehension ; Cues ; Dysarthria/*classification/*diagnosis/rehabilitation ; Female ; Humans ; Loudness Perception ; Male ; Social Environment ; Sound Spectrography ; *Speech Intelligibility ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: The aim of this study was to examine the effect of loud and slow speech cues on younger and older listeners' comprehension of dysarthric speech, specifically, (a) whether one strategy, as opposed to the other, promoted greater intelligibility gains for different speaker groups; (b) whether older and younger listeners' understandings were differentially affected by these strategies; and (c) which acoustic changes best predicted intelligibility gain in individual speakers.

METHOD: Twenty younger and 40 older listeners completed a perceptual task. Six individuals with dysarthria produced phrases across habitual, loud, and slow conditions. The primary dependent variable was proportion of words correct; follow-up acoustic analyses linked perceptual outcomes to changes in acoustic speech features.

RESULTS: Regardless of dysarthria type, the loud condition produced significant intelligibility gains. Overall, older listeners' comprehension was reduced relative to younger listeners. Follow-up analysis revealed considerable interspeaker differences in intelligibility outcomes across conditions. Although the most successful speaking mode varied, intelligibility gains were strongly associated with the degree of change participants made to their vowel formants.

CONCLUSIONS: Perceptual outcomes vary across speaking modes, even when speakers with dysarthria are grouped according to similar perceptual profiles. Further investigation of interspeaker differences is needed to inform individually tailored intervention approaches.}, } @article {pmid28114612, year = {2017}, author = {Connaghan, KP and Patel, R}, title = {The Impact of Contrastive Stress on Vowel Acoustics and Intelligibility in Dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {60}, number = {1}, pages = {38-50}, pmid = {28114612}, issn = {1558-9102}, support = {R03 DC006118/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cerebral Palsy/complications/physiopathology ; *Dysarthria/etiology/physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Severity of Illness Index ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {PURPOSE: To compare vowel acoustics and intelligibility in words produced with and without contrastive stress by speakers with spastic (mixed-spastic) dysarthria secondary to cerebral palsy (DYSCP) and healthy controls (HCs).

METHOD: Fifteen participants (9 men, 6 women; age M = 42 years) with DYSCP and 15 HCs (9 men, 6 women; age M = 36 years) produced sentences containing target words with and without contrastive stress. Forty-five healthy listeners (age M = 25 years) completed a vowel identification task of DYSCP productions. Vowel acoustics were compared across stress conditions and groups using 1st (F1) and 2nd (F2) formant measures. Perceptual intelligibility was compared across stress conditions and dysarthria severity.

RESULTS: F1 and F2 significantly increased in stressed words for both groups, although the degree of change differed. Mean Euclidian distance between vowels also increased with stress. The relative probability of vowels falling within the target F1 × F2 space was greater for HCs but did not differ with stress. Stress production resulted in greater listener vowel identification accuracy for speakers with mild dysarthria.

CONCLUSIONS: Contrastive stress affected vowel formants for both groups. Perceptual results suggest that some speakers with dysarthria may benefit from a contrastive stress strategy to improve vowel intelligibility.}, } @article {pmid28110077, year = {2017}, author = {Summers, RJ and Bailey, PJ and Roberts, B}, title = {WITHDRAWN: Informational masking and the effects of differences in fundamental frequency and fundamental-frequency contour on phonetic integration in a formant ensemble.}, journal = {Hearing research}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.heares.2016.10.031}, pmid = {28110077}, issn = {1878-5891}, } @article {pmid28102300, year = {2017}, author = {Alain, C and Arsenault, JS and Garami, L and Bidelman, GM and Snyder, JS}, title = {Neural Correlates of Speech Segregation Based on Formant Frequencies of Adjacent Vowels.}, journal = {Scientific reports}, volume = {7}, number = {}, pages = {40790}, pmid = {28102300}, issn = {2045-2322}, support = {//CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Adult ; *Comprehension ; Electrophysiological Phenomena ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {The neural substrates by which speech sounds are perceptually segregated into distinct streams are poorly understood. Here, we recorded high-density scalp event-related potentials (ERPs) while participants were presented with a cyclic pattern of three vowel sounds (/ee/-/ae/-/ee/). Each trial consisted of an adaptation sequence, which could have either a small, intermediate, or large difference in first formant (Δf1) as well as a test sequence, in which Δf1 was always intermediate. For the adaptation sequence, participants tended to hear two streams ("streaming") when Δf1 was intermediate or large compared to when it was small. For the test sequence, in which Δf1 was always intermediate, the pattern was usually reversed, with participants hearing a single stream with increasing Δf1 in the adaptation sequences. During the adaptation sequence, Δf1-related brain activity was found between 100-250 ms after the /ae/ vowel over fronto-central and left temporal areas, consistent with generation in auditory cortex. For the test sequence, prior stimulus modulated ERP amplitude between 20-150 ms over left fronto-central scalp region. Our results demonstrate that the proximity of formants between adjacent vowels is an important factor in the perceptual organization of speech, and reveal a widely distributed neural network supporting perceptual grouping of speech sounds.}, } @article {pmid28102014, year = {2017}, author = {Biondi, E and Bandini, A and Lombardo, L and Orlandi, S and Siciliani, G and Manfredi, C}, title = {Phonetic analysis during treatment with rapid maxillary expander.}, journal = {Orthodontics & craniofacial research}, volume = {20}, number = {1}, pages = {21-29}, doi = {10.1111/ocr.12136}, pmid = {28102014}, issn = {1601-6343}, mesh = {Adolescent ; Child ; Female ; Humans ; Male ; *Palatal Expansion Technique ; *Phonetics ; *Speech Acoustics ; }, abstract = {OBJECTIVES: To investigate possible changes and/or device-related impairments in phonetic habits produced by rapid maxillary expansion (RME).

MATERIALS AND METHODS: Thirty-five patients scheduled for RME were divided into two groups: Group A (banded two-arm Hyrax) and Group B (banded four-arm Hyrax). Speech samples were collected at six time points, before, during and after RME removal. Acoustical analysis was performed using PRAAT and BioVoice analysis tools. Ten volunteers completed a questionnaire on the acceptability of patient's speech. Maxillary dimensions and palatal volume were measured on dental casts before and after expansion using a digital gauge.

RESULTS: Voice analysis showed an increase in the peak frequency of fricative consonants (/s/,/ʃ/) after expansion, whereas there was no change of formant frequencies of palatal consonants (/ɲ/,/ʎ/). Vowel /i/ displayed a lowering of the first formant frequency, and an increase in the second and third formant frequencies. After bonding, Group B showed both a greater reduction in the peak frequency of fricatives and a greater increase in the formant frequencies of palatal consonants than Group A.

CONCLUSION: Rapid maxillary expansion causes a slight phonetic change in the acoustical parameters of both consonants and vowels. The two-arm Hyrax caused less speech impairment than the four-arm Hyrax during the treatment.}, } @article {pmid28093574, year = {2017}, author = {Story, BH and Bunton, K}, title = {An acoustically-driven vocal tract model for stop consonant production.}, journal = {Speech communication}, volume = {87}, number = {}, pages = {1-17}, pmid = {28093574}, issn = {0167-6393}, support = {R01 DC011275/DC/NIDCD NIH HHS/United States ; }, abstract = {The purpose of this study was to further develop a multi-tier model of the vocal tract area function in which the modulations of shape to produce speech are generated by the product of a vowel substrate and a consonant superposition function. The new approach consists of specifying input parameters for a target consonant as a set of directional changes in the resonance frequencies of the vowel substrate. Using calculations of acoustic sensitivity functions, these "resonance deflection patterns" are transformed into time-varying deformations of the vocal tract shape without any direct specification of location or extent of the consonant constriction along the vocal tract. The configuration of the constrictions and expansions that are generated by this process were shown to be physiologically-realistic and produce speech sounds that are easily identifiable as the target consonants. This model is a useful enhancement for area function-based synthesis and can serve as a tool for understanding how the vocal tract is shaped by a talker during speech production.}, } @article {pmid28076426, year = {2017}, author = {Boë, LJ and Berthommier, F and Legou, T and Captier, G and Kemp, C and Sawallis, TR and Becker, Y and Rey, A and Fagot, J}, title = {Evidence of a Vocalic Proto-System in the Baboon (Papio papio) Suggests Pre-Hominin Speech Precursors.}, journal = {PloS one}, volume = {12}, number = {1}, pages = {e0169321}, pmid = {28076426}, issn = {1932-6203}, mesh = {Animals ; *Biological Evolution ; Female ; Humans ; Larynx/anatomy & histology/physiology ; Male ; Muscles/physiology ; Papio/anatomy & histology/*physiology ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Tongue/anatomy & histology/physiology ; Vocalization, Animal/*physiology ; }, abstract = {Language is a distinguishing characteristic of our species, and the course of its evolution is one of the hardest problems in science. It has long been generally considered that human speech requires a low larynx, and that the high larynx of nonhuman primates should preclude their producing the vowel systems universally found in human language. Examining the vocalizations through acoustic analyses, tongue anatomy, and modeling of acoustic potential, we found that baboons (Papio papio) produce sounds sharing the F1/F2 formant structure of the human [ɨ æ ɑ ɔ u] vowels, and that similarly with humans those vocalic qualities are organized as a system on two acoustic-anatomic axes. This confirms that hominoids can produce contrasting vowel qualities despite a high larynx. It suggests that spoken languages evolved from ancient articulatory skills already present in our last common ancestor with Cercopithecoidea, about 25 MYA.}, } @article {pmid28056148, year = {2017}, author = {Chiu, YF and Forrest, K}, title = {The Interaction of Lexical Characteristics and Speech Production in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {60}, number = {1}, pages = {13-23}, doi = {10.1044/2016_JSLHR-S-15-0333}, pmid = {28056148}, issn = {1558-9102}, mesh = {Adult ; Aged ; Aging/physiology/psychology ; Female ; Humans ; Linear Models ; *Linguistics ; Male ; Middle Aged ; Motor Activity ; *Parkinson Disease/physiopathology ; *Speech/physiology ; Speech Production Measurement ; }, abstract = {PURPOSE: This study sought to investigate the interaction of speech movement execution with higher order lexical parameters. The authors examined how lexical characteristics affect speech output in individuals with Parkinson's disease (PD) and healthy control (HC) speakers.

METHOD: Twenty speakers with PD and 12 healthy speakers read sentences with target words that varied in word frequency and neighborhood density. The formant transitions (F2 slopes) of the diphthongs in the target words were compared across lexical categories between PD and HC groups.

RESULTS: Both groups of speakers produced steeper F2 slopes for the diphthongs in less frequent words and words from sparse neighborhoods. The magnitude of the increase in F2 slopes was significantly less in the PD than HC group. The lexical effect on the F2 slope differed among the diphthongs and between the 2 groups.

CONCLUSIONS: PD and healthy speakers varied their acoustic output on the basis of word frequency and neighborhood density. F2 slope variations can be traced to higher level lexical differences. This lexical effect on articulation, however, appears to be constrained by PD.}, } @article {pmid28040038, year = {2016}, author = {Chintanpalli, A and Ahlstrom, JB and Dubno, JR}, title = {Effects of age and hearing loss on concurrent vowel identification.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {6}, pages = {4142}, pmid = {28040038}, issn = {1520-8524}, support = {C06 RR014516/RR/NCRR NIH HHS/United States ; P50 DC000422/DC/NIDCD NIH HHS/United States ; R01 DC000184/DC/NIDCD NIH HHS/United States ; UL1 RR029882/RR/NCRR NIH HHS/United States ; }, mesh = {Aging ; Cues ; *Hearing Loss ; Humans ; Phonetics ; Speech Perception ; }, abstract = {Differences in formant frequencies and fundamental frequencies (F0) are important cues for segregating and identifying two simultaneous vowels. This study assessed age- and hearing-loss-related changes in the use of these cues for recognition of one or both vowels in a pair and determined differences related to vowel identity and specific vowel pairings. Younger adults with normal hearing, older adults with normal hearing, and older adults with hearing loss listened to different-vowel and identical-vowel pairs that varied in F0 differences. Identification of both vowels as a function of F0 difference revealed that increased age affects the use of F0 and formant difference cues for different-vowel pairs. Hearing loss further reduced the use of these cues, which was not attributable to lower vowel sensation levels. High scores for one vowel in the pair and no effect of F0 differences suggested that F0 cues are important only for identifying both vowels. In contrast to mean scores, widely varying differences in effects of F0 cues, age, and hearing loss were observed for particular vowels and vowel pairings. These variations in identification of vowel pairs were not explained by acoustical models based on the location and level of formants within the two vowels.}, } @article {pmid28040002, year = {2016}, author = {Mitsuya, T and Purcell, DW}, title = {Occlusion effect on compensatory formant production and voice amplitude in response to real-time perturbation.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {6}, pages = {4017}, doi = {10.1121/1.4968539}, pmid = {28040002}, issn = {1520-8524}, mesh = {Phonetics ; Speech ; Speech Acoustics ; Speech Production Measurement ; *Voice ; }, abstract = {The importance of auditory feedback for controlling speech articulation has been substantiated by the use of the real-time auditory perturbation paradigm. With this paradigm, speakers receive their own manipulated voice signal in real-time while they produce a simple speech segment. In response, they spontaneously compensate for the manipulation. In the case of vowel formant control, various studies have reported behavioral and neural mechanisms of how auditory feedback is processed for compensatory behavior. However, due to technical limitations such as avoiding an electromagnetic artifact or metal transducers near a scanner, some studies require foam tip insert earphones. These earphones occlude the ear canal, and may cause more energy of the unmanipulated first formant to reach the cochlea through bone conduction and thus confound the effect of formant manipulation. Moreover, amplification of lower frequencies due to occluded ear canals may influence speakers' voice amplitude. The current study examined whether using circumaural headphones and insert earphones would elicit different compensatory speech production when speakers' first formant was manipulated in real-time. The results of the current study showed that different headphones did not elicit different compensatory formant production. Voice amplitude results were varied across different vowels examined; however, voice amplitude tended to decrease with the introduction of F1 perturbation.}, } @article {pmid31402984, year = {2017}, author = {Gustison, ML and Bergman, TJ}, title = {Divergent acoustic properties of gelada and baboon vocalizations and their implications for the evolution of human speech.}, journal = {Journal of language evolution}, volume = {2}, number = {1}, pages = {20-36}, pmid = {31402984}, issn = {2058-458X}, support = {R01 MH062249/MH/NIMH NIH HHS/United States ; }, abstract = {Human speech has many complex spectral and temporal features traditionally thought to be absent in the vocalizations of other primates. Recent explorations of the vocal capabilities of non-human primates are challenging this view. Here, we continue this trend by exploring the spectro-temporal properties of gelada (Theropithecus gelada) vocalizations. First, we made cross-species comparisons of geladas, chacma baboons, and human vowel space area. We found that adult male and female gelada exhaled grunts-a call type shared with baboons-have formant profiles that overlap more with human vowel space than do baboon grunts. These gelada grunts also contained more modulation of fundamental and formant frequencies than did baboon grunts. Second, we compared formant profiles and modulation of exhaled grunts to the derived call types (those not shared with baboons) produced by gelada males. These derived calls contained divergent formant profiles, and a subset of them, notably wobbles and vocalized yawns, were more modulated than grunts. Third, we investigated the rhythmic patterns of wobbles, a call type shown previously to contain cycles that match the 3-8 Hz tempo of speech. We use a larger dataset to show that the wobble rhythm overlaps more with speech rhythm than previously thought. We also found that variation in cycle duration depends on the production modality; specifically, exhaled wobbles were produced at a slower tempo than inhaled wobbles. Moreover, the variability in cycle duration within wobbles aligns with a linguistic property known as 'Menzerath's law' in that there was a negative association between cycle duration and wobble size (i.e. the number of cycles). Taken together, our results add to growing evidence that non-human primates are anatomically capable of producing modulated sounds. Our results also support and expand on current hypotheses of speech evolution, including the 'neural hypothesis' and the 'bimodal speech rhythm hypothesis'.}, } @article {pmid28039737, year = {2017}, author = {Takatsu, J and Hanai, N and Suzuki, H and Yoshida, M and Tanaka, Y and Tanaka, S and Hasegawa, Y and Yamamoto, M}, title = {Phonologic and Acoustic Analysis of Speech Following Glossectomy and the Effect of Rehabilitation on Speech Outcomes.}, journal = {Journal of oral and maxillofacial surgery : official journal of the American Association of Oral and Maxillofacial Surgeons}, volume = {75}, number = {7}, pages = {1530-1541}, doi = {10.1016/j.joms.2016.12.004}, pmid = {28039737}, issn = {1531-5053}, mesh = {Adult ; Aged ; Articulation Disorders/*physiopathology/*rehabilitation ; Female ; Glossectomy/*rehabilitation ; Humans ; Male ; Middle Aged ; Postoperative Complications/*physiopathology/*rehabilitation ; Prospective Studies ; *Speech Acoustics ; Tongue Neoplasms/*surgery ; Treatment Outcome ; Young Adult ; }, abstract = {PURPOSE: Changes in acoustic features in the perioperative phase for elucidating the mechanisms of articulation disorder and the effect of perioperative rehabilitation were studied prospectively.

MATERIALS AND METHODS: Sixty-two patients with 62 tongue cancer were divided into a partial glossectomy group (n = 40) and a reconstruction group (n = 22). Acoustic characteristics were analyzed during the preoperative and postoperative periods and after rehabilitation using the first and second formants of the vowels /a/, /i/, and /u/; the triangular vowel space area (tVSA); and the slopes of formant transitions.

RESULTS: In the 2 groups, decreases in the tVSA and formant slopes were found from the preoperative to the postoperative period, and the acoustic characteristics of the reconstruction group especially improved to preoperative values after rehabilitation. Analysis of the postoperative period showed that acoustic characteristics were altered at the site of surgical resection.

CONCLUSION: Changes of acoustic variables are related to excision size and site, suggesting the distinctive tongue portion for the articulation of each speech sound. Perioperative rehabilitation could activate the articulators and increase the range of movement of the remaining tongue, especially the preserved anterior tongue.}, } @article {pmid28029556, year = {2017}, author = {Titze, IR and Maxfield, LM and Walker, MC}, title = {A Formant Range Profile for Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {3}, pages = {382.e9-382.e13}, pmid = {28029556}, issn = {1873-4588}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; *Singing ; Sound Spectrography ; Vocal Cords/*physiology ; *Voice Quality ; Young Adult ; }, abstract = {Vowel selection is important in differentiating between singing styles. The timbre of the vocal instrument, which is related to its frequency spectrum, is governed by both the glottal sound source and the vowel choices made by singers. Consequently, the ability to modify the vowel space is a measure of how successfully a singer can maintain a desired timbre across a range of pitches. Formant range profiles were produced as a means of quantifying this ability. Seventy-seven subjects (including trained and untrained vocalists) participated, producing vowels with three intended mouth shapes: (1) neutral or speech-like, (2) megaphone-shaped (wide open mouth), and (3) inverted-megaphone-shaped (widened oropharynx with moderate mouth opening). The first and second formant frequencies (F1 and F2) were estimated with fry phonation for each shape and values were plotted in F1-F2 space. By taking four vowels of a quadrangle /i, æ, a, u/, the resulting area was quantified in kHz[2] (kHz squared) as a measure of the subject's ability to modify their vocal tract for spectral differences.}, } @article {pmid28002838, year = {2016}, author = {Georgeton, L and Antolík, TK and Fougeron, C}, title = {Effect of Domain Initial Strengthening on Vowel Height and Backness Contrasts in French: Acoustic and Ultrasound Data.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {59}, number = {6}, pages = {S1575-S1586}, doi = {10.1044/2016_JSLHR-S-15-0044}, pmid = {28002838}, issn = {1558-9102}, mesh = {Adult ; Female ; Humans ; *Motor Activity/physiology ; Motor Skills/physiology ; *Phonetics ; *Speech Acoustics ; *Tongue/diagnostic imaging/physiology ; Ultrasonography ; }, abstract = {PURPOSE: Phonetic variation due to domain initial strengthening was investigated with respect to the acoustic and articulatory distinctiveness of vowels within a subset of the French oral vowel system /i, e, ɛ, a, o, u/, organized along 4 degrees of height for the front vowels and 2 degrees of backness at the close and midclose height levels.

METHOD: Acoustic and lingual ultrasound data were examined to characterize the production of vowels in intonational phrase initial position, compared with intonational phrase medial position, for 4 speakers.

RESULTS: Formant values and estimates of lingual constriction location and degree differed according to the prosodic position independent of vowel duration, with a higher F1 for /a/, a higher F2 for /ɛ/, a backer constriction for /o/ and /a/ but a fronter constriction for /ɛ/, and a narrower constriction for /e, ɛ, u, o/ but a wider constriction for /a/. For most speakers, these variations enlarge the acoustic and/or articulatory distance between members of the pairs /e-ɛ/, /ɛ-a/, /u-o/, /i-u/, and /e-o/ but reduce the distinction within the pair /i-e/.

CONCLUSIONS: These changes in intonational phrase initial position are vowel dependent and frequently contribute to augmenting the phonetic distinctiveness between vowels contrasting along the height and backness dimensions.}, } @article {pmid28002836, year = {2016}, author = {Saito, H}, title = {Lip Movements for an Unfamiliar Vowel: Mandarin Front Rounded Vowel Produced by Japanese Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {59}, number = {6}, pages = {S1558-S1565}, doi = {10.1044/2015_JSLHR-S-15-0033}, pmid = {28002836}, issn = {1558-9102}, mesh = {Adolescent ; Biomechanical Phenomena ; Female ; Humans ; *Lip/physiology ; Male ; *Motor Activity/physiology ; Motor Skills/physiology ; Movement/physiology ; *Phonetics ; *Speech/physiology ; Uncertainty ; Video Recording ; Young Adult ; }, abstract = {PURPOSE: The study was aimed at investigating what kind of lip positions are selected by Japanese adult participants for an unfamiliar Mandarin rounded vowel /y/ and if their lip positions are related to and/or differentiated from those for their native vowels.

METHOD: Videotaping and post hoc tracking measurements for lip positions, namely protrusion and vertical aperture, and acoustic analysis of vowel formants were conducted on participants' production in a repetition task.

RESULTS: First, 31.2% of all productions of /y/ were produced with either protruded or compressed rounding. Second, the lip positions for /y/ were differentiated from those for the perceived nearest native vowel; although they correlated with them in terms of vertical aperture, they did not in terms of protrusion/retraction.

CONCLUSIONS: Lip positions for a novel rounded vowel seemed to be produced as a modification of existing lip positions from the native repertoire. Moreover, the degree of vertical aperture might be easily transferred, and the degree of protrusion is less likely to be retained in the new lip positions.}, } @article {pmid28000516, year = {2018}, author = {Blankenship, KG and Ohde, RN and Won, JH and Hedrick, M}, title = {Speech perception in children with cochlear implants for continua varying in formant transition duration.}, journal = {International journal of speech-language pathology}, volume = {20}, number = {2}, pages = {238-246}, doi = {10.1080/17549507.2016.1265589}, pmid = {28000516}, issn = {1754-9515}, mesh = {Child ; Child, Preschool ; *Cochlear Implants ; Female ; Hearing Loss, Sensorineural/surgery ; Humans ; Male ; Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {PURPOSE: To examine the developmental course of labial and alveolar manner of articulation contrasts, and to determine how that course may be different for typically developing (TD) children with cochlear implants (CI).

METHOD: Eight young adults, eight TD 5-8 year-old children, and seven 5-8 year-old children with CIs participated. Labial /ba/-/wa/ and alveolar /da/-/ja/ continua stimuli were presented, with each continuum consisting of nine synthetic stimuli varying in F2 and F3 transition duration. Participants were asked to label the stimuli as either a stop or glide, and responses were analysed for phonetic boundaries and slopes.

RESULT: For the /ba/-/wa/ contrast, children with CIs required longer transition durations compared to TD children or adults to cross from one phoneme category to another. The children with CIs demonstrated less confidence in labelling the stimuli (i.e. less steep slopes) than the TD children or the adults. For the /da/-/ja/ contrast, the children with CIs showed less steep slope values than adults.

CONCLUSION: These results suggest that there are differences in the way TD children and children with CIs develop and maintain phonetic categories, perhaps differences in phonetic representation or in linking acoustic and phonetic representations.}, } @article {pmid27965527, year = {2016}, author = {Zora, H and Riad, T and Schwarz, IC and Heldner, M}, title = {Lexical Specification of Prosodic Information in Swedish: Evidence from Mismatch Negativity.}, journal = {Frontiers in neuroscience}, volume = {10}, number = {}, pages = {533}, pmid = {27965527}, issn = {1662-4548}, abstract = {Like that of many other Germanic languages, the stress system of Swedish has mainly undergone phonological analysis. Recently, however, researchers have begun to recognize the central role of morphology in these systems. Similar to the lexical specification of tonal accent, the Swedish stress system is claimed to be morphologically determined and morphemes are thus categorized as prosodically specified and prosodically unspecified. Prosodically specified morphemes bear stress information as part of their lexical representations and are classified as tonic (i.e., lexically stressed), pretonic and posttonic, whereas prosodically unspecified morphemes receive stress through a phonological rule that is right-edge oriented, but is sensitive to prosodic specification at that edge. The presence of prosodic specification is inferred from vowel quality and vowel quantity; if stress moves elsewhere, vowel quality and quantity change radically in phonologically stressed morphemes, whereas traces of stress remain in lexically stressed morphemes. The present study is the first to investigate whether stress is a lexical property of Swedish morphemes by comparing mismatch negativity (MMN) responses to vowel quality and quantity changes in phonologically stressed and lexically stressed words. In a passive oddball paradigm, 15 native speakers of Swedish were presented with standards and deviants, which differed from the standards in formant frequency and duration. Given that vowel quality and quantity changes are associated with morphological derivations only in phonologically stressed words, MMN responses are expected to be greater in phonologically stressed words than in lexically stressed words that lack such an association. The results indicated that the processing differences between phonologically and lexically stressed words were reflected in the amplitude and topography of MMN responses. Confirming the expectation, MMN amplitude was greater for the phonologically stressed word than for the lexically stressed word and showed a more widespread topographic distribution. The brain did not only detect vowel quality and quantity changes but also used them to activate memory traces associated with derivations. The present study therefore implies that morphology is directly involved in the Swedish stress system and that changes in phonological shape due to stress shift cue upcoming stress and potential addition of a morpheme.}, } @article {pmid27957536, year = {2016}, author = {Fitch, WT and de Boer, B and Mathur, N and Ghazanfar, AA}, title = {Monkey vocal tracts are speech-ready.}, journal = {Science advances}, volume = {2}, number = {12}, pages = {e1600723}, pmid = {27957536}, issn = {2375-2548}, support = {230604/ERC_/European Research Council/International ; R01 NS054898/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Haplorhini/*anatomy & histology ; Larynx/*anatomy & histology ; Speech ; *Vocalization, Animal ; }, abstract = {For four decades, the inability of nonhuman primates to produce human speech sounds has been claimed to stem from limitations in their vocal tract anatomy, a conclusion based on plaster casts made from the vocal tract of a monkey cadaver. We used x-ray videos to quantify vocal tract dynamics in living macaques during vocalization, facial displays, and feeding. We demonstrate that the macaque vocal tract could easily produce an adequate range of speech sounds to support spoken language, showing that previous techniques based on postmortem samples drastically underestimated primate vocal capabilities. Our findings imply that the evolution of human speech capabilities required neural changes rather than modifications of vocal anatomy. Macaques have a speech-ready vocal tract but lack a speech-ready brain to control it.}, } @article {pmid27914422, year = {2016}, author = {Rong, P and Kuehn, DP and Shosted, RK}, title = {Modeling of oropharyngeal articulatory adaptation to compensate for the acoustic effects of nasalization.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {3}, pages = {2145}, doi = {10.1121/1.4963065}, pmid = {27914422}, issn = {1520-8524}, mesh = {*Acoustics ; Dysphonia ; Phonetics ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Hypernasality is one of the most detrimental speech disturbances that lead to declines of speech intelligibility. Velopharyngeal inadequacy, which is associated with anatomic defects such as cleft palate or neuromuscular disorders that affect velopharygneal function, is the primary cause of hypernasality. A simulation study by Rong and Kuehn [J. Speech Lang. Hear. Res. 55(5), 1438-1448 (2012)] demonstrated that properly adjusted oropharyngeal articulation can reduce nasality for vowels synthesized with an articulatory model [Mermelstein, J. Acoust. Soc. Am. 53(4), 1070-1082 (1973)]. In this study, a speaker-adaptive articulatory model was developed to simulate speaker-customized oropharyngeal articulatory adaptation to compensate for the acoustic effects of nasalization on /a/, /i/, and /u/. The results demonstrated that (1) the oropharyngeal articulatory adaptation effectively counteracted the effects of nasalization on the second lowest formant frequency (F2) and partially compensated for the effects of nasalization on vowel space (e.g., shifting and constriction of vowel space) and (2) the articulatory adaptation strategies generated by the speaker-adaptive model might be more efficacious for counteracting the acoustic effects of nasalization compared to the adaptation strategies generated by the standard articulatory model in Rong and Kuehn. The findings of this study indicated the potential of using oropharyngeal articulatory adaptation as a means to correct maladaptive articulatory behaviors and to reduce nasality.}, } @article {pmid27908069, year = {2016}, author = {Mefferd, AS}, title = {Associations between tongue movement pattern consistency and formant movement pattern consistency in response to speech behavioral modifications.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {5}, pages = {3728}, pmid = {27908069}, issn = {1520-8524}, support = {R03 DC015075/DC/NIDCD NIH HHS/United States ; U54 HD083211/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Biomechanical Phenomena ; Female ; Humans ; Male ; Movement ; Phonetics ; Speech ; *Tongue ; Young Adult ; }, abstract = {The degree of speech movement pattern consistency can provide information about speech motor control. Although tongue motor control is particularly important because of the tongue's primary contribution to the speech acoustic signal, capturing tongue movements during speech remains difficult and costly. This study sought to determine if formant movements could be used to estimate tongue movement pattern consistency indirectly. Two age groups (seven young adults and seven older adults) and six speech conditions (typical, slow, loud, clear, fast, bite block speech) were selected to elicit an age- and task-dependent performance range in tongue movement pattern consistency. Kinematic and acoustic spatiotemporal indexes (STI) were calculated based on sentence-length tongue movement and formant movement signals, respectively. Kinematic and acoustic STI values showed strong associations across talkers and moderate to strong associations for each talker across speech tasks; although, in cases where task-related tongue motor performance changes were relatively small, the acoustic STI values were poorly associated with kinematic STI values. These findings suggest that, depending on the sensitivity needs, formant movement pattern consistency could be used in lieu of direct kinematic analysis to indirectly examine speech motor control.}, } @article {pmid27908035, year = {2016}, author = {T V, A and A G, R}, title = {Intrinsic-cum-extrinsic normalization of formant data of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {5}, pages = {EL446}, doi = {10.1121/1.4967311}, pmid = {27908035}, issn = {1520-8524}, abstract = {Using a known speaker-intrinsic normalization procedure, formant data are scaled by the reciprocal of the geometric mean of the first three formant frequencies. This reduces the influence of the talker but results in a distorted vowel space. The proposed speaker-extrinsic procedure re-scales the normalized values by the mean formant values of vowels. When tested on the formant data of vowels published by Peterson and Barney, the combined approach leads to well separated clusters by reducing the spread due to talkers. The proposed procedure performs better than two top-ranked normalization procedures based on the accuracy of vowel classification as the objective measure.}, } @article {pmid35990794, year = {2016}, author = {Titze, IR and Palaparthi, A}, title = {Sensitivity of Source-Filter Interaction to Specific Vocal Tract Shapes.}, journal = {IEEE/ACM transactions on audio, speech, and language processing}, volume = {24}, number = {12}, pages = {2507-2515}, pmid = {35990794}, issn = {2329-9290}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; }, abstract = {A systematic variation of length and cross-sectional area of specific segments of the vocal tract (trachea to lips) was conducted computationally to quantify the effects of source-filter interaction. A one-dimensional Navier-Stokes (transmission line) solution was used to compute peak glottal airflow, maximum flow declination rate, and formant ripple on glottal flow for Level 1 (aero-acoustic) interactions. For Level 2 (tissue movement) interaction, peak glottal area, phonation threshold pressure, and deviation in fo were quantified. Results show that the ventricle, the false-fold glottis, the conus elasticus entry, and the laryngeal vestibule are the regions to which acoustic variables are most sensitive. Generally, any narrow section of the vocal tract increases the degree of interaction, both in terms of its length and its cross-sectional area. The closer the narrow section is to the vocal folds, the greater the effect.}, } @article {pmid27891665, year = {2017}, author = {Akimov, AG and Egorova, MA and Ehret, G}, title = {Spectral summation and facilitation in on- and off-responses for optimized representation of communication calls in mouse inferior colliculus.}, journal = {The European journal of neuroscience}, volume = {45}, number = {3}, pages = {440-459}, doi = {10.1111/ejn.13488}, pmid = {27891665}, issn = {1460-9568}, mesh = {Animals ; *Auditory Perception ; Auditory Threshold ; Female ; Inferior Colliculi/cytology/*physiology ; Mice ; Mice, Inbred C57BL ; Neurons/physiology ; *Vocalization, Animal ; }, abstract = {Selectivity for processing of species-specific vocalizations and communication sounds has often been associated with the auditory cortex. The midbrain inferior colliculus, however, is the first center in the auditory pathways of mammals integrating acoustic information processed in separate nuclei and channels in the brainstem and, therefore, could significantly contribute to enhance the perception of species' communication sounds. Here, we used natural wriggling calls of mouse pups, which communicate need for maternal care to adult females, and further 15 synthesized sounds to test the hypothesis that neurons in the central nucleus of the inferior colliculus of adult females optimize their response rates for reproduction of the three main harmonics (formants) of wriggling calls. The results confirmed the hypothesis showing that average response rates, as recorded extracellularly from single units, were highest and spectral facilitation most effective for both onset and offset responses to the call and call models with three resolved frequencies according to critical bands in perception. In addition, the general on- and/or off-response enhancement in almost half the investigated 122 neurons favors not only perception of single calls but also of vocalization rhythm. In summary, our study provides strong evidence that critical-band resolved frequency components within a communication sound increase the probability of its perception by boosting the signal-to-noise ratio of neural response rates within the inferior colliculus for at least 20% (our criterion for facilitation). These mechanisms, including enhancement of rhythm coding, are generally favorable to processing of other animal and human vocalizations, including formants of speech sounds.}, } @article {pmid27876301, year = {2017}, author = {McGlashan, J and Thuesen, MA and Sadolin, C}, title = {Overdrive and Edge as Refiners of "Belting"?: An Empirical Study Qualifying and Categorizing "Belting" Based on Audio Perception, Laryngostroboscopic Imaging, Acoustics, LTAS, and EGG.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {3}, pages = {385.e11-385.e22}, doi = {10.1016/j.jvoice.2016.09.006}, pmid = {27876301}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; *Auditory Perception ; Biomechanical Phenomena ; Case-Control Studies ; Electrodiagnosis/*methods ; Female ; Humans ; Judgment ; *Laryngoscopy ; Male ; Middle Aged ; Phonation ; Predictive Value of Tests ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; *Speech Acoustics ; *Stroboscopy ; Time Factors ; Vocal Cords/diagnostic imaging/*physiology ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: We aimed to study the categorizations "Overdrive" and "Edge" from the pedagogical method Complete Vocal Technique as refiners of the often ill-defined concept of "belting" by means of audio perception, laryngostroboscopic imaging, acoustics, long-term average spectrum (LTAS), and electroglottography (EGG).

STUDY DESIGN: This is a case-control study.

METHODS: Twenty singers were recorded singing sustained vowels in a "belting" quality refined by audio perception as "Overdrive" and "Edge." Two studies were performed: (1) a laryngostroboscopic examination using a videonasoendoscopic camera system (Olympus) and the Laryngostrobe program (Laryngograph); (2) a simultaneous recording of the EGG and acoustic signals using Speech Studio (Laryngograph). The images were analyzed based on consensus agreement. Statistical analysis of the acoustic, LTAS, and EGG parameters was undertaken using the Student paired t test.

RESULTS: The two modes of singing determined by audio perception have visibly different laryngeal gestures: Edge has a more constricted setting than that of Overdrive, where the ventricular folds seem to cover more of the vocal folds, the aryepiglottic folds show a sharper edge in Edge, and the cuneiform cartilages are rolled in anteromedially. LTAS analysis shows a statistical difference, particularly after the ninth harmonic, with a coinciding first formant. The combined group showed statistical differences in shimmer, harmonics-to-noise ratio, normalized noise energy, and mean sound pressure level (P ≤ 0.05).

CONCLUSION: "Belting" sounds can be categorized using audio perception into two modes of singing: "Overdrive" and "Edge." This study demonstrates consistent visibly different laryngeal gestures between these modes and with some correspondingly significant differences in LTAS, EGG, and acoustic measures.}, } @article {pmid27876268, year = {2017}, author = {Volodin, IA and Efremova, KO and Frey, R and Soldatova, NV and Volodina, EV}, title = {Vocal changes accompanying the descent of the larynx during ontogeny from neonates to adults in male and female goitred gazelles (Gazella subgutturosa).}, journal = {Zoology (Jena, Germany)}, volume = {120}, number = {}, pages = {31-41}, doi = {10.1016/j.zool.2016.09.001}, pmid = {27876268}, issn = {1873-2720}, mesh = {Aging ; Animals ; Animals, Newborn ; Antelopes/*physiology ; Body Weight ; Female ; Larynx/*anatomy & histology/physiology ; Male ; Neck/anatomy & histology ; Sex Factors ; Vocalization, Animal/*physiology ; }, abstract = {The pronouncedly enlarged and descended larynx in male goitred gazelles (Gazella subgutturosa), Mongolian gazelles (Procapra gutturosa) and fallow deer (Dama dama) represents an interesting parallel to the 'Adam's apple' of human males. Goitred gazelles, as humans, are not born with a descended larynx. Therefore the sexual dimorphism of larynx size and position develops during ontogeny. In this study, the vocal ontogeny of male and female goitred gazelles was investigated across five age classes from neonates to adults. The acoustic variables of nasal contact calls were measured in 53 (24 male, 29 female) individuals, body mass and neck dimensions in 63 (31 male, 32 female) live individuals and nasal vocal tract and vocal fold lengths in 26 (16 male, 10 female) anatomical specimens. Call fundamental frequency (f0), the acoustic correlate of the ontogenetically enlarging larynx, decreased significantly in either sex. Call formants (second, third and forth), the acoustic correlates of the ontogenetically elongating vocal tract, did not differ significantly between sexes up to early adulthood, but clearly diverged in adults. Significant differences between sexes in neck circumference at the level of the larynx emerged already at 2-3 months of age, whereas body mass, neck circumference at the neck-body transition and the degree of larynx descent significantly differed in adults only. We discuss that, in contrast to humans, the accelerated enlargement of the larynx in male goitred gazelles starts early in ontogeny. A moderate descent of the larynx develops equally in both sexes before early adulthood, whereas the additional prominent descent of the larynx in males is shifted to late ontogeny. This might avoid selective disadvantages of this sexually dimorphic trait on males during their period of growth. As has been previously proved for humans, the emergence of the strong male-specific descent of the larynx in goitred gazelles may go along with the increasing social status and the males' increasing chances of siring offspring. Similar to the Adam's apple of human males, this may indicate the important role of the enlarged and descended larynx for signaling male status via masculine voice.}, } @article {pmid27849246, year = {2016}, author = {Leme, AL and Marcelino, MA and Prado, PP}, title = {Margins of tolerance and reference values for the formant vowels for use in voice therapy for the deaf in commercial computer.}, journal = {CoDAS}, volume = {28}, number = {5}, pages = {610-617}, doi = {10.1590/2317-1782/20162015104}, pmid = {27849246}, issn = {2317-1782}, mesh = {Adult ; Brazil ; Computers ; Female ; Humans ; Language ; Male ; Middle Aged ; Persons With Hearing Impairments/*rehabilitation ; Phonation ; Reference Values ; Signal Processing, Computer-Assisted/*instrumentation ; *Sound Spectrography ; *Speech Acoustics ; Voice Disorders/*therapy ; *Voice Quality ; Voice Training ; Young Adult ; }, abstract = {PURPOSE: This study presents the margins of minimum and maximum tolerances for the frequencies of the first three formants (F1, F2, and F3) in the pronunciation of vowels of Brazilian Portuguese for use in voice therapy for the deaf.

METHODS: The frequencies were obtained from the voluntary collaboration of 53 adults who had their voices recorded and converted into digital signals during the phonation of each of the seven vowels (/a/, /e/, /Ɛ/, /i/, /o/, /ᴐ/, /u/) sustained for approximately one second. The samples were divided into two groups: male and female. The recording and extraction of the formants were conducted by software developed exclusively for this purpose in MATLAB platform using the eight-coefficient LPC (Linear Predictive Coding) algorithm.

RESULTS: The results showed that a consistent reference for the mean values of the F1, F2, and F3 frequencies can be obtained through graphical and statistical analysis of the samples collected from the voice signals.

CONCLUSION: The reference values obtained were analyzed and can be used for the calibration of devices and serve as a basis for training oralization for the deaf.}, } @article {pmid27839988, year = {2017}, author = {Kayes, G and Welch, GF}, title = {Can Genre Be "Heard" in Scale as Well as Song Tasks? An Exploratory Study of Female Singing in Western Lyric and Musical Theater Styles.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {3}, pages = {388.e1-388.e12}, doi = {10.1016/j.jvoice.2016.09.015}, pmid = {27839988}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; *Auditory Perception ; Female ; Humans ; *Judgment ; Observer Variation ; Reproducibility of Results ; Sex Factors ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: Using an empirical design, this study investigated perceptual and acoustic differences between the recorded vocal products of songs and scales of professional female singers of classical Western Lyric (WL) and non-legit Musical Theater (MT) styles.

METHODS: A total of 54 audio-recorded samples of songs and scales from professional female singers were rated in a blind randomized testing process by seven expert listeners as being performed by either a WL or MT singer. Songs and scales that were accurately perceived by genre were then analyzed intra- and inter-genre using long-term average spectrum analysis.

RESULTS: A high level of agreement was found between judges in ratings for both songs and scales according to genre (P < 0.0001). Judges were more successful in locating WL than MT, but accuracy was always >50%. For the long-term average spectrum analysis intra-genre, song and scale matched better than chance. The highest spectral peak for the WL singers was at the mean fundamental frequency, whereas this spectral area was weaker for the MT singers, who showed a marked peak at 1 kHz. The other main inter-genre difference appeared in the higher frequency region, with a peak in the MT spectrum between 4 and 5 kHz-the region of the "speaker's formant."

CONCLUSIONS: In comparing female singers of WL and MT styles, scales as well as song tasks appear to be indicative of singer genre behavior. This implied difference in vocal production may be useful to teachers and clinicians dealing with multiple genres. The addition of a scale-in-genre task may be useful in future research seeking to identify genre-distinctive behaviors.}, } @article {pmid27833879, year = {2016}, author = {Ansari, MS and Rangasayee, R}, title = {Construction of Hindi Speech Stimuli for Eliciting Auditory Brainstem Responses.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {68}, number = {4}, pages = {496-507}, pmid = {27833879}, issn = {2231-3796}, abstract = {Speech-evoked auditory brainstem responses (spABRs) provide considerable information of clinical relevance to describe auditory processing of complex stimuli at the sub cortical level. The substantial research data have suggested faithful representation of temporal and spectral characteristics of speech sounds. However, the spABR are known to be affected by acoustic properties of speech, language experiences and training. Hence, there exists indecisive literature with regards to brainstem speech processing. This warrants establishment of language specific speech stimulus to describe the brainstem processing in specific oral language user. The objective of current study is to develop Hindi speech stimuli for recording auditory brainstem responses. The Hindi stop speech of 40 ms containing five formants was constructed. Brainstem evoked responses to speech sound |da| were gained from 25 normal hearing (NH) adults having mean age of 20.9 years (SD = 2.7) in the age range of 18-25 years and ten subjects (HI) with mild SNHL of mean 21.3 years (SD = 3.2) in the age range of 18-25 years. The statistically significant differences in the mean identification scores of synthesized for speech stimuli |da| and |ga| between NH and HI were obtained. The mean, median, standard deviation, minimum, maximum and 95 % confidence interval for the discrete peaks and V-A complex values of electrophysiological responses to speech stimulus were measured and compared between NH and HI population. This paper delineates a comprehensive methodological approach for development of Hindi speech stimuli and recording of ABR to speech. The acoustic characteristic of stimulus |da| was faithfully represented at brainstem level in normal hearing adults. There was statistically significance difference between NH and HI individuals. This suggests that spABR offers an opportunity to segregate normal speech encoding from abnormal speech processing at sub cortical level, which implies that alterations in brainstem responses have clinical significance to identify the subjects with possible processing disorders.}, } @article {pmid27833009, year = {2016}, author = {Sengupta, R and Shah, S and Gore, K and Loucks, T and Nasir, SM}, title = {Anomaly in neural phase coherence accompanies reduced sensorimotor integration in adults who stutter.}, journal = {Neuropsychologia}, volume = {93}, number = {Pt A}, pages = {242-250}, doi = {10.1016/j.neuropsychologia.2016.11.004}, pmid = {27833009}, issn = {1873-3514}, mesh = {Adult ; Brain/*physiopathology ; Electroencephalography ; Feedback, Sensory/*physiology ; Female ; Humans ; Learning/physiology ; Male ; Motor Skills/physiology ; Neuropsychological Tests ; Speech/*physiology ; Speech Perception/*physiology ; Stuttering/*physiopathology ; Young Adult ; }, abstract = {Despite advances in our understanding of the human speech system, the neurophysiological basis of stuttering remains largely unknown. Here, it is hypothesized that the speech of adults who stutter (AWS) is susceptible to disruptions in sensorimotor integration caused by neural miscommunication within the speech motor system. Human speech unfolds over rapid timescales and relies on a distributed system of brain regions working in a parallel and synchronized manner, and a breakdown in neural communication between the putative brain regions could increase susceptibility to dysfluency. Using a speech motor adaptation paradigm under altered auditory feedback with simultaneous recording of EEG, the oscillatory cortical dynamics was investigated in stuttering and fluent adults (FA). Auditory feedback perturbation involved the shifting of the formant frequencies of the target vowel sound. Reduced adaptation in response to the feedback error was observed in AWS and was accompanied by differences in EEG spectral powers and anomalies in phase coherence evolving over the course of speech motor training. It is understood that phase coherence possibly captures neural communication within speech motor networks. Thus, the phase coherence network of the two groups exhibited differences involving the EEG frequency bands. These findings in anomalous neural synchrony provide novel evidence for compromised neuronal communication at short time scales within the speech motor network of AWS.}, } @article {pmid27816356, year = {2017}, author = {Ritzerfeld, WGJ and Miller, DG}, title = {Formant Tuning and Feedback in the Male Passaggio.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {4}, pages = {506.e7-506.e17}, doi = {10.1016/j.jvoice.2016.10.006}, pmid = {27816356}, issn = {1873-4588}, mesh = {Humans ; Male ; *Singing ; *Speech Acoustics ; }, abstract = {It has been suggested that traversing the male (secondo) passaggio requires two important adjustments. When singing up the scale, first of all, the second harmonic (H2) needs to pass over the first formant (F1). After that, the timbre of the voice takes on a different, slightly "darker" quality. This is the pitch where, in singers' jargon, the voice reaches secondo passaggio. Above secondo passaggio, in the optimal arrangement, the second formant (F2) is tuned close to one of the higher harmonics, or, sometimes alternatively, the singer's formant cluster induces a dominant resonance in the approximate range of 2.4-3.4 kHz. These two adjustments together produce the typical sound of the classical male upper voice. In this study, we have investigated the choices individual singers make while negotiating the passaggio and the effect of feedback from the vocal tract to the voice source during this maneuver. Electroglottograph (EGG) and microphone signals were recorded of nine male singers (five tenors and four baritones) using VoceVista. Inverse filtering was performed on the microphone signals, using the Sopran/DeCap application, to reveal the shape of the glottal airflow pulses.}, } @article {pmid27815130, year = {2017}, author = {Summers, RJ and Bailey, PJ and Roberts, B}, title = {Informational masking and the effects of differences in fundamental frequency and fundamental-frequency contour on phonetic integration in a formant ensemble.}, journal = {Hearing research}, volume = {344}, number = {}, pages = {295-303}, doi = {10.1016/j.heares.2016.10.026}, pmid = {27815130}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Acoustics ; Adolescent ; Adult ; Audiometry, Speech ; Female ; Humans ; Male ; Noise/*adverse effects ; *Perceptual Masking ; *Phonetics ; Recognition, Psychology ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; *Voice Quality ; Young Adult ; }, abstract = {This study explored the effects on speech intelligibility of across-formant differences in fundamental frequency (ΔF0) and F0 contour. Sentence-length speech analogues were presented dichotically (left = F1+F3; right = F2), either alone or-because competition usually reveals grouping cues most clearly-accompanied in the left ear by a competitor for F2 (F2C) that listeners must reject to optimize recognition. F2C was created by inverting the F2 frequency contour. In experiment 1, all left-ear formants shared the same constant F0 and ΔF0F2 was 0 or ±4 semitones. In experiment 2, all left-ear formants shared the natural F0 contour and that for F2 was natural, constant, exaggerated, or inverted. Adding F2C lowered keyword scores, presumably because of informational masking. The results for experiment 1 were complicated by effects associated with the direction of ΔF0F2; this problem was avoided in experiment 2 because all four F0 contours had the same geometric mean frequency. When the target formants were presented alone, scores were relatively high and did not depend on the F0F2 contour. F2C impact was greater when F2 had a different F0 contour from the other formants. This effect was a direct consequence of the associated ΔF0; the F0F2 contour per se did not influence competitor impact.}, } @article {pmid27812617, year = {2016}, author = {Ha, J and Sung, IY and Son, JH and Stone, M and Ord, R and Cho, YC}, title = {Analysis of speech and tongue motion in normal and post-glossectomy speaker using cine MRI.}, journal = {Journal of applied oral science : revista FOB}, volume = {24}, number = {5}, pages = {472-480}, pmid = {27812617}, issn = {1678-7765}, mesh = {Adult ; Analysis of Variance ; Anatomic Landmarks ; Case-Control Studies ; Female ; Glossectomy/*rehabilitation ; Humans ; Magnetic Resonance Imaging, Cine ; Male ; Middle Aged ; Neoplasm Staging ; Pharynx/pathology/physiopathology ; Postoperative Period ; Reference Values ; Retrospective Studies ; Speech/*physiology ; Speech Acoustics ; Tongue/*pathology/*physiopathology ; Tongue Neoplasms/pathology/physiopathology/surgery ; Treatment Outcome ; }, abstract = {OBJECTIVE: Since the tongue is the oral structure responsible for mastication, pronunciation, and swallowing functions, patients who undergo glossectomy can be affected in various aspects of these functions. The vowel /i/ uses the tongue shape, whereas /u/ uses tongue and lip shapes. The purpose of this study is to investigate the morphological changes of the tongue and the adaptation of pronunciation using cine MRI for speech of patients who undergo glossectomy.

MATERIAL AND METHODS: Twenty-three controls (11 males and 12 females) and 13 patients (eight males and five females) volunteered to participate in the experiment. The patients underwent glossectomy surgery for T1 or T2 lateral lingual tumors. The speech tasks "a souk" and "a geese" were spoken by all subjects providing data for the vowels /u/ and /i/. Cine MRI and speech acoustics were recorded and measured to compare the changes in the tongue with vowel acoustics after surgery. 2D measurements were made of the interlip distance, tongue-palate distance, tongue position (anterior-posterior and superior-inferior), tongue height on the left and right sides, and pharynx size. Vowel formants Fl, F2, and F3 were measured.

RESULTS: The patients had significantly lower F2/Fl ratios (F=5.911, p=0.018), and lower F3/F1 ratios that approached significance. This was seen primarily in the /u/ data. Patients had flatter tongue shapes than controls with a greater effect seen in /u/ than /i/.

CONCLUSION: The patients showed complex adaptation motion in order to preserve the acoustic integrity of the vowels, and the tongue modified cavity size relationships to maintain the value of the formant frequencies.}, } @article {pmid29332988, year = {2016}, author = {Solomon, NP and Makashay, MJ and Munson, B}, title = {The Effect of Jaw Position on Perceptual and Acoustic Characteristics of Speech.}, journal = {The International journal of orofacial myology : official publication of the International Association of Orofacial Myology}, volume = {42}, number = {}, pages = {15-24}, pmid = {29332988}, issn = {0735-0120}, support = {R03 DC006096/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Biomechanical Phenomena ; Humans ; Jaw/*anatomy & histology ; Movement ; Phonetics ; *Speech ; *Speech Production Measurement ; Tongue ; }, abstract = {Bite blocks are used to stabilize the jaw and to isolate tongue and lip movements from that of the mandible during speech and nonspeech activities. Ten normally speaking young adults produced sentences with an unconstrained jaw and with unilateral placement of 2-mm and 5-mm bite blocks. Six listeners rated sentences spoken without either bite block as the most natural sounding. Spectral characteristics of /s/, /ʃ/ and /t/ (sibilant frication and stop bursts) differed significantly with than without bite blocks, such that mean spectral energy decreased, and variation and skew of spectral energy increased. Spectral kurtosis did not change for the group, but 2 participants exhibited highly kurtotic /s/ spectra without a bite block that normalized with bite blocks. The second formant frequency for the high vowel /i/ was lower with bite blocks; there was no systematic difference in F2 slope for diphthongs. Segmental and suprasegmental timing of speech articulation was not affected significantly by these small bite blocks. This study provides support for using small bite blocks to isolate the tongue from the jaw without large effects on speech, but cautions that speech is likely to sound less natural than when produced with an unconstrained jaw.}, } @article {pmid27596251, year = {2016}, author = {Stilp, CE and Anderson, PW and Assgari, AA and Ellis, GM and Zahorik, P}, title = {Speech perception adjusts to stable spectrotemporal properties of the listening environment.}, journal = {Hearing research}, volume = {341}, number = {}, pages = {168-178}, pmid = {27596251}, issn = {1878-5891}, support = {R01 DC008168/DC/NIDCD NIH HHS/United States ; }, mesh = {*Auditory Perception ; Calibration ; Environment ; Humans ; Language ; Noise ; *Phonetics ; Psychometrics ; Regression Analysis ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {When perceiving speech, listeners compensate for reverberation and stable spectral peaks in the speech signal. Despite natural listening conditions usually adding both reverberation and spectral coloration, these processes have only been studied separately. Reverberation smears spectral peaks across time, which is predicted to increase listeners' compensation for these peaks. This prediction was tested using sentences presented with or without a simulated reverberant sound field. All sentences had a stable spectral peak (added by amplifying frequencies matching the second formant frequency [F2] in the target vowel) before a test vowel varying from /i/ to /u/ in F2 and spectral envelope (tilt). In Experiment 1, listeners demonstrated increased compensation (larger decrease in F2 weights and larger increase in spectral tilt weights for identifying the target vowel) in reverberant speech than in nonreverberant speech. In Experiment 2, increased compensation was shown not to be due to reverberation tails. In Experiment 3, adding a pure tone to nonreverberant speech at the target vowel's F2 frequency increased compensation, revealing that these effects are not specific to reverberation. Results suggest that perceptual adjustment to stable spectral peaks in the listening environment is not affected by their source or cause.}, } @article {pmid27438871, year = {2016}, author = {Winn, MB and Won, JH and Moon, IJ}, title = {Assessment of Spectral and Temporal Resolution in Cochlear Implant Users Using Psychoacoustic Discrimination and Speech Cue Categorization.}, journal = {Ear and hearing}, volume = {37}, number = {6}, pages = {e377-e390}, pmid = {27438871}, issn = {1538-4667}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC002932/DC/NIDCD NIH HHS/United States ; R01 DC003083/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Auditory Perception ; *Cochlear Implantation ; Cochlear Implants ; *Cues ; Deafness/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; Noise ; Phonetics ; Psychoacoustics ; Spectrum Analysis ; Speech ; *Speech Perception ; Time Factors ; }, abstract = {OBJECTIVES: This study was conducted to measure auditory perception by cochlear implant users in the spectral and temporal domains, using tests of either categorization (using speech-based cues) or discrimination (using conventional psychoacoustic tests). The authors hypothesized that traditional nonlinguistic tests assessing spectral and temporal auditory resolution would correspond to speech-based measures assessing specific aspects of phonetic categorization assumed to depend on spectral and temporal auditory resolution. The authors further hypothesized that speech-based categorization performance would ultimately be a superior predictor of speech recognition performance, because of the fundamental nature of speech recognition as categorization.

DESIGN: Nineteen cochlear implant listeners and 10 listeners with normal hearing participated in a suite of tasks that included spectral ripple discrimination, temporal modulation detection, and syllable categorization, which was split into a spectral cue-based task (targeting the /ba/-/da/ contrast) and a timing cue-based task (targeting the /b/-/p/ and /d/-/t/ contrasts). Speech sounds were manipulated to contain specific spectral or temporal modulations (formant transitions or voice onset time, respectively) that could be categorized. Categorization responses were quantified using logistic regression to assess perceptual sensitivity to acoustic phonetic cues. Word recognition testing was also conducted for cochlear implant listeners.

RESULTS: Cochlear implant users were generally less successful at utilizing both spectral and temporal cues for categorization compared with listeners with normal hearing. For the cochlear implant listener group, spectral ripple discrimination was significantly correlated with the categorization of formant transitions; both were correlated with better word recognition. Temporal modulation detection using 100- and 10-Hz-modulated noise was not correlated either with the cochlear implant subjects' categorization of voice onset time or with word recognition. Word recognition was correlated more closely with categorization of the controlled speech cues than with performance on the psychophysical discrimination tasks.

CONCLUSIONS: When evaluating people with cochlear implants, controlled speech-based stimuli are feasible to use in tests of auditory cue categorization, to complement traditional measures of auditory discrimination. Stimuli based on specific speech cues correspond to counterpart nonlinguistic measures of discrimination, but potentially show better correspondence with speech perception more generally. The ubiquity of the spectral (formant transition) and temporal (voice onset time) stimulus dimensions across languages highlights the potential to use this testing approach even in cases where English is not the native language.}, } @article {pmid27769749, year = {2016}, author = {Araújo, F and Filho, J and Klautau, A}, title = {Genetic algorithm to estimate the input parameters of Klatt and HLSyn formant-based speech synthesizers.}, journal = {Bio Systems}, volume = {150}, number = {}, pages = {190-193}, doi = {10.1016/j.biosystems.2016.10.002}, pmid = {27769749}, issn = {1872-8324}, mesh = {*Algorithms ; *Communication Aids for Disabled/trends ; Female ; Humans ; Male ; *Models, Genetic ; *Speech/physiology ; }, abstract = {Voice imitation basically consists in estimating a synthesizer's input parameters to mimic a target speech signal. This is a difficult inverse problem because the mapping is time-varying, non-linear and from many to one. It typically requires considerable amount of time to be done manually. This work presents the evolution of a system based on a genetic algorithm (GA) to automatically estimate the input parameters of the Klatt and HLSyn formant synthesizers using an analysis-by-synthesis process. Results are presented for natural (human-generated) speech for three male speakers. The results obtained with the GA-based system outperform those obtained with the baseline Winsnoori with respect to four objective figures of merit and a subjective test. The GA with Klatt synthesizer generated similar voices to the target and the subjective tests indicate an improvement in the quality of the synthetic voices when compared to the ones produced by the baseline.}, } @article {pmid27766433, year = {2017}, author = {Henry, KS and Abrams, KS and Forst, J and Mender, MJ and Neilans, EG and Idrobo, F and Carney, LH}, title = {Midbrain Synchrony to Envelope Structure Supports Behavioral Sensitivity to Single-Formant Vowel-Like Sounds in Noise.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {18}, number = {1}, pages = {165-181}, pmid = {27766433}, issn = {1438-7573}, support = {K99 DC013792/DC/NIDCD NIH HHS/United States ; R00 DC013792/DC/NIDCD NIH HHS/United States ; R01 DC001641/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Animals ; *Auditory Threshold ; Cochlear Nerve/physiology ; Female ; Humans ; Male ; Melopsittacus ; Mesencephalon/*physiology ; *Noise ; *Sound ; }, abstract = {Vowels make a strong contribution to speech perception under natural conditions. Vowels are encoded in the auditory nerve primarily through neural synchrony to temporal fine structure and to envelope fluctuations rather than through average discharge rate. Neural synchrony is thought to contribute less to vowel coding in central auditory nuclei, consistent with more limited synchronization to fine structure and the emergence of average-rate coding of envelope fluctuations. However, this hypothesis is largely unexplored, especially in background noise. The present study examined coding mechanisms at the level of the midbrain that support behavioral sensitivity to simple vowel-like sounds using neurophysiological recordings and matched behavioral experiments in the budgerigar. Stimuli were harmonic tone complexes with energy concentrated at one spectral peak, or formant frequency, presented in quiet and in noise. Behavioral thresholds for formant-frequency discrimination decreased with increasing amplitude of stimulus envelope fluctuations, increased in noise, and were similar between budgerigars and humans. Multiunit recordings in awake birds showed that the midbrain encodes vowel-like sounds both through response synchrony to envelope structure and through average rate. Whereas neural discrimination thresholds based on either coding scheme were sufficient to support behavioral thresholds in quiet, only synchrony-based neural thresholds could account for behavioral thresholds in background noise. These results reveal an incomplete transformation to average-rate coding of vowel-like sounds in the midbrain. Model simulations suggest that this transformation emerges due to modulation tuning, which is shared between birds and mammals. Furthermore, the results underscore the behavioral relevance of envelope synchrony in the midbrain for detection of small differences in vowel formant frequency under real-world listening conditions.}, } @article {pmid27662325, year = {2016}, author = {Tanaka, Y and Tsuboi, T and Watanabe, H and Kajita, Y and Nakatsubo, D and Fujimoto, Y and Ohdake, R and Ito, M and Atsuta, N and Yamamoto, M and Wakabayashi, T and Katsuno, M and Sobue, G}, title = {Articulation Features of Parkinson's Disease Patients with Subthalamic Nucleus Deep Brain Stimulation.}, journal = {Journal of Parkinson's disease}, volume = {6}, number = {4}, pages = {811-819}, doi = {10.3233/JPD-160838}, pmid = {27662325}, issn = {1877-718X}, mesh = {Aged ; Articulation Disorders/etiology/*therapy ; Deep Brain Stimulation/*methods ; Female ; Humans ; Male ; Middle Aged ; *Outcome Assessment, Health Care ; Parkinson Disease/complications/*therapy ; *Speech Intelligibility ; *Subthalamic Nucleus ; }, abstract = {BACKGROUND: Voice and speech disorders are one of the most important issues after subthalamic nucleus deep brain stimulation (STN-DBS) in Parkinson's disease (PD) patients. However, articulation features in this patient population remain unclear.

OBJECTIVE: We studied the articulation features of PD patients with STN-DBS.

METHODS: Participants were 56 PD patients treated with STN-DBS (STN-DBS group) and 41 patients treated only with medical therapy (medical-therapy-alone group). Articulation function was evaluated with acoustic and auditory-perceptual analyses. The vowel space area (VSA) was calculated using the formant frequency data of three vowels (/a/, /i/, and /u/) from sustained phonation task. The VSA reportedly reflects the distance of mouth/jaw and tongue movements during speech and phonation. Correlations between acoustic and auditory-perceptual measurements were also assessed.

RESULTS: The VSA did not significantly differ between the medical-therapy-alone group and the STN-DBS group in the off-stimulation condition. In the STN-DBS group, the VSA was larger in the on-stimulation condition than in the off-stimulation condition. However, individual analysis showed the VSA changes after stopping stimulation were heterogeneous. In total, 89.8% of the STN-DBS group showed a large VSA size in the on- than in the off-stimulation condition. In contrast, the VSA of the remaining patients in that group was smaller in the on- than the off-stimulation condition.

CONCLUSIONS: STN-DBS may resolve hypokinesia of the articulation structures, including the mouth/jaw and tongue, and improve maximal vowel articulation. However, in the on-stimulation condition, the VSA was not significantly correlated with speech intelligibility. This may be because STN-DBS potentially affects other speech processes such as voice and/or respiratory process.}, } @article {pmid27743845, year = {2017}, author = {Cordeiro, H and Fonseca, J and Guimarães, I and Meneses, C}, title = {Hierarchical Classification and System Combination for Automatically Identifying Physiological and Neuromuscular Laryngeal Pathologies.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {3}, pages = {384.e9-384.e14}, doi = {10.1016/j.jvoice.2016.09.003}, pmid = {27743845}, issn = {1873-4588}, mesh = {Case-Control Studies ; Databases, Factual ; Diagnosis, Computer-Assisted/*methods ; Edema/classification/*diagnosis/pathology/physiopathology ; Female ; Humans ; Male ; Pattern Recognition, Automated ; Predictive Value of Tests ; *Signal Processing, Computer-Assisted ; Speech Acoustics ; Speech Production Measurement/*methods ; *Support Vector Machine ; Vocal Cord Paralysis/classification/*diagnosis/pathology/physiopathology ; *Vocal Cords/pathology/physiopathology ; Voice Disorders/classification/*diagnosis/pathology/physiopathology ; Voice Quality ; }, abstract = {OBJECTIVES: Speech signal processing techniques have provided several contributions to pathologic voice identification, in which healthy and unhealthy voice samples are evaluated. A less common approach is to identify laryngeal pathologies, for which the use of a noninvasive method for pathologic voice identification is an important step forward for preliminary diagnosis. In this study, a hierarchical classifier and a combination of systems are used to improve the accuracy of a three-class identification system (healthy, physiological larynx pathologies, and neuromuscular larynx pathologies).

METHOD: Three main subject classes were considered: subjects with physiological larynx pathologies (vocal fold nodules and edemas: 59 samples), subjects with neuromuscular larynx pathologies (unilateral vocal fold paralysis: 59 samples), and healthy subjects (36 samples). The variables used in this study were a speech task (sustained vowel /a/ or continuous reading speech), features with or without perceptual information, and features with or without direct information about formants evaluated using single classifiers. A hierarchical classification system was designed based on this information.

RESULTS: The resulting system combines an analysis of continuous speech by way of the commonly used sustained vowel /a/ to obtain spectral and perceptual speech features. It achieved an accuracy of 84.4%, which represents an improvement of approximately 9% compared with the stand-alone approach. For pathologic voice identification, the accuracy obtained was 98.7%, and the identification accuracy for the two pathology classes was 81.3%.

CONCLUSIONS: Hierarchical classification and system combination create significant benefits and introduce a modular approach to the classification of larynx pathologies.}, } @article {pmid27709292, year = {2017}, author = {Koo, SK and Kwon, SB and Kim, YJ and Moon, JIS and Kim, YJ and Jung, SH}, title = {Acoustic analysis of snoring sounds recorded with a smartphone according to obstruction site in OSAS patients.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {274}, number = {3}, pages = {1735-1740}, pmid = {27709292}, issn = {1434-4726}, mesh = {*Acoustics ; Adult ; Airway Resistance/physiology ; Female ; Humans ; Male ; Middle Aged ; Polysomnography ; Sleep Apnea, Obstructive/complications/*diagnosis/physiopathology ; Smartphone ; Snoring/etiology/*physiopathology ; *Sound ; Tongue ; Young Adult ; }, abstract = {Snoring is a sign of increased upper airway resistance and is the most common symptom suggestive of obstructive sleep apnea. Acoustic analysis of snoring sounds is a non-invasive diagnostic technique and may provide a screening test that can determine the location of obstruction sites. We recorded snoring sounds according to obstruction level, measured by DISE, using a smartphone and focused on the analysis of formant frequencies. The study group comprised 32 male patients (mean age 42.9 years). The spectrogram pattern, intensity (dB), fundamental frequencies (F 0), and formant frequencies (F 1, F 2, and F 3) of the snoring sounds were analyzed for each subject. On spectrographic analysis, retropalatal level obstruction tended to produce sharp and regular peaks, while retrolingual level obstruction tended to show peaks with a gradual onset and decay. On formant frequency analysis, F 1 (retropalatal level vs. retrolingual level: 488.1 ± 125.8 vs. 634.7 ± 196.6 Hz) and F 2 (retropalatal level vs. retrolingual level: 1267.3 ± 306.6 vs. 1723.7 ± 550.0 Hz) of retrolingual level obstructions showed significantly higher values than retropalatal level obstruction (p < 0.05). This suggests that the upper airway is more severely obstructed with retrolingual level obstruction and that there is a greater change in tongue position. Acoustic analysis of snoring is a non-invasive diagnostic technique that can be easily applied at a relatively low cost. The analysis of formant frequencies will be a useful screening test for the prediction of occlusion sites. Moreover, smartphone can be effective for recording snoring sounds.}, } @article {pmid27687571, year = {2016}, author = {Pisanski, K and Mora, EC and Pisanski, A and Reby, D and Sorokowski, P and Frackowiak, T and Feinberg, DR}, title = {Volitional exaggeration of body size through fundamental and formant frequency modulation in humans.}, journal = {Scientific reports}, volume = {6}, number = {}, pages = {34389}, pmid = {27687571}, issn = {2045-2322}, abstract = {Several mammalian species scale their voice fundamental frequency (F0) and formant frequencies in competitive and mating contexts, reducing vocal tract and laryngeal allometry thereby exaggerating apparent body size. Although humans' rare capacity to volitionally modulate these same frequencies is thought to subserve articulated speech, the potential function of voice frequency modulation in human nonverbal communication remains largely unexplored. Here, the voices of 167 men and women from Canada, Cuba, and Poland were recorded in a baseline condition and while volitionally imitating a physically small and large body size. Modulation of F0, formant spacing (∆F), and apparent vocal tract length (VTL) were measured using Praat. Our results indicate that men and women spontaneously and systemically increased VTL and decreased F0 to imitate a large body size, and reduced VTL and increased F0 to imitate small size. These voice modulations did not differ substantially across cultures, indicating potentially universal sound-size correspondences or anatomical and biomechanical constraints on voice modulation. In each culture, men generally modulated their voices (particularly formants) more than did women. This latter finding could help to explain sexual dimorphism in F0 and formants that is currently unaccounted for by sexual dimorphism in human vocal anatomy and body size.}, } @article {pmid27649413, year = {2016}, author = {Hu, W and Mi, L and Yang, Z and Tao, S and Li, M and Wang, W and Dong, Q and Liu, C}, title = {Shifting Perceptual Weights in L2 Vowel Identification after Training.}, journal = {PloS one}, volume = {11}, number = {9}, pages = {e0162876}, pmid = {27649413}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Audiometry, Speech ; Cues ; Female ; Humans ; *Language ; Male ; *Multilingualism ; *Phonetics ; Speech/physiology ; Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Difficulties with second-language vowel perception may be related to the significant challenges in using acoustic-phonetic cues. This study investigated the effects of perception training with duration-equalized vowels on native Chinese listeners' English vowel perception and their use of acoustic-phonetic cues. Seventeen native Chinese listeners were perceptually trained with duration-equalized English vowels, and another 17 native Chinese listeners watched English videos as a control group. Both groups were tested with English vowel identification and vowel formant discrimination before training, immediately after training, and three months later. The results showed that the training effect was greater for the vowel training group than for the control group, while both groups improved their English vowel identification and vowel formant discrimination after training. Moreover, duration-equalized vowel perception training significantly reduced listeners' reliance on duration cues and improved their use of spectral cues in identifying English vowels, but video-watching did not help. The results suggest that duration-equalized English vowel perception training may improve non-native listeners' English vowel perception by changing their perceptual weights of acoustic-phonetic cues.}, } @article {pmid27605865, year = {2016}, author = {Zhang, J and Pan, Z and Gui, C and Zhu, J and Cui, D}, title = {Clinical investigation of speech signal features among patients with schizophrenia.}, journal = {Shanghai archives of psychiatry}, volume = {28}, number = {2}, pages = {95-102}, pmid = {27605865}, issn = {1002-0829}, abstract = {BACKGROUND: A new area of interest in the search for biomarkers for schizophrenia is the study of the acoustic parameters of speech called 'speech signal features'. Several of these features have been shown to be related to emotional responsiveness, a characteristic that is notably restricted in patients with schizophrenia, particularly those with prominent negative symptoms.

AIM: Assess the relationship of selected acoustic parameters of speech to the severity of clinical symptoms in patients with chronic schizophrenia and compare these characteristics between patients and matched healthy controls.

METHODS: Ten speech signal features-six prosody features, formant bandwidth and amplitude, and two spectral features-were assessed using 15-minute speech samples obtained by smartphone from 26 inpatients with chronic schizophrenia (at enrollment and 1 week later) and from 30 healthy controls (at enrollment only). Clinical symptoms of the patients were also assessed at baseline and 1 week later using the Positive and Negative Syndrome Scale, the Scale for the Assessment of Negative Symptoms, and the Clinical Global Impression-Schizophrenia scale.

RESULTS: In the patient group the symptoms were stable over the 1-week interval and the 1-week test-retest reliability of the 10 speech features was good (intraclass correlation coefficients [ICC] ranging from 0.55 to 0.88). Comparison of the speech features between patients and controls found no significant differences in the six prosody features or in the formant bandwidth and amplitude features, but the two spectral features were different: the Mel-frequency cepstral coefficient (MFCC) scores were significantly lower in the patient group than in the control group, and the linear prediction coding (LPC) scores were significantly higher in the patient group than in the control group. Within the patient group, 10 of the 170 associations between the 10 speech features considered and the 17 clinical parameters considered were statistically significant at the p<0.05 level.

CONCLUSIONS: This study provides some support for the potential value of speech signal features as indicators (i.e., biomarkers) of the severity of negative symptoms in schizophrenia, but more detailed studies using larger samples of more diverse patients that are followed over time will be needed before the potential utility of such acoustic parameters of speech can be fully assessed.}, } @article {pmid27598835, year = {2016}, author = {Charlton, BD and Reby, D}, title = {The evolution of acoustic size exaggeration in terrestrial mammals.}, journal = {Nature communications}, volume = {7}, number = {}, pages = {12739}, pmid = {27598835}, issn = {2041-1723}, mesh = {Animals ; *Biological Evolution ; Female ; Male ; Mammals/*anatomy & histology/*genetics ; *Models, Biological ; Phylogeny ; Selection, Genetic ; Sex Characteristics ; Spermatozoa ; Vocalization, Animal/*physiology ; }, abstract = {Recent studies have revealed that some mammals possess adaptations that enable them to produce vocal signals with much lower fundamental frequency (F0) and formant frequency spacing (ΔF) than expected for their size. Although these adaptations are assumed to reflect selection pressures for males to lower frequency components and exaggerate body size in reproductive contexts, this hypothesis has not been tested across a broad range of species. Here we show that male terrestrial mammals produce vocal signals with lower ΔF (but not F0) than expected for their size in mating systems with greater sexual size dimorphism. We also reveal that males produce calls with higher than expected F0 and ΔF in species with increased sperm competition. This investigation confirms that sexual selection favours the use of ΔF as an acoustic size exaggerator and supports the notion of an evolutionary trade-off between pre-copulatory signalling displays and sperm production.}, } @article {pmid27588160, year = {2016}, author = {Donai, JJ and Motiian, S and Doretto, G}, title = {Automated Classification of Vowel Category and Speaker Type in the High-Frequency Spectrum.}, journal = {Audiology research}, volume = {6}, number = {1}, pages = {137}, pmid = {27588160}, issn = {2039-4330}, abstract = {The high-frequency region of vowel signals (above the third formant or F3) has received little research attention. Recent evidence, however, has documented the perceptual utility of high-frequency information in the speech signal above the traditional frequency bandwidth known to contain important cues for speech and speaker recognition. The purpose of this study was to determine if high-pass filtered vowels could be separated by vowel category and speaker type in a supervised learning framework. Mel frequency cepstral coefficients (MFCCs) were extracted from productions of six vowel categories produced by two male, two female, and two child speakers. Results revealed that the filtered vowels were well separated by vowel category and speaker type using MFCCs from the high-frequency spectrum. This demonstrates the presence of useful information for automated classification from the high-frequency region and is the first study to report findings of this nature in a supervised learning framework.}, } @article {pmid27586751, year = {2016}, author = {Summers, RJ and Bailey, PJ and Roberts, B}, title = {Across-formant integration and speech intelligibility: Effects of acoustic source properties in the presence and absence of a contralateral interferer.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {2}, pages = {1227}, doi = {10.1121/1.4960595}, pmid = {27586751}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Humans ; Phonetics ; Random Allocation ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Perception ; }, abstract = {The role of source properties in across-formant integration was explored using three-formant (F1+F2+F3) analogues of natural sentences (targets). In experiment 1, F1+F3 were harmonic analogues (H1+H3) generated using a monotonous buzz source and second-order resonators; in experiment 2, F1+F3 were tonal analogues (T1+T3). F2 could take either form (H2 or T2). Target formants were always presented monaurally; the receiving ear was assigned randomly on each trial. In some conditions, only the target was present; in others, a competitor for F2 (F2C) was presented contralaterally. Buzz-excited or tonal competitors were created using the time-reversed frequency and amplitude contours of F2. Listeners must reject F2C to optimize keyword recognition. Whether or not a competitor was present, there was no effect of source mismatch between F1+F3 and F2. The impact of adding F2C was modest when it was tonal but large when it was harmonic, irrespective of whether F2C matched F1+F3. This pattern was maintained when harmonic and tonal counterparts were loudness-matched (experiment 3). Source type and competition, rather than acoustic similarity, governed the phonetic contribution of a formant. Contrary to earlier research using dichotic targets, requiring across-ear integration to optimize intelligibility, H2C was an equally effective informational masker for H2 as for T2.}, } @article {pmid29924531, year = {2016}, author = {Pycha, A}, title = {Co-articulatory Cues for Communication: An Investigation of Five Environments.}, journal = {Language and speech}, volume = {59}, number = {Pt 3}, pages = {364-386}, doi = {10.1177/0023830915603878}, pmid = {29924531}, issn = {0023-8309}, mesh = {Acoustics ; Adolescent ; Adult ; *Communication ; *Cues ; *Environment ; Female ; Humans ; Male ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {We hypothesized that speakers adjust co-articulation in vowel–consonant (VC) sequences in order to provide listeners with enhanced perceptual cues to C, and that they do so specifically in those situations where primary cues to C place of articulation tend to be diminished. We tested this hypothesis in a speech production study of American English, measuring the duration and extent of VC formant transitions in five conditioning environments – consonant voicing, phrasal position, sentence accent, vowel quality, and consonant place – that modulate primary cues to C place in different ways. Results partially support our hypothesis. Although speakers did not exhibit greater temporal co-articulation in contexts that tend to diminish place cues, they did exhibit greater spatial co-articulation. This finding suggests that co-articulation serves specific communicative goals.}, } @article {pmid27538050, year = {2016}, author = {Liao, JS}, title = {An Acoustic Study of Vowels Produced by Alaryngeal Speakers in Taiwan.}, journal = {American journal of speech-language pathology}, volume = {25}, number = {4}, pages = {481-492}, doi = {10.1044/2016_AJSLP-15-0068}, pmid = {27538050}, issn = {1558-9110}, mesh = {Humans ; *Larynx, Artificial ; *Phonetics ; *Speech Acoustics ; *Speech Intelligibility ; Taiwan ; Voice ; }, abstract = {PURPOSE: This study investigated the acoustic properties of 6 Taiwan Southern Min vowels produced by 10 laryngeal speakers (LA), 10 speakers with a pneumatic artificial larynx (PA), and 8 esophageal speakers (ES).

METHOD: Each of the 6 monophthongs of Taiwan Southern Min (/i, e, a, ɔ, u, ə/) was represented by a Taiwan Southern Min character and appeared randomly on a list 3 times (6 Taiwan Southern Min characters × 3 repetitions = 18 tokens). Each Taiwan Southern Min character in this study has the same syllable structure, /V/, and all were read with tone 1 (high and level). Acoustic measurements of the 1st formant, 2nd formant, and 3rd formant were taken for each vowel. Then, vowel space areas (VSAs) enclosed by /i, a, u/ were calculated for each group of speakers. The Euclidean distance between vowels in the pairs /i, a/, /i, u/, and /a, u/ was also calculated and compared across the groups.

RESULTS: PA and ES have higher 1st or 2nd formant values than LA for each vowel. The distance is significantly shorter between vowels in the corner vowel pairs /i, a/ and /i, u/. PA and ES have a significantly smaller VSA compared with LA.

CONCLUSIONS: In accordance with previous studies, alaryngeal speakers have higher formant frequency values than LA because they have a shortened vocal tract as a result of their total laryngectomy. Furthermore, the resonance frequencies are inversely related to the length of the vocal tract (on the basis of the assumption of the source filter theory). PA and ES have a smaller VSA and shorter distances between corner vowels compared with LA, which may be related to speech intelligibility. This hypothesis needs further support from future study.}, } @article {pmid27501922, year = {2017}, author = {Maxfield, L and Palaparthi, A and Titze, I}, title = {New Evidence That Nonlinear Source-Filter Coupling Affects Harmonic Intensity and fo Stability During Instances of Harmonics Crossing Formants.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {2}, pages = {149-156}, pmid = {27501922}, issn = {1873-4588}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; R01 DC013573/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; Male ; Middle Aged ; *Models, Biological ; Nonlinear Dynamics ; *Phonation ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Vocal Cords/anatomy & histology/*physiology ; *Voice Quality ; Young Adult ; }, abstract = {The traditional source-filter theory of voice production describes a linear relationship between the source (glottal flow pulse) and the filter (vocal tract). Such a linear relationship does not allow for nor explain how changes in the filter may impact the stability and regularity of the source. The objective of this experiment was to examine what effect unpredictable changes to vocal tract dimensions could have on fo stability and individual harmonic intensities in situations in which low frequency harmonics cross formants in a fundamental frequency glide. To determine these effects, eight human subjects (five male, three female) were recorded producing fo glides while their vocal tracts were artificially lengthened by a section of vinyl tubing inserted into the mouth. It was hypothesized that if the source and filter operated as a purely linear system, harmonic intensities would increase and decrease at nearly the same rates as they passed through a formant bandwidth, resulting in a relatively symmetric peak on an intensity-time contour. Additionally, fo stability should not be predictably perturbed by formant/harmonic crossings in a linear system. Acoustic analysis of these recordings, however, revealed that harmonic intensity peaks were asymmetric in 76% of cases, and that 85% of fo instabilities aligned with a crossing of one of the first four harmonics with the first three formants. These results provide further evidence that nonlinear dynamics in the source-filter relationship can impact fo stability as well as harmonic intensities as harmonics cross through formant bandwidths.}, } @article {pmid27498890, year = {2016}, author = {Dehqan, A and Yadegari, F and Blomgren, M and Scherer, RC}, title = {Formant transitions in the fluent speech of Farsi-speaking people who stutter.}, journal = {Journal of fluency disorders}, volume = {48}, number = {}, pages = {1-15}, doi = {10.1016/j.jfludis.2016.01.005}, pmid = {27498890}, issn = {1873-801X}, mesh = {Adult ; Humans ; Iran ; Linguistics ; Male ; *Speech ; *Speech Production Measurement ; *Stuttering ; }, abstract = {PURPOSE: Second formant (F2) transitions can be used to infer attributes of articulatory transitions. This study compared formant transitions during fluent speech segments of Farsi (Persian) speaking people who stutter and normally fluent Farsi speakers.

METHODS: Ten Iranian males who stutter and 10 normally fluent Iranian males participated. Sixteen different "CVt" tokens were embedded within the phrase "Begu CVt an". Measures included overall F2 transition frequency extents, durations, and derived overall slopes, initial F2 transition slopes at 30ms and 60ms, and speaking rate.

RESULTS: (1) Mean overall formant frequency extent was significantly greater in 14 of the 16 CVt tokens for the group of stuttering speakers. (2) Stuttering speakers exhibited significantly longer overall F2 transitions for all 16 tokens compared to the nonstuttering speakers. (3) The overall F2 slopes were similar between the two groups. (4) The stuttering speakers exhibited significantly greater initial F2 transition slopes (positive or negative) for five of the 16 tokens at 30ms and six of the 16 tokens at 60ms. (5) The stuttering group produced a slower syllable rate than the non-stuttering group.

CONCLUSIONS: During perceptually fluent utterances, the stuttering speakers had greater F2 frequency extents during transitions, took longer to reach vowel steady state, exhibited some evidence of steeper slopes at the beginning of transitions, had overall similar F2 formant slopes, and had slower speaking rates compared to nonstuttering speakers. Findings support the notion of different speech motor timing strategies in stuttering speakers. Findings are likely to be independent of the language spoken. Educational objectives This study compares aspects of F2 formant transitions between 10 stuttering and 10 nonstuttering speakers. Readers will be able to describe: (a) characteristics of formant frequency as a specific acoustic feature used to infer speech movements in stuttering and nonstuttering speakers, (b) two methods of measuring second formant (F2) transitions: the visual criteria method and fixed time criteria method, (c) characteristics of F2 transitions in the fluent speech of stuttering speakers and how those characteristics appear to differ from normally fluent speakers, and (d) possible cross-linguistic effects on acoustic analyses of stuttering.}, } @article {pmid27475179, year = {2016}, author = {Elvin, J and Williams, D and Escudero, P}, title = {Dynamic acoustic properties of monophthongs and diphthongs in Western Sydney Australian English.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {1}, pages = {576}, doi = {10.1121/1.4952387}, pmid = {27475179}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Language ; Male ; New South Wales ; Phonation/*physiology ; *Speech Acoustics ; *Speech Production Measurement ; Young Adult ; }, abstract = {This study provides a thorough acoustic analysis of the 18 Australian English monophthongs and diphthongs produced in a variety of phonetic contexts by young adult speakers from Western Sydney. The 18 vowels are well separated by duration and dynamic formant trajectory information. Vowel durations and formant trajectories were affected by the consonantal context in which the vowels were produced, particularly those produced in the /hVd/ context. Finally, the results indicate that capturing aspects of vowel inherent spectral change may be useful in future cross-dialectal and cross-linguistic studies.}, } @article {pmid27475161, year = {2016}, author = {Kuo, C and Weismer, G}, title = {Vowel reduction across tasks for male speakers of American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {1}, pages = {369}, pmid = {27475161}, issn = {1520-8524}, support = {R01 DC003723/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Language ; Male ; *Phonetics ; Reading ; Speech ; *Speech Acoustics ; United States ; Young Adult ; }, abstract = {This study examined acoustic variation of vowels within speakers across speech tasks. The overarching goal of the study was to understand within-speaker variation as one index of the range of normal speech motor behavior for American English vowels. Ten male speakers of American English performed four speech tasks including citation form sentence reading with a clear-speech style (clear-speech), citation form sentence reading (citation), passage reading (reading), and conversational speech (conversation). Eight monophthong vowels in a variety of consonant contexts were studied. Clear-speech was operationally defined as the reference point for describing variation. Acoustic measures associated with the conventions of vowel targets were obtained and examined. These included temporal midpoint formant frequencies for the first three formants (F1, F2, and F3) and the derived Euclidean distances in the F1-F2 and F2-F3 planes. Results indicated that reduction toward the center of the F1-F2 and F2-F3 planes increased in magnitude across the tasks in the order of clear-speech, citation, reading, and conversation. The cross-task variation was comparable for all speakers despite fine-grained individual differences. The characteristics of systematic within-speaker acoustic variation across tasks have potential implications for the understanding of the mechanisms of speech motor control and motor speech disorders.}, } @article {pmid27475131, year = {2016}, author = {Leung, KK and Jongman, A and Wang, Y and Sereno, JA}, title = {Acoustic characteristics of clearly spoken English tense and lax vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {140}, number = {1}, pages = {45}, doi = {10.1121/1.4954737}, pmid = {27475131}, issn = {1520-8524}, mesh = {Acoustics ; Adolescent ; Adult ; Female ; Humans ; *Language ; Male ; Phonetics ; Speech ; *Speech Acoustics ; Speech Perception ; Young Adult ; }, abstract = {Clearly produced vowels exhibit longer duration and more extreme spectral properties than plain, conversational vowels. These features also characterize tense relative to lax vowels. This study explored the interaction of clear-speech and tensity effects by comparing clear and plain productions of three English tense-lax vowel pairs (/i-ɪ/, /ɑ-ʌ/, /u-ʊ/ in /kVd/ words). Both temporal and spectral acoustic features were examined, including vowel duration, vowel-to-word duration ratio, formant frequency, and dynamic spectral characteristics. Results revealed that the tense-lax vowel difference was generally enhanced in clear relative to plain speech, but clear-speech modifications for tense and lax vowels showed a trade-off in the use of temporal and spectral cues. While plain-to-clear vowel lengthening was greater for tense than lax vowels, clear-speech modifications in spectral change were larger for lax than tense vowels. Moreover, peripheral tense vowels showed more consistent clear-speech modifications in the temporal than spectral domain. Presumably, articulatory constraints limit the spectral variation of these extreme vowels, so clear-speech modifications resort to temporal features and reserve the primary spectral features for tensity contrasts. These findings suggest that clear-speech and tensity interactions involve compensatory modifications in different acoustic domains.}, } @article {pmid27473205, year = {2016}, author = {Bálint, A and Faragó, T and Miklósi, Á and Pongrácz, P}, title = {Threat-level-dependent manipulation of signaled body size: dog growls' indexical cues depend on the different levels of potential danger.}, journal = {Animal cognition}, volume = {19}, number = {6}, pages = {1115-1131}, doi = {10.1007/s10071-016-1019-9}, pmid = {27473205}, issn = {1435-9456}, mesh = {Acoustics ; *Animal Communication ; Animals ; *Body Size ; *Cues ; *Dogs ; Female ; Humans ; Male ; }, abstract = {Body size is an important feature that affects fighting ability; however, size-related parameters of agonistic vocalizations are difficult to manipulate because of anatomical constraints within the vocal production system. Rare examples of acoustic size modulation are due to specific features that enable the sender to steadily communicate exaggerated body size. However, one could argue that it would be more adaptive if senders could adjust their signaling behavior to the fighting potential of their actual opponent. So far there has been no experimental evidence for this possibility. We tested this hypothesis by exposing family dogs (Canis familiaris) to humans with potentially different fighting ability. In a within-subject experiment, 64 dogs of various breeds consecutively faced two threateningly approaching humans, either two men or two women of different stature, or a man and a woman of similar or different stature. We found that the dogs' vocal responses were affected by the gender of the threatening stranger and the dog owner's gender. Dogs with a female owner, or those dogs which came from a household where both genders were present, reacted with growls of lower values of the Pitch-Formant component (including deeper fundamental frequency and lower formant dispersion) to threatening men. Our results are the first to show that non-human animals react with dynamic alteration of acoustic parameters related to their individual indexical features (body size), depending on the level of threat in an agonistic encounter.}, } @article {pmid27456205, year = {2016}, author = {Boidron, L and Boudenia, K and Avena, C and Boucheix, JM and Aucouturier, JJ}, title = {Emergency medical triage decisions are swayed by computer-manipulated cues of physical dominance in caller's voice.}, journal = {Scientific reports}, volume = {6}, number = {}, pages = {30219}, pmid = {27456205}, issn = {2045-2322}, support = {335536/ERC_/European Research Council/International ; }, mesh = {Adult ; *Decision Making ; *Emergency Medical Service Communication Systems ; Emergency Medical Services/*organization & administration ; Female ; Humans ; Male ; Triage/*organization & administration ; *Voice ; }, abstract = {In humans as well as other animals, displays of body strength such as power postures or deep masculine voices are associated with prevalence in conflicts of interest and facilitated access to resources. We conduct here an ecological and highly critical test of this hypothesis in a domain that, on first thought, would appear to be shielded from such influences: access to emergency medical care. Using acoustic manipulations of vocal masculinity, we systematically varied the perceived level of physical dominance of mock patients calling a medical call center simulator. Callers whose voice were perceived as indicative of physical dominance (i.e. those with low fundamental and formant frequency voices) obtained a higher grade of response, a higher evaluation of medical emergency and longer attention from physicians than callers with strictly identical medical needs whose voice signaled lower physical dominance. Strikingly, while the effect was important for physician participants, it was virtually non-existent when calls were processed by non-medically-trained phone operators. This finding demonstrates an unprecedented degree of vulnerability of telephone-based medical decisions to extra-medical factors carried by vocal cues, and shows that it may not simply be assumed that more medical training will shield decisions from such influences.}, } @article {pmid27430862, year = {2017}, author = {Mayr, A}, title = {Investigating the Voce Faringea: Physiological and Acoustic Characteristics of the Bel Canto Tenor's Forgotten Singing Practice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {2}, pages = {255.e13-255.e23}, doi = {10.1016/j.jvoice.2016.06.010}, pmid = {27430862}, issn = {1873-4588}, mesh = {*Acoustics ; Biomechanical Phenomena ; Electrodiagnosis ; Glottis/*physiology ; Humans ; Male ; *Phonation ; Pilot Projects ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; Time Factors ; *Voice Quality ; *Voice Training ; }, abstract = {Several historical sources from the first half of the 19th century mention a distinct third register mechanism particular to tenor voices of that period. This so-called voce faringea-often described as an "intermediate" register-is a virtually forgotten historical singing practice used to extend the upper range of the voice, where the singer modifies falsetto, typically a weak and often feminine sound, into a more powerful, tenor-like vocal quality. Based on an evaluation of historical voice register theories, training strategies, and the sound ideals of the historical period, an informed discussion of that technique is developed. For this study, acoustic and electroglottographic signals for tones produced on the vowel /a/ by a professional tenor/countertenor in different vocal register mechanisms-voce faringea, falsetto, chest register, and mezza voce-were recorded using the VoceVista system. Analysis of the electroglottography (EGG) and audio data revealed specific characteristics of the voce faringea with regard to both the laryngeal mechanism and the sound spectrum, including high EGG contact quotient and low speed quotient values. EGG pulses were skewed significantly to the left and displayed a distinct knee shape during the de-contacting phase of the vocal folds, which consequently indicates a vibration with a clear mucosal wave. The long-term average spectrum and power spectrum exposed a considerable amplification and dislocation of F2 in the direction of high frequencies, thus boosting the third harmonic and showing a strong concentration of acoustic energy in the area of the singer's formant cluster.}, } @article {pmid27430860, year = {2017}, author = {Hallqvist, H and Lã, FM and Sundberg, J}, title = {Soul and Musical Theater: A Comparison of Two Vocal Styles.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {2}, pages = {229-235}, doi = {10.1016/j.jvoice.2016.05.020}, pmid = {27430860}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Biomechanical Phenomena ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; *Phonation ; Pressure ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; Time Factors ; Vibration ; *Voice Quality ; Young Adult ; }, abstract = {The phonatory and resonatory characteristics of nonclassical styles of singing have been rarely analyzed in voice research. Six professional singers volunteered to sing excerpts from two songs pertaining to the musical theater and to the soul styles of singing. Voice source parameters and formant frequencies were analyzed by inverse filtering tones, sung at the same fundamental frequencies in both excerpts. As compared with musical theater, the soul style was characterized by significantly higher subglottal pressure and maximum flow declination rate. Yet sound pressure level was lower, suggesting higher glottal resistance. The differences would be the effects of firmer glottal adduction and a greater frequency separation between the first formant and its closest spectrum partial in soul than in musical theater.}, } @article {pmid27369181, year = {2016}, author = {Kashima, E and Williams, D and Mark Ellison, T and Schokkin, D and Escudero, P}, title = {Uncovering the acoustic vowel space of a previously undescribed language: The vowels of Nambo.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {6}, pages = {EL252}, doi = {10.1121/1.4954395}, pmid = {27369181}, issn = {1520-8524}, mesh = {*Acoustics ; Adolescent ; Adult ; Age Factors ; Aged ; Female ; Humans ; Male ; Middle Aged ; Papua New Guinea ; *Phonetics ; Sex Factors ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; Young Adult ; }, abstract = {This study presents the first acoustic description of the vowel space of a Papuan language-Nambo, spoken in southern Papua New Guinea-based on duration and first and second formant measurements from 19 adult male and female speakers across three age groups (young, middle-aged, senior). Phonemically, Nambo has six full vowels /i, e, æ, ɑ, o, u/ and a reduced vowel tentatively labeled /ə/. Unlike the full vowels, the quality of /ə/ showed great variation: seniors' and young females' realizations tended to be more open and retracted than those by young males, while middle-aged speakers' productions fell between these two variants.}, } @article {pmid27363398, year = {2015}, author = {Okada, Y and Murata, M and Toda, T}, title = {Effects of Levodopa on Vowel Articulation in Patients with Parkinson's Disease.}, journal = {The Kobe journal of medical sciences}, volume = {61}, number = {5}, pages = {E144-54}, pmid = {27363398}, issn = {1883-0498}, mesh = {Aged ; Aged, 80 and over ; Antiparkinson Agents/blood/therapeutic use ; Articulation Disorders/*drug therapy/*etiology/physiopathology ; Case-Control Studies ; Dopamine Agents/blood/therapeutic use ; Female ; Humans ; Levodopa/blood/*therapeutic use ; Male ; Middle Aged ; Parkinson Disease/*complications/*drug therapy/physiopathology ; Speech/drug effects/physiology ; Speech Articulation Tests ; }, abstract = {OBJECTIVES: The effects of levodopa on articulatory dysfunction in patients with Parkinson's disease remain inconclusive. This study aimed to investigate the effects of levodopa on isolated vowel articulation and motor performance in patients with moderate to severe Parkinson's disease, excluding speech fluctuations caused by dyskinesia.

METHODS: 21 patients (14 males and 7 females) and 21 age- and sex- matched healthy subjects were enrolled. Together with motor assessment, the patients phonated five Japanese isolated vowels (/a/, /i/, /u/, /e/, and /o/) 20 times before and 1 h after levodopa treatment. We made the frequency analysis of each vowel and measured the first and second formants. From these formants we constructed the pentagonal vowel space area which should be the good indicator for articulatory dysfunction of vowels. In control subjects, only speech samples were analyzed. To investigate the sequential relationship between plasma levodopa concentrations, motor performances, and acoustic measurements after treatment, entire drug cycle tests were performed in 4 patients.

RESULTS: The pentagonal vowel space area was significantly expanded together with motor amelioration after levodopa treatment, although the enlargement is not enough for the space area of control subjects. Drug cycle tests revealed that sequential increases or decreases in plasma levodopa levels after treatment correlated well with expansion or decrease of the vowel space areas and improvement or deterioration of motor manifestations.

CONCLUSIONS: Levodopa expanded the vowel space area and ameliorated motor performance, suggesting that dysfunctions in vowel articulation and motor performance in patients with Parkinson's disease are based on dopaminergic pathology.}, } @article {pmid27355431, year = {2016}, author = {Lam, J and Tjaden, K}, title = {Clear Speech Variants: An Acoustic Study in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {59}, number = {4}, pages = {631-646}, pmid = {27355431}, issn = {1558-9102}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Aged, 80 and over ; Female ; Humans ; Male ; Middle Aged ; Multivariate Analysis ; Observer Variation ; *Parkinson Disease/physiopathology/psychology ; Phonetics ; Sex Characteristics ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: The authors investigated how different variants of clear speech affect segmental and suprasegmental acoustic measures of speech in speakers with Parkinson's disease and a healthy control group.

METHOD: A total of 14 participants with Parkinson's disease and 14 control participants served as speakers. Each speaker produced 18 different sentences selected from the Sentence Intelligibility Test (Yorkston & Beukelman, 1996). All speakers produced stimuli in 4 speaking conditions (habitual, clear, overenunciate, and hearing impaired). Segmental acoustic measures included vowel space area and first moment (M1) coefficient difference measures for consonant pairs. Second formant slope of diphthongs and measures of vowel and fricative durations were also obtained. Suprasegmental measures included fundamental frequency, sound pressure level, and articulation rate.

RESULTS: For the majority of adjustments, all variants of clear speech instruction differed from the habitual condition. The overenunciate condition elicited the greatest magnitude of change for segmental measures (vowel space area, vowel durations) and the slowest articulation rates. The hearing impaired condition elicited the greatest fricative durations and suprasegmental adjustments (fundamental frequency, sound pressure level).

CONCLUSIONS: Findings have implications for a model of speech production for healthy speakers as well as for speakers with dysarthria. Findings also suggest that particular clear speech instructions may target distinct speech subsystems.}, } @article {pmid27334765, year = {2017}, author = {Meister, J and Kühn, H and Shehata-Dieler, W and Hagen, R and Kleinsasser, N}, title = {Perceptual analysis of the male-to-female transgender voice after glottoplasty-the telephone test.}, journal = {The Laryngoscope}, volume = {127}, number = {4}, pages = {875-881}, doi = {10.1002/lary.26110}, pmid = {27334765}, issn = {1531-4995}, mesh = {Adult ; Auditory Perception ; Case-Control Studies ; Female ; *Gender Identity ; Glottis/*surgery ; Humans ; Male ; Middle Aged ; Retrospective Studies ; Speech Acoustics ; *Speech Perception ; Statistics, Nonparametric ; *Telephone ; *Transgender Persons ; Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES/HYPOTHESIS: The aim of this investigation was to quantify gender perception in telephone communication as a situation of everyday life.

STUDY DESIGN: Matched control study.

METHODS: Speech samples were recorded of 18 male to female (MtF) after Wendler's glottoplasty, 18 male, and 18 female persons. After adaption of the frequency to the limited frequency transmission on the telephone (300-3,400 Hz), the speech samples were judged by 50 male and 50 female listeners. Parameters were the decision "male" or "female" and the decision time. The formant frequencies F1 to F3 for the vowel /a/ were extracted and compared between the speaker groups.

RESULTS: There were 7/18 MtF perceived as female by the majority of listeners. A correlation between fundamental frequency and perceptions as female could be shown. The decision time needed was longer for MtF than for male or female speakers. Female listeners decided significantly faster than male listeners. Female listeners perceived the MtF more often as male speaker. For the MtF, the perception as female correlated with their individual voice satisfaction. Comparing the formant frequencies of male and MtF speakers, F2 was higher for MtF. Regarding female and MtF speakers, F1 and F2 were significantly lower for MtF speakers.

CONCLUSIONS: Using the telephone test, MtF individuals misperceived as male can be identified even if they reached a vocal pitch in the female frequency range. The strong correlation of the perceptions as female in the telephone test and the personal satisfaction shows the power of this instrument for evaluation of therapy success. It should be utilized to compare different techniques of gender voice surgery.

LEVEL OF EVIDENCE: NA Laryngoscope, 127:875-881, 2017.}, } @article {pmid27334516, year = {2017}, author = {Hong, KH and Yang, WS and Park, MJ and Oh, JS and Han, BH}, title = {Changes in Oral Vowel Sounds and Hyoid Bone Movement After Thyroidectomy.}, journal = {Clinical and experimental otorhinolaryngology}, volume = {10}, number = {2}, pages = {168-173}, pmid = {27334516}, issn = {1976-8710}, abstract = {OBJECTIVES: Voice and speech alterations after total thyroidectomy may be associated with other extralaryngeal factors, such as neck muscle dysfunction and neck scar contracture. We evaluated the acoustic characteristics of oral vowel sounds and changes in hyoid bone movement before and after thyroidectomy.

METHODS: Twenty-nine female patients undergoing total thyroidectomy were included. Fundamental frequencies (Fo), formants and vowel space areas were evaluated before surgery and 7 days and 3 months after surgery to acoustically analyze the oral vowel sounds. Videofluoroscopic images were taken at the same times to evaluate hyoid bone movement.

RESULTS: The Fo levels of seven vowels decreased significantly after surgery. The vowel formant changes the F1 of vowel /[e]/ decreased significantly from baseline at 3 months postoperatively, and the F3 of vowel /[i]/ decreased significantly from baseline 7 days postoperatively. The change in the vowel space area was not observed. The Y coordinate of the vowels /[i]/ and /[e]/ decreased significantly from baseline 7 days postoperatively due to changes in hyoid movement.

CONCLUSION: The damage to the neck muscles after thyroidectomy changes in Fo, formant and hyoid bone position. These quantitative results could be used as basic data for voice management in patients who undergo thyroidectomy.}, } @article {pmid27328115, year = {2017}, author = {Whitfield, JA and Goberman, AM}, title = {Articulatory-acoustic vowel space: Associations between acoustic and perceptual measures of clear speech.}, journal = {International journal of speech-language pathology}, volume = {19}, number = {2}, pages = {184-194}, doi = {10.1080/17549507.2016.1193897}, pmid = {27328115}, issn = {1754-9515}, mesh = {Adult ; Female ; Humans ; Male ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Production Measurement/*methods ; Young Adult ; }, abstract = {PURPOSE: The current investigation examined the relationship between perceptual ratings of speech clarity and acoustic measures of speech production. Included among the acoustic measures was the Articulatory-Acoustic Vowel Space (AAVS), which provides a measure of working formant space derived from continuously sampled formant trajectories in connected speech.

METHOD: Acoustic measures of articulation and listener ratings of speech clarity were obtained from habitual and clear speech samples produced by 10 neurologically healthy adults. Perceptual ratings of speech clarity were obtained from visual-analogue scale ratings and acoustic measures included the AAVS measure, articulation rate and percentage pause.

RESULT: Clear speech was characterised by a higher perceptual clarity rating, slower articulation rate, greater percentage pause and larger AAVS compared to habitual speech. Additionally, correlation analysis revealed a significant relationship between the perceptual clear speech effect and the relative clarity-related change in the AAVS and articulation rate measures.

CONCLUSION: The current findings suggest that, along with speech rate measures, the recently introduced AAVS is sensitive to changes in speech clarity.}, } @article {pmid27453680, year = {2013}, author = {Yunusova, Y and Green, JR and Lindstrom, MJ and Pattee, GL and Zinman, L}, title = {Speech in ALS: Longitudinal Changes in Lips and Jaw Movements and Vowel Acoustics.}, journal = {Journal of medical speech-language pathology}, volume = {21}, number = {1}, pages = {1-13}, pmid = {27453680}, issn = {1065-1438}, support = {R01 DC009890/DC/NIDCD NIH HHS/United States ; R01 DC013547/DC/NIDCD NIH HHS/United States ; }, abstract = {PURPOSE: The goal of this exploratory study was to investigate longitudinally the changes in facial kinematics, vowel formant frequencies, and speech intelligibility in individuals diagnosed with bulbar amyotrophic lateral sclerosis (ALS). This study was motivated by the need to understand articulatory and acoustic changes with disease progression and their subsequent effect on deterioration of speech in ALS.

METHOD: Lip and jaw movements and vowel acoustics were obtained for four individuals with bulbar ALS during four consecutive recording sessions with an average interval of three months between recordings. Participants read target words embedded into sentences at a comfortable speaking rate. Maximum vertical and horizontal mouth opening and maximum jaw displacements were obtained during corner vowels. First and second formant frequencies were measured for each vowel. Speech intelligibility and speaking rate score were obtained for each session as well.

RESULTS: Transient, non-vowel-specific changes in kinematics of the jaw and lips were observed. Kinematic changes often preceded changes in vowel acoustics and speech intelligibility.

CONCLUSIONS: Nonlinear changes in speech kinematics should be considered in evaluation of the disease effects on jaw and lip musculature. Kinematic measures might be most suitable for early detection of changes associated with bulbar ALS.}, } @article {pmid34892883, year = {2008}, author = {Cap, H and Deleporte, P and Joachim, J and Reby, D}, title = {Male vocal behavior and phylogeny in deer.}, journal = {Cladistics : the international journal of the Willi Hennig Society}, volume = {24}, number = {6}, pages = {917-931}, doi = {10.1111/j.1096-0031.2008.00223.x}, pmid = {34892883}, issn = {1096-0031}, abstract = {The phylogenetic relationships among 11 species of the Cervidae family were inferred from an analysis of male vocalizations. Eighteen characters, including call types (e.g. antipredator barks, mating loudcalls) and acoustic characteristics (call composition, fundamental frequency and formant frequencies), were used for phylogeny inference. The resulting topology and the phylogenetic consistency of behavioral characters were compared with those of current molecular phylogenies of Cervidae and with separate and simultaneous parsimony analyses of molecular and behavioral data. Our results indicate that male vocalizations constitute plausible phylogenetic characters in this taxon. Evolutionary scenarios for the vocal characters are discussed in relation with associated behaviors.}, } @article {pmid29852682, year = {1998}, author = {Pepperberg, IM and Howell, KS and Banta, PA and Patterson, DK and Meister, M}, title = {Measurement of grey parrot (Psittacus erithacus) trachea via magnetic resonance imaging, dissection, and electron beam computed tomography.}, journal = {Journal of morphology}, volume = {238}, number = {1}, pages = {81-91}, doi = {10.1002/(SICI)1097-4687(199810)238:1<81::AID-JMOR7>3.0.CO;2-Q}, pmid = {29852682}, issn = {1097-4687}, abstract = {To produce a model to explain the acoustic properties of human speech sounds produced by Grey parrots (Psittacus erithacus) and to compare these properties across species (e.g., with humans, other psittacine and nonpsittacine mimics), researchers need adequate measurements of the chambers that constitute the parrot vocal tract. Various methods can provide such data. Here we compare results for tracheal measurements provided by a) magnetic resonance imaging (MRI) of a live bird, b) caliper measurements of four preserved specimens, and c) electron beam computed tomography (EBCT) of three of these preserved specimens. We find that EBCT scans provide data that correspond to the inner area of the dissected trachea, whereas MRI results correspond to area measurements that include tracheal ring thickness. We briefly discuss how these data may predict formant values for Grey parrot reproduction of human vowels. Our results suggest how noninvasive techniques can be used for cross-species comparisons, including the coevolution of structure and function in avian mimicry. J. Morphol. 238:81-91, 1998. © 1998 Wiley-Liss, Inc.}, } @article {pmid34644866, year = {1995}, author = {Harvey, RG and Lloyd, DH}, title = {The Distribution of Bacteria (Other than Staphylococci and Propionibacterium acnes) on the Hair, at the Skin Surface and Within the Hair Follicles of Dogs.}, journal = {Veterinary dermatology}, volume = {6}, number = {2}, pages = {79-84}, doi = {10.1111/j.1365-3164.1995.tb00047.x}, pmid = {34644866}, issn = {1365-3164}, abstract = {Résumé- La distribution des bactéries, autres que les staphlocoques sur la tige des poils, à la surface cutanée et dans les follicules pileux de 8 chiens est analysée. Sur la tige des poils Micrococcus spp. et les bactéries aérobies gram mégatifs sont plus nombreuses avec des numérations variant de 1,12 à 0,84 log10 (colonies formant unites par cm[2]). Des nombres hautement significatifs (p < 0,05) sont également trouvés. A la surface cutanée Micrococcus spp. des bactéries aérobies gram négatifs et Clostridium sp. sont les plus nombreuses avec des numérations respectives variant de 0,62, 1,12 et 0,84 log10 (colonies formant unités par cm[2]). Des nombres hautement significatifs (p < 0,05) de Micrococcus spp. sont trouvés de façon plus importante à l'intérieur des follicules pileux qu' à la surface cutanée, les Streptocoques et Bacillus sp. ont été trouvés respectivement sur cinq et quatre chiens. Proteus sp., Pseudomonas sp. Nocardia sp. sont occasion-nellement trouvés. [HARVEY, R.G., LLOYD, D.H. The distribution of bacteria (other than Staphylococci and Propionibacterium acnes) on the hair, at the skin surface and within the hair follicles of dogs (Distribution des bactéries autres que Staphylococci et Propionibacterium acnes) sur le poil, à la surface de al peau et dans les follicules pileux). Resumen- Presentamos la localizatión de bacterias noestafilocócicas en el pelo, en la superficie cutánea y dentro del foliculo piloso de ocho perros. En los pelos, Micrococcus spp. abundantes, con contajes medios entre 1.12 a 0.84 Log10 (unidades formadoras de colonias +1) cm[-1] , respectivamente. Se encontró a nivel proximal un número significativamente mayor (p < 0.05) de bacterias aeróbicas gram-negativas y Bacillus spp. En la superficie cutánea, Micrococcus spp., las bacterias aeróbicas contajes medios de 0.62, 1.12 y 0.84 Log10 (unidades formadoras de colonias +1) cm"[2] , respectivamente. Se aisló un número significativamente mayor (p < 0.05) de Micrococcus spp. dentro de los foliculos pilosos que en la superficie cutánea (p < 0.05). Se aisló Streptococi y Bacillus spp. en cinco y cuatro perros, respectivamente. Proteus spp., Pseudomonas spp. y Nocardia spp. fueron hallados ocasionalmente. [HARVEY, R.G., LLOYD, D.H. The distribution of bacteria (other than Staphylococci and Propionibacterium acnes) on the hair, at the skin surface and within the hair follicles of dogs (Localizatión de bacterias (exceptuando Staphilococci y Propionibacterium acnes) en el pelo, en la superficie cutánea y dentro de los foliculos pilosos). Abstract- The distribution of bacteria, other than staphylococci, on the hair shaft, at the skin surface and within the hair follicles of eight dogs is reported. On the hair shafts Micrococcus spp. and aerobic Gram-negative bacteria were most numerous, with mean counts ranging from 1.12 to 0.84 Log10 (colony forming units + 1) cm"[1] respectively. Significantly higher numbers (p < 0.05) of Gram-negative bacteria and Bacillus sp. were found proximally. At the skin surface Micrococcus spp., aerobic Gram-negative bacteria and Clostridium spp. were the most numerous with mean counts of 0.62, 1.12 and 0.84 Log10 (colony forming units + 1) cm"[2] , respectively. Significantly higher numbers (p < 0.05) of Micrococcus spp. were found within the hair follicles than on the skin surface (p < 0.05). Streptococci and Bacillus spp. were found on five and four dogs, respectively. Proteus spp., Pseudomonas spp. and Nocardia spp. were occasionally found.}, } @article {pmid31936924, year = {1994}, author = {Seyfarth, RM and Cheney, DL and Harcourt, AH and Stewart, KJ}, title = {The acoustic features of gorilla double grunts and their relation to behavior.}, journal = {American journal of primatology}, volume = {33}, number = {1}, pages = {31-50}, doi = {10.1002/ajp.1350330104}, pmid = {31936924}, issn = {1098-2345}, abstract = {Mountain gorillas (Gorilla gorilla beringei) give double-grunts to one another in a variety of situations, when feeding, resting, moving, or engaged in other kinds of social behavior. Some double-grunts elicit double-grunts in reply whereas others do not. Double-grunts are individually distinctive, and high-ranking animals give double-grunts at higher rates than others. There was no evidence, however, that the probability of eliciting a reply depended upon either the animals' behavior at the time a call was given or the social relationship between caller and respondent. The probability of eliciting a reply could be predicted from a double-grunt's acoustic features. Gorillas apparently produce at least two acoustically different subtypes of double-grunts, each of which conveys different information. Double-grunts with a low second formant (typically < 1600 Hz) are given by animals after a period of silence and frequently elicit vocal replies. Double-grunts with a high second formant (typically > 1600 Hz) are given by animals within 5 s of a call from another individual and rarely elicit replies. © 1994 Wiley-Liss, Inc.}, } @article {pmid31905533, year = {1990}, author = {Dalsgaard, P and Fink, FK and Pedersen, JE and Sørensen, H}, title = {Real-Time Signal Processing in Speech Recognition and Its Potential Use within The Development of Hearing Aids.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {108-116}, doi = {10.1080/00016489.1990.12088416}, pmid = {31905533}, issn = {1651-2251}, abstract = {Acoustic-Phonetic features necessary for rule-based speech recognition are described. The selection of feature algorithms is based on their robustness to variability of dynamics in speech signals and to influence of environmental noise, and their suitability for real-time implementation using speech production or speech perception modelling. Features are estimated in real-time, using a 32-bit floating point signal processor DSP32. The features described are pitch, formants, segmentation and labelling. The paper indicates the potential use of these features in connection with future experiments leading to development of new hearing aids.}, } @article {pmid31905530, year = {1990}, author = {Dillier, N and Senn, C and Schlatter, T and Stöckli, M and Utzinger, U}, title = {Wearable Digital Speech Processor for Cochlear Implants Using a TMS320C25.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {120-127}, doi = {10.1080/00016489.1990.12088418}, pmid = {31905530}, issn = {1651-2251}, abstract = {Based on a single-chip DSP (TMS320C25, Texas Instruments) a programmable battery-operated sound processor with a digital encoder interface for the Nucleus-22 cochlear implant (CI) was built. The number of quasi-simultaneously addressed electrodes is only limited by the selected pulse width and the maximum rate of stimulation and can be as high as 10 electrodes at 300 Hz repetition rate. Implementation of various processing strategies (formant or channel vocoder, filterbank, zero crossings, etc.) is possible as well as sophisticated adaptive noise reduction. Programs and stimulation parameters are stored in electrically erasable memory and may be updated via a host computer. The built-in analog output may be used for single-channel stimulation or acoustic verifications of the sound processing algorithms. The power consumption with current 16K word data memory and at maximum stimulation rate is about 1 Watt, which necessitates recharging of batteries after 11 h.}, } @article {pmid31905526, year = {1990}, author = {von Wedel, H and von Wedel, UC and Streppel, M}, title = {Fine Structure Analysis of Speech Signals: Hearing Aids and Perceptual Training.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {236-244}, doi = {10.1080/00016489.1990.12088435}, pmid = {31905526}, issn = {1651-2251}, abstract = {Transition and gap detection experiments with synthetic speech stimuli in normal and pathological hearing were carried out to investigate the feasibility of some form of feature-analysing mechanism in speech perception. Identification experiments were performed before the hearing aid fitting, directly thereafter and half a year later to evaluate the important individual cues of speech perception with regard to perceptual training. Possible distortions in the temporal structure of the speech signals by the hearing aids were considered. For our experiments we generated a series of synthetic speech stimuli such as the ba-da-ga sequence, which are identical in their acoustic cues except for the transition of the second formant. The start frequency was changed in nine 200-Hz steps from 800 Hz to 2400 Hz thus producing a ba at one end of the series and a ga at the other. The phenomenon of categorical perception could be investigated with this set of synthesized speech stimuli. In a second experiment the influence of silent intervals on the identification of plosives was analysed increasing the artificial silent interval in 10 ms steps from 0 ms to 120 ms in speech stimuli like schal thus producing stahl for the 120 ms silent interval. The results of patients with different types of hearing loss fitted with hearing aids are compared with normal hearing. Our investigations show that it will be possible to achieve an integration of fine-structure analysing mechanisms in speech perception into the management and rehabilitation of people with impaired hearing using special speech processors in modified hearing aids and developing special speech and hearing training programs.}, } @article {pmid31905521, year = {1990}, author = {Simpson, AM and Moore, BCJ and Glasberg, BR}, title = {Spectral Enhancement to Improve the Intelligibility of Speech in Noise for Hearing-impaired Listeners.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {101-107}, doi = {10.1080/00016489.1990.12088415}, pmid = {31905521}, issn = {1651-2251}, abstract = {At speech-to-noise ratios between -3 and 6 dB, many hearing-impaired listeners have difficulty in understanding speech, but spectrograms reveal that the formant peaks of voiced speech and some of the spectral peaks associated with unvoiced speech stand out against the background noise. Our speech-enhancement process is based on the assumption that increasing spectral contrast will result in improved intelligibility. The enhancement involves calculating an auditory excitation pattern from the magnitude spectrum of overlapping short segments of the speech signal. This pattern is convolved with a difference-of-Gaussians function whose bandwidth varies with frequency in the same way as the auditory filter bandwidth. Magnitude values from this enhanced pattern are combined with the unchanged phase spectrum from the original signal to produce the enhanced speech. The processing was used to enhance Boothroyd and Bench-Kowal-Bamford Audiometric lists which had been digitally combined with speech-shaped noise at speech-to-noise ratios between -3 and 6 dB. The subjects had moderate to severe sensorineural hearing losses. The processing produced small but significant improvements in intelligibility for the hearing-impaired listeners tested. Possibilities for improving the processing are discussed.}, } @article {pmid31905520, year = {1990}, author = {Tong, YC and Harrison, JM and Huigen, J and Clark, GM}, title = {Comparison of Two Speech Processing Schemes Using Normal-hearing Subjects.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {135-139}, doi = {10.1080/00016489.1990.12088420}, pmid = {31905520}, issn = {1651-2251}, abstract = {Two speech processing schemes have been implemented on a real-time laboratory speech processor for speech perception studies on normal-hearing subjects. The first scheme presents the information of two spectral components, the first and second formant of the speech signal, and will be referred to as the ZC (zero-crossing) scheme. The second scheme presents the information of four spectral components, four spectral peaks of the speech signal determined from the output of a filter bank, and will be referred to as the FB (Filter-bank) scheme. Perceptual studies were conducted on 4 normal-hearing subjects. Two of the 4 subjects were first trained and tested using the ZC scheme while the other 2 used the FB scheme. The training and testing schedules were then repeated for the unused scheme. Vowel perception performance was similar across subjects and speech processing schemes. Percentage correct scores for consonant perception in noise were higher for the FB scheme than for the ZC scheme at signal-to-noise ratios ranging from 5 to 20 dB. Percentage correct scores for open set CNC words and speech tracking scores were also better for FB than for ZC.}, } @article {pmid31905519, year = {1990}, author = {Von Wallenberg, EL and Hochmair, ES and Hochmair-Desoyer, IJ}, title = {Initial Results with Simultaneous Analog and Pulsatile Stimulation of the Cochlea.}, journal = {Acta oto-laryngologica}, volume = {109}, number = {sup469}, pages = {140-149}, doi = {10.1080/00016489.1990.12088421}, pmid = {31905519}, issn = {1651-2251}, abstract = {An improved method has been developed for the coding of speech information into adequate signals for the stimulation of the auditory nerve. It combines the periodicity principle, which has been applied in single-channel analog stimulation in the Austrian cochlear prosthesis, with the place principle by simultaneous analog stimulation on one channel and pulsatile stimulation on other channels. The second formant frequency determines the place of stimulation for the pulsatile signals. Simultaneous stimulation of several channels can cause the currents emerging from different electrodes to interact because the fluid impedance in the cochlea is small. Therefore, an important aspect of the multichannel strategy is to maintain the temporal pattern transmitted via the analog channel by adequate repetition rates and phase relationships of the pulsatile signals. The signals were processed with finite impulse response digital filters. Vowel identification tests were performed with 6 patients implanted with a 4-channel intracochlear electrode. The test material was spoken by male and female speakers. With proper timing of the pulses the improvement over the single-channel stimulation was significant at the 1% level and this difference was due to a significant increase in second formant recognition.}, } @article {pmid27088361, year = {2016}, author = {Eaves, BS and Feldman, NH and Griffiths, TL and Shafto, P}, title = {Infant-directed speech is consistent with teaching.}, journal = {Psychological review}, volume = {123}, number = {6}, pages = {758-771}, doi = {10.1037/rev0000031}, pmid = {27088361}, issn = {1939-1471}, mesh = {Adult ; *Child Development ; Humans ; Infant ; *Language Development ; *Learning ; Phonetics ; *Speech ; *Teaching ; }, abstract = {Infant-directed speech (IDS) has distinctive properties that differ from adult-directed speech (ADS). Why it has these properties-and whether they are intended to facilitate language learning-is a matter of contention. We argue that much of this disagreement stems from lack of a formal, guiding theory of how phonetic categories should best be taught to infantlike learners. In the absence of such a theory, researchers have relied on intuitions about learning to guide the argument. We use a formal theory of teaching, validated through experiments in other domains, as the basis for a detailed analysis of whether IDS is well designed for teaching phonetic categories. Using the theory, we generate ideal data for teaching phonetic categories in English. We qualitatively compare the simulated teaching data with human IDS, finding that the teaching data exhibit many features of IDS, including some that have been taken as evidence IDS is not for teaching. The simulated data reveal potential pitfalls for experimentalists exploring the role of IDS in language learning. Focusing on different formants and phoneme sets leads to different conclusions, and the benefit of the teaching data to learners is not apparent until a sufficient number of examples have been provided. Finally, we investigate transfer of IDS to learning ADS. The teaching data improve classification of ADS data but only for the learner they were generated to teach, not universally across all classes of learners. This research offers a theoretically grounded framework that empowers experimentalists to systematically evaluate whether IDS is for teaching. (PsycINFO Database Record}, } @article {pmid27317666, year = {2016}, author = {Kong, YY and Winn, MB and Poellmann, K and Donaldson, GS}, title = {Discriminability and Perceptual Saliency of Temporal and Spectral Cues for Final Fricative Consonant Voicing in Simulated Cochlear-Implant and Bimodal Hearing.}, journal = {Trends in hearing}, volume = {20}, number = {}, pages = {}, pmid = {27317666}, issn = {2331-2165}, support = {R01 DC012300/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustic Stimulation ; Cochlear Implantation ; *Cochlear Implants ; Cues ; Humans ; Phonetics ; *Speech Perception ; }, abstract = {Multiple redundant acoustic cues can contribute to the perception of a single phonemic contrast. This study investigated the effect of spectral degradation on the discriminability and perceptual saliency of acoustic cues for identification of word-final fricative voicing in "loss" versus "laws", and possible changes that occurred when low-frequency acoustic cues were restored. Three acoustic cues that contribute to the word-final /s/-/z/ contrast (first formant frequency [F1] offset, vowel-consonant duration ratio, and consonant voicing duration) were systematically varied in synthesized words. A discrimination task measured listeners' ability to discriminate differences among stimuli within a single cue dimension. A categorization task examined the extent to which listeners make use of a given cue to label a syllable as "loss" versus "laws" when multiple cues are available. Normal-hearing listeners were presented with stimuli that were either unprocessed, processed with an eight-channel noise-band vocoder to approximate spectral degradation in cochlear implants, or low-pass filtered. Listeners were tested in four listening conditions: unprocessed, vocoder, low-pass, and a combined vocoder + low-pass condition that simulated bimodal hearing. Results showed a negative impact of spectral degradation on F1 cue discrimination and a trading relation between spectral and temporal cues in which listeners relied more heavily on the temporal cues for "loss-laws" identification when spectral cues were degraded. Furthermore, the addition of low-frequency fine-structure cues in simulated bimodal hearing increased the perceptual saliency of the F1 cue for "loss-laws" identification compared with vocoded speech. Findings suggest an interplay between the quality of sensory input and cue importance.}, } @article {pmid27306548, year = {2017}, author = {Vainio, L and Tiainen, M and Tiippana, K and Rantala, A and Vainio, M}, title = {Sharp and round shapes of seen objects have distinct influences on vowel and consonant articulation.}, journal = {Psychological research}, volume = {81}, number = {4}, pages = {827-839}, pmid = {27306548}, issn = {1430-2772}, mesh = {Adult ; Aged ; Aged, 80 and over ; Female ; Humans ; Male ; Middle Aged ; Space Perception/*physiology ; Speech Perception/*physiology ; *Symbolism ; }, abstract = {The shape and size-related sound symbolism phenomena assume that, for example, the vowel [i] and the consonant [t] are associated with sharp-shaped and small-sized objects, whereas [ɑ] and [m] are associated with round and large objects. It has been proposed that these phenomena are mostly based on the involvement of articulatory processes in representing shape and size properties of objects. For example, [i] might be associated with sharp and small objects, because it is produced by a specific front-close shape of articulators. Nevertheless, very little work has examined whether these object properties indeed have impact on speech sound vocalization. In the present study, the participants were presented with a sharp- or round-shaped object in a small or large size. They were required to pronounce one out of two meaningless speech units (e.g., [i] or [ɑ]) according to the size or shape of the object. We investigated how a task-irrelevant object property (e.g., the shape when responses are made according to size) influences reaction times, accuracy, intensity, fundamental frequency, and formant 1 and formant 2 of vocalizations. The size did not influence vocal responses but shape did. Specifically, the vowel [i] and consonant [t] were vocalized relatively rapidly when the object was sharp-shaped, whereas [u] and [m] were vocalized relatively rapidly when the object was round-shaped. The study supports the view that the shape-related sound symbolism phenomena might reflect mapping of the perceived shape with the corresponding articulatory gestures.}, } @article {pmid27294198, year = {2016}, author = {Fishman, YI and Micheyl, C and Steinschneider, M}, title = {Neural Representation of Concurrent Vowels in Macaque Primary Auditory Cortex.}, journal = {eNeuro}, volume = {3}, number = {3}, pages = {}, pmid = {27294198}, issn = {2373-2822}, support = {R01 DC000657/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Cortex/*physiology ; Macaca fascicularis ; Male ; Microelectrodes ; Neurons/*physiology ; Phonetics ; Pitch Perception/physiology ; Speech Perception/*physiology ; }, abstract = {Successful speech perception in real-world environments requires that the auditory system segregate competing voices that overlap in frequency and time into separate streams. Vowels are major constituents of speech and are comprised of frequencies (harmonics) that are integer multiples of a common fundamental frequency (F0). The pitch and identity of a vowel are determined by its F0 and spectral envelope (formant structure), respectively. When two spectrally overlapping vowels differing in F0 are presented concurrently, they can be readily perceived as two separate "auditory objects" with pitches at their respective F0s. A difference in pitch between two simultaneous vowels provides a powerful cue for their segregation, which in turn, facilitates their individual identification. The neural mechanisms underlying the segregation of concurrent vowels based on pitch differences are poorly understood. Here, we examine neural population responses in macaque primary auditory cortex (A1) to single and double concurrent vowels (/a/ and /i/) that differ in F0 such that they are heard as two separate auditory objects with distinct pitches. We find that neural population responses in A1 can resolve, via a rate-place code, lower harmonics of both single and double concurrent vowels. Furthermore, we show that the formant structures, and hence the identities, of single vowels can be reliably recovered from the neural representation of double concurrent vowels. We conclude that A1 contains sufficient spectral information to enable concurrent vowel segregation and identification by downstream cortical areas.}, } @article {pmid27288701, year = {2016}, author = {Mollaei, F and Shiller, DM and Baum, SR and Gracco, VL}, title = {Sensorimotor control of vocal pitch and formant frequencies in Parkinson's disease.}, journal = {Brain research}, volume = {1646}, number = {}, pages = {269-277}, pmid = {27288701}, issn = {1872-6240}, support = {R01 DC012502/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Aged ; Feedback, Sensory ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*psychology ; *Pitch Perception ; *Psychomotor Performance ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {BACKGROUND: Auditory feedback reflects information on multiple speech parameters including fundamental frequency (pitch) and formant properties. Inducing auditory errors in these acoustic parameters during speech production has been used to examine the manner in which auditory feedback is integrated with ongoing speech motor processes. This integration has been shown to be impaired in disorders such as Parkinson's disease (PD), in which individuals exhibit difficulty adjusting to altered sensory-motor relationships. The current investigation examines whether such sensorimotor impairments affect fundamental frequency and formant parameters of speech differentially.

METHODS: We employed a sensorimotor compensation paradigm to investigate the mechanisms underlying the control of vocal pitch and formant parameters. Individuals with PD and age-matched controls prolonged a speech vowel in the context of a word while the fundamental or first formant frequency of their auditory feedback was altered unexpectedly on random trials, using two magnitudes of perturbation.

RESULTS: Compared with age-matched controls, individuals with PD exhibited a larger compensatory response to fundamental frequency perturbations, in particular in response to the smaller magnitude alteration. In contrast, the group with PD showed reduced compensation to first formant frequency perturbations.

CONCLUSIONS: The results demonstrate that the neural processing impairment of PD differentially affects the processing of auditory feedback for the control of fundamental and formant frequency. The heightened modulation of fundamental frequency in response to auditory perturbations may reflect a change in sensory weighting due to somatosensory deficits associated with the larynx, while the reduced ability to modulate vowel formants may result from impaired activation of the oral articulatory musculature.}, } @article {pmid27273586, year = {2016}, author = {Stoeger, AS and Baotic, A}, title = {Information content and acoustic structure of male African elephant social rumbles.}, journal = {Scientific reports}, volume = {6}, number = {}, pages = {27585}, pmid = {27273586}, issn = {2045-2322}, mesh = {Acoustics ; Aggression/*physiology ; Animals ; Elephants/*physiology ; Female ; Male ; Sexual Behavior, Animal/*physiology ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {Until recently, the prevailing theory about male African elephants (Loxodonta africana) was that, once adult and sexually mature, males are solitary and targeted only at finding estrous females. While this is true during the state of 'musth' (a condition characterized by aggressive behavior and elevated androgen levels), 'non-musth' males exhibit a social system seemingly based on companionship, dominance and established hierarchies. Research on elephant vocal communication has so far focused on females, and very little is known about the acoustic structure and the information content of male vocalizations. Using the source and filter theory approach, we analyzed social rumbles of 10 male African elephants. Our results reveal that male rumbles encode information about individuality and maturity (age and size), with formant frequencies and absolute fundamental frequency values having the most informative power. This first comprehensive study on male elephant vocalizations gives important indications on their potential functional relevance for male-male and male-female communication. Our results suggest that, similar to the highly social females, future research on male elephant vocal behavior will reveal a complex communication system in which social knowledge, companionship, hierarchy, reproductive competition and the need to communicate over long distances play key roles.}, } @article {pmid27263123, year = {2016}, author = {Intartaglia, B and White-Schwoch, T and Meunier, C and Roman, S and Kraus, N and Schön, D}, title = {Native language shapes automatic neural processing of speech.}, journal = {Neuropsychologia}, volume = {89}, number = {}, pages = {57-65}, doi = {10.1016/j.neuropsychologia.2016.05.033}, pmid = {27263123}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Analysis of Variance ; Brain/*physiology ; *Brain Mapping ; Case-Control Studies ; Cross-Cultural Comparison ; Electroencephalography ; Female ; Fourier Analysis ; Humans ; *Language ; Male ; Speech/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The development of the phoneme inventory is driven by the acoustic-phonetic properties of one's native language. Neural representation of speech is known to be shaped by language experience, as indexed by cortical responses, and recent studies suggest that subcortical processing also exhibits this attunement to native language. However, most work to date has focused on the differences between tonal and non-tonal languages that use pitch variations to convey phonemic categories. The aim of this cross-language study is to determine whether subcortical encoding of speech sounds is sensitive to language experience by comparing native speakers of two non-tonal languages (French and English). We hypothesized that neural representations would be more robust and fine-grained for speech sounds that belong to the native phonemic inventory of the listener, and especially for the dimensions that are phonetically relevant to the listener such as high frequency components. We recorded neural responses of American English and French native speakers, listening to natural syllables of both languages. Results showed that, independently of the stimulus, American participants exhibited greater neural representation of the fundamental frequency compared to French participants, consistent with the importance of the fundamental frequency to convey stress patterns in English. Furthermore, participants showed more robust encoding and more precise spectral representations of the first formant when listening to the syllable of their native language as compared to non-native language. These results align with the hypothesis that language experience shapes sensory processing of speech and that this plasticity occurs as a function of what is meaningful to a listener.}, } @article {pmid27250184, year = {2016}, author = {Hanna, N and Smith, J and Wolfe, J}, title = {Frequencies, bandwidths and magnitudes of vocal tract and surrounding tissue resonances, measured through the lips during phonation.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {5}, pages = {2924}, doi = {10.1121/1.4948754}, pmid = {27250184}, issn = {1520-8524}, mesh = {Acoustic Impedance Tests ; Adult ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; Lip/anatomy & histology/*physiology ; Male ; Middle Aged ; Motion ; *Phonation ; Sound ; Time Factors ; Vibration ; Vocal Cords/anatomy & histology/physiology ; *Voice ; Young Adult ; }, abstract = {The frequencies, magnitudes, and bandwidths of vocal tract resonances are all important in understanding and synthesizing speech. High precision acoustic impedance spectra of the vocal tracts of 10 subjects were measured from 10 Hz to 4.2 kHz by injecting a broadband acoustic signal through the lips. Between 300 Hz and 4 kHz the acoustic resonances R (impedance minima measured through the lips) and anti-resonances R¯ (impedance maxima) associated with the first three voice formants, have bandwidths of ∼50 to 90 Hz for men and ∼70 to 90 Hz for women. These acoustic resonances approximate those of a smooth, dry, rigid cylinder of similar dimensions, except that their bandwidths indicate higher losses in the vocal tract. The lossy, inertive load and airflow caused by opening the glottis further increase the bandwidths observed during phonation. The vocal tract walls are not rigid and measurements show an acousto-mechanical resonance R0 ∼ 20 Hz and anti-resonance R¯0∼200 Hz. These give an estimate of wall inertance consistent with an effective thickness of 1-2 cm and a wall stiffness of 2-4 kN m(-1). The non-rigidity of the tract imposes a lower limit of the frequency of the first acoustic resonance fR1 and the first formant F1.}, } @article {pmid27230274, year = {2016}, author = {Gulec, S and Kulahli, I and Sahin, MI and Kokoğlu, K and Gunes, MS and Avci, D and Arli, T}, title = {Effect of Septoplasty on Voice Quality: A Prospective-Controlled Trial.}, journal = {Clinical and experimental otorhinolaryngology}, volume = {9}, number = {3}, pages = {238-243}, pmid = {27230274}, issn = {1976-8710}, abstract = {OBJECTIVES: The purpose is to investigate effect of septoplasty and widened nasal patency on voice quality.

METHODS: Fifty patients who undergone septoplasty were included in the study. Thirty-three people who had similar age and distribution were enrolled as control group. Before and 1 and 3 months after surgery, anterior rhinomanometry, voice analysis by Multi-Dimensional Voice Program, and spectrographic analysis were performed to patients. The recordings of /a/ vowel were used to evaluate average fundamental frequency (F0), jitter percent, and shimmer percent. In spectrographic analyses, F3-F4 values for the vowels /i, e, a, o, and u/, nasal formant frequencies of the consonants /m/ and /n/ in the word /mini/, and 4 formant frequencies (F1, F2, F3, and F4) for nasalized /i/ vowel following a nasal consonant /n/ in the word /mini/ were compared. The differences in nasal resonance were evaluated. All patients were asked whether change in their voices after the surgery. Preoperative and postoperative voice parameters and anterior rhinomanometry results were compared separately with the control group as well as in the patient group itself.

RESULTS: Preoperative total nasal resistance (TNR) values of patients were higher than the control group (P=0.001). TNR values of patients measured one day before surgery and after surgery in the 1st and 3rd months were different and these differences were significant statistically (P=0.001). There was no significant difference between the voice analysis parameters in preoperative, postoperative 1st, and 3rd months. As a result of their subjective reviews, 12 patients (36%) noted their voices were better than before surgery and 20 patients (61%) noted no change before and after surgery.

CONCLUSION: Providing widened nasal cavity has no effect on voice quality.}, } @article {pmid27225777, year = {2016}, author = {Suh, Y and Hwang, J}, title = {The Korean Prevocalic Palatal Glide: A Comparison with the Russian Glide and Palatalization.}, journal = {Phonetica}, volume = {73}, number = {2}, pages = {85-100}, doi = {10.1159/000444189}, pmid = {27225777}, issn = {1423-0321}, mesh = {*Cross-Cultural Comparison ; Humans ; Korea ; *Language ; Palate/*physiology ; Phonation/*physiology ; *Phonetics ; Russia ; *Sound Spectrography ; Speech Articulation Tests ; }, abstract = {Phonetic studies of the Korean prevocalic glides have often suggested that they are shorter in duration than those of languages like English, and lack a prolonged steady state. In addition, the formant frequencies of the Korean labiovelar glide are reported to be greatly influenced by the following vowel. In this study the Korean prevocalic palatal glide is investigated vis-à-vis the two phonologically similar configurations of another language - the glide /j/ and the secondary palatalization of Russian, with regard to the inherent duration of the glide component, F2 trajectory, vowel-to-glide coarticulation and glide-to-vowel coarticulation. It is revealed that the Korean palatal glide is closer to the Russian palatalization in duration and F2 trajectory, indicating a lack of steady state, and to the Russian segmental glide in the vowel-to-glide coarticulation degree. When the glide-to-vowel coarticulation is considered, the Korean palatal glide is distinguished from both Russian categories. The results suggest that both the Korean palatal glide and the Russian palatalization involve significant articulatory overlap, the former with the vowel and the latter with the consonant. Phonological implications of such a difference in coarticulation pattern are discussed, as well as the comparison between the Korean labiovelar and palatal glides.}, } @article {pmid27220769, year = {2016}, author = {Reiss, LA and Eggleston, JL and Walker, EP and Oh, Y}, title = {Two Ears Are Not Always Better than One: Mandatory Vowel Fusion Across Spectrally Mismatched Ears in Hearing-Impaired Listeners.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {17}, number = {4}, pages = {341-356}, pmid = {27220769}, issn = {1438-7573}, support = {P30 DC005983/DC/NIDCD NIH HHS/United States ; P30 DC010755/DC/NIDCD NIH HHS/United States ; R01 DC013307/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Aged, 80 and over ; *Cochlear Implants ; Female ; Hearing Loss/*physiopathology ; Humans ; Male ; *Speech Perception ; }, abstract = {Hearing loss and auditory prostheses can alter auditory processing by inducing large pitch mismatches and broad pitch fusion between the two ears. Similar to integration of incongruent inputs in other sensory modalities, the mismatched, fused pitches are often averaged across ears for simple stimuli. Here, we measured parallel effects on complex stimulus integration using a new technique based on vowel classification in five bilateral hearing aid users and eight bimodal cochlear implant users. Continua between five pairs of synthetic vowels were created by varying the first formant spectral peak while keeping the second formant constant. Comparison of binaural and monaural vowel classification functions for each vowel pair continuum enabled visualization of the following frequency-dependent integration trends: (1) similar monaural and binaural functions, (2) ear dominance, (3) binaural averaging, and (4) binaural interference. Hearing aid users showed all trends, while bimodal cochlear implant users showed mostly ear dominance or interference. Interaural pitch mismatches, frequency ranges of binaural pitch fusion, and the relative weightings of pitch averaging across ears were also measured using tone and/or electrode stimulation. The presence of both large interaural pitch mismatches and broad pitch fusion was not sufficient to predict vowel integration trends such as binaural averaging or interference. The way that pitch averaging was weighted between ears also appears to be important for determining binaural vowel integration trends. Abnormally broad spectral fusion and the associated phoneme fusion across mismatched ears may underlie binaural speech perception interference observed in hearing aid and cochlear implant users.}, } @article {pmid27219893, year = {2016}, author = {Kuo, C and Tjaden, K}, title = {Acoustic variation during passage reading for speakers with dysarthria and healthy controls.}, journal = {Journal of communication disorders}, volume = {62}, number = {}, pages = {30-44}, pmid = {27219893}, issn = {1873-7994}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {*Dysarthria ; Humans ; Multiple Sclerosis/complications ; Parkinson Disease/complications ; *Reading ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement/*statistics & numerical data ; }, abstract = {PURPOSE: Acoustic variation in a passage read by speakers with dysarthria and healthy speakers was examined.

METHOD: 15 speakers with Multiple Sclerosis (MS), 12 speakers with Parkinson's disease (PD), and 14 healthy speakers were studied. Acoustic variables included measures of global speech timing (e.g., articulation rate, pause characteristics), vocal intensity (e.g., mean sound pressure level and intensity modulation), and segmental articulation (i.e., utterance-level second formant interquartile range (F2 IQR)). Acoustic measures were obtained from three segments operationally defined to represent the beginning, middle, and end of a reading passage. Two speaking conditions associated with common treatment techniques for dysarthria were included for comparison to a habitual speaking condition. These conditions included a slower-than-habitual rate (Slow) and greater-than-habitual intensity (Loud).

RESULTS: There was some degree of acoustic variation across the three operationally-defined segments of the reading passage. The Slow, Loud and Habitual conditions yielded comparable characteristics of variation. Patterns of acoustic variation across the three passage segments also were largely similar across speaker groups.

CONCLUSIONS: Within-task acoustic variation during passage reading should be considered when making decisions regarding speech sampling in clinical practice and research. The contributions of speech disorder severity and linguistic variables to within-task acoustic change warrant further investigation.

LEARNING OUTCOMES: Readers will be able to (1) discuss the motivation for studying and understanding within-task variation in contextual speech, (2) describe patterns of acoustic variation for speakers with dysarthria and healthy speakers during passage reading, (3) discuss the relationship between non-habitual speaking conditions and within-task variation, (4) understand the need to consider within-speaker, within-task variation in speech sampling.}, } @article {pmid27192606, year = {2016}, author = {Duff, LB and Urichuk, TM and Hodgins, LN and Young, JR and Untereiner, WA}, title = {Diversity of fungi from the mound nests of Formica ulkei and adjacent non-nest soils.}, journal = {Canadian journal of microbiology}, volume = {62}, number = {7}, pages = {562-571}, doi = {10.1139/cjm-2015-0628}, pmid = {27192606}, issn = {1480-3275}, mesh = {Animals ; Ants/*microbiology ; Ascomycota/isolation & purification ; Aspergillus/isolation & purification ; *Biodiversity ; Fusarium/isolation & purification ; Mitosporic Fungi/isolation & purification ; Penicillium/isolation & purification ; *Soil Microbiology ; Temperature ; }, abstract = {Culture-based methods were employed to recover 3929 isolates of fungi from soils collected in May and July 2014 from mound nests of Formica ulkei and adjacent non-nest sites. The abundance, diversity, and richness of species from nest mounds exceeded those of non-mound soils, particularly in July. Communities of fungi from mounds were more similar to those from mounds than non-mounds; this was also the case for non-mound soils with the exception of one non-mound site in July. Species of Aspergillus, Paecilomyces, and Penicillium were dominant in nest soils and represented up to 81.8% of the taxa recovered. Members of the genus Aspergillus accounted for the majority of Trichocomaceae from nests and were represented almost exclusively by Aspergillus navahoensis and Aspergillus pseudodeflectus. Dominant fungi from non-mound sites included Cladosporium cladosporioides, Geomyces pannorum, and species of Acremonium, Fusarium, Penicillium, and Phoma. Although mound nests were warmer than adjacent soils, the dominance of xerotolerant Aspergillus in soils from mounds and the isolation of the majority of Trichocomaceae at 25 and 35 °C suggests that both temperature and water availability may be determinants of fungal community structure in nests of F. ulkei.}, } @article {pmid27177161, year = {2016}, author = {Shen, J and Wright, R and Souza, PE}, title = {On Older Listeners' Ability to Perceive Dynamic Pitch.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {59}, number = {3}, pages = {572-582}, pmid = {27177161}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; R01 DC012289/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Aged, 80 and over ; Aging/physiology/*psychology ; Female ; Hearing Tests ; Humans ; Language ; Male ; Middle Aged ; Music ; *Pitch Perception/physiology ; Signal Detection, Psychological ; Sound Spectrography ; Young Adult ; }, abstract = {PURPOSE: Natural speech comes with variation in pitch, which serves as an important cue for speech recognition. The present study investigated older listeners' dynamic pitch perception with a focus on interindividual variability. In particular, we asked whether some of the older listeners' inability to perceive dynamic pitch stems from the higher susceptibility to the interference from formant changes.

METHOD: A total of 22 older listeners and 21 younger controls with at least near-typical hearing were tested on dynamic pitch identification and discrimination tasks using synthetic monophthong and diphthong vowels.

RESULTS: The older listeners' ability to detect changes in pitch varied substantially, even when musical and linguistic experiences were controlled. The influence of formant patterns on dynamic pitch perception was evident in both groups of listeners. Overall, strong pitch contours (i.e., more dynamic) were perceived better than weak pitch contours (i.e., more monotonic), particularly with rising pitch patterns.

CONCLUSIONS: The findings are in accordance with the literature demonstrating some older individuals' difficulty perceiving dynamic pitch cues in speech. Moreover, they suggest that this problem may be prominent when the dynamic pitch is carried by natural speech and when the pitch contour is not strong.}, } @article {pmid27160042, year = {2017}, author = {Mięsikowska, M}, title = {Analysis of Polish Vowels of Tracheoesophageal Speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {2}, pages = {263.e5-263.e11}, doi = {10.1016/j.jvoice.2016.04.007}, pmid = {27160042}, issn = {1873-4588}, mesh = {*Acoustics ; Aged ; Analysis of Variance ; Case-Control Studies ; Discriminant Analysis ; Humans ; Linear Models ; Male ; Middle Aged ; *Phonetics ; Poland ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Production Measurement ; *Speech, Esophageal ; *Voice Quality ; }, abstract = {OBJECTIVES/HYPOTHESIS: The aim of this study was to determine the acoustical differences between normal and tracheoesophageal Polish speakers during Polish vowel production.

METHODS: Formant frequencies, namely, the first (F1) and second (F2) formant frequencies for 6 Polish vowels produced by 11 normal and 11 tracheoesophageal speakers, were analyzed using statistical analysis of variance and discriminant analysis.

RESULTS: Spectral analysis showed that the F1 and F2 values of Polish vowels produced by tracheoesophageal speakers were significantly higher than those produced by normal speakers, with the exception of the F2 value of /i/ produced by tracheoesophageal speakers. Analysis of variance showed significant differences between speeches based on the F1 and F2 formant frequencies. Discriminant analysis based on the formant frequencies for F1 and F2 exhibited 73.33% of the mean classification score for tracheoesophageal speakers and 96.36% for normal speakers.

CONCLUSIONS: Tracheoesophageal speakers exhibit higher F1 and F2 formant frequencies, with the exception of the F2 value for the vowel /i/ than normal speakers. Discriminant analysis showed that the classification process for TE speech exhibits lower accuracy due to the poorer classification of the vowels /i/, /u/, and /y/.}, } @article {pmid27158036, year = {2016}, author = {Morris, DJ and Steinmetzger, K and Tøndering, J}, title = {Auditory event-related responses to diphthongs in different attention conditions.}, journal = {Neuroscience letters}, volume = {626}, number = {}, pages = {158-163}, doi = {10.1016/j.neulet.2016.05.002}, pmid = {27158036}, issn = {1872-7972}, mesh = {Acoustic Stimulation ; Adult ; Attention/*physiology ; Cerebral Cortex/*physiology ; Discrimination, Psychological/physiology ; Electroencephalography ; *Evoked Potentials, Auditory ; Female ; Humans ; Male ; Speech Perception/*physiology ; Visual Perception/physiology ; }, abstract = {The modulation of auditory event-related potentials (ERP) by attention generally results in larger amplitudes when stimuli are attended. We measured the P1-N1-P2 acoustic change complex elicited with synthetic overt (second formant, F2Δ=1000Hz) and subtle (F2Δ=100Hz) diphthongs, while subjects (i) attended to the auditory stimuli, (ii) ignored the auditory stimuli and watched a film, and (iii) diverted their attention to a visual discrimination task. Responses elicited by diphthongs where F2 values rose and fell were found to be different and this precluded their combined analysis. Multivariate analysis of ERP components from the rising F2 changes showed main effects of attention on P2 amplitude and latency, and N1-P2 amplitude. P2 amplitude decreased by 40% between the attend and ignore conditions, and by 60% between the attend and divert conditions. The effect of diphthong magnitude was significant for components from a broader temporal window which included P1 latency and N1 amplitude. N1 latency did not vary between attention conditions, a finding that may be related to stimulation with a continuous vowel. These data show that a discernible P1-N1-P2 response can be observed to subtle vowel quality transitions, even when the attention of a subject is diverted to an unrelated visual task.}, } @article {pmid27128640, year = {2016}, author = {Meister, J and Kuehn, H and Shehata-Dieler, W and Kraus, F and Hagen, R and Kleinsasser, N}, title = {[Patient Satisfaction after Pitch Elevation and Development of a Therapy Algorithm].}, journal = {Laryngo- rhino- otologie}, volume = {95}, number = {11}, pages = {774-782}, doi = {10.1055/s-0042-103590}, pmid = {27128640}, issn = {1438-8685}, mesh = {*Algorithms ; Female ; Humans ; Male ; *Patient Satisfaction ; *Transgender Persons ; Transsexualism ; *Voice Quality ; }, abstract = {Objective: Voice feminization is an important step in the therapy of male-to-female transsexualism. Approaches are conservative voice therapy and surgical interventions. The most powerful parameter of gender perception is the fundamental frequency. Besides the vocal pitch, there are other parameters influencing gender perception of a voice, e. g. intonation, prosody or formant frequencies. Material and methods: In 21 male to female transgender persons after surgical elevation of the vocal pitch the Voice Handicap Index (VHI), the Life Satisfaction Questionnaire (FLZ) and a new addendum were used. A new algorithm for voice feminization in male-to-female transsexualism was deduced. Results: After elevation of the vocal pitch, the self-confidence of the male-to-female transgender persons has increased. Despite of an elevated pitch some persons were not satisfied with their voice. Conclusion: Surgical intervention changes only the pitch of a voice. To change other parameters, conservative voice therapy is necessary. If the transgender persons are able to reach a satisfying female voice with conservative voice therapy alone, surgical intervention is not indicated.}, } @article {pmid27115359, year = {2016}, author = {Hardy, TL and Boliek, CA and Wells, K and Dearden, C and Zalmanowitz, C and Rieger, JM}, title = {Pretreatment Acoustic Predictors of Gender, Femininity, and Naturalness Ratings in Individuals With Male-to-Female Gender Identity.}, journal = {American journal of speech-language pathology}, volume = {25}, number = {2}, pages = {125-137}, doi = {10.1044/2015_AJSLP-14-0098}, pmid = {27115359}, issn = {1558-9110}, mesh = {Female ; Femininity ; *Gender Identity ; Humans ; Male ; Retrospective Studies ; *Speech Acoustics ; *Speech Perception ; Voice Quality ; }, abstract = {PURPOSE: The purpose of this study was to describe the pretreatment acoustic characteristics of individuals with male-to-female gender identity (IMtFGI) and investigate the ability of the acoustic measures to predict ratings of gender, femininity, and vocal naturalness.

METHOD: This retrospective descriptive study included 2 groups of participants. Speakers were IMtFGI who had not previously received communication feminization treatment (N = 25). Listeners were members of the lay community (N = 30). Acoustic data were retrospectively obtained from pretreatment recordings, and pretreatment recordings also served as stimuli for 3 perceptual rating tasks (completed by listeners).

RESULTS: Acoustic data generally were within normal limits for male speakers. All but 2 speakers were perceived to be male, limiting information about the relationship between acoustic measures and gender perception. Fundamental frequency (reading) significantly predicted femininity ratings (p = .000). A total of 3 stepwise regression models indicated that minimum frequency (range task), second vowel formant (sustained vowel), and shimmer percentage (sustained vowel) together significantly predicted naturalness ratings (p = .005, p = .003, and p = .002, respectively).

CONCLUSIONS: Study aims were achieved with the exception of acoustic predictors of gender perception, which could be described for only 2 speakers. Future research should investigate measures of prosody, voice quality, and other aspects of communication as predictors of gender, femininity, and naturalness.}, } @article {pmid27103677, year = {2016}, author = {Reby, D and Wyman, MT and Frey, R and Passilongo, D and Gilbert, J and Locatelli, Y and Charlton, BD}, title = {Evidence of biphonation and source-filter interactions in the bugles of male North American wapiti (Cervus canadensis).}, journal = {The Journal of experimental biology}, volume = {219}, number = {Pt 8}, pages = {1224-1236}, doi = {10.1242/jeb.131219}, pmid = {27103677}, issn = {1477-9145}, mesh = {Animals ; Deer/*physiology ; Gestures ; Male ; Muscles/physiology ; Organ Specificity ; *Phonation ; Posture ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {With an average male body mass of 320 kg, the wapiti, ITALIC! Cervus canadensis, is the largest extant species of Old World deer (Cervinae). Despite this large body size, male wapiti produce whistle-like sexual calls called bugles characterised by an extremely high fundamental frequency. Investigations of the biometry and physiology of the male wapiti's relatively large larynx have so far failed to account for the production of such a high fundamental frequency. Our examination of spectrograms of male bugles suggested that the complex harmonic structure is best explained by a dual-source model (biphonation), with one source oscillating at a mean of 145 Hz (F0) and the other oscillating independently at an average of 1426 Hz (G0). A combination of anatomical investigations and acoustical modelling indicated that the F0 of male bugles is consistent with the vocal fold dimensions reported in this species, whereas the secondary, much higher source at G0 is more consistent with an aerodynamic whistle produced as air flows rapidly through a narrow supraglottic constriction. We also report a possible interaction between the higher frequency G0 and vocal tract resonances, as G0 transiently locks onto individual formants as the vocal tract is extended. We speculate that male wapiti have evolved such a dual-source phonation to advertise body size at close range (with a relatively low-frequency F0 providing a dense spectrum to highlight size-related information contained in formants) while simultaneously advertising their presence over greater distances using the very high-amplitude G0 whistle component.}, } @article {pmid27102762, year = {2016}, author = {Favaro, L and Gili, C and Da Rugna, C and Gnone, G and Fissore, C and Sanchez, D and McElligott, AG and Gamba, M and Pessani, D}, title = {Vocal individuality and species divergence in the contact calls of banded penguins.}, journal = {Behavioural processes}, volume = {128}, number = {}, pages = {83-88}, doi = {10.1016/j.beproc.2016.04.010}, pmid = {27102762}, issn = {1872-8308}, mesh = {Acoustics ; Animal Identification Systems ; Animals ; Female ; *Individuality ; Male ; Species Specificity ; Spheniscidae/*physiology ; *Vocalization, Animal ; }, abstract = {Penguins produce contact calls to maintain social relationships and group cohesion. Such vocalisations have recently been demonstrated to encode individual identity information in the African penguin. Using a source-filter theory approach, we investigated whether acoustic cues of individuality can also be found in other Spheniscus penguins and the acoustic features of contact calls have diverged within this genus. We recorded vocalisations from two ex-situ colonies of Humboldt penguin and Magellanic penguin (sympatric and potentially interbreeding in the wild) and one ex-situ group of African penguins (allopatric although capable of interbreeding with the other two species in captivity). We measured 14 acoustic parameters from each vocalisation. These included temporal (duration), source-related (fundamental frequency, f0), and filter-related (formants) parameters. They were then used to carry out a series of stepwise discriminant function analyses (with cross-validation) and General Linear Model comparisons. We showed that contact calls allow individual discrimination in two additional species of the genus Spheniscus. We also found that calls can be classified according to species in a manner far greater than that attributable by chance, even though there is limited genetic distance among African, Humboldt, and Magellanic penguins. Our results provide further evidence that the source-filter theory is a valuable framework for investigating the biologically meaningful information contained in bird vocalisations. Our findings also provide novel insights into penguin vocal communication and suggest that contact calls of the penguin family are affected by selection for individuality.}, } @article {pmid27095715, year = {2016}, author = {Fu, QY and Liang, Y and Zou, A and Wang, T and Zhao, XD and Wan, J}, title = {[Relationships between electrophysiological characteristic of speech evoked auditory brainstem response and Mandarin monosyllable discriminative ability at different hearing impairment].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {51}, number = {4}, pages = {247-255}, doi = {10.3760/cma.j.issn.1673-0860.2016.04.002}, pmid = {27095715}, issn = {1673-0860}, mesh = {Adult ; Audiometry, Speech ; China ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Hearing Loss, Conductive/*physiopathology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; *Speech ; *Speech Perception ; }, abstract = {OBJECTIVE: To investigate the relationships between electrophysiological characteristic of speech evoked auditory brainstem response(s-ABR) and Mandarin phonetically balanced maximum(PBmax) at different hearing impairment, so as to provide more clues for the mechanism of speech cognitive behavior.

METHOD: Forty-one ears in 41 normal hearing adults(NH), thirty ears in 30 conductive hearing loss patients(CHL) and twenty-seven ears in 27 sensorineural hearing loss patients(SNHL) were included in present study. The speech discrimination scores were obtained by Mandarin phonemic-balanced monosyllable lists via speech audiometric software. Their s-ABRs were recorded with speech syllables /da/ with the intensity of phonetically balanced maximum(PBmax). The electrophysiological characteristic of s-ABR, as well as the relationships between PBmax and s-ABR parameters including latency in time domain, fundamental frequency(F0) and first formant(F1) in frequency domain were analyzed statistically.

RESULTS: All subjects completed good speech perception tests and PBmax of CHL and SNHL had no significant difference (P>0.05), but both significantly less than that of NH (P<0.05). While divided the subjects into three groups by 90%
CONCLUSIONS: These electrophysiological characteristics of s-ABR showed closely and stably associated with Mandarin monosyllable discriminative abilities at different hearing impairment. Some electrophysiological characteristics such as amplitudes of F0 and F1, latencies of peak F and A may play more important roles in speech recognition assess, which may be applied combined or separately to further investigation of speech perception and temporal processing abilities.}, } @article {pmid27095264, year = {2016}, author = {Pisanski, K and Oleszkiewicz, A and Sorokowska, A}, title = {Can blind persons accurately assess body size from the voice?.}, journal = {Biology letters}, volume = {12}, number = {4}, pages = {}, pmid = {27095264}, issn = {1744-957X}, mesh = {Adult ; Aged ; Blindness/*physiopathology/psychology ; *Body Size ; Female ; Humans ; Male ; Middle Aged ; Speech Acoustics ; Speech Perception/*physiology ; Visually Impaired Persons ; Voice/*physiology ; }, abstract = {Vocal tract resonances provide reliable information about a speaker's body size that human listeners use for biosocial judgements as well as speech recognition. Although humans can accurately assess men's relative body size from the voice alone, how this ability is acquired remains unknown. In this study, we test the prediction that accurate voice-based size estimation is possible without prior audiovisual experience linking low frequencies to large bodies. Ninety-one healthy congenitally or early blind, late blind and sighted adults (aged 20-65) participated in the study. On the basis of vowel sounds alone, participants assessed the relative body sizes of male pairs of varying heights. Accuracy of voice-based body size assessments significantly exceeded chance and did not differ among participants who were sighted, or congenitally blind or who had lost their sight later in life. Accuracy increased significantly with relative differences in physical height between men, suggesting that both blind and sighted participants used reliable vocal cues to size (i.e. vocal tract resonances). Our findings demonstrate that prior visual experience is not necessary for accurate body size estimation. This capacity, integral to both nonverbal communication and speech perception, may be present at birth or may generalize from broader cross-modal correspondences.}, } @article {pmid27093692, year = {2016}, author = {Koo, DL and Lee, JY and Joo, EY and Hong, SB and Nam, H}, title = {Acoustic Characteristics of Stridor in Multiple System Atrophy.}, journal = {PloS one}, volume = {11}, number = {4}, pages = {e0153935}, pmid = {27093692}, issn = {1932-6203}, mesh = {Acoustics ; Aged ; Continuous Positive Airway Pressure/methods ; Female ; Humans ; Male ; Middle Aged ; Multiple System Atrophy/*physiopathology ; Polysomnography/methods ; Respiratory Sounds/*diagnosis/*physiopathology ; Sleep Apnea, Obstructive/diagnosis/physiopathology ; Snoring/diagnosis/physiopathology ; }, abstract = {Nocturnal stridor is a breathing disorder prevalent in patients with multiple system atrophy (MSA). An improved understanding of this breathing disorder is essential since nocturnal stridor carries a poor prognosis (an increased risk of sudden death). In this study, we aimed to classify types of stridor by sound analysis and to reveal their clinical significance. Patients who met the criteria for probable MSA and had undergone polysomnography (PSG) were recruited. Patients were then assessed clinically with sleep questionnaires, including the Pittsburgh Sleep Quality Index, and the Hoehn and Yahr scale. Nocturnal stridor and snoring were analyzed with the Multi-Dimensional Voice Program. Nocturnal stridor was recorded in 22 patients and snoring in 18 patients using the PSG. Waveforms of stridors were classified into rhythmic or semirhythmic after analysis of the oscillogram. Formants and harmonics were observed in both types of stridor, but not in snoring. Of the 22 patients diagnosed with stridor during the present study, fifteen have subsequently died, with the time to death after the PSG study being 1.9 ± 1.4 years (range 0.8 to 5.0 years). The rhythmic waveform group presented higher scores on the Hoehn and Yahr scale and the survival outcome of this group was lower compared to the semirhythmic waveform group (p = 0.030, p = 0.014). In the Kaplan Meier's survival curve, the outcome of patients with rhythmic waveform was significantly less favorable than the outcome of patients with semirhythmic waveform (log-rank test, p < 0.001). Stridor in MSA can be classified into rhythmic and semirhythmic types and the rhythmic component signifies a poorer outcome.}, } @article {pmid27080684, year = {2016}, author = {Carney, LH and Kim, DO and Kuwada, S}, title = {Speech Coding in the Midbrain: Effects of Sensorineural Hearing Loss.}, journal = {Advances in experimental medicine and biology}, volume = {894}, number = {}, pages = {427-435}, pmid = {27080684}, issn = {0065-2598}, support = {R01 DC010813/DC/NIDCD NIH HHS/United States ; NIDCD-010813//PHS HHS/United States ; }, mesh = {Auditory Threshold ; Cochlear Nerve/physiology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Inferior Colliculi/*physiology ; Speech Perception/*physiology ; }, abstract = {In response to voiced speech sounds, auditory-nerve (AN) fibres phase-lock to harmonics near best frequency (BF) and to the fundamental frequency (F0) of voiced sounds. Due to nonlinearities in the healthy ear, phase-locking in each frequency channel is dominated either by a single harmonic, for channels tuned near formants, or by F0, for channels between formants. The alternating dominance of these factors sets up a robust pattern of F0-synchronized rate across best frequency (BF). This profile of a temporally coded measure is transformed into a mean rate profile in the midbrain (inferior colliculus, IC), where neurons are sensitive to low-frequency fluctuations. In the impaired ear, the F0-synchronized rate profile is affected by several factors: Reduced synchrony capture decreases the dominance of a single harmonic near BF on the response. Elevated thresholds also reduce the effect of rate saturation, resulting in increased F0-synchrony. Wider peripheral tuning results in a wider-band envelope with reduced F0 amplitude. In general, sensorineural hearing loss reduces the contrast in AN F0-synchronized rates across BF. Computational models for AN and IC neurons illustrate how hearing loss would affect the F0-synchronized rate profiles set up in response to voiced speech sounds.}, } @article {pmid27080591, year = {2017}, author = {S V, N and K, V}, title = {Spectral Measures of Hoarseness in Persons with Hyperfunctional Voice Disorder.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {1}, pages = {57-61}, doi = {10.1016/j.jvoice.2016.03.005}, pmid = {27080591}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Case-Control Studies ; Female ; Humans ; Larynx/*physiopathology ; Male ; Middle Aged ; Sound Spectrography ; Voice Disorders/*diagnosis/physiopathology ; *Voice Quality ; Young Adult ; }, abstract = {UNLABELLED: The purpose of the present investigation was to understand the spectral differences between individuals with hyperfunctional voice disorders and subjects with normal voice in terms of H1-H2, H1-A1, H1-A2, and H1-A3 (H1, first harmonic amplitude; H2, second harmonic amplitude; A1, amplitude of the most robust harmonic in the region of first formant frequency; A2, amplitude of the strongest harmonic component in the region of second formant frequency; A3, amplitude of the most robust harmonic component in the region of third formant frequency).

STUDY DESIGN: This study is a standard group comparison.

METHOD: Two groups of subjects were recruited for the study. Group 1 subjects were diagnosed with hyperfunctional voice disorder secondary to either vocal fold nodule, polyps, or edema, and group 2 subjects had clinically normal voice. Voice recordings of all the participants were collected, and their spectrum was analyzed. Further, the amplitudes from the spectrum were investigated, and the vowel harmonic amplitude differences namely H1-H2, H1-A1, H1-A2, and H1-A3 were calculated.

RESULTS: The significant effect of groups on all the spectral measures was noted. Individuals with hyperfunctional voice disorders showed a significantly higher amplitude difference, indicating higher spectral noise and breathiness as a result of laryngeal pathology.

CONCLUSION: The present investigation strongly recommends the spectral measures as a quantitative acoustic index of measuring hoarseness. Supplementary researches on this regard would be helpful in differentiating and better quantifying a breathy voice from a modal voice.}, } @article {pmid27063697, year = {2016}, author = {Nicolaidis, K and Sfakianaki, A}, title = {Acoustic characteristics of vowels produced by Greek intelligible speakers with profound hearing impairment II: The influence of stress and context.}, journal = {International journal of speech-language pathology}, volume = {18}, number = {4}, pages = {388-401}, doi = {10.3109/17549507.2016.1151934}, pmid = {27063697}, issn = {1754-9515}, mesh = {Adult ; Female ; Greece ; Hearing Loss/*complications ; Humans ; Language ; Male ; Phonetics ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {PURPOSE: The present paper examines the influence of stress and context on selected acoustic characteristics of vowels produced by six adult Greek intelligible speakers with profound hearing impairment and six speakers with normal hearing (three males and three females in each group).

METHOD: F1, F2 and F3 formant frequencies and the duration of vowels are measured in words of the form /(')pVCV/ with V = /i, ɛ, ɐ, ɔ, u/ and C = /p, t, k, s/. Variation in these parameters due to context and stress is reported for the two genders. A co-articulatory measure and three measures that examine the area of the vowel space and differences along the F1 and F2 axes are calculated.

RESULT: The results show a reduction of the vowel space in the unstressed condition for both groups, while vowel duration was found to be significantly longer in this condition for the speakers with hearing impairment. In addition, smaller C-to-V carryover co-articulatory effects were found for the speakers with hearing impairment.

CONCLUSION: Findings are discussed within the framework of perceptual and production constraints in hearing impairment and with reference to current models of co-articulation.}, } @article {pmid27063696, year = {2016}, author = {Nicolaidis, K and Sfakianaki, A}, title = {Acoustic characteristics of vowels produced by Greek intelligible speakers with profound hearing impairment I: Examination of vowel space.}, journal = {International journal of speech-language pathology}, volume = {18}, number = {4}, pages = {378-387}, doi = {10.3109/17549507.2015.1101155}, pmid = {27063696}, issn = {1754-9515}, mesh = {Adult ; Female ; Greece ; Hearing Loss/*complications ; Humans ; Language ; Male ; Phonetics ; *Speech Acoustics ; Young Adult ; }, abstract = {PURPOSE: The study examines F1, F2 and F3 formant frequencies of vowels produced by six Greek intelligible speakers with profound hearing impairment and six speakers with normal hearing (three male and three female in each group).

METHOD: The formant frequencies are measured in words of the form /'pVCV/ where V = /i, , , , u/ and C = /p, t, k, s/. The study examines differences in formant frequencies between the two groups and as a function of gender. Three measures are calculated to examine the area of the vowel space and differences along the F1 and F2 axes between the groups and genders.

RESULT: The results show that the vowel space produced by the speakers with hearing impairment is considerably reduced. Greater reduction was evident for F2 compared to F1. Restricted formant frequency ranges and relatively large variation along F1 and/or F2 for selected vowels resulted in overlap among vowel categories. F3 frequencies were systematically lower and showed greater variation for the speakers with hearing impairment.

CONCLUSION: The paper discusses findings with reference to perceptual and production constraints affecting the speech of individuals with hearing impairment.}, } @article {pmid27062936, year = {2016}, author = {Markova, D and Richer, L and Pangelinan, M and Schwartz, DH and Leonard, G and Perron, M and Pike, GB and Veillette, S and Chakravarty, MM and Pausova, Z and Paus, T}, title = {Age- and sex-related variations in vocal-tract morphology and voice acoustics during adolescence.}, journal = {Hormones and behavior}, volume = {81}, number = {}, pages = {84-96}, doi = {10.1016/j.yhbeh.2016.03.001}, pmid = {27062936}, issn = {1095-6867}, mesh = {Adolescent ; Adolescent Development/*physiology ; Age Factors ; Female ; Humans ; Male ; Sex Characteristics ; Sexual Maturation ; *Speech Acoustics ; Vocal Cords/*anatomy & histology ; Voice/*physiology ; }, abstract = {Distinct differences in the human voice emerge during adolescence, with males producing deeper and more resonant voices than females by the end of sexual maturation. Using magnetic resonance images of heads and voice recordings obtained in 532 typically developing adolescents, we investigate what might be the drivers of this change in voice, and the subjective judgment of the voice "maleness" and "femaleness". We show clear sex differences in the morphology of voice-related structures during adolescence, with males displaying strong associations between age (and puberty) and both vocal-fold and vocal-tract length; this was not the case in female adolescents. At the same time, males (compared with females) display stronger associations between age (and puberty) with both fundamental frequency and formant position. In males, vocal morphology was a mediator in the relationship between bioavailable testosterone and acoustic indices. Subjective judgment of the voice sex could be predicted by the morphological and acoustic parameters in males only: the length of vocal folds and its acoustic counterpart, fundamental frequency, is a larger predictor of subjective "maleness" of a voice than vocal-tract length and formant position.}, } @article {pmid27059064, year = {2016}, author = {Garcia, M and Wondrak, M and Huber, L and Fitch, WT}, title = {Honest signaling in domestic piglets (Sus scrofa domesticus): vocal allometry and the information content of grunt calls.}, journal = {The Journal of experimental biology}, volume = {219}, number = {Pt 12}, pages = {1913-1921}, pmid = {27059064}, issn = {1477-9145}, mesh = {Animals ; Body Size ; Body Weight ; *Cues ; Female ; Male ; Radiography/veterinary ; Sound Spectrography ; Sus scrofa/growth & development/*physiology ; Vocal Cords/anatomy & histology/diagnostic imaging ; *Vocalization, Animal ; }, abstract = {The information conveyed in acoustic signals is a central topic in mammal vocal communication research. Body size is one form of information that can be encoded in calls. Acoustic allometry aims to identify the specific acoustic correlates of body size within the vocalizations of a given species, and formants are often a useful acoustic cue in this context. We conducted a longitudinal investigation of acoustic allometry in domestic piglets (Sus scrofa domesticus), asking whether formants of grunt vocalizations provide information concerning the caller's body size over time. On four occasions, we recorded grunts from 20 kunekune piglets, measured their vocal tract length by means of radiographs (X-rays) and weighed them. Controlling for effects of age and sex, we found that body weight strongly predicts vocal tract length, which in turn determines formant frequencies. We conclude that grunt formant frequencies could allow domestic pigs to assess a signaler's body size as it grows. Further research using playback experiments is needed to determine the perceptual role of formants in domestic pig communication.}, } @article {pmid27039009, year = {2016}, author = {Bernardini, F and Lunden, A and Covington, M and Broussard, B and Halpern, B and Alolayan, Y and Crisafio, A and Pauselli, L and Balducci, PM and Capulong, L and Attademo, L and Lucarini, E and Salierno, G and Natalicchi, L and Quartesan, R and Compton, MT}, title = {Associations of acoustically measured tongue/jaw movements and portion of time speaking with negative symptom severity in patients with schizophrenia in Italy and the United States.}, journal = {Psychiatry research}, volume = {239}, number = {}, pages = {253-258}, doi = {10.1016/j.psychres.2016.03.037}, pmid = {27039009}, issn = {1872-7123}, support = {R21 MH097999/MH/NIMH NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; Italy ; Male ; Middle Aged ; Phonetics ; Schizophrenia/*physiopathology ; *Severity of Illness Index ; *Speech Acoustics ; Speech Production Measurement/*methods ; United States ; Young Adult ; }, abstract = {This is the first cross-language study of the effect of schizophrenia on speech as measured by analyzing phonetic parameters with sound spectrography. We hypothesized that reduced variability in pitch and formants would be correlated with negative symptom severity in two samples of patients with schizophrenia, one from Italy, and one from the United States. Audio recordings of spontaneous speech were available from 40 patients. From each speech sample, a file of F0 (pitch) and formant values (F1 and F2, resonance bands indicating the moment-by-moment shape of the oral cavity), and the portion of the recording in which there was speaking ("fraction voiced," FV), was created. Correlations between variability in the phonetic indices and negative symptom severity were tested and further examined using regression analyses. Meaningful negative correlations between Scale for the Assessment of Negative Symptoms (SANS) total score and standard deviation (SD) of F2, as well as variability in pitch (SD F0) were observed in the Italian sample. We also found meaningful associations of SANS affective flattening and SANS alogia with SD F0, and of SANS avolition/apathy and SD F2 in the Italian sample. In both samples, FV was meaningfully correlated with SANS total score, avolition/apathy, and anhedonia/asociality.}, } @article {pmid27017263, year = {2016}, author = {Peter, V and Kalashnikova, M and Burnham, D}, title = {Neural processing of amplitude and formant rise time in dyslexia.}, journal = {Developmental cognitive neuroscience}, volume = {19}, number = {}, pages = {152-163}, pmid = {27017263}, issn = {1878-9307}, mesh = {Acoustic Stimulation/*methods ; Brain/*physiopathology ; Child ; *Cues ; Dyslexia/diagnosis/*physiopathology ; Female ; Humans ; Language ; Male ; *Phonetics ; Speech/physiology ; Speech Perception/*drug effects/physiology ; Time Factors ; }, abstract = {This study aimed to investigate how children with dyslexia weight amplitude rise time (ART) and formant rise time (FRT) cues in phonetic discrimination. Passive mismatch responses (MMR) were recorded for a/ba/-/wa/contrast in a multiple deviant odd-ball paradigm to identify the neural response to cue weighting in 17 children with dyslexia and 17 age-matched control children. The deviant stimuli had either partial or full ART or FRT cues. The results showed that ART did not generate an MMR in either group, whereas both partial and full FRT cues generated MMR in control children while only full FRT cues generated MMR in children with dyslexia. These findings suggest that children, both controls and those with dyslexia, discriminate speech based on FRT cues and not ART cues. However, control children have greater sensitivity to FRT cues in speech compared to children with dyslexia.}, } @article {pmid26992555, year = {2017}, author = {Vos, RR and Daffern, H and Howard, DM}, title = {Resonance Tuning in Three Girl Choristers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {31}, number = {1}, pages = {122.e1-122.e7}, doi = {10.1016/j.jvoice.2016.01.013}, pmid = {26992555}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Age Factors ; Child ; Female ; Humans ; Larynx/*physiology ; *Phonation ; Sex Factors ; *Singing ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Vibration ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVE: The phenomenon of resonance tuning, whereby a singer modifies the shape of their vocal tract to increase the acoustic power output, is commonly exploited across large pitch ranges by professional sopranos and has been observed to a lesser degree in nonexpert adult singers. This study considers the employment of two common resonance tuning techniques in experienced child singers; tuning the first vocal tract resonance to the fundamental (R1: fo) and tuning the second resonance to the second harmonic (R2:2 fo).

METHODS: Wide-band excitation at the subject's mouth during singing was used to measure the vocal tract resonances of three girl choristers, and vowel formant values in speech were extracted from samples of spoken text. Measured resonance values were cross-referenced with first and second harmonics for sung vowels across the subjects' ranges to identify the resonance tuning techniques employed, and these results were compared with those previously observed by others in professional adult classical singers.

RESULTS AND CONCLUSIONS: There was clear evidence that the subjects employed resonance tuning techniques comparable with the strategies used by adult singers. The protocol and results presented here pave the way for further studies exploring the development of resonance tuning techniques in young soprano voices, with the potential to impact on approaches to classical singing training in the future.}, } @article {pmid26960915, year = {2016}, author = {Cuzalina, A and Jung, C}, title = {Rhinoplasty for the Cleft Lip and Palate Patient.}, journal = {Oral and maxillofacial surgery clinics of North America}, volume = {28}, number = {2}, pages = {189-202}, doi = {10.1016/j.coms.2015.12.002}, pmid = {26960915}, issn = {1558-1365}, mesh = {Cartilage/transplantation ; Cleft Lip/*surgery ; Cleft Palate/*surgery ; Humans ; Infant ; Infant, Newborn ; Nasal Septum/surgery ; Rhinoplasty/*methods ; Ribs/transplantation ; Surgical Flaps ; }, abstract = {Septorhinoplasties in cleft patients are challenging procedures to perform for even the most experienced surgeon. Unilateral cleft rhinoplasties present a unique challenge given that the tissue bed has had previous manipulation and scarring is found around the tissue matrix, making it typically necessary to place several sturdy cartilaginous grafts to provide structural support. Rib graft provides an abundance of cartilage that can be used for multiple areas. The ability to manipulate and adjust the thickness of the cartilage strut allows for improved integrity of the graft to resist deformation and warping from the formant scar tissue.}, } @article {pmid26950654, year = {2016}, author = {Bergman, TJ and Cortés-Ortiz, L and Dias, PA and Ho, L and Adams, D and Canales-Espinosa, D and Kitchen, DM}, title = {Striking differences in the loud calls of howler monkey sister species (Alouatta pigra and A. palliata).}, journal = {American journal of primatology}, volume = {78}, number = {7}, pages = {755-766}, doi = {10.1002/ajp.22539}, pmid = {26950654}, issn = {1098-2345}, mesh = {Acoustics ; *Alouatta ; Animals ; Body Size ; Mexico ; *Social Behavior ; *Vocalization, Animal ; }, abstract = {Comparing vocalizations across species is useful for understanding acoustic variation at mechanistic and evolutionary levels. Here, we take advantage of the divergent vocalizations of two closely related howler monkey species (Alouatta pigra and A. palliata) to better understand vocal evolution. In addition to comparing multiple acoustic and temporal features of roars and the calling bouts in which they are produced, we tested several predictions. First, A. pigra should have roars with lower fundamental frequency and lower formant dispersion because they are larger than A. palliata and have a larger hyoid apparatus. Second, A. pigra should have faster calling rates, longer roars, longer bouts, and exaggerated call features linked to vocal effort (e.g., nonlinear phenomena and emphasized frequencies) because they are the more aggressive species during intergroup encounters. We found significant interspecific differences supporting our predictions in every tested parameter of roars and bouts, except for roar duration and barking rate. Stepwise discriminant function analyses identified the best features for differentiating roars (acoustic features: formant dispersion followed by highest frequency; temporal features: longest syllable duration followed by number of syllables). Although resembling each other more than they resemble South American howler monkeys, our comparison revealed striking differences in the vocalizations of the two Mesoamerican species. While we cannot completely rule out the influence of body size or the environmental conditions in which the two species evolved, vocal differences were likely influenced by sexual selection. The exaggerated roars and intense calling patterns in A. pigra seem more suitable for intergroup competition, whereas A. palliata calls may be better suited for mate attraction and competition within groups. With interspecific acoustic differences quantified, we will now be able to examine how vocalizations contribute to the evolutionary dynamics of the A. palliata × A. pigra hybrid zone in southern Mexico. Am. J. Primatol. 78:755-766, 2016. © 2016 Wiley Periodicals, Inc.}, } @article {pmid26936574, year = {2016}, author = {Alexander, JM}, title = {Nonlinear frequency compression: Influence of start frequency and input bandwidth on consonant and vowel recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {2}, pages = {938-957}, pmid = {26936574}, issn = {1520-8524}, support = {RC1 DC010601/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; Aged ; Aged, 80 and over ; Audiometry, Speech ; Auditory Threshold ; Cues ; Female ; Hearing Loss/diagnosis/*psychology ; Humans ; Male ; Middle Aged ; Noise/adverse effects ; Nonlinear Dynamics ; Perceptual Masking ; *Pitch Perception ; *Recognition, Psychology ; Severity of Illness Index ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; *Voice Quality ; }, abstract = {By varying parameters that control nonlinear frequency compression (NFC), this study examined how different ways of compressing inaudible mid- and/or high-frequency information at lower frequencies influences perception of consonants and vowels. Twenty-eight listeners with mild to moderately severe hearing loss identified consonants and vowels from nonsense syllables in noise following amplification via a hearing aid simulator. Low-pass filtering and the selection of NFC parameters fixed the output bandwidth at a frequency representing a moderately severe (3.3 kHz, group MS) or a mild-to-moderate (5.0 kHz, group MM) high-frequency loss. For each group (n = 14), effects of six combinations of NFC start frequency (SF) and input bandwidth [by varying the compression ratio (CR)] were examined. For both groups, the 1.6 kHz SF significantly reduced vowel and consonant recognition, especially as CR increased; whereas, recognition was generally unaffected if SF increased at the expense of a higher CR. Vowel recognition detriments for group MS were moderately correlated with the size of the second formant frequency shift following NFC. For both groups, significant improvement (33%-50%) with NFC was confined to final /s/ and /z/ and to some VCV tokens, perhaps because of listeners' limited exposure to each setting. No set of parameters simultaneously maximized recognition across all tokens.}, } @article {pmid26936570, year = {2016}, author = {Tabain, M and Butcher, A and Breen, G and Beare, R}, title = {An acoustic study of nasal consonants in three Central Australian languages.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {2}, pages = {890-903}, doi = {10.1121/1.4941659}, pmid = {26936570}, issn = {1520-8524}, mesh = {*Acoustics ; Australia ; Female ; Humans ; Linear Models ; Male ; Nose/*physiology ; *Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; *Voice Quality ; }, abstract = {This study presents nasal consonant data from 21 speakers of three Central Australian languages: Arrernte, Pitjantjatjara and Warlpiri. The six nasals considered are bilabial /m/, dental /n/, alveolar /n/, retroflex /ɳ/, alveo-palatal /ɲ/, and velar /ŋ/. Nasal formant and bandwidth values are examined, as are the locations of spectral minima. Several differences are found between the bilabial /m/ and the velar /ŋ/, and also the palatal /ɲ/. The remaining coronal nasals /n n ɳ/ are not well differentiated within the nasal murmur, but their average bandwidths are lower than for the other nasal consonants. Broader spectral shape measures (Centre of Gravity and Standard Deviation) are also considered, and comparisons are made with data for stops and laterals in these languages based on the same spectral measures. It is suggested that nasals are not as easily differentiated using the various measures examined here as are stops and laterals. It is also suggested that existing models of nasal consonants do not fully account for the observed differences between the various nasal places of articulation; and that oral formants, in addition to anti-formants, contribute substantially to the output spectrum of nasal consonants.}, } @article {pmid26936555, year = {2016}, author = {Shadle, CH and Nam, H and Whalen, DH}, title = {Comparing measurement errors for formants in synthetic and natural vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {2}, pages = {713-727}, pmid = {26936555}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; DC-002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Algorithms ; Female ; Fourier Analysis ; Humans ; Linear Models ; Male ; Middle Aged ; Reproducibility of Results ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; *Voice Quality ; Young Adult ; }, abstract = {The measurement of formant frequencies of vowels is among the most common measurements in speech studies, but measurements are known to be biased by the particular fundamental frequency (F0) exciting the formants. Approaches to reducing the errors were assessed in two experiments. In the first, synthetic vowels were constructed with five different first formant (F1) values and nine different F0 values; formant bandwidths, and higher formant frequencies, were constant. Input formant values were compared to manual measurements and automatic measures using the linear prediction coding-Burg algorithm, linear prediction closed-phase covariance, the weighted linear prediction-attenuated main excitation (WLP-AME) algorithm [Alku, Pohjalainen, Vainio, Laukkanen, and Story (2013). J. Acoust. Soc. Am. 134(2), 1295-1313], spectra smoothed cepstrally and by averaging repeated discrete Fourier transforms. Formants were also measured manually from pruned reassigned spectrograms (RSs) [Fulop (2011). Speech Spectrum Analysis (Springer, Berlin)]. All but WLP-AME and RS had large errors in the direction of the strongest harmonic; the smallest errors occur with WLP-AME and RS. In the second experiment, these methods were used on vowels in isolated words spoken by four speakers. Results for the natural speech show that F0 bias affects all automatic methods, including WLP-AME; only the formants measured manually from RS appeared to be accurate. In addition, RS coped better with weaker formants and glottal fry.}, } @article {pmid26928002, year = {2016}, author = {Miller, S and Zhang, Y and Nelson, P}, title = {Neural Correlates of Phonetic Learning in Postlingually Deafened Cochlear Implant Listeners.}, journal = {Ear and hearing}, volume = {37}, number = {5}, pages = {514-528}, doi = {10.1097/AUD.0000000000000287}, pmid = {26928002}, issn = {1538-4667}, mesh = {Adult ; Aged ; *Cochlear Implantation ; *Cochlear Implants ; Deafness/physiopathology/*rehabilitation ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; *Learning ; Male ; Middle Aged ; *Phonetics ; *Speech Perception ; }, abstract = {OBJECTIVE: The present training study aimed to examine the fine-scale behavioral and neural correlates of phonetic learning in adult postlingually deafened cochlear implant (CI) listeners. The study investigated whether high variability identification training improved phonetic categorization of the /ba/-/da/ and /wa/-/ja/ speech contrasts and whether any training-related improvements in phonetic perception were correlated with neural markers associated with phonetic learning. It was hypothesized that training would sharpen phonetic boundaries for the speech contrasts and that changes in behavioral sensitivity would be associated with enhanced mismatch negativity (MMN) responses to stimuli that cross a phonetic boundary relative to MMN responses evoked using stimuli from the same phonetic category.

DESIGN: A computer-based training program was developed that featured multitalker variability and adaptive listening. The program was designed to help CI listeners attend to the important second formant transition cue that categorizes the /ba/-/da/ and /wa/-/ja/ contrasts. Nine adult CI listeners completed the training and 4 additional CI listeners that did not undergo training were included to assess effects of procedural learning. Behavioral pre-post tests consisted of identification and discrimination of the synthetic /ba/-/da/ and /wa/-/ja/ speech continua. The electrophysiologic MMN response elicited by an across phoneme category pair and a within phoneme category pair that differed by an acoustically equivalent amount was derived at pre-post test intervals for each speech contrast as well.

RESULTS: Training significantly enhanced behavioral sensitivity across the phonetic boundary and significantly altered labeling of the stimuli along the /ba/-/da/ continuum. While training only slightly altered identification and discrimination of the /wa/-/ja/ continuum, trained CI listeners categorized the /wa/-/ja/ contrast more efficiently than the /ba/-/da/ contrast across pre-post test sessions. Consistent with behavioral results, pre-post EEG measures showed the MMN amplitude to the across phoneme category pair significantly increased with training for both the /ba/-/da/ and /wa/-/ja/ contrasts, but the MMN was unchanged with training for the corresponding within phoneme category pairs. Significant brain-behavior correlations were observed between changes in the MMN amplitude evoked by across category phoneme stimuli and changes in the slope of identification functions for the trained listeners for both speech contrasts.

CONCLUSIONS: The brain and behavior data of the present study provide evidence that substantial neural plasticity for phonetic learning in adult postlingually deafened CI listeners can be induced by high variability identification training. These findings have potential clinical implications related to the aural rehabilitation process following receipt of a CI device.}, } @article {pmid26857619, year = {2016}, author = {Pisanski, K and Cartei, V and McGettigan, C and Raine, J and Reby, D}, title = {Voice Modulation: A Window into the Origins of Human Vocal Control?.}, journal = {Trends in cognitive sciences}, volume = {20}, number = {4}, pages = {304-318}, doi = {10.1016/j.tics.2016.01.002}, pmid = {26857619}, issn = {1879-307X}, mesh = {Animals ; *Biological Evolution ; Humans ; Interpersonal Relations ; *Speech ; Voice/*physiology ; }, abstract = {An unresolved issue in comparative approaches to speech evolution is the apparent absence of an intermediate vocal communication system between human speech and the less flexible vocal repertoires of other primates. We argue that humans' ability to modulate nonverbal vocal features evolutionarily linked to expression of body size and sex (fundamental and formant frequencies) provides a largely overlooked window into the nature of this intermediate system. Recent behavioral and neural evidence indicates that humans' vocal control abilities, commonly assumed to subserve speech, extend to these nonverbal dimensions. This capacity appears in continuity with context-dependent frequency modulations recently identified in other mammals, including primates, and may represent a living relic of early vocal control abilities that led to articulated human speech.}, } @article {pmid26855461, year = {2015}, author = {Story, BH and Bunton, K}, title = {Formant measurement in children's speech based on spectral filtering.}, journal = {Speech communication}, volume = {76}, number = {}, pages = {93-111}, pmid = {26855461}, issn = {0167-6393}, support = {R01 DC011275/DC/NIDCD NIH HHS/United States ; }, abstract = {Children's speech presents a challenging problem for formant frequency measurement. In part, this is because high fundamental frequencies, typical of a children's speech production, generate widely spaced harmonic components that may undersample the spectral shape of the vocal tract transfer function. In addition, there is often a weakening of upper harmonic energy and a noise component due to glottal turbulence. The purpose of this study was to develop a formant measurement technique based on cepstral analysis that does not require modification of the cepstrum itself or transformation back to the spectral domain. Instead, a narrow-band spectrum is low-pass filtered with a cutoff point (i.e., cutoff "quefrency" in the terminology of cepstral analysis) to preserve only the spectral envelope. To test the method, speech representative of a 2-3 year-old child was simulated with an airway modulation model of speech production. The model, which includes physiologically-scaled vocal folds and vocal tract, generates sound output analogous to a microphone signal. The vocal tract resonance frequencies can be calculated independently of the output signal and thus provide test cases that allow for assessing the accuracy of the formant tracking algorithm. When applied to the simulated child-like speech, the spectral filtering approach was shown to provide a clear spectrographic representation of formant change over the time course of the signal, and facilitates tracking formant frequencies for further analysis.}, } @article {pmid26827037, year = {2016}, author = {Lee, J and Shaiman, S and Weismer, G}, title = {Relationship between tongue positions and formant frequencies in female speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {1}, pages = {426-440}, doi = {10.1121/1.4939894}, pmid = {26827037}, issn = {1520-8524}, mesh = {Female ; Humans ; Kinetics ; Movement/physiology ; Observer Variation ; *Phonetics ; Pressure ; Speech/*physiology ; Speech Acoustics ; Tongue/*physiology ; Young Adult ; }, abstract = {This study examined the relationship (1) between acoustic vowel space and the corresponding tongue kinematic vowel space and (2) between formant frequencies (F1 and F2) and tongue x-y coordinates for the same time sampling point. Thirteen healthy female adults participated in this study. Electromagnetic articulography and synchronized acoustic recordings were utilized to obtain vowel acoustic and tongue kinematic data across ten speech tasks. Intra-speaker analyses showed that for 10 of the 13 speakers the acoustic vowel space was moderately to highly correlated with tongue kinematic vowel space; much weaker correlations were obtained for inter-speaker analyses. Correlations of individual formants with tongue positions showed that F1 varied strongly with tongue position variations in the y dimension, whereas F2 was correlated in equal magnitude with variations in the x and y positions. For within-speaker analyses, the size of the acoustic vowel space is likely to provide a reasonable inference of size of the tongue working space for most speakers; unfortunately there is no a priori, obvious way to identify the speakers for whom the covariation is not significant. A second conclusion is that F1 variations reflect tongue height, but F2 is a much more complex reflection of tongue variation in both dimensions.}, } @article {pmid26827031, year = {2016}, author = {Tabain, M and Butcher, A and Breen, G and Beare, R}, title = {An acoustic study of multiple lateral consonants in three Central Australian languages.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {1}, pages = {361-372}, doi = {10.1121/1.4937751}, pmid = {26827031}, issn = {1520-8524}, mesh = {Alveolar Process/physiology ; Australia/ethnology ; Dental Arch/physiology ; Female ; Humans ; *Language ; Male ; Palate/physiology ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; }, abstract = {This study presents dental, alveolar, retroflex, and palatal lateral /̪ll ɭ ʎ/ data from three Central Australian languages: Arrernte, Pitjantjatjara, and Warlpiri. Formant results show that the laminal laterals (dental /̪l/ and palatal /ʎ/) have a relatively low F1, presumably due to a high jaw position for these sounds, as well as higher F4. In addition, the palatal /ʎ/ has very high F2. There is relatively little difference in F3 between the four lateral places of articulation. However, the retroflex /ɭ/ appears to have slightly lower F3 and F4 in comparison to the other lateral sounds. Importantly, spectral moment analyses suggest that centre of gravity and standard deviation (first and second spectral moments) are sufficient to characterize the four places of articulation. The retroflex has a concentration of energy at slightly lower frequencies than the alveolar, while the palatal has a concentration of energy at higher frequencies. The dental is characterized by a more even spread of energy. These various results are discussed in light of different acoustic models of lateral production, and the possibility of spectral cues to place of articulation across manners of articulation is considered.}, } @article {pmid26826999, year = {2016}, author = {Won, JH and Tremblay, K and Clinard, CG and Wright, RA and Sagi, E and Svirsky, M}, title = {The neural encoding of formant frequencies contributing to vowel identification in normal-hearing listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {139}, number = {1}, pages = {1-11}, pmid = {26826999}, issn = {1520-8524}, support = {R01 DC012769/DC/NIDCD NIH HHS/United States ; T32-DC000033/DC/NIDCD NIH HHS/United States ; R01-DC012769-03/DC/NIDCD NIH HHS/United States ; R01-DC03937/DC/NIDCD NIH HHS/United States ; R01 DC003937/DC/NIDCD NIH HHS/United States ; T32 DC000033/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Cochlea/physiology ; Cues ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Male ; Middle Aged ; Models, Neurological ; Monte Carlo Method ; Perceptual Masking/physiology ; Phonetics ; Recognition, Psychology/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Even though speech signals trigger coding in the cochlea to convey speech information to the central auditory structures, little is known about the neural mechanisms involved in such processes. The purpose of this study was to understand the encoding of formant cues and how it relates to vowel recognition in listeners. Neural representations of formants may differ across listeners; however, it was hypothesized that neural patterns could still predict vowel recognition. To test the hypothesis, the frequency-following response (FFR) and vowel recognition were obtained from 38 normal-hearing listeners using four different vowels, allowing direct comparisons between behavioral and neural data in the same individuals. FFR was employed because it provides an objective and physiological measure of neural activity that can reflect formant encoding. A mathematical model was used to describe vowel confusion patterns based on the neural responses to vowel formant cues. The major findings were (1) there were large variations in the accuracy of vowel formant encoding across listeners as indexed by the FFR, (2) these variations were systematically related to vowel recognition performance, and (3) the mathematical model of vowel identification was successful in predicting good vs poor vowel identification performers based exclusively on physiological data.}, } @article {pmid26822389, year = {2016}, author = {Erickson, ML}, title = {Acoustic Properties of the Voice Source and the Vocal Tract: Are They Perceptually Independent?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {772.e9-772.e22}, doi = {10.1016/j.jvoice.2015.11.010}, pmid = {26822389}, issn = {1873-4588}, mesh = {Acoustic Stimulation ; Acoustics ; Adolescent ; Adult ; Cues ; Female ; Humans ; Judgment ; Larynx/*physiology ; Male ; Pattern Recognition, Physiological ; *Pitch Perception ; Psychoacoustics ; Signal Processing, Computer-Assisted ; *Singing ; Vibration ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study sought to determine whether the properties of the voice source and vocal tract are perceptually independent.

STUDY DESIGN: Within-subjects design.

METHODS: This study employed a paired-comparison paradigm where listeners heard synthetic voices and rated them as same or different using a visual analog scale. Stimuli were synthesized using three different source slopes and two different formant patterns (mezzo-soprano and soprano) on the vowel /a/ at four pitches: A3, C4, B4, and F5.

RESULTS: Whereas formant pattern was the strongest effect, difference in source slope also affected perceived quality difference. Source slope and formant pattern were not independently perceived.

CONCLUSION: These results suggest that when judging laryngeal adduction using perceptual information, judgments may not be accurate when the stimuli are of differing formant patterns.}, } @article {pmid26821404, year = {2015}, author = {Hamdan, AL and Ziade, G and Al-Zaghal, Z and Tabri, D and Sinno, S and Saade, R and Jabbour, J and Nassar, J}, title = {FORMANT CHARACTERISTICS OF ENGLISH-SPEAKING LEBANESE MEN.}, journal = {Le Journal medical libanais. The Lebanese medical journal}, volume = {63}, number = {4}, pages = {209-212}, doi = {10.12816/0017969}, pmid = {26821404}, issn = {0023-9852}, mesh = {Adolescent ; Adult ; Cross-Sectional Studies ; Humans ; *Language ; Lebanon ; Male ; Middle Aged ; *Phonetics ; Young Adult ; }, abstract = {OBJECTIVE: To report the formant characteristics of English-speaking Lebanese men during steady prolongation of vowels /a/ and /i/.

DESIGN: Cross-sectional study involving volunteer participants.

PARTICIPANTS: Fifty healthy English-speaking males with a mean age of 32 ± 4.027 years and a range of 42 years (18-60).

MATERIALS AND METHODS: A total of 50 healthy males between the age of 18 and 60 were recruited for the study. Each subject was asked to phonate a sustained /a/ and /i/ sound at a comfortable pitch and intensity level. Measures were made in real-time and formant frequencies across F1, F2, F3 and F4 were determined using the Real-time Spectrogram VP 3950 (Kay Elemetrics, New Jersey).

MAIN OUTCOME MEASURES: The formant frequencies across F1, F2, F3 and F4 were recorded.

RESULTS: For the vowel /a/, the mean values of F1, F2, F3 and F4 were 622.86 ± 61.293 Hz, 1264 ± 78.602 Hz, 2610.90 ± 206.359 Hz and 3483.56 ± 206.833 Hz, respectively. For the vowel /i/, the corresponding values were 378.88 ± 51.825 Hz, 2210.34 ± 124.077 Hz, 2847 ± 168.770 Hz and 3576.82 ± 242.760 Hz, respectively.

CONCLUSION: Formant characteristics vary among cultures and ethnic groups.}, } @article {pmid26778327, year = {2016}, author = {Villar, AC and Korn, GP and Azevedo, RR}, title = {Perceptual-auditory and Acoustic Analysis of Air Traffic Controllers' Voices Pre- and Postshift.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {768.e11-768.e15}, doi = {10.1016/j.jvoice.2015.10.021}, pmid = {26778327}, issn = {1873-4588}, mesh = {Accidents, Aviation/prevention & control ; *Acoustics ; Adult ; *Aviation ; Female ; Humans ; Judgment ; Male ; Occupational Diseases/*diagnosis/etiology/physiopathology/psychology ; *Occupational Health ; *Occupations ; *Phonation ; Predictive Value of Tests ; Prospective Studies ; Risk Factors ; Sound Spectrography ; *Speech Perception ; *Speech Production Measurement ; Speech Therapy/*methods ; Time Factors ; Voice Disorders/*diagnosis/etiology/physiopathology/psychology ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To characterize the vocal quality and acoustic parameters of voices of air traffic controllers (ATCs) without any vocal complaints before and after a shift.

METHOD: The voices of a group of 45 ATCs were recorded before and after a 2-hour shift, regardless of their operational position or number of previously worked shifts; both genders were included, participants had a mean age of 25 years, and they had a mean length of occupational experience of 4 years and 2 months. Each of these professionals was recorded phonating a sustained /a/ vowel and counting from 1 to 20, and the recordings were acoustically analyzed using the Praat software. A perceptual-auditory analysis of the recordings was then performed by three speech therapists specializing in voice, who evaluated the characteristics of each emission using a visual analog scale (VAS). The acoustic analysis was performed on the sustained /a/ vowel. The measures of intensity; frequency; maximum phonation time (MPT); and the first, second, third, and fourth formants were considered in this analysis.

RESULTS: There were no significant differences between the random pre- and postshift samples, either in the acoustic or in the perceptual-auditory analysis. The perceptual-auditory analysis revealed that 44% (n = 20) of ATCs showed alterations in vocal quality during the sustained /a/ vowel emission, and this dysphonia was also observed in connected speech in 25% (n = 5) of this group.

CONCLUSION: Perceptual-auditory analysis of the /a/ vowel revealed that a high percentage of ATCs had vocal alterations (44%), even among a group of subjects without vocal complaints.}, } @article {pmid26771013, year = {2015}, author = {Chun, H and Ma, S and Han, W and Chun, Y}, title = {Error Patterns Analysis of Hearing Aid and Cochlear Implant Users as a Function of Noise.}, journal = {Journal of audiology & otology}, volume = {19}, number = {3}, pages = {144-153}, pmid = {26771013}, issn = {2384-1621}, abstract = {BACKGROUND AND OBJECTIVES: Not all impaired listeners may have the same speech perception ability although they will have similar pure-tone threshold and configuration. For this reason, the present study analyzes error patterns in the hearing-impaired compared to normal hearing (NH) listeners as a function of signal-to-noise ratio (SNR).

SUBJECTS AND METHODS: Forty-four adults participated: 10 listeners with NH, 20 hearing aids (HA) users and 14 cochlear implants (CI) users. The Korean standardized monosyllables were presented as the stimuli in quiet and three different SNRs. Total error patterns were classified into types of substitution, omission, addition, fail, and no response, using stacked bar plots.

RESULTS: Total error percent for the three groups significantly increased as the SNRs decreased. For error pattern analysis, the NH group showed substitution errors dominantly regardless of the SNRs compared to the other groups. Both the HA and CI groups had substitution errors that declined, while no response errors appeared as the SNRs increased. The CI group was characterized by lower substitution and higher fail errors than did the HA group. Substitutions of initial and final phonemes in the HA and CI groups were limited by place of articulation errors. However, the HA group had missed consonant place cues, such as formant transitions and stop consonant bursts, whereas the CI group usually had limited confusions of nasal consonants with low frequency characteristics. Interestingly, all three groups showed /k/ addition in the final phoneme, a trend that magnified as noise increased.

CONCLUSIONS: The HA and CI groups had their unique error patterns even though the aided thresholds of the two groups were similar. We expect that the results of this study will focus on high error patterns in auditory training of hearing-impaired listeners, resulting in reducing those errors and improving their speech perception ability.}, } @article {pmid26768853, year = {2016}, author = {Mi, L and Tao, S and Wang, W and Dong, Q and Guan, J and Liu, C}, title = {English vowel identification and vowel formant discrimination by native Mandarin Chinese- and native English-speaking listeners: The effect of vowel duration dependence.}, journal = {Hearing research}, volume = {333}, number = {}, pages = {58-65}, doi = {10.1016/j.heares.2015.12.024}, pmid = {26768853}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Acoustics ; Adolescent ; Adult ; Audiometry, Pure-Tone ; Audiometry, Speech ; Auditory Threshold ; *Cues ; *Discrimination, Psychological ; Humans ; Male ; *Multilingualism ; *Phonetics ; *Recognition, Psychology ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The purpose of this study was to examine the relationship between English vowel identification and English vowel formant discrimination for native Mandarin Chinese- and native English-speaking listeners. The identification of 12 English vowels was measured with the duration cue preserved or removed. The thresholds of vowel formant discrimination on the F2 of two English vowels,/Λ/and/i/, were also estimated using an adaptive-tracking procedure. Native Mandarin Chinese-speaking listeners showed significantly higher thresholds of vowel formant discrimination and lower identification scores than native English-speaking listeners. The duration effect on English vowel identification was similar between native Mandarin Chinese- and native English-speaking listeners. Moreover, regardless of listeners' language background, vowel identification was significantly correlated with vowel formant discrimination for the listeners who were less dependent on duration cues, whereas the correlation between vowel identification and vowel formant discrimination was not significant for the listeners who were highly dependent on duration cues. This study revealed individual variability in using multiple acoustic cues to identify English vowels for both native and non-native listeners.}, } @article {pmid26725549, year = {2016}, author = {Jafari, N and Yadegari, F and Jalaie, S}, title = {Acoustic Analysis of Persian Vowels in Cochlear Implant Users: A Comparison With Hearing-impaired Children Using Hearing Aid and Normal-hearing Children.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {763.e1-763.e7}, doi = {10.1016/j.jvoice.2015.10.006}, pmid = {26725549}, issn = {1873-4588}, mesh = {*Acoustics ; Age Factors ; Analysis of Variance ; Case-Control Studies ; Child ; *Child Development ; Child, Preschool ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Deafness/diagnosis/physiopathology/psychology/*rehabilitation ; Disabled Children/psychology/*rehabilitation ; Female ; Humans ; Iran ; Male ; Persons With Hearing Impairments/psychology/*rehabilitation ; Signal Processing, Computer-Assisted ; Software ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {OBJECTIVE: Vowel production in essence is auditorily controlled; hence, the role of the auditory feedback in vowel production is very important. The purpose of this study was to compare formant frequencies and vowel space in Persian-speaking deaf children with cochlear implantation (CI), hearing-impaired children with hearing aid (HA), and their normal-hearing (NH) peers.

METHODS: A total of 40 prelingually children with hearing impairment and 20 NH groups participated in this study. Participants were native Persian speakers. The average of first formant frequency (F1) and second formant frequency (F2) of the six vowels were measured using Praat software (version 5.1.44). One-way analysis of variance (ANOVA) was used to analyze the differences between the three3 groups.

RESULTS: The mean value of F1 for vowel /i/ was significantly different (between CI and NH children and also between HA and NH groups) (F2, 57 = 9.229, P < 0.001). For vowel /a/, the mean value of F1 was significantly different (between HA and NH groups) (F2, 57 = 3.707, P < 0.05). Regarding the second formant frequency, a post hoc Tukey test revealed that the differences were between HA and NH children (P < 0.05). F2 for vowel /o/ was significantly different (F2, 57 = 4.572, P < 0.05). Also, the mean value of F2 for vowel /a/ was significantly different (F2, 57 = 3.184, P < 0.05).

CONCLUSION: About 1 year after implantation, the formants shift closer to those of the NH listeners who tend to have more expanded vowel spaces than hearing-impaired listeners with hearing aids. Probably, this condition is because CI has a subtly positive impact on the place of articulation of vowels.}, } @article {pmid26723334, year = {2015}, author = {Heeren, WF}, title = {Vocalic correlates of pitch in whispered versus normal speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {6}, pages = {3800-3810}, doi = {10.1121/1.4937762}, pmid = {26723334}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Acoustics ; Audiometry, Speech ; *Cues ; Female ; Humans ; Male ; *Phonetics ; *Pitch Perception ; Psychoacoustics ; Recognition, Psychology ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; *Voice Quality ; }, abstract = {In whispered speech, the fundamental frequency is absent as a main cue to pitch. This study investigated how different pitch targets can acoustically be coded in whispered relative to normal speech. Secondary acoustic correlates that are found in normal speech may be preserved in whisper. Alternatively, whispering speakers may provide compensatory information. Compared to earlier studies, a more comprehensive set of acoustic correlates (duration, intensity, formants, center-of-gravity, spectral balance) and a larger set of materials were included. To elicit maximal acoustic differences among the low, mid, and high pitch targets, linguistic and semantic load were minimized: 12 native Dutch speakers produced the point vowels (/a, i, u/) in nonsense vowel-consonant-vowel targets (with C = {/s/, /f/}). Acoustic analyses showed that in addition to systematic changes in formants, which have been reported before, also center of gravity, spectral balance, and intensity varied with pitch target, both in whispered and normal speech. Some acoustic correlates differed more in whispered than in normal speech, suggesting that speakers can adopt a compensatory strategy when coding pitch in the speech mode lacking the main cue. Speakers furthermore varied in the extent to which particular correlates were used, and in the combination of correlates they altered systematically.}, } @article {pmid26723325, year = {2015}, author = {Rosen, S and Hui, SN}, title = {Sine-wave and noise-vocoded sine-wave speech in a tone language: Acoustic details matter.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {6}, pages = {3698-3702}, doi = {10.1121/1.4937605}, pmid = {26723325}, issn = {1520-8524}, mesh = {*Acoustics ; Adolescent ; Adult ; Audiometry, Speech ; Comprehension ; *Cues ; Female ; Humans ; Male ; Middle Aged ; Multilingualism ; Noise/*adverse effects ; Perceptual Masking ; *Phonetics ; Pitch Discrimination ; *Pitch Perception ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; *Voice Quality ; Young Adult ; }, abstract = {Sine-wave speech (SWS) is a highly simplified version of speech consisting only of frequency- and amplitude-modulated sinusoids representing the formants. That listeners can successfully understand SWS has led to claims that speech perception must be based on abstract properties of the stimuli far removed from their specific acoustic form. Here it is shown, in bilingual Cantonese/English listeners, that performance with Cantonese SWS is improved by noise vocoding, with no effect on English SWS utterances. This manipulation preserves the abstract informational structure in the signals but changes its surface form. The differential effects of noise vocoding likely arise from the fact that Cantonese is a tonal language and hence more reliant on fundamental frequency (F0) contours for its intelligibility. SWS does not preserve tonal information from the original speech but does have false tonal information signalled by the lowest frequency sinusoid. Noise vocoding SWS appears to minimise the tonal percept, which thus interferes less in the perception of Cantonese. It has no effect in English, which is minimally reliant on F0 variations for intelligibility. Therefore it is not only the informational structure of a sound that is important but also how its acoustic detail interacts with the phonological structure of a given language.}, } @article {pmid26706750, year = {2016}, author = {Vanhecke, F and Lebacq, J and Moerman, M and Manfredi, C and Raes, GW and DeJonckere, PH}, title = {Physiology and Acoustics of Inspiratory Phonation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {769.e9-769.e18}, doi = {10.1016/j.jvoice.2015.11.001}, pmid = {26706750}, issn = {1873-4588}, mesh = {*Acoustics ; Female ; Humans ; *Inhalation ; Kymography ; Lung/*physiology ; Middle Aged ; *Phonation ; *Singing ; Time Factors ; Video Recording ; Vocal Cords/*physiology ; *Voice Quality ; }, abstract = {INTRODUCTION: Inspiratory phonation (IP) means phonating with inspiratory airflow. Some vocalists remarkably master this technique, to such an extent that it offers new dramatic, aesthetic, and functional possibilities in singing specific contemporary music. The present study aims to a better understanding of the physiological backgrounds of IP.

MATERIAL AND METHODS: A total of 51 inhaling utterances were compared with 61 exhaling utterances in a professional soprano highly skilled in inhaling singing, by means of high-speed single-line scanning and advanced acoustic analysis. Ranges of intensity and Fo were kept similar.

RESULTS: The main differences are: (1) an inversion of the mucosal wave, (2) a smaller closed quotient in IP, (3) a larger opening/closing quotient in IP with the additional difference that in IP, the quotient is larger than 1 (opening slower than closing), whereas it is less than 1 in expiratory mode (opening faster than closing), (4) a larger vocal-fold excursion in IP, (5) higher values of adaptive normalized noise energy in IP, and (6) a steeper slope of harmonic peaks in IP. However, jitter values are similar (within normal range), as well as damping ratios and central formant frequencies. The two voicing modes cannot be differentiated by blind listening.

CONCLUSION: The basic physiological mechanisms are comparable in both voicing modes, although with specific differences. IP is actually to be considered as an "extended vocal technique," a term applied to vocalization in art music, which falls outside of traditional classical singing styles, but with remarkable possibilities in skilled vocalists.}, } @article {pmid26697170, year = {2015}, author = {Elgendi, M and Bobhate, P and Jain, S and Guo, L and Kumar, S and Rutledge, J and Coe, Y and Zemp, R and Schuurmans, D and Adatia, I}, title = {The unique heart sound signature of children with pulmonary artery hypertension.}, journal = {Pulmonary circulation}, volume = {5}, number = {4}, pages = {631-639}, pmid = {26697170}, issn = {2045-8932}, abstract = {We hypothesized that vibrations created by the pulmonary circulation would create sound like the vocal cords during speech and that subjects with pulmonary artery hypertension (PAH) might have a unique sound signature. We recorded heart sounds at the cardiac apex and the second left intercostal space (2LICS), using a digital stethoscope, from 27 subjects (12 males) with a median age of 7 years (range: 3 months-19 years) undergoing simultaneous cardiac catheterization. Thirteen subjects had mean pulmonary artery pressure (mPAp) < 25 mmHg (range: 8-24 mmHg). Fourteen subjects had mPAp ≥ 25 mmHg (range: 25-97 mmHg). We extracted the relative power of the frequency band, the entropy, and the energy of the sinusoid formants from the heart sounds. We applied linear discriminant analysis with leave-one-out cross validation to differentiate children with and without PAH. The significance of the results was determined with a t test and a rank-sum test. The entropy of the first sinusoid formant contained within an optimized window length of 2 seconds of the heart sounds recorded at the 2LICS was significantly lower in subjects with mPAp ≥ 25 mmHg relative to subjects with mPAp < 25 mmHg, with a sensitivity of 93% and specificity of 92%. The reduced entropy of the first sinusoid formant of the heart sounds in children with PAH suggests the existence of an organized pattern. The analysis of this pattern revealed a unique sound signature, which could be applied to a noninvasive method to diagnose PAH.}, } @article {pmid26685174, year = {2016}, author = {Suthers, RA and Rothgerber, JR and Jensen, KK}, title = {Lingual articulation in songbirds.}, journal = {The Journal of experimental biology}, volume = {219}, number = {Pt 4}, pages = {491-500}, pmid = {26685174}, issn = {1477-9145}, support = {R01 NS029467/NS/NINDS NIH HHS/United States ; 5R01NS029467-19/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustics ; Animals ; Beak ; Cineradiography ; Male ; Passeriformes/*physiology ; Sound Spectrography ; Tongue/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Lingual articulation in humans is one of the primary means of vocal tract resonance filtering that produces the characteristic vowel formants of speech. In songbirds, the function of the tongue in song has not been thoroughly examined, although recent research has identified the oropharyngeal-esophageal cavity as a resonance filter that is actively tuned to the frequency of the song. In northern cardinals (Cardinalis cardinalis), the volume of this cavity is inversely proportional to the frequency of the song above 2 kHz. However, cardinal song extends below this range, leaving the question of whether and how the vocal tract is tracking these low frequencies. We investigated the possible role of the tongue in vocal tract filtering using X-ray cineradiography of northern cardinals. Below 2 kHz, there was prominent tongue elevation in which the tip of the tongue was raised until it seemed to touch the palate. These results suggest that tongue elevation lowers the resonance frequency below 2 kHz by reducing the area of the passage from the oral cavity into the beak. This is consistent with a computational model of the songbird vocal tract in which resonance frequencies are actively adjusted by both changing the volume of the oropharyngeal-esophageal cavity and constricting the opening into the beak.}, } @article {pmid26683493, year = {2015}, author = {Rallo Fabra, L}, title = {Can Nonnative Speakers Reduce English Vowels in a Native-Like Fashion? Evidence from L1-Spanish L2-English Bilinguals.}, journal = {Phonetica}, volume = {72}, number = {2-3}, pages = {162-181}, doi = {10.1159/000430920}, pmid = {26683493}, issn = {1423-0321}, mesh = {Adult ; Case-Control Studies ; England ; Evidence-Based Medicine ; Female ; Humans ; *Language ; Male ; *Multilingualism ; Phonation/*physiology ; Spain ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Voice Quality ; }, abstract = {This paper investigates the production of English unstressed vowels by two groups of early (ESp) and late Spanish (LSp) bilinguals and a control group of native English (NE) monolinguals. Three acoustic measurements were obtained: duration and intensity ratios of unstressed to stressed vowels, normalized vowel formants and euclidean distances. Both groups of bilinguals showed significantly fewer differences in duration between stressed and unstressed vowels than the NE monolinguals. Intensity differences depended on whether the stress pattern of the target English words matched the stress pattern of their Spanish cognates. As for vowel quality, the early bilinguals reduced the unstressed vowels, which clustered around the midcenter area of the vowel space, in the same fashion as the NE monolinguals, suggesting that vowel reduction might be operating at the phonological level. However, the late bilinguals showed a context-dependent, phonetic-level pattern with vowels that were more peripheral in the vowel space.}, } @article {pmid26655914, year = {2016}, author = {Moberly, AC and Bhat, J and Shahin, AJ}, title = {Acoustic Cue Weighting by Adults with Cochlear Implants: A Mismatch Negativity Study.}, journal = {Ear and hearing}, volume = {37}, number = {4}, pages = {465-472}, pmid = {26655914}, issn = {1538-4667}, support = {R01 DC013543/DC/NIDCD NIH HHS/United States ; R03 DC011168/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Cochlear Implantation ; *Cochlear Implants ; *Cues ; Deafness/physiopathology/*rehabilitation ; Electroencephalography ; *Evoked Potentials, Auditory ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; *Speech Perception ; Young Adult ; }, abstract = {OBJECTIVES: Formant rise time (FRT) and amplitude rise time (ART) are acoustic cues that inform phonetic identity. FRT represents the rate of transition of the formant(s) to a steady state, while ART represents the rate at which the sound reaches its peak amplitude. Normal-hearing (NH) native English speakers weight FRT more than ART during the perceptual labeling of the /ba/-/wa/ contrast. This weighting strategy is reflected neurophysiologically in the magnitude of the mismatch negativity (MMN)-MMN is larger during the FRT than the ART distinction. The present study examined the neurophysiological basis of acoustic cue weighting in adult cochlear implant (CI) listeners using the MMN design. It was hypothesized that individuals with CIs who weight ART more in behavioral labeling (ART users) would show larger MMNs during the ART than the FRT contrast, and the opposite would be seen for FRT users.

DESIGN: Electroencephalography was recorded while 20 adults with CIs listened passively to combinations of 3 synthetic speech stimuli: a /ba/ with /ba/-like FRT and ART; a /wa/ with /wa/-like FRT and ART; and a /ba/ stimulus with /ba/-like FRT and /wa/-like ART. The MMN response was elicited during the FRT contrast by having participants passively listen to a train of /wa/ stimuli interrupted occasionally by /ba/ stimuli, and vice versa. For the ART contrast, the same procedure was implemented using the /ba/ and /ba/ stimuli.

RESULTS: Both ART and FRT users with CIs elicited MMNs that were equal in magnitudes during FRT and ART contrasts, with the exception that FRT users exhibited MMNs for ART and FRT contrasts that were temporally segregated. That is, their MMNs occurred significantly earlier during the ART contrast (~100 msec following sound onset) than during the FRT contrast (~200 msec). In contrast, the MMNs for ART users of both contrasts occurred later and were not significantly separable in time (~230 msec). Interestingly, this temporal segregation observed in FRT users is consistent with the MMN behavior in NH listeners.

CONCLUSIONS: Results suggest that listeners with CIs who learn to classify phonemes based on formant dynamics, consistent with NH listeners, develop a strategy similar to NH listeners, in which the organization of the amplitude and spectral representations of phonemes in auditory memory are temporally segregated.}, } @article {pmid26629749, year = {2016}, author = {Verhoeven, J and Hide, O and De Maeyer, S and Gillis, S and Gillis, S}, title = {Hearing impairment and vowel production. A comparison between normally hearing, hearing-aided and cochlear implanted Dutch children.}, journal = {Journal of communication disorders}, volume = {59}, number = {}, pages = {24-39}, doi = {10.1016/j.jcomdis.2015.10.007}, pmid = {26629749}, issn = {1873-7994}, mesh = {Child ; Child, Preschool ; *Cochlear Implants ; *Correction of Hearing Impairment ; Female ; *Hearing Aids ; Humans ; Male ; *Phonetics ; Reference Values ; Sound Spectrography ; Speech Acoustics ; Speech Articulation Tests ; Speech Intelligibility ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {This study investigated the acoustic characteristics of the Belgian Standard Dutch vowels in children with hearing impairment and in children with normal hearing. In a balanced experimental design, the 12 vowels of Belgian Standard Dutch were recorded in three groups of children: a group of children with normal hearing, a group with a conventional hearing aid and a group with a cochlear implant. The formants, the surface area of the vowel space and the acoustic differentiation between the vowels were determined. The analyses revealed that many of the vowels in hearing-impaired children showed a reduction of the formant values. This reduction was particularly significant with respect to F2. The size of the vowel space was significantly smaller in the hearing-impaired children. Finally, a smaller acoustic differentiation between the vowels was observed in children with hearing impairment. The results show that even after 5 years of device use, the acoustic characteristics of the vowels in hearing-assisted children remain significantly different as compared to their NH peers.}, } @article {pmid26604010, year = {2016}, author = {Macari, AT and Ziade, G and Turfe, Z and Chidiac, A and Alam, E and Hamdan, AL}, title = {Correlation Between the Position of the Hyoid Bone on Lateral Cephalographs and Formant Frequencies.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {757.e21-757.e26}, doi = {10.1016/j.jvoice.2015.08.020}, pmid = {26604010}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Adult ; *Anatomic Landmarks ; Cephalometry/*methods ; Cervical Vertebrae/diagnostic imaging ; Child ; Cross-Sectional Studies ; Female ; Humans ; Hyoid Bone/anatomy & histology/*diagnostic imaging ; Male ; *Phonation ; Prospective Studies ; Skull Base/diagnostic imaging ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: The objective of this study is to examine the F1, F2, F3, and F4 during sustained vowels /ɑ/, /i/, /o/, /u/.

STUDY DESIGN: Prospective cross-sectional study.

METHODS: Fifty-two consecutive patients aged between 9 years and 38 years were invited to participate in this study. Linear measurements included linear vertical distance from the hyoid bone to the sella turcica (H-S); linear vertical distance from the hyoid bone to the posterior nasal spine (H-PNS); linear measure from the hyoid bone to the most anterior point of the cervical vertebra C3 (H-C3); and linear vertical distance from the hyoid bone to the mandibular plane (H-MP).

RESULTS: The results showed a moderate and statistically significant correlation between the average fundamental frequency for the vowel /ɑ/ and H-C3, H-S, and H-PNS and another moderate negative correlation between F3 and F4, and the vertical position of the hyoid bone H-C3 and H-S. For the vowel /i/, there was a moderate negative correlation between F1, F3, and F4 and H-S and also a moderate negative correlation between F3 and F4 and H-C3. For the vowel /o/, there was a moderate negative correlation between F4 and H-S and H-PNS. For the vowel /u/, only F4 correlated significantly with H-S.

CONCLUSION: There is a moderate correlation between the high formants, mostly F4, and the cephalo-caudal position of the hyoid bone.}, } @article {pmid26596843, year = {2016}, author = {Titze, IR}, title = {Some Consensus has been Reached on the Labeling of Harmonics, Formants, and Resonances.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {2}, pages = {129}, doi = {10.1016/j.jvoice.2015.09.014}, pmid = {26596843}, issn = {1873-4588}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; }, mesh = {Consensus ; Humans ; *Phonation ; *Phonetics ; *Terminology as Topic ; Vibration ; Vocal Cords/anatomy & histology/*physiology ; *Voice Quality ; }, } @article {pmid26592343, year = {2015}, author = {Dunn, JC and Halenar, LB and Davies, TG and Cristobal-Azkarate, J and Reby, D and Sykes, D and Dengg, S and Fitch, WT and Knapp, LA}, title = {Evolutionary trade-off between vocal tract and testes dimensions in howler monkeys.}, journal = {Current biology : CB}, volume = {25}, number = {21}, pages = {2839-2844}, pmid = {26592343}, issn = {1879-0445}, support = {230604/ERC_/European Research Council/International ; }, mesh = {Alouatta/anatomy & histology/genetics/*physiology ; Animals ; Biological Evolution ; Copulation/physiology ; Female ; Hyoid Bone/anatomy & histology/*physiology ; Male ; Phenotype ; Sexual Behavior, Animal/physiology ; Social Behavior ; Spermatozoa/physiology ; Testis/anatomy & histology/*physiology ; Vocalization, Animal/physiology ; }, abstract = {Males often face a trade-off between investments in precopulatory and postcopulatory traits [1], particularly when male-male contest competition determines access to mates [2]. To date, studies of precopulatory strategies have largely focused on visual ornaments (e.g., coloration) or weapon morphology (e.g., antlers, horns, and canines). However, vocalizations can also play an important role in both male competition and female choice [3-5]. We investigated variation in vocal tract dimensions among male howler monkeys (Alouatta spp.), which produce loud roars using a highly specialized and greatly enlarged hyoid bone and larynx [6]. We examined the relative male investment in hyoids and testes among howler monkey species in relation to the level of male-male competition and analyzed the acoustic consequences of variation in hyoid morphology. Species characterized by single-male groups have large hyoids and small testes, suggesting high levels of vocally mediated competition. Larger hyoids lower formant frequencies, probably increasing the acoustic impression of male body size and playing a role analogous to investment in large body size or weaponry. Across species, as the number of males per group increases, testes volume also increases, indicating higher levels of postcopulatory sperm competition, while hyoid volume decreases. These results provide the first evidence of an evolutionary trade-off between investment in precopulatory vocal characteristics and postcopulatory sperm production.}, } @article {pmid26587121, year = {2015}, author = {Ouattassi, N and Benmansour, N and Ridal, M and Zaki, Z and Bendahhou, K and Nejjari, C and Cherkaoui, A and El Alami, MN}, title = {Acoustic assessment of erygmophonic speech of Moroccan laryngectomized patients.}, journal = {The Pan African medical journal}, volume = {21}, number = {}, pages = {270}, pmid = {26587121}, issn = {1937-8688}, mesh = {*Acoustics ; Aged ; Algorithms ; Case-Control Studies ; Humans ; Laryngectomy/*rehabilitation ; Male ; Middle Aged ; Morocco ; Prospective Studies ; Reproducibility of Results ; *Speech, Alaryngeal ; *Voice Quality ; }, abstract = {INTRODUCTION: Acoustic evaluation of alaryngeal voices is among the most prominent issues in speech analysis field. In fact, many methods have been developed to date to substitute the classic perceptual evaluation. The Aim of this study is to present our experience in erygmophonic speech objective assessment and to discuss the most widely used methods of acoustic speech appraisal. through a prospective case-control study we have measured acoustic parameters of speech quality during one year of erygmophonic rehabilitation therapy of Moroccan laryngectomized patients.

METHODS: We have assessed acoustic parameters of erygmophonic speech samples of eleven laryngectomized patients through the speech rehabilitation therapy. Acoustic parameters were obtained by perturbation analysis method and linear predictive coding algorithms also through the broadband spectrogram.

RESULTS: Using perturbation analysis methods, we have found erygmophonic voice to be significantly poorer than normal speech and it exhibits higher formant frequency values. However, erygmophonic voice shows also higher and extremely variable Error values that were greater than the acceptable level. And thus, live a doubt on the reliability of those analytic methods results.

CONCLUSION: Acoustic parameters for objective evaluation of alaryngeal voices should allow a reliable representation of the perceptual evaluation of the quality of speech. This requirement has not been fulfilled by the common methods used so far. Therefore, acoustical assessment of erygmophonic speech needs more investigations.}, } @article {pmid26583482, year = {2016}, author = {Ananthakrishnan, S and Krishnan, A and Bartlett, E}, title = {Human Frequency Following Response: Neural Representation of Envelope and Temporal Fine Structure in Listeners with Normal Hearing and Sensorineural Hearing Loss.}, journal = {Ear and hearing}, volume = {37}, number = {2}, pages = {e91-e103}, pmid = {26583482}, issn = {1538-4667}, support = {R01 DC008549/DC/NIDCD NIH HHS/United States ; R01 DC011580/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aging/*physiology ; Brain/physiology/*physiopathology ; Case-Control Studies ; Electroencephalography ; Electroencephalography Phase Synchronization/*physiology ; Female ; Fourier Analysis ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; Speech Perception ; Time Factors ; Young Adult ; }, abstract = {OBJECTIVE: Listeners with sensorineural hearing loss (SNHL) typically experience reduced speech perception, which is not completely restored with amplification. This likely occurs because cochlear damage, in addition to elevating audiometric thresholds, alters the neural representation of speech transmitted to higher centers along the auditory neuroaxis. While the deleterious effects of SNHL on speech perception in humans have been well-documented using behavioral paradigms, our understanding of the neural correlates underlying these perceptual deficits remains limited. Using the scalp-recorded frequency following response (FFR), the authors examine the effects of SNHL and aging on subcortical neural representation of acoustic features important for pitch and speech perception, namely the periodicity envelope (F0) and temporal fine structure (TFS; formant structure), as reflected in the phase-locked neural activity generating the FFR.

DESIGN: FFRs were obtained from 10 listeners with normal hearing (NH) and 9 listeners with mild-moderate SNHL in response to a steady-state English back vowel /u/ presented at multiple intensity levels. Use of multiple presentation levels facilitated comparisons at equal sound pressure level (SPL) and equal sensation level. In a second follow-up experiment to address the effect of age on envelope and TFS representation, FFRs were obtained from 25 NH and 19 listeners with mild to moderately severe SNHL to the same vowel stimulus presented at 80 dB SPL. Temporal waveforms, Fast Fourier Transform and spectrograms were used to evaluate the magnitude of the phase-locked activity at F0 (periodicity envelope) and F1 (TFS).

RESULTS: Neural representation of both envelope (F0) and TFS (F1) at equal SPLs was stronger in NH listeners compared with listeners with SNHL. Also, comparison of neural representation of F0 and F1 across stimulus levels expressed in SPL and sensation level (accounting for audibility) revealed that level-related changes in F0 and F1 magnitude were different for listeners with SNHL compared with listeners with NH. Furthermore, the degradation in subcortical neural representation was observed to persist in listeners with SNHL even when the effects of age were controlled for.

CONCLUSIONS: Overall, our results suggest a relatively greater degradation in the neural representation of TFS compared with periodicity envelope in individuals with SNHL. This degraded neural representation of TFS in SNHL, as reflected in the brainstem FFR, may reflect a disruption in the temporal pattern of phase-locked neural activity arising from altered tonotopic maps and/or wider filters causing poor frequency selectivity in these listeners. Finally, while preliminary results indicate that the deleterious effects of SNHL may be greater than age-related degradation in subcortical neural representation, the lack of a balanced age-matched control group in this study does not permit us to completely rule out the effects of age on subcortical neural representation.}, } @article {pmid26581377, year = {2016}, author = {Elie, JE and Theunissen, FE}, title = {The vocal repertoire of the domesticated zebra finch: a data-driven approach to decipher the information-bearing acoustic features of communication signals.}, journal = {Animal cognition}, volume = {19}, number = {2}, pages = {285-315}, pmid = {26581377}, issn = {1435-9456}, support = {R01 DC010132/DC/NIDCD NIH HHS/United States ; CD010132/CD/ODCDC CDC HHS/United States ; }, mesh = {*Algorithms ; Animals ; Female ; Finches/*physiology ; Male ; Social Behavior ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Although a universal code for the acoustic features of animal vocal communication calls may not exist, the thorough analysis of the distinctive acoustical features of vocalization categories is important not only to decipher the acoustical code for a specific species but also to understand the evolution of communication signals and the mechanisms used to produce and understand them. Here, we recorded more than 8000 examples of almost all the vocalizations of the domesticated zebra finch, Taeniopygia guttata: vocalizations produced to establish contact, to form and maintain pair bonds, to sound an alarm, to communicate distress or to advertise hunger or aggressive intents. We characterized each vocalization type using complete representations that avoided any a priori assumptions on the acoustic code, as well as classical bioacoustics measures that could provide more intuitive interpretations. We then used these acoustical features to rigorously determine the potential information-bearing acoustical features for each vocalization type using both a novel regularized classifier and an unsupervised clustering algorithm. Vocalization categories are discriminated by the shape of their frequency spectrum and by their pitch saliency (noisy to tonal vocalizations) but not particularly by their fundamental frequency. Notably, the spectral shape of zebra finch vocalizations contains peaks or formants that vary systematically across categories and that would be generated by active control of both the vocal organ (source) and the upper vocal tract (filter).}, } @article {pmid26572721, year = {2016}, author = {Macari, AT and Ziade, G and Khandakji, M and Tamim, H and Hamdan, AL}, title = {Effect of Rapid Maxillary Expansion on Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {6}, pages = {760.e1-760.e6}, doi = {10.1016/j.jvoice.2015.09.013}, pmid = {26572721}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Child ; Female ; Humans ; Male ; Maxilla/pathology/*surgery ; Maxillary Diseases/diagnosis/*surgery ; Palatal Expansion Technique/*adverse effects ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Treatment Outcome ; *Voice Quality ; }, abstract = {OBJECTIVE: The purpose of this investigation is to evaluate the effect of rapid maxillary expansion (RME) on the fundamental frequency (F0) and formant frequencies F1-F4.

STUDY DESIGN: Cross-sectional study.

MATERIALS AND METHODS: A total of 14 consecutive patients between the ages of 9.6 years and 15 years with a constricted maxilla undergoing RME were included in this study. Measurements were made before (T1) and after treatment (T2). These included maxillary arch length, depth, width, and perimeter in addition to F0, habitual pitch, and formants F1, F2, F3, and F4 for the vowels /ɑ/, /i/, /o/, and /u/.

RESULTS: There was a significant difference in the mean of F1/ɑ/ and F2/ɑ/ before and after treatment (P value of 0.04 and 0.013, respectively). It is worth noting that F1/ɑ/ decreased in 11 and F2/ɑ/ decreased in 10 of the 14 subjects.

CONCLUSION: The application of RME in the treatment of maxillary constriction leads to a significant lowering of the first and second formants for the vowel /ɑ/ in most subjects. Subjects undergoing rapid maxillary application should be aware of the potential change in voice quality especially in case of professional voice users.}, } @article {pmid26558301, year = {2015}, author = {Sondhi, S and Khan, M and Vijay, R and Salhan, AK and Chouhan, S}, title = {Acoustic analysis of speech under stress.}, journal = {International journal of bioinformatics research and applications}, volume = {11}, number = {5}, pages = {417-432}, doi = {10.1504/ijbra.2015.071942}, pmid = {26558301}, issn = {1744-5485}, abstract = {When a person is emotionally charged, stress could be discerned in his voice. This paper presents a simplified and a non-invasive approach to detect psycho-physiological stress by monitoring the acoustic modifications during a stressful conversation. Voice database consists of audio clips from eight different popular FM broadcasts wherein the host of the show vexes the subjects who are otherwise unaware of the charade. The audio clips are obtained from real-life stressful conversations (no simulated emotions). Analysis is done using PRAAT software to evaluate mean fundamental frequency (F0) and formant frequencies (F1, F2, F3, F4) both in neutral and stressed state. Results suggest that F0 increases with stress; however, formant frequency decreases with stress. Comparison of Fourier and chirp spectra of short vowel segment shows that for relaxed speech, the two spectra are similar; however, for stressed speech, they differ in the high frequency range due to increased pitch modulation.}, } @article {pmid26558134, year = {2015}, author = {Martel Sauvageau, V and Roy, JP and Cantin, L and Prud'Homme, M and Langlois, M and Macoir, J}, title = {Articulatory Changes in Vowel Production following STN DBS and Levodopa Intake in Parkinson's Disease.}, journal = {Parkinson's disease}, volume = {2015}, number = {}, pages = {382320}, pmid = {26558134}, issn = {2090-8083}, abstract = {Purpose. To investigate the impact of deep brain stimulation of the subthalamic nucleus (STN DBS) and levodopa intake on vowel articulation in dysarthric speakers with Parkinson's disease (PD). Methods. Vowel articulation was assessed in seven Quebec French speakers diagnosed with idiopathic PD who underwent STN DBS. Assessments were conducted on- and off-medication, first prior to surgery and then 1 year later. All recordings were made on-stimulation. Vowel articulation was measured using acoustic vowel space and formant centralization ratio. Results. Compared to the period before surgery, vowel articulation was reduced after surgery when patients were off-medication, while it was better on-medication. The impact of levodopa intake on vowel articulation changed with STN DBS: before surgery, levodopa impaired articulation, while it no longer had a negative effect after surgery. Conclusions. These results indicate that while STN DBS could lead to a direct deterioration in articulation, it may indirectly improve it by reducing the levodopa dose required to manage motor symptoms. These findings suggest that, with respect to speech production, STN DBS and levodopa intake cannot be investigated separately because the two are intrinsically linked. Along with motor symptoms, speech production should be considered when optimizing therapeutic management of patients with PD.}, } @article {pmid26557690, year = {2015}, author = {Jesus, LM and Martinez, J and Hall, A and Ferreira, A}, title = {Acoustic Correlates of Compensatory Adjustments to the Glottic and Supraglottic Structures in Patients with Unilateral Vocal Fold Paralysis.}, journal = {BioMed research international}, volume = {2015}, number = {}, pages = {704121}, pmid = {26557690}, issn = {2314-6141}, mesh = {Acoustics ; Adult ; Aged ; Cross-Sectional Studies ; Female ; Humans ; Male ; Middle Aged ; Noise ; Speech Acoustics ; Vocal Cord Paralysis/*physiopathology ; Vocal Cords/*physiopathology ; Voice/physiology ; Voice Disorders/*physiopathology ; }, abstract = {The goal of this study was to analyse perceptually and acoustically the voices of patients with Unilateral Vocal Fold Paralysis (UVFP) and compare them to the voices of normal subjects. These voices were analysed perceptually with the GRBAS scale and acoustically using the following parameters: mean fundamental frequency (F0), standard-deviation of F0, jitter (ppq5), shimmer (apq11), mean harmonics-to-noise ratio (HNR), mean first (F1) and second (F2) formants frequency, and standard-deviation of F1 and F2 frequencies. Statistically significant differences were found in all of the perceptual parameters. Also the jitter, shimmer, HNR, standard-deviation of F0, and standard-deviation of the frequency of F2 were statistically different between groups, for both genders. In the male data differences were also found in F1 and F2 frequencies values and in the standard-deviation of the frequency of F1. This study allowed the documentation of the alterations resulting from UVFP and addressed the exploration of parameters with limited information for this pathology.}, } @article {pmid26531753, year = {2016}, author = {Ali, Z and Elamvazuthi, I and Alsulaiman, M and Muhammad, G}, title = {Detection of Voice Pathology using Fractal Dimension in a Multiresolution Analysis of Normal and Disordered Speech Signals.}, journal = {Journal of medical systems}, volume = {40}, number = {1}, pages = {20}, pmid = {26531753}, issn = {1573-689X}, mesh = {Algorithms ; *Fractals ; Humans ; Voice/physiology ; Voice Disorders/*diagnosis/*physiopathology ; *Wavelet Analysis ; }, abstract = {Voice disorders are associated with irregular vibrations of vocal folds. Based on the source filter theory of speech production, these irregular vibrations can be detected in a non-invasive way by analyzing the speech signal. In this paper we present a multiband approach for the detection of voice disorders given that the voice source generally interacts with the vocal tract in a non-linear way. In normal phonation, and assuming sustained phonation of a vowel, the lower frequencies of speech are heavily source dependent due to the low frequency glottal formant, while the higher frequencies are less dependent on the source signal. During abnormal phonation, this is still a valid, but turbulent noise of source, because of the irregular vibration, affects also higher frequencies. Motivated by such a model, we suggest a multiband approach based on a three-level discrete wavelet transformation (DWT) and in each band the fractal dimension (FD) of the estimated power spectrum is estimated. The experiments suggest that frequency band 1-1562 Hz, lower frequencies after level 3, exhibits a significant difference in the spectrum of a normal and pathological subject. With this band, a detection rate of 91.28 % is obtained with one feature, and the obtained result is higher than all other frequency bands. Moreover, an accuracy of 92.45 % and an area under receiver operating characteristic curve (AUC) of 95.06 % is acquired when the FD of all levels is fused. Likewise, when the FD of all levels is combined with 22 Multi-Dimensional Voice Program (MDVP) parameters, an improvement of 2.26 % in accuracy and 1.45 % in AUC is observed.}, } @article {pmid26520296, year = {2015}, author = {Fletcher, AR and McAuliffe, MJ and Lansford, KL and Liss, JM}, title = {The relationship between speech segment duration and vowel centralization in a group of older speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {4}, pages = {2132-2139}, doi = {10.1121/1.4930563}, pmid = {26520296}, issn = {1520-8524}, mesh = {Age Factors ; Aged/*psychology ; Aged, 80 and over ; Female ; Habits ; Humans ; Male ; *Phonation ; *Phonetics ; Sex Factors ; Sound Spectrography ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; Verbal Behavior ; }, abstract = {This study examined the relationship between average vowel duration and spectral vowel quality across a group of 149 New Zealand English speakers aged 65 to 90 yr. The primary intent was to determine whether participants who had a natural tendency to speak slowly would also produce more spectrally distinct vowel segments. As a secondary aim, this study investigated whether advancing age exhibited a measurable effect on vowel quality and vowel durations within the group. In examining vowel quality, both flexible and static formant extraction points were compared. Two formant measurements, from selected [ɐ:], [ i:], and [ o:] vowels, were extracted from a standard passage and used to calculate two measurements of vowel space area (VSA) for each speaker. Average vowel duration was calculated from segments across the passage. The study found a statistically significant relationship between speakers' average vowel durations and VSA measurements indicating that, on average, speakers with slower speech rates produced more acoustically distinct speech segments. As expected, increases in average vowel duration were found with advancing age. However, speakers' formant values remained unchanged. It is suggested that the use of a habitually slower speaking rate may assist speakers in maintaining acoustically distinct vowels.}, } @article {pmid26505410, year = {2015}, author = {Niu, XQ and Zhao, XX and Li, BC and Gao, YJ and Xu, W and Fan, YD and Fu, GP and Wang, K and Pu, J}, title = {[1]H-MRS before and after resuscitation following selective cerebral ultra-profound hypothermic blood flow occlusion in monkeys.}, journal = {Genetics and molecular research : GMR}, volume = {14}, number = {4}, pages = {12595-12605}, doi = {10.4238/2015.October.19.3}, pmid = {26505410}, issn = {1676-5680}, mesh = {Animals ; Aspartic Acid/analogs & derivatives/metabolism ; Brain/*blood supply/*metabolism ; Carotid Stenosis ; Cerebrovascular Disorders/metabolism/*pathology ; Choline/metabolism ; Circulatory Arrest, Deep Hypothermia Induced/methods ; Creatine/metabolism ; Female ; Macaca mulatta ; Male ; Proton Magnetic Resonance Spectroscopy/*methods ; Resuscitation ; }, abstract = {We investigated the effect of selective cerebral ultra-profound hypothermic blood flow occlusion on brain tissue and cell metabolism to ascertain the efficacy and safety of selective deep hypothermic technologies using proton magnetic resonance spectroscopy ((1)H-MRS). The bilateral carotid artery was blocked at room temperature for 10 min. Other neck vessels were then blocked through cold perfusion of the internal carotid artery and reflux of the ipsilateral jugular vein. Thus, selective cerebral extracorporeal circulation was established. Brain temperature was reduced to 15.1° ± 0.9°C. After 60 min, cerebral blood flow recovered naturally. Routine magnetic resonance imaging (MRI), diffusion-weighted imaging (DWI), and (1)H-MRS examination of the bilateral frontal cortex and basal ganglia were performed prior to surgery and 4, 24, 72 h, 21 days after recovery. The formants and areas under the curve (AUC) of N-acetyl aspartate (NAA), choline (Cho), creatine/phosphocreatine (Cr/Cr2) were analyzed using 1H-MRS. The pre- and postoperative AUC of NAA and Cho at different time points were compared. Conventional MRI and DWI showed no abnormal signal changes in the brain parenchyma or right basal ganglia before and after surgery (P > 0.05). There was no significant difference in the ratio between NAA/(Cr+Cr2) and Cho/(Cr+Cr2) before and after surgery in the bilateral basal ganglia and frontoparietal regions of the cortex (P > 0.05). Quantitative (1)H-MRS showed that selective deep cerebral hypothermia significantly improved the brain's tolerance to ischemia and hypoxia. Our results could provide a better understanding of the efficacy and safety of selective deep hypothermia and blood flow occlusion.}, } @article {pmid26501214, year = {2016}, author = {Derdemezis, E and Vorperian, HK and Kent, RD and Fourakis, M and Reinicke, EL and Bolt, DM}, title = {Optimizing Vowel Formant Measurements in Four Acoustic Analysis Systems for Diverse Speaker Groups.}, journal = {American journal of speech-language pathology}, volume = {25}, number = {3}, pages = {335-354}, pmid = {26501214}, issn = {1558-9110}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Down Syndrome/*physiopathology ; Female ; Humans ; Male ; *Phonetics ; Reference Values ; *Speech Acoustics ; }, abstract = {PURPOSE: This study systematically assessed the effects of select linear predictive coding (LPC) analysis parameter manipulations on vowel formant measurements for diverse speaker groups using 4 trademarked Speech Acoustic Analysis Software Packages (SAASPs): CSL, Praat, TF32, and WaveSurfer.

METHOD: Productions of 4 words containing the corner vowels were recorded from 4 speaker groups with typical development (male and female adults and male and female children) and 4 speaker groups with Down syndrome (male and female adults and male and female children). Formant frequencies were determined from manual measurements using a consensus analysis procedure to establish formant reference values, and from the 4 SAASPs (using both the default analysis parameters and with adjustments or manipulations to select parameters). Smaller differences between values obtained from the SAASPs and the consensus analysis implied more optimal analysis parameter settings.

RESULTS: Manipulations of default analysis parameters in CSL, Praat, and TF32 yielded more accurate formant measurements, though the benefit was not uniform across speaker groups and formants. In WaveSurfer, manipulations did not improve formant measurements.

CONCLUSIONS: The effects of analysis parameter manipulations on accuracy of formant-frequency measurements varied by SAASP, speaker group, and formant. The information from this study helps to guide clinical and research applications of SAASPs.}, } @article {pmid26474717, year = {2016}, author = {Nair, A and Nair, G and Reishofer, G}, title = {The Low Mandible Maneuver and Its Resonential Implications for Elite Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {1}, pages = {128.e13-32}, doi = {10.1016/j.jvoice.2015.03.010}, pmid = {26474717}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Aged ; Biofeedback, Psychology ; Biomechanical Phenomena ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; Mandible/anatomy & histology/diagnostic imaging/*physiology ; Middle Aged ; Pilot Projects ; *Singing ; Sound Spectrography ; Speech Acoustics ; Speech Production Measurement ; Tongue/diagnostic imaging/physiology ; Ultrasonography ; Vibration ; *Voice Quality ; }, abstract = {Many elite singers appear to frequently drop the posterior mandible while singing to optimize resonance production. This study investigated the physiology of the Low Mandible Maneuver (LMM) through the use of magnetic resonance imaging (MRI), ultrasound (US), and spectrographic analysis. The study of elite singers has been hampered by the paucity of internal imagery. We have attempted to address this problem by using portable US equipment that we could transport to the homes, studios, or dressing rooms of such ranking singers. With the US and acoustic data gathered in fairly brief sessions, we were able to ascertain the resonance gains garnered from the technique's use. The study featured two phases: I--MRI study of the maneuver and its physiological effect on surrounding structures (in collaboration of the Medical University of Graz, Austria) and II--US investigation that studied tongue shape during the maneuver. The LMM has significant ramifications for resonance production by enabling a concomitantly lowered larynx and increased resonance space in the pharyngeal and oral cavities. Measurements of the LMM ranged between 0.7 and 3.1 cm and showed a boost in the first harmonics as well as an enhancement in the singers formant. Its use also has a rather significant effect on the tongue shapes required for all sung phonemes. The advantage of using US for this study was the ability to produce real-time videos of the singer in action and then, through the use of stop action, precisely study both individual phoneme production and phoneme-to-phoneme transitions during the LMM.}, } @article {pmid26474716, year = {2016}, author = {Colman Machado de Machado, F and Lessa, MM and Cielo, CA and Barbosa, LH}, title = {Spectrographic Acoustic Vocal Characteristics of Elderly Women Engaged in Aerobics.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {5}, pages = {579-586}, doi = {10.1016/j.jvoice.2015.07.002}, pmid = {26474716}, issn = {1873-4588}, mesh = {*Acoustics ; Age Factors ; Aged ; Aged, 80 and over ; Analysis of Variance ; Cross-Sectional Studies ; *Exercise ; Female ; Glottis/*physiology ; Humans ; Middle Aged ; *Phonation ; Sex Factors ; *Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; Vocal Cords/physiology ; *Voice Quality ; }, abstract = {OBJECTIVES: This study was carried out to characterize the voice of the elderly women engaged in aerobics through spectrographic analysis.

METHODS: The vocal emission /a:/ of 58 elderly women engaged in aerobics for the spectrographic analysis of broadband (BBS) and narrowband (NBS) was collected, through the Real-Time Spectrogram of KayPENTAX program, that provides information about the glottal source, the position of the vocal tract and the characteristics of vowels and consonants. ANOVA (Análise de Variância) test was used for associations and Pearson correlation test with a significance level of 5%.

RESULTS: To the BBS, the elderly women had medium intensity of the tracing color of the formants (Fs), low presence of noise, and medium definition of F1 and F2. There was a medium defining F3 and F4 and regularity for age 60 years, medium definition F4 and high regularity of the tracing for 70 years, and medium definition of F3 and F4 and regularity of the tracing for 80 years. For the NBS, the elderly women had medium intensity of tracing color, little presence of noise, harmonic substitutions by noise and subharmonic; 60 and 80 years had medium definition of harmonics and regularity of tracing and high definition; and regular for 70 years. For 70 and 80 years, there was a presence of harmonics and medium presence for 60 years. There was a negative correlation between F2 and the group of 60 years and F3 with the general age.

CONCLUSIONS: Even with myofunctional, structural, and functional changes of the larynx caused by advancing age, which may affect the vocal characteristics, the elderly women of this study showed few changes in tracing spectrogram.}, } @article {pmid26465340, year = {2015}, author = {Charlton, BD}, title = {The Acoustic Structure and Information Content of Female Koala Vocal Signals.}, journal = {PloS one}, volume = {10}, number = {10}, pages = {e0138670}, pmid = {26465340}, issn = {1932-6203}, mesh = {*Acoustics ; Animals ; Cluster Analysis ; Cues ; Ecosystem ; Female ; Male ; Phascolarctidae/*physiology ; Queensland ; Retrospective Studies ; *Sexual Behavior, Animal ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Determining the information content of animal vocalisations can give valuable insights into the potential functions of vocal signals. The source-filter theory of vocal production allows researchers to examine the information content of mammal vocalisations by linking variation in acoustic features with variation in relevant physical characteristics of the caller. Here I used a source-filter theory approach to classify female koala vocalisations into different call-types, and determine which acoustic features have the potential to convey important information about the caller to other conspecifics. A two-step cluster analysis classified female calls into bellows, snarls and tonal rejection calls. Additional results revealed that female koala vocalisations differed in their potential to provide information about a given caller's phenotype that may be of importance to receivers. Female snarls did not contain reliable acoustic cues to the caller's identity and age. In contrast, female bellows and tonal rejection calls were individually distinctive, and the tonal rejection calls of older female koalas had consistently lower mean, minimum and maximum fundamental frequency. In addition, female bellows were significantly shorter in duration and had higher fundamental frequency, formant frequencies, and formant frequency spacing than male bellows. These results indicate that female koala vocalisations have the potential to signal the caller's identity, age and sex. I go on to discuss the anatomical basis for these findings, and consider the possible functional relevance of signalling this type of information in the koala's natural habitat.}, } @article {pmid26464993, year = {2015}, author = {Carney, LH and Li, T and McDonough, JM}, title = {Speech Coding in the Brain: Representation of Vowel Formants by Midbrain Neurons Tuned to Sound Fluctuations.}, journal = {eNeuro}, volume = {2}, number = {4}, pages = {}, pmid = {26464993}, issn = {2373-2822}, support = {R01 DC001641/DC/NIDCD NIH HHS/United States ; }, abstract = {Current models for neural coding of vowels are typically based on linear descriptions of the auditory periphery, and fail at high sound levels and in background noise. These models rely on either auditory nerve discharge rates or phase locking to temporal fine structure. However, both discharge rates and phase locking saturate at moderate to high sound levels, and phase locking is degraded in the CNS at middle to high frequencies. The fact that speech intelligibility is robust over a wide range of sound levels is problematic for codes that deteriorate as the sound level increases. Additionally, a successful neural code must function for speech in background noise at levels that are tolerated by listeners. The model presented here resolves these problems, and incorporates several key response properties of the nonlinear auditory periphery, including saturation, synchrony capture, and phase locking to both fine structure and envelope temporal features. The model also includes the properties of the auditory midbrain, where discharge rates are tuned to amplitude fluctuation rates. The nonlinear peripheral response features create contrasts in the amplitudes of low-frequency neural rate fluctuations across the population. These patterns of fluctuations result in a response profile in the midbrain that encodes vowel formants over a wide range of levels and in background noise. The hypothesized code is supported by electrophysiological recordings from the inferior colliculus of awake rabbits. This model provides information for understanding the structure of cross-linguistic vowel spaces, and suggests strategies for automatic formant detection and speech enhancement for listeners with hearing loss.}, } @article {pmid26403671, year = {2016}, author = {Kochetov, A and Sreedevi, N}, title = {Articulation and acoustics of Kannada affricates: A case of geminate /ʧ/.}, journal = {Clinical linguistics & phonetics}, volume = {30}, number = {3-5}, pages = {202-226}, doi = {10.3109/02699206.2015.1080762}, pmid = {26403671}, issn = {1464-5076}, mesh = {Adult ; Female ; Humans ; India ; Language ; Male ; Palate ; *Phonetics ; *Speech Acoustics ; Speech Articulation Tests ; *Speech Production Measurement ; Tongue/physiology ; }, abstract = {Affricates have been observed to be problematic in phonological acquisition and disordered speech across languages, due to their relatively complex spatial and temporal articulatory patterns. Remediation of difficulties in the production of affricates requires understanding of how these sounds are typically produced. This study presents the first systematic articulatory and acoustic investigation of voiceless geminate affricate /ʧ/ in Kannada (a Dravidian language), compared to the palatal glide and the voiceless dental stop. Ultrasound data from 10 normal speakers from Mysore, India revealed that /ʧ/ is produced with the tongue shape intermediate between the palatal glide and the dental stop, and with the laminal constriction at the alveolar ridge. The observed articulatory differences are reflected in acoustic formant patterns of vowel transitions and stop/affricate bursts. Altogether, the results show that the Kannada consonant in question is an alveolopalatal affricate, supporting some of the previous descriptive phonetic accounts of the language and raising questions for further research on normal and disordered speech. The results and our survey of literature also suggest that affricates in South Asian languages tend to be phonetically variable and historically unstable compared to other consonant articulations.}, } @article {pmid26399909, year = {2015}, author = {Varnet, L and Wang, T and Peter, C and Meunier, F and Hoen, M}, title = {How musical expertise shapes speech perception: evidence from auditory classification images.}, journal = {Scientific reports}, volume = {5}, number = {}, pages = {14489}, pmid = {26399909}, issn = {2045-2322}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Female ; Humans ; Male ; *Music ; *Professional Competence ; Reaction Time ; *Speech Perception ; Young Adult ; }, abstract = {It is now well established that extensive musical training percolates to higher levels of cognition, such as speech processing. However, the lack of a precise technique to investigate the specific listening strategy involved in speech comprehension has made it difficult to determine how musicians' higher performance in non-speech tasks contributes to their enhanced speech comprehension. The recently developed Auditory Classification Image approach reveals the precise time-frequency regions used by participants when performing phonemic categorizations in noise. Here we used this technique on 19 non-musicians and 19 professional musicians. We found that both groups used very similar listening strategies, but the musicians relied more heavily on the two main acoustic cues, at the first formant onset and at the onsets of the second and third formants onsets. Additionally, they responded more consistently to stimuli. These observations provide a direct visualization of auditory plasticity resulting from extensive musical training and shed light on the level of functional transfer between auditory processing and speech perception.}, } @article {pmid26395116, year = {2016}, author = {Jacobi, I and Timmermans, AJ and Hilgers, FJ and van den Brekel, MW}, title = {Voice quality and surgical detail in post-laryngectomy tracheoesophageal speakers.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {273}, number = {9}, pages = {2669-2679}, pmid = {26395116}, issn = {1434-4726}, mesh = {Adult ; Aged ; Aged, 80 and over ; Female ; Humans ; Laryngeal Neoplasms/*surgery ; *Laryngectomy ; Male ; Middle Aged ; Neck Dissection ; Retrospective Studies ; *Speech, Esophageal ; Surgical Flaps ; Treatment Outcome ; *Voice Quality ; }, abstract = {The objective of this study is to assess surgical parameters correlating with voice quality after total laryngectomy (TL) by relating voice and speech outcomes of TL speakers to surgical details. Seventy-six tracheoesophageal patients' voice recordings of running speech and sustained vowel were assessed in terms of voice characteristics. Measurements were related to data retrieved from surgical reports and patient records. In standard TL (sTL), harmonics-to-noise ratio was more favorable after primary TL + postoperative RT than after salvage TL. Pause/breathing time increased when RT preceded TL, after extensive base of tongue resection, and after neck dissections. Fundamental frequency (f0) measures were better after neurectomy. Females showed higher minimum f0 and higher second formants. While voice quality differed widely after sTL, gastric pull-ups and non-circumferential pharyngeal reconstructions using (myo-)cutaneous flaps scored worst in voice and speech measures and the two tubed free flaps best. Formant/resonance measures in/a/indicated differences in pharyngeal lumen properties and cranio-caudal place of the neoglottic bar between pharyngeal reconstructions, and indicate that narrower pharynges and/or more superiorly located neoglottic bars bring with them favorable voice quality. Ranges in functional outcome after TL in the present data, and the effects of treatment and surgical variables such as radiotherapy, neurectomy, neck dissection, and differences between partial or circumferential reconstructions on different aspects of voice and speech underline the importance of these variables for voice quality. Using running speech, next to sustained/a/, renders more reliable results. More balanced data, and better detail in surgical reporting will improve our knowledge on voice quality after TL.}, } @article {pmid26379579, year = {2015}, author = {Kriengwatana, B and Escudero, P and Kerkhoven, AH and Cate, CT}, title = {A general auditory bias for handling speaker variability in speech? Evidence in humans and songbirds.}, journal = {Frontiers in psychology}, volume = {6}, number = {}, pages = {1243}, pmid = {26379579}, issn = {1664-1078}, abstract = {Different speakers produce the same speech sound differently, yet listeners are still able to reliably identify the speech sound. How listeners can adjust their perception to compensate for speaker differences in speech, and whether these compensatory processes are unique only to humans, is still not fully understood. In this study we compare the ability of humans and zebra finches to categorize vowels despite speaker variation in speech in order to test the hypothesis that accommodating speaker and gender differences in isolated vowels can be achieved without prior experience with speaker-related variability. Using a behavioral Go/No-go task and identical stimuli, we compared Australian English adults' (naïve to Dutch) and zebra finches' (naïve to human speech) ability to categorize / I/ and /ε/ vowels of an novel Dutch speaker after learning to discriminate those vowels from only one other speaker. Experiments 1 and 2 presented vowels of two speakers interspersed or blocked, respectively. Results demonstrate that categorization of vowels is possible without prior exposure to speaker-related variability in speech for zebra finches, and in non-native vowel categories for humans. Therefore, this study is the first to provide evidence for what might be a species-shared auditory bias that may supersede speaker-related information during vowel categorization. It additionally provides behavioral evidence contradicting a prior hypothesis that accommodation of speaker differences is achieved via the use of formant ratios. Therefore, investigations of alternative accounts of vowel normalization that incorporate the possibility of an auditory bias for disregarding inter-speaker variability are warranted.}, } @article {pmid26365312, year = {2016}, author = {Erickson, ML}, title = {Can Inexperienced Listeners Hear Who Is Flat? The Role of Timbre and Vibrato.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {5}, pages = {638.e9-638.e20}, doi = {10.1016/j.jvoice.2015.07.014}, pmid = {26365312}, issn = {1873-4588}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Female ; Humans ; Judgment ; Male ; *Pitch Discrimination ; Psychoacoustics ; *Singing ; Vibration ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES/HYPOTHESIS: Research has shown that the distribution of spectral energy and the presence of vibrato in a complex tone can affect pitch perception. This study sought to answer the questions: "Does timbre affect the perception of difference in pitch in complex synthetic stimuli modeled after singing voices?" "Does vibrato affect the perception of difference in pitch in complex synthetic stimuli modeled after singing voices?" and "Does the direction of timbre difference affect the perception of pitch difference?"

STUDY DESIGN: This is a repeated-measures factorial design.

METHODS: The experiment consisted of three experimental blocks at the pitches A3, G4, and F5, each with a vibrato and no-vibrato subblock. For each block, two reference stimuli (mezzo-soprano and soprano) and six test stimuli (mezzo-soprano at frequencies of -1%, -2%, and -3%, soprano at frequencies of -1%, -2%, and -3%) were synthesized on the vowel /ɑ/. Each reference stimulus was paired with itself, with the other reference stimulus, and with all the test stimuli. Vibrato stimuli had a rate of 5.6 Hz and a frequency vibrato extent of ±50 cents. Listeners indicated the degree to which stimuli differed in pitch.

RESULTS: Differences in timbre and vibrato were significant main effects on the perception of pitch difference. The direction of timbre difference was a consistent significant effect on the perception of pitch difference for the pitch G4; however, this was not a consistent effect at the pitches A3 and F5.

CONCLUSION: Numerous factors can affect the perception of pitch including timbre and presence of vibrato.}, } @article {pmid26360784, year = {2016}, author = {Knowles, KK and Little, AC}, title = {Vocal fundamental and formant frequencies affect perceptions of speaker cooperativeness.}, journal = {Quarterly journal of experimental psychology (2006)}, volume = {69}, number = {9}, pages = {1657-1675}, doi = {10.1080/17470218.2015.1091484}, pmid = {26360784}, issn = {1747-0226}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Analysis of Variance ; *Cooperative Behavior ; Female ; Humans ; Male ; Pitch Perception/*physiology ; *Sex Characteristics ; *Social Perception ; *Speech Acoustics ; Voice/*physiology ; Young Adult ; }, abstract = {In recent years, the perception of social traits in faces and voices has received much attention. Facial and vocal masculinity are linked to perceptions of trustworthiness; however, while feminine faces are generally considered to be trustworthy, vocal trustworthiness is associated with masculinized vocal features. Vocal traits such as pitch and formants have previously been associated with perceived social traits such as trustworthiness and dominance, but the link between these measurements and perceptions of cooperativeness have yet to be examined. In Experiment 1, cooperativeness ratings of male and female voices were examined against four vocal measurements: fundamental frequency (F0), pitch variation (F0-SD), formant dispersion (Df), and formant position (Pf). Feminine pitch traits (F0 and F0-SD) and masculine formant traits (Df and Pf) were associated with higher cooperativeness ratings. In Experiment 2, manipulated voices with feminized F0 were found to be more cooperative than voices with masculinized F0(,) among both male and female speakers, confirming our results from Experiment 1. Feminine pitch qualities may indicate an individual who is friendly and non-threatening, while masculine formant qualities may reflect an individual that is socially dominant or prestigious, and the perception of these associated traits may influence the perceived cooperativeness of the speakers.}, } @article {pmid26350698, year = {2016}, author = {Sundberg, J and Lã, FM and Gill, BP}, title = {Voice Source Variation Between Vowels in Male Opera Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {5}, pages = {509-517}, doi = {10.1016/j.jvoice.2015.07.010}, pmid = {26350698}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Analysis of Variance ; Biomechanical Phenomena ; Electrodiagnosis ; Glottis/*physiology ; Humans ; Male ; *Occupations ; *Phonation ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: The theory of nonlinear source-filter interaction predicts that the glottal voice source should be affected by the frequency relationship between formants and partials. An attempt to experimentally verify this theory is presented.

STUDY DESIGN: Glottal voice source and electrolaryngograph (ELG) signal differences between vowels were analyzed in vowel sequences, sung at four pitches with the same degree of vocal loudness by professional opera singers. In addition, the relationships between such differences and the frequency distance between the first formant (F1) and its closest partial were examined.

METHODS: A digital laryngograph microprocessor was used to simultaneously record audio and ELG signals. The former was inverse filtered, and voice source parameters and formant frequencies were extracted. The amplitude quotient of the derivative of the ELG signal (AQdELG) and the contact quotient were also compared.

RESULTS: A one-way repeated-measures ANOVA revealed significant differences between vowels, for contact quotient at four pitches and for maximum flow declination rate (MFDR) at three pitches. For other voice source parameters, differences were found at one or two pitches only. No consistent correlation was found between MFDR and the distance between F1 and its closest partial.

CONCLUSIONS: The glottal voice source tends to vary between vowels, presumably because of source-filter interaction, but the variation does not seem to be dependent on the frequency distance between F1 and its closest partial.}, } @article {pmid26343343, year = {2015}, author = {Gutschalk, A and Uppenkamp, S and Riedel, B and Bartsch, A and Brandt, T and Vogt-Schaden, M}, title = {Pure word deafness with auditory object agnosia after bilateral lesion of the superior temporal sulcus.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {73}, number = {}, pages = {24-35}, doi = {10.1016/j.cortex.2015.08.001}, pmid = {26343343}, issn = {1973-8102}, mesh = {Acoustic Stimulation/methods ; Agnosia/physiopathology ; Aphasia/physiopathology ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Brain Mapping ; Functional Laterality/*physiology ; Humans ; Male ; Middle Aged ; *Speech ; Speech Perception/*physiology ; Temporal Lobe/*physiology ; }, abstract = {Based on results from functional imaging, cortex along the superior temporal sulcus (STS) has been suggested to subserve phoneme and pre-lexical speech perception. For vowel classification, both superior temporal plane (STP) and STS areas have been suggested relevant. Lesion of bilateral STS may conversely be expected to cause pure word deafness and possibly also impaired vowel classification. Here we studied a patient with bilateral STS lesions caused by ischemic strokes and relatively intact medial STPs to characterize the behavioral consequences of STS loss. The patient showed severe deficits in auditory speech perception, whereas his speech production was fluent and communication by written speech was grossly intact. Auditory-evoked fields in the STP were within normal limits on both sides, suggesting that major parts of the auditory cortex were functionally intact. Further studies showed that the patient had normal hearing thresholds and only mild disability in tests for telencephalic hearing disorder. Prominent deficits were discovered in an auditory-object classification task, where the patient performed four standard deviations below the control group. In marked contrast, performance in a vowel-classification task was intact. Auditory evoked fields showed enhanced responses for vowels compared to matched non-vowels within normal limits. Our results are consistent with the notion that cortex along STS is important for auditory speech perception, although it does not appear to be entirely speech specific. Formant analysis and single vowel classification, however, appear to be already implemented in auditory cortex on the STP.}, } @article {pmid26328721, year = {2015}, author = {Hansen, JH and Williams, K and Bořil, H}, title = {Speaker height estimation from speech: Fusing spectral regression and statistical acoustic models.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {2}, pages = {1052-1067}, doi = {10.1121/1.4927554}, pmid = {26328721}, issn = {1520-8524}, abstract = {Estimating speaker height can assist in voice forensic analysis and provide additional side knowledge to benefit automatic speaker identification or acoustic model selection for automatic speech recognition. In this study, a statistical approach to height estimation that incorporates acoustic models within a non-uniform height bin width Gaussian mixture model structure as well as a formant analysis approach that employs linear regression on selected phones are presented. The accuracy and trade-offs of these systems are explored by examining the consistency of the results, location, and causes of error as well a combined fusion of the two systems using data from the TIMIT corpus. Open set testing is also presented using the Multi-session Audio Research Project corpus and publicly available YouTube audio to examine the effect of channel mismatch between training and testing data and provide a realistic open domain testing scenario. The proposed algorithms achieve a highly competitive performance to previously published literature. Although the different data partitioning in the literature and this study may prevent performance comparisons in absolute terms, the mean average error of 4.89 cm for males and 4.55 cm for females provided by the proposed algorithm on TIMIT utterances containing selected phones suggest a considerable estimation error decrease compared to past efforts.}, } @article {pmid26328699, year = {2015}, author = {Vorperian, HK and Kurtzweil, SL and Fourakis, M and Kent, RD and Tillman, KK and Austin, D}, title = {Effect of body position on vocal tract acoustics: Acoustic pharyngometry and vowel formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {2}, pages = {833-845}, pmid = {26328699}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; P-30 HD0335/HD/NICHD NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Adolescent ; Adult ; Anthropometry ; Copying Processes ; Female ; Humans ; Larynx/*physiology ; Male ; Mouth/*physiology ; Organ Size ; Pharynx/*physiology ; *Phonetics ; Posture/*physiology ; Respiration ; Sound ; *Speech Acoustics ; Supine Position ; Valsalva Maneuver ; Young Adult ; }, abstract = {The anatomic basis and articulatory features of speech production are often studied with imaging studies that are typically acquired in the supine body position. It is important to determine if changes in body orientation to the gravitational field alter vocal tract dimensions and speech acoustics. The purpose of this study was to assess the effect of body position (upright versus supine) on (1) oral and pharyngeal measurements derived from acoustic pharyngometry and (2) acoustic measurements of fundamental frequency (F0) and the first four formant frequencies (F1-F4) for the quadrilateral point vowels. Data were obtained for 27 male and female participants, aged 17 to 35 yrs. Acoustic pharyngometry showed a statistically significant effect of body position on volumetric measurements, with smaller values in the supine than upright position, but no changes in length measurements. Acoustic analyses of vowels showed significantly larger values in the supine than upright position for the variables of F0, F3, and the Euclidean distance from the centroid to each corner vowel in the F1-F2-F3 space. Changes in body position affected measurements of vocal tract volume but not length. Body position also affected the aforementioned acoustic variables, but the main vowel formants were preserved.}, } @article {pmid26304760, year = {2016}, author = {Jafarpisheh, AS and Jafari, AH and Abolhassani, M and Farhadi, M and Sadjedi, H and Pourbakht, A and Shirzhiyan, Z}, title = {Nonlinear feature extraction for objective classification of complex auditory brainstem responses to diotic perceptually critical consonant-vowel syllables.}, journal = {Auris, nasus, larynx}, volume = {43}, number = {1}, pages = {37-44}, doi = {10.1016/j.anl.2015.06.003}, pmid = {26304760}, issn = {1879-1476}, mesh = {Adult ; Auditory Perception ; Electroencephalography ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Nonlinear Dynamics ; Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: To examine if nonlinear feature extraction method yields appropriate results in complex brainstem response classification of three different consonant vowels diotically presented in normal Persian speaking adults.

METHODS: Speech-evoked auditory brainstem responses were obtained in 27 normal hearing young adults by using G.tec EEG recording system. 170ms synthetic consonant-vowel stimuli /ba/, /da/, /ga/ were presented binaurally and the recurrence quantification analysis was performed on the responses. The recurrence time of second type was proposed as a suitable feature. ANOVA was also used for testing the significance of extracted feature. Post-comparison statistical method was used for showing which means are significantly different from each other.

RESULTS: Dimension embedding and state space reconstruction were helpful for visualizing nonlinearity in auditory system. The proposed feature was successful in the objective classification of responses in window time 20.1-35.3ms, which belonged to formant transition period of stimuli. Also the p value behavior of recurrence time of second type feature as a discriminant feature was close to the nature of the response that includes transient and sustained parts. On the other hand, the /ba/ and /ga/ classification period was wider than the others.

CONCLUSION: The extracted feature shown in this paper is helpful for the objective of distinguishing individuals with auditory processing disorders in the structurally similar voices. On the other hand, differing nonlinear feature is meaningful in a special region of response, equal to formant transition period, and this feature is related to the state space changes of brainstem response. It can be assumed that more information is within this region of signal and it is a sign of processing role of brainstem. The state changes of system are dependent on input stimuli, so the existence of top down feedback from cortex to brainstem forces the system to act differently.}, } @article {pmid26279584, year = {2015}, author = {Pitcher, BJ and Briefer, EF and McElligott, AG}, title = {Intrasexual selection drives sensitivity to pitch, formants and duration in the competitive calls of fallow bucks.}, journal = {BMC evolutionary biology}, volume = {15}, number = {}, pages = {149}, pmid = {26279584}, issn = {1471-2148}, mesh = {Animals ; *Behavior, Animal ; Body Size ; Competitive Behavior ; Deer/*physiology ; Female ; Male ; Sexual Behavior, Animal ; Social Behavior ; *Vocalization, Animal ; }, abstract = {BACKGROUND: Mammal vocal parameters such as fundamental frequency (or pitch; f o) and formant dispersion often provide information about quality traits of the producer (e.g. dominance and body size), suggesting that they are sexually selected. However, little experimental evidence exists demonstrating the importance of these cues in intrasexual competition, particularly f o . Male Fallow deer (bucks) produce an extremely low pitched groan. Bucks have a descended larynx and generate f o well below what is expected for animals of their size. Groan parameters are linked to caller dominance, body size and condition, suggesting that groans are the product of sexual selection. Using a playback experiment, we presented bucks with groans that had been manipulated to alter vocal cues to these male characteristics and compared the response to the same, non-modified (natural) groans.

RESULTS: We experimentally examined the ability of bucks to utilise putative cues to dominance (f o), body size (formant frequencies) and condition (groan duration), when assessing competitors. We found that bucks treated groans with lowered f o (more dominant), and lowered formant frequencies (larger caller) as more threatening. By contrast, groans with raised formant frequencies (smaller caller), and shorter durations (more fatigued caller) were treated as less threatening.

CONCLUSIONS: Our results indicate that intrasexual selection is driving groans to concurrently convey caller dominance, body size and condition. They represent the first experimental demonstration of the importance of f o in male competition in non-human mammals, and show that bucks have advanced perception abilities that allow them to extract information based on relatively small changes in key parameters.}, } @article {pmid26277012, year = {2015}, author = {Orozco-Arroyave, JR and Belalcazar-Bolaños, EA and Arias-Londoño, JD and Vargas-Bonilla, JF and Skodda, S and Rusz, J and Daqrouq, K and Hönig, F and Nöth, E}, title = {Characterization Methods for the Detection of Multiple Voice Disorders: Neurological, Functional, and Laryngeal Diseases.}, journal = {IEEE journal of biomedical and health informatics}, volume = {19}, number = {6}, pages = {1820-1828}, doi = {10.1109/JBHI.2015.2467375}, pmid = {26277012}, issn = {2168-2208}, mesh = {Adult ; Aged ; Aged, 80 and over ; Diagnosis, Computer-Assisted/*methods ; Female ; Humans ; Laryngeal Diseases/classification/*diagnosis/physiopathology ; Male ; Middle Aged ; *Signal Processing, Computer-Assisted ; Sound Spectrography/*methods ; Voice Disorders/classification/*diagnosis/physiopathology ; }, abstract = {This paper evaluates the accuracy of different characterization methods for the automatic detection of multiple speech disorders. The speech impairments considered include dysphonia in people with Parkinson's disease (PD), dysphonia diagnosed in patients with different laryngeal pathologies (LP), and hypernasality in children with cleft lip and palate (CLP). Four different methods are applied to analyze the voice signals including noise content measures, spectral-cepstral modeling, nonlinear features, and measurements to quantify the stability of the fundamental frequency. These measures are tested in six databases: three with recordings of PD patients, two with patients with LP, and one with children with CLP. The abnormal vibration of the vocal folds observed in PD patients and in people with LP is modeled using the stability measures with accuracies ranging from 81% to 99% depending on the pathology. The spectral-cepstral features are used in this paper to model the voice spectrum with special emphasis around the first two formants. These measures exhibit accuracies ranging from 95% to 99% in the automatic detection of hypernasal voices, which confirms the presence of changes in the speech spectrum due to hypernasality. Noise measures suitably discriminate between dysphonic and healthy voices in both databases with speakers suffering from LP. The results obtained in this study suggest that it is not suitable to use every kind of features to model all of the voice pathologies; conversely, it is necessary to study the physiology of each impairment to choose the most appropriate set of features.}, } @article {pmid26254465, year = {2015}, author = {Lee, SH and Yu, JF and Hsieh, YH and Lee, GS}, title = {Relationships Between Formant Frequencies of Sustained Vowels and Tongue Contours Measured by Ultrasonography.}, journal = {American journal of speech-language pathology}, volume = {24}, number = {4}, pages = {739-749}, doi = {10.1044/2015_AJSLP-14-0063}, pmid = {26254465}, issn = {1558-9110}, mesh = {Adult ; Female ; Humans ; Male ; Phonation/*physiology ; *Phonetics ; Reference Values ; *Sound Spectrography ; Tongue/*diagnostic imaging/*physiology ; Ultrasonography ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to explore the formant-articulation relationships in corner vowels by acoustic analysis of speech signals and measuring tongue contour using ultrasonography.

METHOD: Eighteen healthy adults (8 men, 10 women), aged between 20 and 40 years, were enrolled. Speech signals of sustained vowels /ɑ/, /i/, and /u/ were recorded to obtain the first 2 formant frequencies. Tongue height, tongue advancement, and lengths of posterior tongue surface (LPTS) and anterior oral cavity (LAOC) were obtained using ultrasound image processing technique in order to examine the resonance mechanism of the oral vocal tract.

RESULTS: LPTS and LAOC, as well as mean frequencies of the first and second formants, showed significant differences between sexes and among vowels. The first formant significantly and better correlated with LPTS than with tongue height (r = -.78, p < .001, Pearson's correlation). The correlation of the second formant with LAOC (r = -.85, p < .001) was also significant and better than the correlation with tongue advancement. Sex, however, did not show significant correlation with the formant frequencies.

CONCLUSIONS: Ultrasonographic measurements of tongue contour correlated well to the formant frequencies. The results stressed the importance of tongue shaping in articulation. Although more studies are necessary in clinical implications, disorders associated with abnormal tongue shaping may be the target applications in the future.}, } @article {pmid26252382, year = {2015}, author = {Honey, C and Schnupp, J}, title = {Neural Resolution of Formant Frequencies in the Primary Auditory Cortex of Rats.}, journal = {PloS one}, volume = {10}, number = {8}, pages = {e0134078}, pmid = {26252382}, issn = {1932-6203}, support = {//Wellcome Trust/United Kingdom ; BB/H008608/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {*Acoustic Stimulation ; Animals ; Auditory Cortex/*physiology ; Female ; Normal Distribution ; Rats, Sprague-Dawley ; Reproducibility of Results ; }, abstract = {Pulse-resonance sounds play an important role in animal communication and auditory object recognition, yet very little is known about the cortical representation of this class of sounds. In this study we shine light on one simple aspect: how well does the firing rate of cortical neurons resolve resonant ("formant") frequencies of vowel-like pulse-resonance sounds. We recorded neural responses in the primary auditory cortex (A1) of anesthetized rats to two-formant pulse-resonance sounds, and estimated their formant resolving power using a statistical kernel smoothing method which takes into account the natural variability of cortical responses. While formant-tuning functions were diverse in structure across different penetrations, most were sensitive to changes in formant frequency, with a frequency resolution comparable to that reported for rat cochlear filters.}, } @article {pmid26246611, year = {2015}, author = {Reber, SA and Nishimura, T and Janisch, J and Robertson, M and Fitch, WT}, title = {A Chinese alligator in heliox: formant frequencies in a crocodilian.}, journal = {The Journal of experimental biology}, volume = {218}, number = {Pt 15}, pages = {2442-2447}, pmid = {26246611}, issn = {1477-9145}, mesh = {Alligators and Crocodiles/*physiology ; Animals ; Body Size ; Cues ; Female ; Helium ; Oxygen ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {Crocodilians are among the most vocal non-avian reptiles. Adults of both sexes produce loud vocalizations known as 'bellows' year round, with the highest rate during the mating season. Although the specific function of these vocalizations remains unclear, they may advertise the caller's body size, because relative size differences strongly affect courtship and territorial behaviour in crocodilians. In mammals and birds, a common mechanism for producing honest acoustic signals of body size is via formant frequencies (vocal tract resonances). To our knowledge, formants have to date never been documented in any non-avian reptile, and formants do not seem to play a role in the vocalizations of anurans. We tested for formants in crocodilian vocalizations by using playbacks to induce a female Chinese alligator (Alligator sinensis) to bellow in an airtight chamber. During vocalizations, the animal inhaled either normal air or a helium/oxygen mixture (heliox) in which the velocity of sound is increased. Although heliox allows normal respiration, it alters the formant distribution of the sound spectrum. An acoustic analysis of the calls showed that the source signal components remained constant under both conditions, but an upward shift of high-energy frequency bands was observed in heliox. We conclude that these frequency bands represent formants. We suggest that crocodilian vocalizations could thus provide an acoustic indication of body size via formants. Because birds and crocodilians share a common ancestor with all dinosaurs, a better understanding of their vocal production systems may also provide insight into the communication of extinct Archosaurians.}, } @article {pmid26233058, year = {2015}, author = {Friedrichs, D and Maurer, D and Dellwo, V}, title = {The phonological function of vowels is maintained at fundamental frequencies up to 880 Hz.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {1}, pages = {EL36-42}, doi = {10.1121/1.4922534}, pmid = {26233058}, issn = {1520-8524}, mesh = {Adult ; Analysis of Variance ; Cues ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {In a between-subject perception task, listeners either identified full words or vowels isolated from these words at F0s between 220 and 880 Hz. They received two written words as response options (minimal pair with the stimulus vowel in contrastive position). Listeners' sensitivity (A') was extremely high in both conditions at all F0s, showing that the phonological function of vowels can also be maintained at high F0s. This indicates that vowel sounds may carry strong acoustic cues departing from common formant frequencies at high F0s and that listeners do not rely on consonantal context phenomena for their identification performance.}, } @article {pmid26233040, year = {2015}, author = {Mitsuya, T and MacDonald, EN and Munhall, KG and Purcell, DW}, title = {Formant compensation for auditory feedback with English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {138}, number = {1}, pages = {413-424}, doi = {10.1121/1.4923154}, pmid = {26233040}, issn = {1520-8524}, mesh = {Adolescent ; Adult ; Canada ; Feedback, Sensory/*physiology ; Female ; Humans ; Language ; Phonation/*physiology ; *Phonetics ; *Speech Acoustics ; United States/ethnology ; Young Adult ; }, abstract = {Past studies have shown that speakers spontaneously adjust their speech acoustics in response to their auditory feedback perturbed in real time. In the case of formant perturbation, the majority of studies have examined speaker's compensatory production using the English vowel /ɛ/ as in the word "head." Consistent behavioral observations have been reported, and there is lively discussion as to how the production system integrates auditory versus somatosensory feedback to control vowel production. However, different vowels have different oral sensation and proprioceptive information due to differences in the degree of lingual contact or jaw openness. This may in turn influence the ways in which speakers compensate for auditory feedback. The aim of the current study was to examine speakers' compensatory behavior with six English monophthongs. Specifically, the current study tested to see if "closed vowels" would show less compensatory production than "open vowels" because closed vowels' strong lingual sensation may richly specify production via somatosensory feedback. Results showed that, indeed, speakers exhibited less compensatory production with the closed vowels. Thus sensorimotor control of vowels is not fixed across all vowels; instead it exerts different influences across different vowels.}, } @article {pmid26226607, year = {2015}, author = {Easwar, V and Purcell, DW and Aiken, SJ and Parsa, V and Scollie, SD}, title = {Effect of Stimulus Level and Bandwidth on Speech-Evoked Envelope Following Responses in Adults With Normal Hearing.}, journal = {Ear and hearing}, volume = {36}, number = {6}, pages = {619-634}, doi = {10.1097/AUD.0000000000000188}, pmid = {26226607}, issn = {1538-4667}, mesh = {Acoustic Stimulation/*methods ; Adult ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Healthy Volunteers ; Hearing Loss/*diagnosis/physiopathology ; Humans ; Male ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: The use of auditory evoked potentials as an objective outcome measure in infants fitted with hearing aids has gained interest in recent years. This article proposes a test paradigm using speech-evoked envelope following responses (EFRs) for use as an objective-aided outcome measure. The method uses a running speech-like, naturally spoken stimulus token /susa∫i/ (fundamental frequency [f0] = 98 Hz; duration 2.05 sec), to elicit EFRs by eight carriers representing low, mid, and high frequencies. Each vowel elicited two EFRs simultaneously, one from the region of formant one (F1) and one from the higher formants region (F2+). The simultaneous recording of two EFRs was enabled by lowering f0 in the region of F1 alone. Fricatives were amplitude modulated to enable recording of EFRs from high-frequency spectral regions. The present study aimed to evaluate the effect of level and bandwidth on speech-evoked EFRs in adults with normal hearing. As well, the study aimed to test convergent validity of the EFR paradigm by comparing it with changes in behavioral tasks due to bandwidth.

DESIGN: Single-channel electroencephalogram was recorded from the vertex to the nape of the neck over 300 sweeps in two polarities from 20 young adults with normal hearing. To evaluate the effects of level in experiment I, EFRs were recorded at test levels of 50 and 65 dB SPL. To evaluate the effects of bandwidth in experiment II, EFRs were elicited by /susa∫i/ low-pass filtered at 1, 2, and 4 kHz, presented at 65 dB SPL. The 65 dB SPL condition from experiment I represented the full bandwidth condition. EFRs were averaged across the two polarities and estimated using a Fourier analyzer. An F test was used to determine whether an EFR was detected. Speech discrimination using the University of Western Ontario Distinctive Feature Differences test and sound quality rating using the Multiple Stimulus Hidden Reference and Anchors paradigm were measured in identical bandwidth conditions.

RESULTS: In experiment I, the increase in level resulted in a significant increase in response amplitudes for all eight carriers (mean increase of 14 to 50 nV) and the number of detections (mean increase of 1.4 detections). In experiment II, an increase in bandwidth resulted in a significant increase in the number of EFRs detected until the low-pass filtered 4 kHz condition and carrier-specific changes in response amplitude until the full bandwidth condition. Scores in both behavioral tasks increased with bandwidth up to the full bandwidth condition. The number of detections and composite amplitude (sum of all eight EFR amplitudes) significantly correlated with changes in behavioral test scores.

CONCLUSIONS: Results suggest that the EFR paradigm is sensitive to changes in level and audible bandwidth. This may be a useful tool as an objective-aided outcome measure considering its running speech-like stimulus, representation of spectral regions important for speech understanding, level and bandwidth sensitivity, and clinically feasible test times. This paradigm requires further validation in individuals with hearing loss, with and without hearing aids.}, } @article {pmid26226606, year = {2015}, author = {Easwar, V and Purcell, DW and Aiken, SJ and Parsa, V and Scollie, SD}, title = {Evaluation of Speech-Evoked Envelope Following Responses as an Objective Aided Outcome Measure: Effect of Stimulus Level, Bandwidth, and Amplification in Adults With Hearing Loss.}, journal = {Ear and hearing}, volume = {36}, number = {6}, pages = {635-652}, doi = {10.1097/AUD.0000000000000199}, pmid = {26226606}, issn = {1538-4667}, mesh = {Acoustic Stimulation/*methods ; Aged ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Hearing Aids ; Hearing Loss, Sensorineural/*diagnosis/physiopathology/rehabilitation ; Humans ; Male ; Middle Aged ; Speech Perception/*physiology ; }, abstract = {OBJECTIVES: The present study evaluated a novel test paradigm based on speech-evoked envelope following responses (EFRs) as an objective aided outcome measure for individuals fitted with hearing aids. Although intended for use in infants with hearing loss, this study evaluated the paradigm in adults with hearing loss, as a precursor to further evaluation in infants. The test stimulus was a naturally male-spoken token /susa∫i/, modified to enable recording of eight individual EFRs, two from each vowel for different formants and one from each fricative. In experiment I, sensitivity of the paradigm to changes in audibility due to varying stimulus level and use of hearing aids was tested. In experiment II, sensitivity of the paradigm to changes in aided audible bandwidth was evaluated. As well, experiment II aimed to test convergent validity of the EFR paradigm by comparing the effect of bandwidth on EFRs and behavioral outcome measures of hearing aid fitting.

DESIGN: Twenty-one adult hearing aid users with mild to moderately severe sensorineural hearing loss participated in the study. To evaluate the effects of level and amplification in experiment I, the stimulus was presented at 50 and 65 dB SPL through an ER-2 insert earphone in unaided conditions and through individually verified hearing aids in aided conditions. Behavioral thresholds of EFR carriers were obtained using an ER-2 insert earphone to estimate sensation level of EFR carriers. To evaluate the effect of aided audible bandwidth in experiment II, EFRs were elicited by /susa∫i/ low-pass filtered at 1, 2, and 4 kHz and presented through the programmed hearing aid. EFRs recorded in the 65 dB SPL aided condition in experiment I represented the full bandwidth condition. EEG was recorded from the vertex to the nape of the neck over 300 sweeps. Speech discrimination using the University of Western Ontario Distinctive Feature Differences test and sound quality rating using the Multiple-Stimulus Hidden Reference and Anchor paradigm were measured in the same bandwidth conditions.

RESULTS: In experiment I, an increase in stimulus level above threshold and the use of amplification resulted in a significant increase in the number of EFRs detected per condition. At positive sensation levels, an increase in level demonstrated a significant increase in response amplitude in unaided and aided conditions. At 50 and 65 dB SPL, the use of amplification led to a significant increase in response amplitude for the majority of carriers. In experiment II, the number of EFR detections and the combined response amplitude of all eight EFRs improved with an increase in bandwidth up to 4 kHz. In contrast, behavioral measures continued to improve at wider bandwidths. Further change in EFR parameters was possibly limited by the hearing aid bandwidth. Significant positive correlations were found between EFR parameters and behavioral test scores in experiment II.

CONCLUSIONS: The EFR paradigm demonstrates sensitivity to changes in audibility due to a change in stimulus level, bandwidth, and use of amplification in clinically feasible test times. The paradigm may thus have potential applications as an objective aided outcome measure. Further investigations exploring stimulus-response relationships in aided conditions and validation studies in children are warranted.}, } @article {pmid26197615, year = {2015}, author = {Chen, SW and Wang, JX and Sheng, WN and Liu, J and Zhang, WB and Zhou, P}, title = {[Analysis of SPR Signal by Using Optimized Savitzky-Golay Filter].}, journal = {Guang pu xue yu guang pu fen xi = Guang pu}, volume = {35}, number = {4}, pages = {1124-1128}, pmid = {26197615}, issn = {1000-0593}, abstract = {The spectrum of surface plasmon resonance shows different amplitudes and peak widths with theeffects of resonant intensity and position, so it's necessary to develop the self-adaptive methods to guarantee the accurate positioning of SPR formant. Based on the optimized Savitzky-Golay filter, this paper presents a method for SPR spectrum, which can optimize the parameters (polynomial degree and window size) according to the characteristics of the real time SPR spectrum, and minimizes the Stein's unbiased risk estimate of the mean squared error, and it can be quickly solved by Monte-Carlo methods. It is confirmed that SURE can accurately reflect the trends and the results of the true MSE. The experiment measures the SPR signals of sucrose under different concentrations and integration time through the SPR system. By using the proposed method of optimal parameters on the SPR signal filtering, the results show that the repeatability error of the position of SPR formant is smaller compared with the fixed parameters, and does not increase with the addition of noise levels.}, } @article {pmid26186691, year = {2015}, author = {Mainka, A and Poznyakovskiy, A and Platzek, I and Fleischer, M and Sundberg, J and Mürbe, D}, title = {Lower Vocal Tract Morphologic Adjustments Are Relevant for Voice Timbre in Singing.}, journal = {PloS one}, volume = {10}, number = {7}, pages = {e0132241}, pmid = {26186691}, issn = {1932-6203}, mesh = {Acoustics ; Humans ; Hypopharynx/anatomy & histology ; Larynx/anatomy & histology ; Magnetic Resonance Imaging ; Male ; Organ Size ; *Singing ; Vocal Cords/*anatomy & histology/physiology ; Voice/*physiology ; Voice Quality/*physiology ; Young Adult ; }, abstract = {The vocal tract shape is crucial to voice production. Its lower part seems particularly relevant for voice timbre. This study analyzes the detailed morphology of parts of the epilaryngeal tube and the hypopharynx for the sustained German vowels /a/, /e/, /i/, /o/, and /u/ by thirteen male singer subjects who were at the beginning of their academic singing studies. Analysis was based on two different phonatory conditions: a natural, speech-like phonation and a singing phonation, like in classical singing. 3D models of the vocal tract were derived from magnetic resonance imaging and compared with long-term average spectrum analysis of audio recordings from the same subjects. Comparison of singing to the speech-like phonation, which served as reference, showed significant adjustments of the lower vocal tract: an average lowering of the larynx by 8 mm and an increase of the hypopharyngeal cross-sectional area (+ 21:9%) and volume (+ 16:8%). Changes in the analyzed epilaryngeal portion of the vocal tract were not significant. Consequently, lower larynx-to-hypopharynx area and volume ratios were found in singing compared to the speech-like phonation. All evaluated measures of the lower vocal tract varied significantly with vowel quality. Acoustically, an increase of high frequency energy in singing correlated with a wider hypopharyngeal area. The findings offer an explanation how classical male singers might succeed in producing a voice timbre with increased high frequency energy, creating a singer`s formant cluster.}, } @article {pmid26177213, year = {2015}, author = {Presacco, A and Jenkins, K and Lieberman, R and Anderson, S}, title = {Effects of Aging on the Encoding of Dynamic and Static Components of Speech.}, journal = {Ear and hearing}, volume = {36}, number = {6}, pages = {e352-63}, pmid = {26177213}, issn = {1538-4667}, support = {T32 DC000046/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aging/*physiology ; Auditory Perception/physiology ; Electroencephalography ; Female ; Humans ; Male ; Middle Aged ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVES: The authors investigated aging effects on the envelope of the frequency following response to dynamic and static components of speech. Older adults frequently experience problems understanding speech, despite having clinically normal hearing. Improving audibility with hearing aids provides variable benefit, as amplification cannot restore the temporal precision degraded by aging. Previous studies have demonstrated age-related delays in subcortical timing specific to the dynamic, transition region of the stimulus. However, it is unknown whether this delay is mainly due to a failure to encode rapid changes in the formant transition because of central temporal processing deficits or as a result of cochlear damage that reduces audibility for the high-frequency components of the speech syllable. To investigate the nature of this delay, the authors compared subcortical responses in younger and older adults with normal hearing to the speech syllables /da/ and /a/, hypothesizing that the delays in peak timing observed in older adults are mainly caused by temporal processing deficits in the central auditory system.

DESIGN: The frequency following response was recorded to the speech syllables /da/ and /a/ from 15 younger and 15 older adults with normal hearing, normal IQ, and no history of neurological disorders. Both speech syllables were presented binaurally with alternating polarities at 80 dB SPL at a rate of 4.3 Hz through electromagnetically shielded insert earphones. A vertical montage of four Ag-AgCl electrodes (Cz, active, forehead ground, and earlobe references) was used.

RESULTS: The responses of older adults were significantly delayed with respect to younger adults for the transition and onset regions of the /da/ syllable and for the onset of the /a/ syllable. However, in contrast with the younger adults who had earlier latencies for /da/ than for /a/ (as was expected given the high-frequency energy in the /da/ stop consonant burst), latencies in older adults were not significantly different between the responses to /da/ and /a/. An unexpected finding was noted in the amplitude and phase dissimilarities between the two groups in the later part of the steady-state region, rather than in the transition region. This amplitude reduction may indicate prolonged neural recovery or response decay associated with a loss of auditory nerve fibers.

CONCLUSIONS: These results suggest that older adults' peak timing delays may arise from decreased synchronization to the onset of the stimulus due to reduced audibility, though the possible role of impaired central auditory processing cannot be ruled out. Conversely, a deterioration in temporal processing mechanisms in the auditory nerve, brainstem, or midbrain may be a factor in the sudden loss of synchronization in the later part of the steady-state response in older adults.}, } @article {pmid26177102, year = {2015}, author = {Lammert, AC and Narayanan, SS}, title = {On Short-Time Estimation of Vocal Tract Length from Formant Frequencies.}, journal = {PloS one}, volume = {10}, number = {7}, pages = {e0132193}, pmid = {26177102}, issn = {1932-6203}, support = {R01 DC007124/DC/NIDCD NIH HHS/United States ; T32 DC009975/DC/NIDCD NIH HHS/United States ; DC007124/DC/NIDCD NIH HHS/United States ; }, mesh = {*Algorithms ; Humans ; Laryngeal Cartilages/physiology ; Larynx/*physiology ; Male ; *Models, Theoretical ; Phonetics ; Reproducibility of Results ; Speech Production Measurement/methods ; Vocal Cords/*physiology ; *Voice ; }, abstract = {Vocal tract length is highly variable across speakers and determines many aspects of the acoustic speech signal, making it an essential parameter to consider for explaining behavioral variability. A method for accurate estimation of vocal tract length from formant frequencies would afford normalization of interspeaker variability and facilitate acoustic comparisons across speakers. A framework for considering estimation methods is developed from the basic principles of vocal tract acoustics, and an estimation method is proposed that follows naturally from this framework. The proposed method is evaluated using acoustic characteristics of simulated vocal tracts ranging from 14 to 19 cm in length, as well as real-time magnetic resonance imaging data with synchronous audio from five speakers whose vocal tracts range from 14.5 to 18.0 cm in length. Evaluations show improvements in accuracy over previously proposed methods, with 0.631 and 1.277 cm root mean square error on simulated and human speech data, respectively. Empirical results show that the effectiveness of the proposed method is based on emphasizing higher formant frequencies, which seem less affected by speech articulation. Theoretical predictions of formant sensitivity reinforce this empirical finding. Moreover, theoretical insights are explained regarding the reason for differences in formant sensitivity.}, } @article {pmid26164798, year = {2015}, author = {Knez Ambrožič, M and Hočevar Boltežar, I and Ihan Hren, N}, title = {Changes of some functional speech disorders after surgical correction of skeletal anterior open bite.}, journal = {International journal of rehabilitation research. Internationale Zeitschrift fur Rehabilitationsforschung. Revue internationale de recherches de readaptation}, volume = {38}, number = {3}, pages = {246-252}, doi = {10.1097/MRR.0000000000000123}, pmid = {26164798}, issn = {1473-5660}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Open Bite/complications/*surgery ; Pilot Projects ; Prospective Studies ; Speech Acoustics ; Speech Disorders/*etiology ; Speech Production Measurement ; Voice Quality ; Young Adult ; }, abstract = {Skeletal anterior open bite (AOB) or apertognathism is characterized by the absence of contact of the anterior teeth and affects articulation parameters, chewing, biting and voice quality. The treatment of AOB consists of orthognatic surgical procedures. The aim of this study was to evaluate the effects of treatment on voice quality, articulation and nasality in speech with respect to skeletal changes. The study was prospective; 15 patients with AOB were evaluated before and after surgery. Lateral cephalometric x-ray parameters (facial angle, interincisal distance, Wits appraisal) were measured to determine skeletal changes. Before surgery, nine patients still had articulation disorders despite speech therapy during childhood. The voice quality parameters were determined by acoustic analysis of the vowel sound /a/ (fundamental frequency-F0, jitter, shimmer). Spectral analysis of vowels /a/, /e/, /i/, /o/, /u/ was carried out by determining the mean frequency of the first (F1) and second (F2) formants. Nasality in speech was expressed as the ratio between the nasal and the oral sound energies during speech samples. After surgery, normalizations of facial skeletal parameters were observed in all patients, but no statistically significant changes in articulation and voice quality parameters occurred despite subjective observations of easier articulation. Any deterioration in velopharyngeal insufficiency was absent in all of the patients. In conclusion, the surgical treatment of skeletal AOB does not lead to deterioration in voice, resonance and articulation qualities. Despite surgical correction of the unfavourable skeletal situation of the speech apparatus, the pre-existing articulation disorder cannot improve without professional intervention.}, } @article {pmid26106868, year = {2015}, author = {Recasens, D}, title = {The Effect of Stress and Speech Rate on Vowel Coarticulation in Catalan Vowel-Consonant-Vowel Sequences.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {5}, pages = {1407-1424}, doi = {10.1044/2015_JSLHR-S-14-0196}, pmid = {26106868}, issn = {1558-9102}, mesh = {Adult ; Humans ; *Language ; Middle Aged ; Perceptual Masking/physiology ; Phonetics ; Sound Spectrography ; Spain/ethnology ; Speech/*physiology ; Speech Acoustics ; Speech Articulation Tests ; Stress, Psychological/*physiopathology ; Tongue/physiology ; }, abstract = {PURPOSE: The goal of this study was to ascertain the effect of changes in stress and speech rate on vowel coarticulation in vowel-consonant-vowel sequences.

METHOD: Data on second formant coarticulatory effects as a function of changing /i/ versus /a/ were collected for five Catalan speakers' productions of vowel-consonant-vowel sequences with the fixed vowels /i/ and /a/ and consonants: the approximant /δ/, the alveolopalatal nasal /ɲ/, and /l/, which in the Catalan language differs in darkness degree according to speaker.

RESULTS: In agreement with predictions formulated by the degree-of-articulation-constraint model of coarticulation, the size of the vowel coarticulatory effects was inversely related to the degree of articulatory constraint for the consonant, and the direction of those effects was mostly carryover or anticipatory in vowel-consonant-vowel sequences with highly constrained consonants (/ɲ/, dark /l/) and more variable whenever the intervocalic consonant was less constrained (/δ/, clear /l/). Stress and speech-rate variations had an effect on overall vowel duration, second formant frequency, and coarticulation size but not on the consonant-specific patterns of degree and direction of vowel coarticulation.

CONCLUSION: These results indicate that prosodically induced coarticulatory changes conform to the basic principles of segmental coarticulatory organization.}, } @article {pmid26094167, year = {2016}, author = {Duvvuru, S and Erickson, M}, title = {The Effect of Timbre, Pitch, and Vibrato on Vocal Pitch-Matching Accuracy.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {3}, pages = {378.e1-378.e12}, doi = {10.1016/j.jvoice.2015.05.011}, pmid = {26094167}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Biomechanical Phenomena ; Female ; Humans ; *Imitative Behavior ; Middle Aged ; *Phonation ; *Pitch Perception ; *Singing ; Speech Production Measurement ; Tennessee ; Vibration ; Vocal Cords/*physiology ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study seeks to examine how target stimulus timbre, vibrato, pitch, and singer classification affect pitch-matching accuracy.

STUDY DESIGN: This is a repeated-measures factorial design.

METHODS: Source signals were synthesized with a source slope of -12 dB/octave with and without vibrato at each of the pitches, C4, B4, and F5. These source signals were filtered using five formant patterns (A-E) constituting a total of 30 stimuli (5 formant patterns × 3 pitches × 2 vibrato conditions). Twelve sopranos and 11 mezzo-sopranos with at least 3 years of individual voice training were recruited from the University Of Tennessee, Knoxville, School of Music and the Knoxville Opera Company. Each singer attempted to match the pitch of all 30 stimuli presented twice in a random order.

RESULTS: Results indicated that there was no significant effect of formant pattern on pitch-matching accuracy. With increasing pitch from C4 to F5, pitch-matching accuracy increased in midpoint of the vowel condition but not in prephonatory set condition. Mezzo-sopranos moved toward being in tune from prephonatory to midpoint of the vowel. However, sopranos at C4 sang closer to being in tune at prephonatory but lowered the pitch at the midpoint of the vowel. Presence or absence of vibrato did not affect the pitch-matching accuracy. However, the interesting finding of the study was that singers attempted to match the timbre of stimuli with vibrato.

CONCLUSIONS: The results of this study show that pitch matching is a complex process affected by many parameters.}, } @article {pmid26093398, year = {2015}, author = {Koda, H and Tokuda, IT and Wakita, M and Ito, T and Nishimura, T}, title = {The source-filter theory of whistle-like calls in marmosets: Acoustic analysis and simulation of helium-modulated voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {6}, pages = {3068-3076}, doi = {10.1121/1.4921607}, pmid = {26093398}, issn = {1520-8524}, mesh = {*Acoustics ; Administration, Inhalation ; Animals ; Biological Evolution ; Biomechanical Phenomena ; Callithrix/*classification/*physiology ; Computer Simulation ; Gases ; Helium/*administration & dosage ; Larynx/physiology ; Male ; Models, Theoretical ; Phonation ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Species Specificity ; Vocalization, Animal/*classification/*drug effects ; }, abstract = {Whistle-like high-pitched "phee" calls are often used as long-distance vocal advertisements by small-bodied marmosets and tamarins in the dense forests of South America. While the source-filter theory proposes that vibration of the vocal fold is modified independently from the resonance of the supralaryngeal vocal tract (SVT) in human speech, a source-filter coupling that constrains the vibration frequency to SVT resonance effectively produces loud tonal sounds in some musical instruments. Here, a combined approach of acoustic analyses and simulation with helium-modulated voices was used to show that phee calls are produced principally with the same mechanism as in human speech. The animal keeps the fundamental frequency (f0) close to the first formant (F1) of the SVT, to amplify f0. Although f0 and F1 are primarily independent, the degree of their tuning can be strengthened further by a flexible source-filter interaction, the variable strength of which depends upon the cross-sectional area of the laryngeal cavity. The results highlight the evolutionary antiquity and universality of the source-filter model in primates, but the study can also explore the diversification of vocal physiology, including source-filter interaction and its anatomical basis in non-human primates.}, } @article {pmid26044859, year = {2015}, author = {De Marco, D and De Stefani, E and Gentilucci, M}, title = {Gesture and word analysis: the same or different processes?.}, journal = {NeuroImage}, volume = {117}, number = {}, pages = {375-385}, doi = {10.1016/j.neuroimage.2015.05.080}, pmid = {26044859}, issn = {1095-9572}, mesh = {Adult ; Evoked Potentials, Motor/*physiology ; Female ; *Gestures ; Humans ; Male ; Motor Cortex/*physiology ; Psychomotor Performance/physiology ; Repetition Priming/physiology ; Semantics ; Speech Production Measurement/*methods ; Time Factors ; Transcranial Magnetic Stimulation/*methods ; Verbal Behavior/*physiology ; Young Adult ; }, abstract = {The present study aimed at determining whether elaboration of communicative signals (symbolic gestures and words) is always accompanied by integration with each other and, if present, this integration can be considered in support of the existence of a same control mechanism. Experiment 1 aimed at determining whether and how gesture is integrated with word. Participants were administered with a semantic priming paradigm with a lexical decision task and pronounced a target word, which was preceded by a meaningful or meaningless prime gesture. When meaningful, the gesture could be either congruent or incongruent with word meaning. Duration of prime presentation (100, 250, 400 ms) randomly varied. Voice spectra, lip kinematics, and time to response were recorded and analyzed. Formant 1 of voice spectra, and mean velocity in lip kinematics increased when the prime was meaningful and congruent with the word, as compared to meaningless gesture. In other words, parameters of voice and movement were magnified by congruence, but this occurred only when prime duration was 250 ms. Time to response to meaningful gesture was shorter in the condition of congruence compared to incongruence. Experiment 2 aimed at determining whether the mechanism of integration of a prime word with a target word is similar to that of a prime gesture with a target word. Formant 1 of the target word increased when word prime was meaningful and congruent, as compared to meaningless congruent prime. Increase was, however, present for whatever prime word duration. Experiment 3 aimed at determining whether symbolic prime gesture comprehension makes use of motor simulation. Transcranial Magnetic Stimulation was delivered to left primary motor cortex 100, 250, 500 ms after prime gesture presentation. Motor Evoked Potential of First Dorsal Interosseus increased when stimulation occurred 100 ms post-stimulus. Thus, gesture was understood within 100 ms and integrated with the target word within 250 ms. Experiment 4 excluded any hand motor simulation in order to comprehend prime word. Thus, the same type of integration with a word was present for both prime gesture and word. It was probably successive to understanding of the signal, which used motor simulation for gesture and direct access to semantics for words.}, } @article {pmid26009260, year = {2015}, author = {Olasagasti, I and Bouton, S and Giraud, AL}, title = {Prediction across sensory modalities: A neurocomputational model of the McGurk effect.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {68}, number = {}, pages = {61-75}, doi = {10.1016/j.cortex.2015.04.008}, pmid = {26009260}, issn = {1973-8102}, mesh = {Acoustic Stimulation ; Algorithms ; Auditory Perception/*physiology ; Communication Aids for Disabled ; *Computer Simulation ; Cues ; Humans ; Lip ; *Models, Neurological ; Photic Stimulation ; Sensation/*physiology ; Speech Perception/*physiology ; Visual Perception/physiology ; }, abstract = {The McGurk effect is a textbook illustration of the automaticity with which the human brain integrates audio-visual speech. It shows that even incongruent audiovisual (AV) speech stimuli can be combined into percepts that correspond neither to the auditory nor to the visual input, but to a mix of both. Typically, when presented with, e.g., visual /aga/ and acoustic /aba/ we perceive an illusory /ada/. In the inverse situation, however, when acoustic /aga/ is paired with visual /aba/, we perceive a combination of both stimuli, i.e., /abga/ or /agba/. Here we assessed the role of dynamic cross-modal predictions in the outcome of AV speech integration using a computational model that processes continuous audiovisual speech sensory inputs in a predictive coding framework. The model involves three processing levels: sensory units, units that encode the dynamics of stimuli, and multimodal recognition/identity units. The model exhibits a dynamic prediction behavior because evidence about speech tokens can be asynchronous across sensory modality, allowing for updating the activity of the recognition units from one modality while sending top-down predictions to the other modality. We explored the model's response to congruent and incongruent AV stimuli and found that, in the two-dimensional feature space spanned by the speech second formant and lip aperture, fusion stimuli are located in the neighborhood of congruent /ada/, which therefore provides a valid match. Conversely, stimuli that lead to combination percepts do not have a unique valid neighbor. In that case, acoustic and visual cues are both highly salient and generate conflicting predictions in the other modality that cannot be fused, forcing the elaboration of a combinatorial solution. We propose that dynamic predictive mechanisms play a decisive role in the dichotomous perception of incongruent audiovisual inputs.}, } @article {pmid26001499, year = {2016}, author = {Millgård, M and Fors, T and Sundberg, J}, title = {Flow Glottogram Characteristics and Perceived Degree of Phonatory Pressedness.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {3}, pages = {287-292}, doi = {10.1016/j.jvoice.2015.03.014}, pmid = {26001499}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Aged ; Biomechanical Phenomena ; Electrodiagnosis ; Female ; Glottis/*physiology ; Healthy Volunteers ; Humans ; Judgment ; Linear Models ; *Loudness Perception ; Male ; Middle Aged ; Observer Variation ; *Phonation ; *Pitch Perception ; Pressure ; Reproducibility of Results ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech-Language Pathology/*methods ; Time Factors ; *Voice Quality ; }, abstract = {OBJECTIVES: Phonatory pressedness is a clinically relevant aspect of voice, which generally is analyzed by auditory perception. The present investigation aimed at identifying voice source and formant characteristics related to experts' ratings of phonatory pressedness.

STUDY DESIGN: Experimental study of the relations between visual analog scale ratings of phonatory pressedness and voice source parameters in healthy voices.

METHODS: Audio, electroglottogram, and subglottal pressure, estimated from oral pressure during /p/ occlusion, were recorded from five female and six male subjects, each of whom deliberately varied phonation type between neutral, flow, and pressed in the syllable /pae/, produced at three loudness levels and three pitches. Speech-language pathologists rated, along a visual analog scale, the degree of perceived phonatory pressedness in these samples.

RESULTS: The samples were analyzed by means of inverse filtering with regard to closed quotient, dominance of the voice source fundamental, normalized amplitude quotient, peak-to-peak flow amplitude, as well as formant frequencies and the alpha ratio of spectrum energy above and below 1000 Hz. The results were compared with the rating data, which showed that the ratings were closely related to voice source parameters.

CONCLUSIONS: Approximately, 70% of the variance of the ratings could be explained by the voice source parameters. A multiple linear regression analysis suggested that perceived phonatory pressedness is related most closely to subglottal pressure, closed quotient, and the two lowest formants.}, } @article {pmid25994732, year = {2015}, author = {Titze, IR and Baken, RJ and Bozeman, KW and Granqvist, S and Henrich, N and Herbst, CT and Howard, DM and Hunter, EJ and Kaelin, D and Kent, RD and Kreiman, J and Kob, M and Löfqvist, A and McCoy, S and Miller, DG and Noé, H and Scherer, RC and Smith, JR and Story, BH and Švec, JG and Ternström, S and Wolfe, J}, title = {Toward a consensus on symbolic notation of harmonics, resonances, and formants in vocalization.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {3005-3007}, pmid = {25994732}, issn = {1520-8524}, support = {R01 DC012045-01/DC/NIDCD NIH HHS/United States ; R03 DC006801/DC/NIDCD NIH HHS/United States ; R01 DC012045/DC/NIDCD NIH HHS/United States ; R01 DC001797/DC/NIDCD NIH HHS/United States ; R01 DC004224/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Animals ; Consensus ; Humans ; Linguistics/*classification/standards ; Phonetics ; Sound ; *Speech Acoustics ; Speech-Language Pathology/*classification/standards ; *Terminology as Topic ; Vibration ; Vocalization, Animal/*classification ; *Voice Quality ; }, } @article {pmid25994716, year = {2015}, author = {Funabashi, M}, title = {Invariance in vowel systems.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {2892-2900}, doi = {10.1121/1.4919360}, pmid = {25994716}, issn = {1520-8524}, mesh = {*Acoustics ; Adult ; Aged ; Female ; Fourier Analysis ; Humans ; Language ; Linear Models ; Male ; Middle Aged ; *Speech Acoustics ; *Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {This study applies information geometry of normal distribution to model Japanese vowels on the basis of the first and second formants. The distribution of Kullback-Leibler (KL) divergence and its decomposed components were investigated to reveal the statistical invariance in the vowel system. The results suggest that although significant variability exists in individual KL divergence distributions, the population distribution tends to converge into a specific log-normal distribution. This distribution can be considered as an invariant distribution for the standard-Japanese speaking population. Furthermore, it was revealed that the mean and variance components of KL divergence are linearly related in the population distribution. The significance of these invariant features is discussed.}, } @article {pmid25994714, year = {2015}, author = {Town, SM and Atilgan, H and Wood, KC and Bizley, JK}, title = {The role of spectral cues in timbre discrimination by ferrets and humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {2870-2883}, pmid = {25994714}, issn = {1520-8524}, support = {098418//Wellcome Trust/United Kingdom ; BB/H016813/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; WT098418MA//Wellcome Trust/United Kingdom ; }, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; Animals ; Auditory Pathways/physiology ; *Behavior, Animal ; *Cues ; *Discrimination, Psychological ; Female ; Ferrets/physiology/*psychology ; Humans ; *Loudness Perception ; Male ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; Species Specificity ; Young Adult ; }, abstract = {Timbre distinguishes sounds of equal loudness, pitch, and duration; however, little is known about the neural mechanisms underlying timbre perception. Such understanding requires animal models such as the ferret in which neuronal and behavioral observation can be combined. The current study asked what spectral cues ferrets use to discriminate between synthetic vowels. Ferrets were trained to discriminate vowels differing in the position of the first (F1) and second formants (F2), inter-formant distance, and spectral centroid. In experiment 1, ferrets responded to probe trials containing novel vowels in which the spectral cues of trained vowels were mismatched. Regression models fitted to behavioral responses determined that F2 and spectral centroid were stronger predictors of ferrets' behavior than either F1 or inter-formant distance. Experiment 2 examined responses to single formant vowels and found that individual spectral peaks failed to account for multi-formant vowel perception. Experiment 3 measured responses to unvoiced vowels and showed that ferrets could generalize vowel identity across voicing conditions. Experiment 4 employed the same design as experiment 1 but with human participants. Their responses were also predicted by F2 and spectral centroid. Together these findings further support the ferret as a model for studying the neural processes underlying timbre perception.}, } @article {pmid25994709, year = {2015}, author = {Nittrouer, S and Kuess, J and Lowenstein, JH}, title = {Speech perception of sine-wave signals by children with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {2811-2822}, pmid = {25994709}, issn = {1520-8524}, support = {R01 DC006237/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; Age Factors ; Audiometry, Speech ; Awareness ; Case-Control Studies ; Child ; Child Behavior ; Child Language ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Correction of Hearing Impairment/*instrumentation ; Cues ; Humans ; Noise/adverse effects ; Pattern Recognition, Physiological ; Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Phonetics ; Recognition, Psychology ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Perception ; }, abstract = {Children need to discover linguistically meaningful structures in the acoustic speech signal. Being attentive to recurring, time-varying formant patterns helps in that process. However, that kind of acoustic structure may not be available to children with cochlear implants (CIs), thus hindering development. The major goal of this study was to examine whether children with CIs are as sensitive to time-varying formant structure as children with normal hearing (NH) by asking them to recognize sine-wave speech. The same materials were presented as speech in noise, as well, to evaluate whether any group differences might simply reflect general perceptual deficits on the part of children with CIs. Vocabulary knowledge, phonemic awareness, and "top-down" language effects were all also assessed. Finally, treatment factors were examined as possible predictors of outcomes. Results showed that children with CIs were as accurate as children with NH at recognizing sine-wave speech, but poorer at recognizing speech in noise. Phonemic awareness was related to that recognition. Top-down effects were similar across groups. Having had a period of bimodal stimulation near the time of receiving a first CI facilitated these effects. Results suggest that children with CIs have access to the important time-varying structure of vocal-tract formants.}, } @article {pmid25994702, year = {2015}, author = {Roberts, B and Summers, RJ}, title = {Informational masking of monaural target speech by a single contralateral formant.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {2726-2736}, doi = {10.1121/1.4919344}, pmid = {25994702}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Acoustics ; Adolescent ; Adult ; Audiometry, Speech ; Female ; Humans ; Male ; *Perceptual Masking ; Recognition, Psychology ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {Recent research suggests that the ability of an extraneous formant to impair intelligibility depends on the variation of its frequency contour. This idea was explored using a method that ensures interference cannot occur through energetic masking. Three-formant (F1 + F2 + F3) analogues of natural sentences were synthesized using a monotonous periodic source. Target formants were presented monaurally, with the target ear assigned randomly on each trial. A competitor for F2 (F2C) was presented contralaterally; listeners must reject F2C to optimize recognition. In experiment 1, F2Cs with various frequency and amplitude contours were used. F2Cs with time-varying frequency contours were effective competitors; constant-frequency F2Cs had far less impact. To a lesser extent, amplitude contour also influenced competitor impact; this effect was additive. In experiment 2, F2Cs were created by inverting the F2 frequency contour about its geometric mean and varying its depth of variation over a range from constant to twice the original (0%-200%). The impact on intelligibility was least for constant F2Cs and increased up to ∼100% depth, but little thereafter. The effect of an extraneous formant depends primarily on its frequency contour; interference increases as the depth of variation is increased until the range exceeds that typical for F2 in natural speech.}, } @article {pmid25994691, year = {2015}, author = {Echternach, M and Birkholz, P and Traser, L and Flügge, TV and Kamberger, R and Burk, F and Burdumy, M and Richter, B}, title = {Articulation and vocal tract acoustics at soprano subject's high fundamental frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {5}, pages = {2586-2595}, doi = {10.1121/1.4919356}, pmid = {25994691}, issn = {1520-8524}, mesh = {*Acoustics ; Biomechanical Phenomena ; Female ; Humans ; Imaging, Three-Dimensional ; Larynx/anatomy & histology/*physiology ; Magnetic Resonance Imaging ; Models, Anatomic ; *Phonation ; Printing, Three-Dimensional ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; *Voice Quality ; }, abstract = {The role of the vocal tract for phonation at very high soprano fundamental frequencies (F0s) is not yet understood in detail. In this investigation, two experiments were carried out with a single professional high soprano subject. First, using two dimensional (2D) dynamic real-time magnetic resonance imaging (MRI) (24 fps) midsagittal and coronal vocal tract shapes were analyzed while the subject sang a scale from Bb5 (932 Hz) to G6 (1568 Hz). In a second experiment, volumetric vocal tract MRI data were recorded from sustained phonations (13 s) for the pitches C6 (1047 Hz) and G6 (1568 Hz). Formant frequencies were measured in physical models created by 3D printing, and calculated from area functions obtained from the 3D vocal tract shapes. The data showed that there were only minor modifications of the vocal tract shape. These changes involved a decrease of the piriform sinus as well as small changes of tongue position. Formant frequencies did not exhibit major differences between C6 and G6 for F1 and F3, respectively. Only F2 was slightly raised for G6. For G6, however, F2 is not excited by any voice source partial. Therefore, this investigation was not able to confirm that the analyzed professional soprano subject adjusted formants to voice source partials for the analyzed F0s.}, } @article {pmid25990321, year = {2016}, author = {Jafari, N and Drinnan, M and Mohamadi, R and Yadegari, F and Nourbakhsh, M and Torabinezhad, F}, title = {A Comparison of Persian Vowel Production in Hearing-Impaired Children Using a Cochlear Implant and Normal-Hearing Children.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {3}, pages = {340-344}, doi = {10.1016/j.jvoice.2015.04.012}, pmid = {25990321}, issn = {1873-4588}, mesh = {Acoustics ; Adaptation, Psychological ; Age Factors ; Case-Control Studies ; Child ; Child Development ; *Child Language ; Child, Preschool ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Deafness/diagnosis/psychology/*rehabilitation ; Disabled Children/psychology/*rehabilitation ; Feedback, Sensory ; Female ; Humans ; Infant ; Iran ; Male ; Persons With Hearing Impairments/psychology/*rehabilitation ; Proprioception ; *Speech Acoustics ; Speech Perception ; Speech Production Measurement ; *Voice Quality ; }, abstract = {OBJECTIVE: Normal-hearing (NH) acuity and auditory feedback control are crucial for human voice production and articulation. The lack of auditory feedback in individuals with profound hearing impairment changes their vowel production. The purpose of this study was to compare Persian vowel production in deaf children with cochlear implants (CIs) and that in NH children.

METHODS: The participants were 20 children (12 girls and 8 boys) with age range of 5 years; 1 month to 9 years. All patients had congenital hearing loss and received a multichannel CI at an average age of 3 years. They had at least 6 months experience of their current device (CI). The control group consisted of 20 NH children (12 girls and 8 boys) with age range of 5 to 9 years old. The two groups were matched by age. Participants were native Persian speakers who were asked to produce the vowels /i/, /e/, /ӕ/, /u/, /o/, and /a/. The averages for first formant frequency (F1) and second formant frequency (F2) of six vowels were measured using Praat software (Version 5.1.44, Boersma & Weenink, 2012). The independent samples t test was conducted to assess the differences in F1 and F2 values and the area of the vowel space between the two groups.

RESULTS: Mean values of F1 were increased in CI children; the mean values of F1 for vowel /i/ and /a/, F2 for vowel /a/ and /o/ were significantly different (P < 0.05). The changes in F1 and F2 showed a centralized vowel space for CI children.

CONCLUSIONS: F1 is increased in CI children, probably because CI children tend to overarticulate. We hypothesis this is due to a lack of auditory feedback; there is an attempt by hearing-impaired children to compensate via proprioceptive feedback during articulatory process.}, } @article {pmid25975670, year = {2015}, author = {Elbashti, ME and Hattori, M and Sumita, YI and Taniguchi, H}, title = {Evaluation of articulation simulation system using artificial maxillectomy models.}, journal = {Journal of oral rehabilitation}, volume = {42}, number = {9}, pages = {678-684}, doi = {10.1111/joor.12306}, pmid = {25975670}, issn = {1365-2842}, mesh = {*Computer Simulation ; Humans ; Magnetic Resonance Imaging ; Maxilla/*surgery ; Models, Biological ; Muscle Contraction/*physiology ; Palatal Obturators ; Phonation/*physiology ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/*methods ; Voice Quality ; }, abstract = {Acoustic evaluation is valuable for guiding the treatment of maxillofacial defects and determining the effectiveness of rehabilitation with an obturator prosthesis. Model simulations are important in terms of pre-surgical planning and pre- and post-operative speech function. This study aimed to evaluate the acoustic characteristics of voice generated by an articulation simulation system using a vocal tract model with or without artificial maxillectomy defects. More specifically, we aimed to establish a speech simulation system for maxillectomy defect models that both surgeons and maxillofacial prosthodontists can use in guiding treatment planning. Artificially simulated maxillectomy defects were prepared according to Aramany's classification (Classes I-VI) in a three-dimensional vocal tract plaster model of a subject uttering the vowel /a/. Formant and nasalance acoustic data were analysed using Computerized Speech Lab and the Nasometer, respectively. Formants and nasalance of simulated /a/ sounds were successfully detected and analysed. Values of Formants 1 and 2 for the non-defect model were 675.43 and 976.64 Hz, respectively. Median values of Formants 1 and 2 for the defect models were 634.36 and 1026.84 Hz, respectively. Nasalance was 11% in the non-defect model, whereas median nasalance was 28% in the defect models. The results suggest that an articulation simulation system can be used to help surgeons and maxillofacial prosthodontists to plan post-surgical defects that will be facilitate maxillofacial rehabilitation.}, } @article {pmid25966553, year = {2015}, author = {Fu, Q and Liang, Y and Zou, A and Wang, T}, title = {[Relationships of electrophysiological characteristic between speech evoked auditory brainstem response and auditory mismatch negativity].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {29}, number = {1}, pages = {39-44}, pmid = {25966553}, issn = {2096-7993}, mesh = {Acoustic Stimulation ; Adult ; *Evoked Potentials, Auditory, Brain Stem ; Humans ; *Speech ; Speech Perception ; }, abstract = {OBJECTIVE: To investigate the relationships of electrophysiological characteristics between speech evoked auditory brainstem response (s-ABR) and auditory mismatch negativity (MMN), so as to provide more clues for the mechanism of speech cognitive behavior.

METHOD: Thirty-three ears in 33 normal hearing adults were included in this study. Their s-ABR were recorded with speech syllables /da/ at 80 dB HL intensity. Meanwhile, two MMNs were recorded with 1 kHz frequency deviant extent and 40 dB intensity deviant extent in them. The electrophysiological characteristics of s-ABRs and MMNs, as well as the relationships of MMN latencies between s-ABR parameters including latencies in time domain, fundamental frequency(F0) and first formants(F1) in frequency domain were analyzed statistically.

RESULT: MMN latency of frequency deviance showed a negative correlation tendency with s-ABR transient components, and it showed a positive trend with sustained components of s-ABR. While MMN latency of intensity deviance showed a positive correlation with s-ABR latency of peak V, A and D respectively, and it negatively showed a correlation with s-ABR latency of other peak s and amplitude of F0 and FI respectively. Only the s-ABR latency of peak F and MMN latency of frequency deviance, and the F0 amplitude of s-ABR and MMN latency of intensity deviance were moderate correlation statistically.

CONCLUSION: It was probably the neurons of frequency deviant MMN unmatched the characteristics of frequency with the neurons of s-ABR transient component, but well matched the characteristics of frequency with the neurons of s-ABR sustained component. Similarly, the neurons of intensity deviant MMN probably matched the characteristics of intensity with neurons of different components of s-ABR or not. These results may formed as a valuable clue for further investigation of speech perception and temporal processing abilities.}, } @article {pmid25953587, year = {2016}, author = {Valença, EH and Salvatori, R and Souza, AH and Oliveira-Neto, LA and Oliveira, AH and Gonçalves, MI and Oliveira, CR and D'Ávila, JS and Melo, VA and de Carvalho, S and de Andrade, BM and Nascimento, LS and Rocha, SB and Ribeiro, TR and Prado-Barreto, VM and Melo, EV and Aguiar-Oliveira, MH}, title = {Voice Formants in Individuals With Congenital, Isolated, Lifetime Growth Hormone Deficiency.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {3}, pages = {281-286}, doi = {10.1016/j.jvoice.2015.03.015}, pmid = {25953587}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Aged ; Biomarkers/blood ; Brazil ; Case-Control Studies ; Cross-Sectional Studies ; Dwarfism, Pituitary/blood/diagnosis/genetics/*physiopathology ; Female ; Human Growth Hormone/blood/*deficiency ; Humans ; Male ; Middle Aged ; Mouth/growth & development ; Pharynx/growth & development ; Severity of Illness Index ; Sex Factors ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; }, abstract = {OBJECTIVE: To analyze the voice formants (F1, F2, F3, and F4 in Hz) of seven oral vowels, in Brazilian Portuguese, [a, ε, e, i, ɔ, o, and u] in adult individuals with congenital lifetime untreated isolated growth hormone deficiency (IGHD).

STUDY DESIGN: This is a cross-sectional study.

METHODS: Acoustic analysis of isolated vowels was performed in 33 individuals with IGHD, age 44.5 (17.6) years (16 women), and 29 controls, age 51.1 (17.6) years (15 women).

RESULTS: Compared with controls, IGHD men showed higher values of F3 [i, e, and ε], P = 0.006, P = 0.022, and P = 0.006, respectively and F4 [i], P = 0.001 and lower values of F2 [u], P = 0.034; IGHD women presented higher values of F1 [i and e] P = 0.029 and P = 0.036; F2 [ɔ] P = 0.006; F4 [ɔ] P = 0.031 and lower values of F2 [i] P = 0.004. IGHD abolished most of the gender differences in formant frequencies present in controls.

CONCLUSIONS: Congenital, severe IGHD results in higher values of most formant frequencies, suggesting smaller oral and pharyngeal cavities. In addition, it causes a reduction in the effect of gender on the structure of the formants, maintaining a prepubertal acoustic prediction.}, } @article {pmid25932247, year = {2015}, author = {Liang, Y and Numan, FA and Li, K and Liao, G}, title = {Spectrum analysis of Chinese vowels formant in patients with tongue carcinoma underwent hemiglossectomy.}, journal = {International journal of clinical and experimental medicine}, volume = {8}, number = {2}, pages = {2867-2873}, pmid = {25932247}, issn = {1940-5901}, abstract = {OBJECTIVES: Tongue is the most important phonatory organ in stomatognathic system. Radical resection of tongue squamous cell carcinoma can cause tongue defect and result in serious oral dysfunction, especially in phonetic function. This study aims to reveal the influence of tongue cancer, tongue defect and tongue reconstructions to phonetic function of tongue cancer patients.

STUDY DESIGN: Formant spectrum analysis of Chinese vowels was performed by linear predictive coding (LPC) in tongue squamous cell carcinoma patients (before surgery and 3 months, 9 months and 2 years after surgery) and normal people. Patients with tongue squamous cell carcinoma were divided into reconstruction group and non-reconstruction group. In reconstruction group, patients underwent tongue reconstruction with radial forearm free flap (RFFF) and lateral arm free flap (LAFF), respectively.

RESULTS: 45 patients and 40 normal people were included. Differences were statistically significant between patients and normal persons, between patients before surgery and after surgery, between non-reconstruction group and construction group 2 years after operation. No statistical significance was found between patients underwent tongue reconstruction with RFFF or LAFF 2 years after operation.

CONCLUSIONS: This study showed that tongue cancer and tongue defect after radical resections affected phonetic function of patients. Tongue reconstruction with free flaps could restore phonetic function to some extent. The efficiency of tongue reconstruction with RFFF and LAFF respectively were similar.}, } @article {pmid25923725, year = {2015}, author = {Vandermeulen, J and Bahr, C and Tullo, E and Fontana, I and Ott, S and Kashiha, M and Guarino, M and Moons, CP and Tuyttens, FA and Niewold, TA and Berckmans, D}, title = {Discerning pig screams in production environments.}, journal = {PloS one}, volume = {10}, number = {4}, pages = {e0123111}, pmid = {25923725}, issn = {1932-6203}, mesh = {Animals ; Area Under Curve ; ROC Curve ; Stress, Physiological ; Swine ; Tape Recording ; Vocalization, Animal/*classification ; }, abstract = {Pig vocalisations convey information about their current state of health and welfare. Continuously monitoring these vocalisations can provide useful information for the farmer. For instance, pig screams can indicate stressful situations. When monitoring screams, other sounds can interfere with scream detection. Therefore, identifying screams from other sounds is essential. The objective of this study was to understand which sound features define a scream. Therefore, a method to detect screams based on sound features with physical meaning and explicit rules was developed. To achieve this, 7 hours of labelled data from 24 pigs was used. The developed detection method attained 72% sensitivity, 91% specificity and 83% precision. As a result, the detection method showed that screams contain the following features discerning them from other sounds: a formant structure, adequate power, high frequency content, sufficient variability and duration.}, } @article {pmid25920849, year = {2015}, author = {Oosthuizen, DJ and Hanekom, JJ}, title = {Fuzzy information transmission analysis for continuous speech features.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {4}, pages = {1983-1994}, doi = {10.1121/1.4916198}, pmid = {25920849}, issn = {1520-8524}, abstract = {Feature information transmission analysis (FITA) estimates information transmitted by an acoustic feature by assigning tokens to categories according to the feature under investigation and comparing within-category to between-category confusions. FITA was initially developed for categorical features (e.g., voicing) for which the category assignments arise from the feature definition. When used with continuous features (e.g., formants), it may happen that pairs of tokens in different categories are more similar than pairs of tokens in the same category. The estimated transmitted information may be sensitive to category boundary location and the selected number of categories. This paper proposes a fuzzy approach to FITA that provides a smoother transition between categories and compares its sensitivity to grouping parameters with that of the traditional approach. The fuzzy FITA was found to be sufficiently robust to boundary location to allow automation of category boundary selection. Traditional and fuzzy FITA were found to be sensitive to the number of categories. This is inherent to the mechanism of isolating a feature by dividing tokens into categories, so that transmitted information values calculated using different numbers of categories should not be compared. Four categories are recommended for continuous features when twelve tokens are used.}, } @article {pmid25920848, year = {2015}, author = {Donai, JJ and Paschall, DD}, title = {Identification of high-pass filtered male, female, and child vowels: The use of high-frequency cues.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {4}, pages = {1971-1982}, doi = {10.1121/1.4916195}, pmid = {25920848}, issn = {1520-8524}, mesh = {Adult ; Child ; Cues ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography/methods ; Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Vowels are characteristically described according to low-frequency resonance characteristics, which are presumed to provide the requisite information for identification. Classically, the study of vowel perception has focused on the lowest formant frequencies, typically F1, F2, and F3. Lehiste and Peterson [Phonetica 4, 161-177 (1959)] investigated identification accuracy of naturally produced male vowels composed of various amounts of low- and high-frequency content. Results showed near-chance identification performance for vowel segments containing only spectral information above 3.5 kHz. The authors concluded that high-frequency information was of minor importance for vowel identification. The current experiments report identification accuracy for high-pass filtered vowels produced by two male, two female, and two child talkers using both between- and within-subject designs. Identification performance was found to be significantly above chance for the majority of vowels even after high-pass filtering to remove spectral content below 3.0-3.5 kHz. Additionally, the filtered vowels having the highest fundamental frequency (child talkers) often had the highest identification accuracy scores. Linear discriminant function analysis mirrored perceptual performance when using spectral peak information between 3 and 12 kHz.}, } @article {pmid25885203, year = {2015}, author = {Fadel, CB and Dassie-Leite, AP and Santos, RS and Rosa, Mde O and Marques, JM}, title = {Acoustic characteristics of the metallic voice quality.}, journal = {CoDAS}, volume = {27}, number = {1}, pages = {97-100}, doi = {10.1590/2317-1782/20152014159}, pmid = {25885203}, issn = {2317-1782}, mesh = {Adolescent ; Adult ; Cross-Sectional Studies ; Female ; Humans ; Middle Aged ; Music ; Singing/*physiology ; Sound Spectrography ; *Speech Acoustics ; Voice Quality/*physiology ; Young Adult ; }, abstract = {PURPOSE: To characterize the fundamental frequency and the frequency of the formants F1, F2, F3, and F4 from vocal emissions of amateur singers with metallic voice quality.

METHODS: There were 60 amateur female singers aged between 18 and 60 years old; 30 women with metallic voice quality forming the study group (SG) and 30 women without such a vocal quality forming control group (CG). The sample was selected through voice screening confirmed by reviewers after reaching a consensus. Regarding data collection, sustained vowel emissions in usual tone and at two predetermined frequencies, by which the values of F0 and frequency of the formants F1, F2, F3, and F4 were obtained, were recorded and analyzed.

RESULTS: Comparing the emissions in usual tone, no difference for F0 was found, but the values of the formants F2, F3, and F4 were higher in the SG. In the preestablished tones, there was a difference between the two groups in the formants F3 and F4 for both tones.

CONCLUSION: It is possible to characterize metallic voice quality as a normal fundamental frequency, with increasing frequency of the F2 formant, and values of frequencies of formants F2, F3, and F4 higher when compared to the CG.}, } @article {pmid25873545, year = {2015}, author = {Mautner, HD}, title = {An Acoustic and Electroglottographic Study of the Aging Voice With and Without an Open Jaw Posture.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {4}, pages = {518.e1-11}, doi = {10.1016/j.jvoice.2014.09.024}, pmid = {25873545}, issn = {1873-4588}, mesh = {Adult ; Aged ; Aged, 80 and over ; Aging/*physiology ; Female ; Humans ; Jaw/physiology ; Male ; Middle Aged ; Sex Factors ; Speech Acoustics ; *Voice Quality ; }, abstract = {OBJECTIVES: This study aimed to determine if the use of an "open jaw" posture in healthy aging adults would result in voice improvement detectable through acoustic and electroglottographic measurements.

STUDY DESIGN: A convenience sampling strategy was used to recruit 85 participants, with at least five females and five males in each of four age groups, between age 35 and 50 years (35+), above 50 (50+), 60+, and 70+ years.

METHODS: Participants sustained the vowel /a/ at three pitch levels (normal, low, and high) and repeated the test sentence "We saw two cars" in both a normal and an open jaw postures. A selection of acoustic and electroglottographic measures were derived from the steady midportion of the vowel segments extracted from the sustained and embedded vowels to identify measures sensitive to the effects of jaw posture, age group, gender, and pitch.

RESULTS: Results from a four-way (two jaw postures × four age groups × two genders × three pitch levels) Mixed Model Multivariate Analysis of Variance showed a significant four-way interaction effect. For both genders, an open jaw posture led to an increase of fundamental frequency (F0), formant one frequency, and vowel space area and a decrease of the amplitude difference of the first two harmonics and %Jitter. With an open jaw posture, speed quotient decreased for females and open quotient increased for females but decreased for males.

CONCLUSIONS: An open jaw posture was generally associated with positive changes in vocal behaviors, including higher F0, improved phonatory stability, and voice clarity.}, } @article {pmid25873544, year = {2015}, author = {Johnson-Read, L and Chmiel, A and Schubert, E and Wolfe, J}, title = {Performing Lieder: Expert Perspectives and Comparison of Vibrato and Singer's Formant With Opera Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {5}, pages = {645.e15-32}, doi = {10.1016/j.jvoice.2014.10.020}, pmid = {25873544}, issn = {1873-4588}, mesh = {Acoustics ; *Auditory Perception ; Humans ; Judgment ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; Surveys and Questionnaires ; *Voice Quality ; }, abstract = {This article reports three studies about performance of lieder, and in particular in comparison with opera performance. In study 1, 21 participants with experience in music performance and teaching completed a survey concerning various characteristics of lieder performance. The results showed that there was consensus between the literature and the assessment of an expert panel-that a "natural" and "unoperatic" vibrato was favored, and that diction, text, and variation of tone are all important aspects of lieder performance. Two acoustic analyses were conducted to investigate genre-specific differences of the singer's formant and vibrato parameters. The first analysis (study 2) used 18 single quasi-unaccompanied notes from commercial recordings of two lieder, and, for comparison, 20 single unaccompanied notes from an opera. Vibrato rate was statistically identical between the two genres at ~6.4 Hz; however, lieder featured a longer delay in vibrato onset. Vibrato extent was smaller for lieder (~112 cents) compared with opera (~138 cents). The singer's formant, which is generally associated with opera, was at times observed in the lieder recordings; however, this was at an overall significantly weaker intensity than in the opera recordings. The results were replicated in study 3, where recordings using only singers who performed in both lied and opera were analyzed. This direct comparison used 45 lieder notes and 55 opera notes and also investigated three different methods of analyzing the singer's formant. A number of consistencies and inconsistencies were identified between acoustic parameters reported in studies 2 and 3, and the beliefs of singing teachers and scholars in the literature and study 1.}, } @article {pmid25849977, year = {2015}, author = {Petersen, MK}, title = {Latent semantics of action verbs reflect phonetic parameters of intensity and emotional content.}, journal = {PloS one}, volume = {10}, number = {4}, pages = {e0121575}, pmid = {25849977}, issn = {1932-6203}, mesh = {Emotions/*physiology ; Humans ; Language ; Magnetic Resonance Imaging/*methods ; *Phonetics ; Reaction Time/*physiology ; *Semantics ; Speech Perception/*physiology ; Verbal Behavior/*physiology ; }, abstract = {Conjuring up our thoughts, language reflects statistical patterns of word co-occurrences which in turn come to describe how we perceive the world. Whether counting how frequently nouns and verbs combine in Google search queries, or extracting eigenvectors from term document matrices made up of Wikipedia lines and Shakespeare plots, the resulting latent semantics capture not only the associative links which form concepts, but also spatial dimensions embedded within the surface structure of language. As both the shape and movements of objects have been found to be associated with phonetic contrasts already in toddlers, this study explores whether articulatory and acoustic parameters may likewise differentiate the latent semantics of action verbs. Selecting 3 × 20 emotion-, face-, and hand-related verbs known to activate premotor areas in the brain, their mutual cosine similarities were computed using latent semantic analysis LSA, and the resulting adjacency matrices were compared based on two different large scale text corpora: HAWIK and TASA. Applying hierarchical clustering to identify common structures across the two text corpora, the verbs largely divide into combined mouth and hand movements versus emotional expressions. Transforming the verbs into their constituent phonemes, and projecting them into an articulatory space framed by tongue height and formant frequencies, the clustered small and large size movements appear differentiated by front versus back vowels corresponding to increasing levels of arousal. Whereas the clustered emotional verbs seem characterized by sequences of close versus open jaw produced phonemes, generating up- or downwards shifts in formant frequencies that may influence their perceived valence. Suggesting, that the latent semantics of action verbs reflect parameters of intensity and emotional polarity that appear correlated with the articulatory contrasts and acoustic characteristics of phonemes.}, } @article {pmid25847198, year = {2016}, author = {Syauqy, D and Wu, CM and Setyawati, O}, title = {An acoustic feature-based similarity scoring system for speech rehabilitation assistance.}, journal = {Disability and rehabilitation. Assistive technology}, volume = {11}, number = {6}, pages = {501-515}, doi = {10.3109/17483107.2015.1027297}, pmid = {25847198}, issn = {1748-3115}, mesh = {Adolescent ; Adult ; *Algorithms ; Child ; Child, Preschool ; Female ; Humans ; Male ; Middle Aged ; Pitch Perception ; Speech Therapy/instrumentation/*methods ; Therapy, Computer-Assisted/instrumentation/*methods ; Young Adult ; }, abstract = {The purpose of this study is to develop a tool to assist speech therapy and rehabilitation, which focused on automatic scoring based on the comparison of the patient's speech with another normal speech on several aspects including pitch, vowel, voiced-unvoiced segments, strident fricative and sound intensity. The pitch estimation employed the use of cepstrum-based algorithm for its robustness; the vowel classification used multilayer perceptron (MLP) to classify vowel from pitch and formants; and the strident fricative detection was based on the major peak spectral intensity, location and the pitch existence in the segment. In order to evaluate the performance of the system, this study analyzed eight patient's speech recordings (four males, four females; 4-58-years-old), which had been recorded in previous study in cooperation with Taipei Veterans General Hospital and Taoyuan General Hospital. The experiment result on pitch algorithm showed that the cepstrum method had 5.3% of gross pitch error from a total of 2086 frames. On the vowel classification algorithm, MLP method provided 93% accuracy (men), 87% (women) and 84% (children). In total, the overall results showed that 156 tool's grading results (81%) were consistent compared to 192 audio and visual observations done by four experienced respondents. Implication for Rehabilitation Difficulties in communication may limit the ability of a person to transfer and exchange information. The fact that speech is one of the primary means of communication has encouraged the needs of speech diagnosis and rehabilitation. The advances of technology in computer-assisted speech therapy (CAST) improve the quality, time efficiency of the diagnosis and treatment of the disorders. The present study attempted to develop tool to assist speech therapy and rehabilitation, which provided simple interface to let the assessment be done even by the patient himself without the need of particular knowledge of speech processing while at the same time, also provided further deep analysis of the speech, which can be useful for the speech therapist.}, } @article {pmid25834769, year = {2015}, author = {Hasselman, F}, title = {Classifying acoustic signals into phoneme categories: average and dyslexic readers make use of complex dynamical patterns and multifractal scaling properties of the speech signal.}, journal = {PeerJ}, volume = {3}, number = {}, pages = {e837}, pmid = {25834769}, issn = {2167-8359}, abstract = {Several competing aetiologies of developmental dyslexia suggest that the problems with acquiring literacy skills are causally entailed by low-level auditory and/or speech perception processes. The purpose of this study is to evaluate the diverging claims about the specific deficient peceptual processes under conditions of strong inference. Theoretically relevant acoustic features were extracted from a set of artificial speech stimuli that lie on a /bAk/-/dAk/ continuum. The features were tested on their ability to enable a simple classifier (Quadratic Discriminant Analysis) to reproduce the observed classification performance of average and dyslexic readers in a speech perception experiment. The 'classical' features examined were based on component process accounts of developmental dyslexia such as the supposed deficit in Envelope Rise Time detection and the deficit in the detection of rapid changes in the distribution of energy in the frequency spectrum (formant transitions). Studies examining these temporal processing deficit hypotheses do not employ measures that quantify the temporal dynamics of stimuli. It is shown that measures based on quantification of the dynamics of complex, interaction-dominant systems (Recurrence Quantification Analysis and the multifractal spectrum) enable QDA to classify the stimuli almost identically as observed in dyslexic and average reading participants. It seems unlikely that participants used any of the features that are traditionally associated with accounts of (impaired) speech perception. The nature of the variables quantifying the temporal dynamics of the speech stimuli imply that the classification of speech stimuli cannot be regarded as a linear aggregate of component processes that each parse the acoustic signal independent of one another, as is assumed by the 'classical' aetiologies of developmental dyslexia. It is suggested that the results imply that the differences in speech perception performance between average and dyslexic readers represent a scaled continuum rather than being caused by a specific deficient component.}, } @article {pmid25819162, year = {2015}, author = {Sundberg, J and Thalén, M}, title = {Respiratory and Acoustical Differences Between Belt and Neutral Style of Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {4}, pages = {418-425}, doi = {10.1016/j.jvoice.2014.09.018}, pmid = {25819162}, issn = {1873-4588}, mesh = {Adult ; Female ; Glottis/*physiology ; Humans ; *Singing ; Speech Acoustics ; Young Adult ; }, abstract = {OBJECTIVES: Belt is a style of singing commonly used in nonclassical genres. Its respiratory, phonatory, and resonatory characteristics are unclear.

DESIGN: Basic research.

METHODS: Six female singers, professionally performing in the belt styles since many years, sang an excerpt of a song in belt and nonbelt/neutral style, two times with the lyrics and two times replacing the lyrics with /pae/ syllables. On separate channels, recordings were made of audio, oral pressure, and rib cage and abdominal wall movements, as picked up by respiratory inductive plethysmography. Lung volume and breathing patterns during inhalation and phonation were normalized with respect to duration and averaged. Voice source was analyzed in terms of flow glottograms derived from the audio signal by inverse filtering.

RESULTS: Belt was produced with higher pressures and yielded higher sound levels, but no consistent breathing pattern was observed, neither for the belt, nor for the neutral style. Voice source differences suggested that belt was produced with firmer glottal adduction than neutral. Also, in four of the singers, the first formant was closer to a spectrum harmonic in belt than in neutral.

CONCLUSIONS: Belt style of singing is not associated with a characteristic breathing behavior but is produced with higher subglottal pressures, higher sound levels, and firmer glottal adduction than a neutral style of singing.}, } @article {pmid25813201, year = {2015}, author = {Nittrouer, S and Lowenstein, JH}, title = {Weighting of Acoustic Cues to a Manner Distinction by Children With and Without Hearing Loss.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {3}, pages = {1077-1092}, pmid = {25813201}, issn = {1558-9102}, support = {R01 DC006237/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Child ; Child Language ; Cochlear Implants ; *Cues ; Discrimination, Psychological ; Hearing Aids ; *Hearing Loss/psychology/rehabilitation ; Humans ; Language Tests ; Neuropsychological Tests ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: Children must develop optimal perceptual weighting strategies for processing speech in their first language. Hearing loss can interfere with that development, especially if cochlear implants are required. The three goals of this study were to measure, for children with and without hearing loss: (a) cue weighting for a manner distinction, (b) sensitivity to those cues, and (c) real-world communication functions.

METHOD: One hundred and seven children (43 with normal hearing [NH], 17 with hearing aids [HAs], and 47 with cochlear implants [CIs]) performed several tasks: labeling of stimuli from /bɑ/-to-/wɑ/ continua varying in formant and amplitude rise time (FRT and ART), discrimination of ART, word recognition, and phonemic awareness.

RESULTS: Children with hearing loss were less attentive overall to acoustic structure than children with NH. Children with CIs, but not those with HAs, weighted FRT less and ART more than children with NH. Sensitivity could not explain cue weighting. FRT cue weighting explained significant amounts of variability in word recognition and phonemic awareness; ART cue weighting did not.

CONCLUSION: Signal degradation inhibits access to spectral structure for children with CIs, but cannot explain their delayed development of optimal weighting strategies. Auditory training could strengthen the weighting of spectral cues for children with CIs, thus aiding spoken language acquisition.}, } @article {pmid25804824, year = {2015}, author = {Kumar, P and Singh, NK}, title = {BioMARK as electrophysiological tool for assessing children at risk for (central) auditory processing disorders without reading deficits.}, journal = {Hearing research}, volume = {324}, number = {}, pages = {54-58}, doi = {10.1016/j.heares.2015.03.001}, pmid = {25804824}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Analysis of Variance ; Auditory Pathways/physiology ; Brain Stem/physiology ; Child ; Electrophysiology/*methods ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Language Development Disorders/*diagnosis/physiopathology ; Male ; Reproducibility of Results ; Risk ; Speech/physiology ; Speech Perception/physiology ; Surveys and Questionnaires ; }, abstract = {Biological Marker of auditory processing (BioMARK) is an electrophysiological test tool widely known as Speech-evoked ABR. Several previous investigations have shown the utility of speech-evoked ABR in the diagnosis of language based processing deficits like learning disability and specific language impairment; however missing from literature is a study that has ruled out the existence of comorbidity of such conditions and carefully delineated the efficacy of speech-evoked ABR in children with children with auditory processing disorders sans reading deficits. Hence, the present study aimed at investigating Speech-evoked ABR in children with auditory processing disorders without reading problems. A total of 336 school going children in the age range of 8-12 years were screened for presence of central auditory processing deficits. Among the 51 children who were identified as at risk, 15 were randomly selected and served as experimental group. The control group comprised of fifteen age matched children. The inter-group comparison was done using MANOVA, which revealed significant prolongations of latencies of waves V and A (p = 0.001) along with marginal reductions in V/A slope (p = 0.052) and amplitude of responses to first formant (p = 0.065). The responses to higher frequencies did not differ between the groups. Speech-evoked ABR are affected in children who are at risk of central auditory processing disorders sans reading deficits which probably indicates the presence of abnormal brainstem encoding of speech signal in this population.}, } @article {pmid25797590, year = {2015}, author = {Hutka, S and Bidelman, GM and Moreno, S}, title = {Pitch expertise is not created equal: Cross-domain effects of musicianship and tone language experience on neural and behavioural discrimination of speech and music.}, journal = {Neuropsychologia}, volume = {71}, number = {}, pages = {52-63}, doi = {10.1016/j.neuropsychologia.2015.03.019}, pmid = {25797590}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Adult ; Cerebral Cortex/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; *Language ; Male ; *Music ; Pitch Perception/*physiology ; Professional Competence ; Psychophysics ; Sound Spectrography ; Young Adult ; }, abstract = {Psychophysiological evidence supports a music-language association, such that experience in one domain can impact processing required in the other domain. We investigated the bidirectionality of this association by measuring event-related potentials (ERPs) in native English-speaking musicians, native tone language (Cantonese) nonmusicians, and native English-speaking nonmusician controls. We tested the degree to which pitch expertise stemming from musicianship or tone language experience similarly enhances the neural encoding of auditory information necessary for speech and music processing. Early cortical discriminatory processing for music and speech sounds was characterized using the mismatch negativity (MMN). Stimuli included 'large deviant' and 'small deviant' pairs of sounds that differed minimally in pitch (fundamental frequency, F0; contrastive musical tones) or timbre (first formant, F1; contrastive speech vowels). Behavioural F0 and F1 difference limen tasks probed listeners' perceptual acuity for these same acoustic features. Musicians and Cantonese speakers performed comparably in pitch discrimination; only musicians showed an additional advantage on timbre discrimination performance and an enhanced MMN responses to both music and speech. Cantonese language experience was not associated with enhancements on neural measures, despite enhanced behavioural pitch acuity. These data suggest that while both musicianship and tone language experience enhance some aspects of auditory acuity (behavioural pitch discrimination), musicianship confers farther-reaching enhancements to auditory function, tuning both pitch and timbre-related brain processes.}, } @article {pmid25795368, year = {2016}, author = {Montero Benavides, A and Blanco Murillo, JL and Fernández Pozo, R and Espinoza Cuadros, F and Torre Toledano, D and Alcázar-Ramírez, JD and Hernández Gómez, LA}, title = {Formant Frequencies and Bandwidths in Relation to Clinical Variables in an Obstructive Sleep Apnea Population.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {30}, number = {1}, pages = {21-29}, doi = {10.1016/j.jvoice.2015.01.006}, pmid = {25795368}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Aged ; Humans ; Male ; Middle Aged ; *Phonation ; Predictive Value of Tests ; Retrospective Studies ; Sleep Apnea, Obstructive/*diagnosis/physiopathology ; Sound Spectrography ; *Speech Acoustics ; *Speech Production Measurement ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: We investigated whether differences in formants and their bandwidths, previously reported comparing small sample population of healthy individuals and patients with obstructive sleep apnea (OSA), are detected on a larger population representative of a clinical practice scenario. We examine possible indirect or mediated effects of clinical variables, which may shed some light on the connection between speech and OSA.

STUDY DESIGN: In a retrospective study, 241 male subjects suspected to suffer from OSA were examined. The apnea-hypopnea index (AHI) was obtained for every subject using overnight polysomnography. Furthermore, the clinical variables usually reported as predictors of OSA, body mass index (BMI), cervical perimeter, height, weight, and age, were collected. Voice samples of sustained phonations of the vowels /a/, /e/, /i/, /o/, and /u/ were recorded.

METHODS: Formant frequencies F1, F2, and F3 and bandwidths BW1, BW2, and BW3 of the sustained vowels were determined using spectrographic analysis. Correlations among AHI, clinical parameters, and formants and bandwidths were determined.

RESULTS: Correlations between AHI and clinical variables were stronger than those between AHI and voice features. AHI only correlates poorly with BW2 of /a/ and BW3 of /e/. A number of further weak but significant correlations have been detected between voice and clinical variables. Most of them were for height and age, with two higher values for age and F2 of /o/ and F2 of /u/. Only few very weak correlations were detected between voice and BMI, weight and cervical perimeter, wich are the clinical variables more correlated with AHI.

CONCLUSIONS: No significant correlations were detected between AHI and formant frequencies and bandwidths. Correlations between voice and other clinical factors characterizing OSA are weak but highlight the importance of considering indirect or mediated effects of such clinical variables in any research on speech and OSA.}, } @article {pmid25795355, year = {2015}, author = {Manfredi, C and Barbagallo, D and Baracca, G and Orlandi, S and Bandini, A and Dejonckere, PH}, title = {Automatic Assessment of Acoustic Parameters of the Singing Voice: Application to Professional Western Operatic and Jazz Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {4}, pages = {517.e1-9}, doi = {10.1016/j.jvoice.2014.09.014}, pmid = {25795355}, issn = {1873-4588}, mesh = {Adult ; Culture ; Female ; Humans ; Male ; Middle Aged ; *Singing ; *Speech Acoustics ; Voice ; Young Adult ; }, abstract = {INTRODUCTION: The obvious perceptual differences between various singing styles like Western operatic and jazz rely on specific dissimilarities in vocal technique. The present study focuses on differences in vibrato acoustics and in singer's formant as analyzed by a novel software tool, named BioVoice, based on robust high-resolution and adaptive techniques that have proven its validity on synthetic voice signals.

MATERIAL AND METHODS: A total of 48 professional singers were investigated (29 females; 19 males; 29 Western operatic; and 19 jazz). They were asked to sing "a cappella," but with artistic expression, a well-known musical phrase from Gershwin's Porgy and Bess, in their own style: either operatic or jazz. A specific sustained note was extracted for detailed vibrato analysis. Beside rate (s(-1)) and extent (cents), duration (seconds) and regularity were computed. Two new concepts are introduced: vibrato jitter and vibrato shimmer, by analogy with the traditional jitter and shimmer of voice signals. For the singer's formant, on the same sustained tone, the ratio of the acoustic energy in formants 1-2 to the energy in formants 3, 4, and 5 was automatically computed, providing a quality ratio (QR).

RESULTS: Vibrato rates did not differ among groups. Extent was significantly larger in operatic singers, particularly females. Vibrato jitter and vibrato shimmer were significantly smaller in operatic singers. Duration of vibrato was also significantly longer in operatic singers. QR was significantly lower in male operatic singers.

CONCLUSIONS: Some vibrato characteristics (extent, regularity, and duration) very clearly differentiate the Western operatic singing style from the jazz singing style. The singer's formant is typical of male operatic singers. The new software tool is well suited to provide useful feedback in a pedagogical context.}, } @article {pmid25788727, year = {2015}, author = {de Boer, B and Wich, SA and Hardus, ME and Lameira, AR}, title = {Acoustic models of orangutan hand-assisted alarm calls.}, journal = {The Journal of experimental biology}, volume = {218}, number = {Pt 6}, pages = {907-914}, doi = {10.1242/jeb.110577}, pmid = {25788727}, issn = {1477-9145}, mesh = {Acoustics ; Animals ; Hand ; Indonesia ; Models, Biological ; Pongo pygmaeus/*physiology ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Orangutans produce alarm calls called kiss-squeaks, which they sometimes modify by putting a hand in front of their mouth. Through theoretical models and observational evidence, we show that using the hand when making a kiss-squeak alters the acoustics of the production in such a way that more formants per kilohertz are produced. Our theoretical models suggest that cylindrical wave propagation is created with the use of the hand and face as they act as a cylindrical extension of the lips. The use of cylindrical wave propagation in animal calls appears to be extremely rare, but is an effective way to lengthen the acoustic system; it causes the number of resonances per kilohertz to increase. This increase is associated with larger animals, and thus using the hand in kiss-squeak production may be effective in exaggerating the size of the producer. Using the hand appears to be a culturally learned behavior, and therefore orangutans may be able to associate the acoustic effect of using the hand with potentially more effective deterrence of predators.}, } @article {pmid25786954, year = {2015}, author = {Winn, MB and Litovsky, RY}, title = {Using speech sounds to test functional spectral resolution in listeners with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {3}, pages = {1430-1442}, pmid = {25786954}, issn = {1520-8524}, support = {R01 DC002932/DC/NIDCD NIH HHS/United States ; P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC02932/DC/NIDCD NIH HHS/United States ; R01 DC003083/DC/NIDCD NIH HHS/United States ; P30 HD03352/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adult ; Aged ; Audiometry, Speech ; Case-Control Studies ; *Cochlear Implants ; Correction of Hearing Impairment/*instrumentation ; *Cues ; Electric Stimulation ; Female ; Humans ; Logistic Models ; Male ; Middle Aged ; Persons With Hearing Impairments/psychology/*rehabilitation ; Phonetics ; Prosthesis Design ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {In this study, spectral properties of speech sounds were used to test functional spectral resolution in people who use cochlear implants (CIs). Specifically, perception of the /ba/-/da/ contrast was tested using two spectral cues: Formant transitions (a fine-resolution cue) and spectral tilt (a coarse-resolution cue). Higher weighting of the formant cues was used as an index of better spectral cue perception. Participants included 19 CI listeners and 10 listeners with normal hearing (NH), for whom spectral resolution was explicitly controlled using a noise vocoder with variable carrier filter widths to simulate electrical current spread. Perceptual weighting of the two cues was modeled with mixed-effects logistic regression, and was found to systematically vary with spectral resolution. The use of formant cues was greatest for NH listeners for unprocessed speech, and declined in the two vocoded conditions. Compared to NH listeners, CI listeners relied less on formant transitions, and more on spectral tilt. Cue-weighting results showed moderately good correspondence with word recognition scores. The current approach to testing functional spectral resolution uses auditory cues that are known to be important for speech categorization, and can thus potentially serve as the basis upon which CI processing strategies and innovations are tested.}, } @article {pmid25781470, year = {2015}, author = {Varnet, L and Knoblauch, K and Serniclaes, W and Meunier, F and Hoen, M}, title = {A psychophysical imaging method evidencing auditory cue extraction during speech perception: a group analysis of auditory classification images.}, journal = {PloS one}, volume = {10}, number = {3}, pages = {e0118009}, pmid = {25781470}, issn = {1932-6203}, mesh = {Algorithms ; *Cues ; Female ; Humans ; Image Processing, Computer-Assisted/*methods ; Male ; *Psychoacoustics ; Signal-To-Noise Ratio ; *Speech ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Although there is a large consensus regarding the involvement of specific acoustic cues in speech perception, the precise mechanisms underlying the transformation from continuous acoustical properties into discrete perceptual units remains undetermined. This gap in knowledge is partially due to the lack of a turnkey solution for isolating critical speech cues from natural stimuli. In this paper, we describe a psychoacoustic imaging method known as the Auditory Classification Image technique that allows experimenters to estimate the relative importance of time-frequency regions in categorizing natural speech utterances in noise. Importantly, this technique enables the testing of hypotheses on the listening strategies of participants at the group level. We exemplify this approach by identifying the acoustic cues involved in da/ga categorization with two phonetic contexts, Al- or Ar-. The application of Auditory Classification Images to our group of 16 participants revealed significant critical regions on the second and third formant onsets, as predicted by the literature, as well as an unexpected temporal cue on the first formant. Finally, through a cluster-based nonparametric test, we demonstrate that this method is sufficiently sensitive to detect fine modifications of the classification strategies between different utterances of the same phoneme.}, } @article {pmid25754812, year = {2016}, author = {Masapollo, M and Polka, L and Ménard, L}, title = {When infants talk, infants listen: pre-babbling infants prefer listening to speech with infant vocal properties.}, journal = {Developmental science}, volume = {19}, number = {2}, pages = {318-328}, doi = {10.1111/desc.12298}, pmid = {25754812}, issn = {1467-7687}, mesh = {*Child Language ; Female ; Humans ; Infant ; *Language Development ; Male ; *Speech Perception ; *Voice Quality ; }, abstract = {To learn to produce speech, infants must effectively monitor and assess their own speech output. Yet very little is known about how infants perceive speech produced by an infant, which has higher voice pitch and formant frequencies compared to adult or child speech. Here, we tested whether pre-babbling infants (at 4-6 months) prefer listening to vowel sounds with infant vocal properties over vowel sounds with adult vocal properties. A listening preference favoring infant vowels may derive from their higher voice pitch, which has been shown to attract infant attention in infant-directed speech (IDS). In addition, infants' nascent articulatory abilities may induce a bias favoring infant speech given that 4- to 6-month-olds are beginning to produce vowel sounds. We created infant and adult /i/ ('ee') vowels using a production-based synthesizer that simulates the act of speaking in talkers at different ages and then tested infants across four experiments using a sequential preferential listening task. The findings provide the first evidence that infants preferentially attend to vowel sounds with infant voice pitch and/or formants over vowel sounds with no infant-like vocal properties, supporting the view that infants' production abilities influence how they process infant speech. The findings with respect to voice pitch also reveal parallels between IDS and infant speech, raising new questions about the role of this speech register in infant development. Research exploring the underpinnings and impact of this perceptual bias can expand our understanding of infant language development.}, } @article {pmid25751040, year = {2015}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Acoustic source characteristics, across-formant integration, and speech intelligibility under competitive conditions.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {41}, number = {3}, pages = {680-691}, pmid = {25751040}, issn = {1939-1277}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Perception ; Cues ; Female ; Humans ; Male ; *Speech Intelligibility ; Speech Perception ; Young Adult ; }, abstract = {An important aspect of speech perception is the ability to group or select formants using cues in the acoustic source characteristics--for example, fundamental frequency (F0) differences between formants promote their segregation. This study explored the role of more radical differences in source characteristics. Three-formant (F1+F2+F3) synthetic speech analogues were derived from natural sentences. In Experiment 1, F1+F3 were generated by passing a harmonic glottal source (F0 = 140 Hz) through second-order resonators (H1+H3); in Experiment 2, F1+F3 were tonal (sine-wave) analogues (T1+T3). F2 could take either form (H2 or T2). In some conditions, the target formants were presented alone, either monaurally or dichotically (left ear = F1+F3; right ear = F2). In others, they were accompanied by a competitor for F2 (F1+F2C+F3; F2), which listeners must reject to optimize recognition. Competitors (H2C or T2C) were created using the time-reversed frequency and amplitude contours of F2. Dichotic presentation of F2 and F2C ensured that the impact of the competitor arose primarily through informational masking. In the absence of F2C, the effect of a source mismatch between F1+F3 and F2 was relatively modest. When F2C was present, intelligibility was lowest when F2 was tonal and F2C was harmonic, irrespective of which type matched F1+F3. This finding suggests that source type and context, rather than similarity, govern the phonetic contribution of a formant. It is proposed that wideband harmonic analogues are more effective informational maskers than narrowband tonal analogues, and so become dominant in across-frequency integration of phonetic information when placed in competition.}, } @article {pmid25749240, year = {2015}, author = {Lee, SH and Hsiao, TY and Lee, GS}, title = {Audio-vocal responses of vocal fundamental frequency and formant during sustained vowel vocalizations in different noises.}, journal = {Hearing research}, volume = {324}, number = {}, pages = {1-6}, doi = {10.1016/j.heares.2015.02.005}, pmid = {25749240}, issn = {1878-5891}, mesh = {Adult ; Female ; Hearing ; Humans ; Male ; Phonation ; *Phonetics ; Sex Factors ; Software ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; Voice ; Young Adult ; }, abstract = {Sustained vocalizations of vowels [a], [i], and syllable [mə] were collected in twenty normal-hearing individuals. On vocalizations, five conditions of different audio-vocal feedback were introduced separately to the speakers including no masking, wearing supra-aural headphones only, speech-noise masking, high-pass noise masking, and broad-band-noise masking. Power spectral analysis of vocal fundamental frequency (F0) was used to evaluate the modulations of F0 and linear-predictive-coding was used to acquire first two formants. The results showed that while the formant frequencies were not significantly shifted, low-frequency modulations (<3 Hz) of F0 significantly increased with reduced audio-vocal feedback across speech sounds and were significantly correlated with auditory awareness of speakers' own voices. For sustained speech production, the motor speech controls on F0 may depend on a feedback mechanism while articulation should rely more on a feedforward mechanism. Power spectral analysis of F0 might be applied to evaluate audio-vocal control for various hearing and neurological disorders in the future.}, } @article {pmid25724819, year = {2015}, author = {Clinard, CG and Cotter, CM}, title = {Neural representation of dynamic frequency is degraded in older adults.}, journal = {Hearing research}, volume = {323}, number = {}, pages = {91-98}, doi = {10.1016/j.heares.2015.02.002}, pmid = {25724819}, issn = {1878-5891}, support = {U24 DC012079/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Age Factors ; Aged ; Aging/*psychology ; Audiometry ; Auditory Pathways/*physiopathology ; Auditory Threshold ; Comprehension ; Humans ; Middle Aged ; Noise/adverse effects ; Perceptual Masking ; *Pitch Perception ; Speech Intelligibility ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {Older adults, even with clinically normal hearing sensitivity, often report difficulty understanding speech in the presence of background noise. Part of this difficulty may be related to age-related degradations in the neural representation of speech sounds, such as formant transitions. Frequency-following responses (FFRs), which are dependent on phase-locked neural activity, were elicited using sounds consisting of linear frequency sweeps, which may be viewed as simple models of formant transitions. Eighteen adults (ten younger, 22-24 years old, and nine older, 51-67 years old) were tested. FFRs were elicited by tonal sweeps in six conditions. Two directions of frequency change, rising or falling, were used for each of three rates of frequency change. Stimulus-to-response cross correlations revealed that older adults had significantly poorer representation of the tonal sweeps, and that FFRs became poorer for faster rates of change. An additional FFR signal-to-noise ratio analysis based on time windows revealed that across the FFR waveforms and rates of frequency change, older adults had smaller (poorer) signal-to-noise ratios. These results indicate that older adults, even with clinically-normal hearing sensitivity, have degraded phase-locked neural representations of dynamic frequency.}, } @article {pmid25698033, year = {2015}, author = {Kreuzer, W and Kasess, CH}, title = {Tuning of vocal tract model parameters for nasals using sensitivity functions.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {2}, pages = {1021-1031}, doi = {10.1121/1.4906158}, pmid = {25698033}, issn = {1520-8524}, abstract = {Determining the cross-sectional areas of the vocal tract models from the linear predictive coding or autoregressive-moving-average analysis of speech signals from vowels has been of research interest for several decades now. To tune the shape of the vocal tract to given sets of formant frequencies, iterative methods using sensitivity functions have been developed. In this paper, the idea of sensitivity functions is expanded to a three-tube model used in connection with nasals, and the energy-based sensitivity function is compared with a Jacobian-based sensitivity function for the branched-tube model. It is shown that the difference between both functions is negligible if the sensitivity is taken with respect to the formant frequency only. Results for an iterative tuning a three-tube vocal tract model based on the sensitivity functions for a nasal (/m/) are given. It is shown that besides the polar angle, the absolute value of the poles and zeros of the rational transfer function also needs to be considered in the tuning process. To test the effectiveness of the iterative solver, the steepest descent method is compared with the Gauss-Newton method. It is shown, that the Gauss-Newton method converges faster if a good starting value for the iteration is given.}, } @article {pmid25698026, year = {2015}, author = {Mehta, DD and Wolfe, PJ}, title = {Statistical properties of linear prediction analysis underlying the challenge of formant bandwidth estimation.}, journal = {The Journal of the Acoustical Society of America}, volume = {137}, number = {2}, pages = {944-950}, pmid = {25698026}, issn = {1520-8524}, support = {R33 DC011588/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Computer Simulation ; Humans ; Linear Models ; *Models, Statistical ; Monte Carlo Method ; *Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; *Voice Quality ; }, abstract = {Formant bandwidth estimation is often observed to be more challenging than the estimation of formant center frequencies due to the presence of multiple glottal pulses within a period and short closed-phase durations. This study explores inherently different statistical properties between linear prediction (LP)-based estimates of formant frequencies and their corresponding bandwidths that may be explained in part by the statistical bounds on the variances of estimated LP coefficients. A theoretical analysis of the Cramér-Rao bounds on LP estimator variance indicates that the accuracy of bandwidth estimation is approximately twice as low as that of center frequency estimation. Monte Carlo simulations of all-pole vowels with stochastic and mixed-source excitation demonstrate that the distributions of estimated LP coefficients exhibit expectedly different variances for each coefficient. Transforming the LP coefficients to formant parameters results in variances of bandwidth estimates being typically larger than the variances of respective center frequency estimates, depending on vowel type and fundamental frequency. These results provide additional evidence underlying the challenge of formant bandwidth estimation due to inherent statistical properties of LP-based speech analysis.}, } @article {pmid25688915, year = {2015}, author = {Sauvageau, VM and Roy, JP and Langlois, M and Macoir, J}, title = {Impact of the LSVT on vowel articulation and coarticulation in Parkinson's disease.}, journal = {Clinical linguistics & phonetics}, volume = {29}, number = {6}, pages = {424-440}, doi = {10.3109/02699206.2015.1012301}, pmid = {25688915}, issn = {1464-5076}, mesh = {Aged ; Dysarthria/*diagnosis/*therapy ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*diagnosis/*therapy ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Intelligibility ; Treatment Outcome ; Voice Disorders/*diagnosis/*therapy ; *Voice Training ; }, abstract = {The purpose of this study was to investigate the impact of the Lee Silverman Voice Treatment (LSVT®) on vowel articulation and consonant-vowel (C-V) coarticulation in dysarthric speakers with Parkinson's disease (PD). Nine Quebec French speakers diagnosed with idiopathic PD underwent the LSVT®. Speech characteristics were compared before and after treatment. Vowel articulation was measured using acoustic vowel space and calculated with the first (F1) and second formant (F2) of the vowels /i/, /u/ and /a/. C-V coarticulation was measured using locus equations, an acoustic metric based on the F2 transitions within vowels in relation to the preceding consonant. The relationship between these variables, speech loudness and vowel duration was also analysed. Results showed that vowel contrast increased in F1/F2 acoustic space after administration of the LSVT®. This improvement was associated with the gain in speech loudness and longer vowel duration. C-V coarticulation patterns between consonant contexts showed greater distinctiveness after the treatment. This improvement was associated with the gain in speech loudness only. These results support the conclusions of previous studies investigating the relationship between the LSVT®, speech loudness and articulation in PD. These results expand clinical understanding of the treatment and indicate that loud speech changes C-V coarticulation patterns. Clinical applications and theoretical considerations are discussed.}, } @article {pmid25676810, year = {2015}, author = {Max, L and Maffett, DG}, title = {Feedback delays eliminate auditory-motor learning in speech production.}, journal = {Neuroscience letters}, volume = {591}, number = {}, pages = {25-29}, pmid = {25676810}, issn = {1872-7972}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; P30 DC004661/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01DC007603/DC/NIDCD NIH HHS/United States ; MOP-137001//Canadian Institutes of Health Research/Canada ; P30DC004661/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; *Feedback, Sensory ; Female ; Humans ; *Learning ; Male ; *Speech ; Speech Acoustics ; Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Neurologically healthy individuals use sensory feedback to alter future movements by updating internal models of the effector system and environment. For example, when visual feedback about limb movements or auditory feedback about speech movements is experimentally perturbed, the planning of subsequent movements is adjusted - i.e., sensorimotor adaptation occurs. A separate line of studies has demonstrated that experimentally delaying the sensory consequences of limb movements causes the sensory input to be attributed to external sources rather than to one's own actions. Yet similar feedback delays have remarkably little effect on visuo-motor adaptation (although the rate of learning varies, the amount of adaptation is only moderately affected with delays of 100-200ms, and adaptation still occurs even with a delay as long as 5000ms). Thus, limb motor learning remains largely intact even in conditions where error assignment favors external factors. Here, we show a fundamentally different result for sensorimotor control of speech articulation: auditory-motor adaptation to formant-shifted feedback is completely eliminated with delays of 100ms or more. Thus, for speech motor learning, real-time auditory feedback is critical. This novel finding informs theoretical models of human motor control in general and speech motor control in particular, and it has direct implications for the application of motor learning principles in the habilitation and rehabilitation of individuals with various sensorimotor speech disorders.}, } @article {pmid25659121, year = {2015}, author = {Burnham, EB and Wieland, EA and Kondaurova, MV and McAuley, JD and Bergeson, TR and Dilley, LC}, title = {Phonetic modification of vowel space in storybook speech to infants up to 2 years of age.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {2}, pages = {241-253}, pmid = {25659121}, issn = {1558-9102}, support = {R01 DC008581/DC/NIDCD NIH HHS/United States ; R01 DC 008581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Child Language ; Child, Preschool ; Cross-Sectional Studies ; Female ; Humans ; Infant ; Longitudinal Studies ; Male ; Mothers ; *Phonetics ; *Reading ; *Speech Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: A large body of literature has indicated vowel space area expansion in infant-directed (ID) speech compared with adult-directed (AD) speech, which may promote language acquisition. The current study tested whether this expansion occurs in storybook speech read to infants at various points during their first 2 years of life.

METHOD: In 2 studies, mothers read a storybook containing target vowels in ID and AD speech conditions. Study 1 was longitudinal, with 11 mothers recorded when their infants were 3, 6, and 9 months old. Study 2 was cross-sectional, with 48 mothers recorded when their infants were 3, 9, 13, or 20 months old (n=12 per group). The 1st and 2nd formants of vowels /i/, /ɑ/, and /u/ were measured, and vowel space area and dispersion were calculated.

RESULTS: Across both studies, 1st and/or 2nd formant frequencies shifted systematically for /i/ and /u/ vowels in ID compared with AD speech. No difference in vowel space area or dispersion was found.

CONCLUSIONS: The results suggest that a variety of communication and situational factors may affect phonetic modifications in ID speech, but that vowel space characteristics in speech to infants stay consistent across the first 2 years of life.}, } @article {pmid25658071, year = {2015}, author = {Wieland, EA and Burnham, EB and Kondaurova, M and Bergeson, TR and Dilley, LC}, title = {Vowel space characteristics of speech directed to children with and without hearing loss.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {2}, pages = {254-267}, pmid = {25658071}, issn = {1558-9102}, support = {R01 DC008581/DC/NIDCD NIH HHS/United States ; R01 DC 008581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child Language ; Child, Preschool ; Cochlear Implants/*psychology ; Correction of Hearing Impairment/methods ; Female ; Hearing Aids/*psychology ; Hearing Loss/physiopathology/*psychology/rehabilitation ; Humans ; Male ; Mothers ; *Phonetics ; Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; United States ; Young Adult ; }, abstract = {PURPOSE: This study examined vowel characteristics in adult-directed (AD) and infant-directed (ID) speech to children with hearing impairment who received cochlear implants or hearing aids compared with speech to children with normal hearing.

METHOD: Mothers' AD and ID speech to children with cochlear implants (Study 1, n=20) or hearing aids (Study 2, n=11) was compared with mothers' speech to controls matched on age and hearing experience. The first and second formants of vowels /i/, /ɑ/, and /u/ were measured, and vowel space area and dispersion were calculated.

RESULTS: In both studies, vowel space was modified in ID compared with AD speech to children with and without hearing loss. Study 1 showed larger vowel space area and dispersion in ID compared with AD speech regardless of infant hearing status. The pattern of effects of ID and AD speech on vowel space characteristics in Study 2 was similar to that in Study 1, but depended partly on children's hearing status.

CONCLUSION: Given previously demonstrated associations between expanded vowel space in ID compared with AD speech and enhanced speech perception skills, this research supports a focus on vowel pronunciation in developing intervention strategies for improving speech-language skills in children with hearing impairment.}, } @article {pmid25657622, year = {2014}, author = {Niu, CM and Lee, K and Houde, JF and Sanger, TD}, title = {Vowel generation for children with cerebral palsy using myocontrol of a speech synthesizer.}, journal = {Frontiers in human neuroscience}, volume = {8}, number = {}, pages = {1077}, pmid = {25657622}, issn = {1662-5161}, support = {R01 DC013979/DC/NIDCD NIH HHS/United States ; }, abstract = {For children with severe cerebral palsy (CP), social and emotional interactions can be significantly limited due to impaired speech motor function. However, if it is possible to extract continuous voluntary control signals from the electromyograph (EMG) of limb muscles, then EMG may be used to drive the synthesis of intelligible speech with controllable speed, intonation and articulation. We report an important first step: the feasibility of controlling a vowel synthesizer using non-speech muscles. A classic formant-based speech synthesizer is adapted to allow the lowest two formants to be controlled by surface EMG from skeletal muscles. EMG signals are filtered using a non-linear Bayesian filtering algorithm that provides the high bandwidth and accuracy required for speech tasks. The frequencies of the first two formants determine points in a 2D plane, and vowels are targets on this plane. We focus on testing the overall feasibility of producing intelligible English vowels with myocontrol using two straightforward EMG-formant mappings. More mappings can be tested in the future to optimize the intelligibility. Vowel generation was tested on 10 healthy adults and 4 patients with dyskinetic CP. Five English vowels were generated by subjects in pseudo-random order, after only 10 min of device familiarization. The fraction of vowels correctly identified by 4 naive listeners exceeded 80% for the vowels generated by healthy adults and 57% for vowels generated by patients with CP. Our goal is a continuous "virtual voice" with personalized intonation and articulation that will restore not only the intellectual content but also the social and emotional content of speech for children and adults with severe movement disorders.}, } @article {pmid25629388, year = {2015}, author = {Souza, PE and Wright, RA and Blackburn, MC and Tatman, R and Gallun, FJ}, title = {Individual sensitivity to spectral and temporal cues in listeners with hearing impairment.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {2}, pages = {520-534}, pmid = {25629388}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; R01 DC60014/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adult ; Auditory Threshold ; Case-Control Studies ; *Cues ; Female ; Hearing Loss/*psychology ; Humans ; Individuality ; Male ; Recognition, Psychology ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: The present study was designed to evaluate use of spectral and temporal cues under conditions in which both types of cues were available.

METHOD: Participants included adults with normal hearing and hearing loss. We focused on 3 categories of speech cues: static spectral (spectral shape), dynamic spectral (formant change), and temporal (amplitude envelope). Spectral and/or temporal dimensions of synthetic speech were systematically manipulated along a continuum, and recognition was measured using the manipulated stimuli. Level was controlled to ensure cue audibility. Discriminant function analysis was used to determine to what degree spectral and temporal information contributed to the identification of each stimulus.

RESULTS: Listeners with normal hearing were influenced to a greater extent by spectral cues for all stimuli. Listeners with hearing impairment generally utilized spectral cues when the information was static (spectral shape) but used temporal cues when the information was dynamic (formant transition). The relative use of spectral and temporal dimensions varied among individuals, especially among listeners with hearing loss.

CONCLUSION: Information about spectral and temporal cue use may aid in identifying listeners who rely to a greater extent on particular acoustic cues and applying that information toward therapeutic interventions.}, } @article {pmid25628545, year = {2014}, author = {Sayles, M and Stasiak, A and Winter, IM}, title = {Reverberation impairs brainstem temporal representations of voiced vowel sounds: challenging "periodicity-tagged" segregation of competing speech in rooms.}, journal = {Frontiers in systems neuroscience}, volume = {8}, number = {}, pages = {248}, pmid = {25628545}, issn = {1662-5137}, abstract = {The auditory system typically processes information from concurrently active sound sources (e.g., two voices speaking at once), in the presence of multiple delayed, attenuated and distorted sound-wave reflections (reverberation). Brainstem circuits help segregate these complex acoustic mixtures into "auditory objects." Psychophysical studies demonstrate a strong interaction between reverberation and fundamental-frequency (F0) modulation, leading to impaired segregation of competing vowels when segregation is on the basis of F0 differences. Neurophysiological studies of complex-sound segregation have concentrated on sounds with steady F0s, in anechoic environments. However, F0 modulation and reverberation are quasi-ubiquitous. We examine the ability of 129 single units in the ventral cochlear nucleus (VCN) of the anesthetized guinea pig to segregate the concurrent synthetic vowel sounds /a/ and /i/, based on temporal discharge patterns under closed-field conditions. We address the effects of added real-room reverberation, F0 modulation, and the interaction of these two factors, on brainstem neural segregation of voiced speech sounds. A firing-rate representation of single-vowels' spectral envelopes is robust to the combination of F0 modulation and reverberation: local firing-rate maxima and minima across the tonotopic array code vowel-formant structure. However, single-vowel F0-related periodicity information in shuffled inter-spike interval distributions is significantly degraded in the combined presence of reverberation and F0 modulation. Hence, segregation of double-vowels' spectral energy into two streams (corresponding to the two vowels), on the basis of temporal discharge patterns, is impaired by reverberation; specifically when F0 is modulated. All unit types (primary-like, chopper, onset) are similarly affected. These results offer neurophysiological insights to perceptual organization of complex acoustic scenes under realistically challenging listening conditions.}, } @article {pmid25620314, year = {2015}, author = {Stachurski, M and Summers, RJ and Roberts, B}, title = {The verbal transformation effect and the perceptual organization of speech: influence of formant transitions and F0-contour continuity.}, journal = {Hearing research}, volume = {323}, number = {}, pages = {22-31}, doi = {10.1016/j.heares.2015.01.007}, pmid = {25620314}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; Audiometry, Speech ; Auditory Pathways/physiology ; *Cues ; Female ; Humans ; Male ; Pattern Recognition, Physiological ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {This study explored the role of formant transitions and F0-contour continuity in binding together speech sounds into a coherent stream. Listening to a repeating recorded word produces verbal transformations to different forms; stream segregation contributes to this effect and so it can be used to measure changes in perceptual coherence. In experiment 1, monosyllables with strong formant transitions between the initial consonant and following vowel were monotonized; each monosyllable was paired with a weak-transitions counterpart. Further stimuli were derived by replacing the consonant-vowel transitions with samples from adjacent steady portions. Each stimulus was concatenated into a 3-min-long sequence. Listeners only reported more forms in the transitions-removed condition for strong-transitions words, for which formant-frequency discontinuities were substantial. In experiment 2, the F0 contour of all-voiced monosyllables was shaped to follow a rising or falling pattern, spanning one octave. Consecutive tokens either had the same contour, giving an abrupt F0 change between each token, or alternated, giving a continuous contour. Discontinuous sequences caused more transformations and forms, and shorter times to the first transformation. Overall, these findings support the notion that continuity cues provided by formant transitions and the F0 contour play an important role in maintaining the perceptual coherence of speech.}, } @article {pmid25611214, year = {2015}, author = {Lowenstein, JH and Nittrouer, S}, title = {All cues are not created equal: the case for facilitating the acquisition of typical weighting strategies in children with hearing loss.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {2}, pages = {466-480}, pmid = {25611214}, issn = {1558-9102}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Child ; Cochlear Implants/psychology ; *Cues ; Female ; Hearing Loss/psychology/rehabilitation ; Humans ; *Language Development ; Male ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: One task of childhood involves learning to optimally weight acoustic cues in the speech signal in order to recover phonemic categories. This study examined the extent to which spectral degradation, as associated with cochlear implants, might interfere. The 3 goals were to measure, for adults and children, (a) cue weighting with spectrally degraded signals, (b) sensitivity to degraded cues, and (c) word recognition for degraded signals.

METHOD: Twenty-three adults and 36 children (10 and 8 years old) labeled spectrally degraded stimuli from /bɑ/-to-/wɑ/ continua varying in formant and amplitude rise time (FRT and ART). They also discriminated degraded stimuli from FRT and ART continua, and recognized words.

RESULTS: A developmental increase in the weight assigned to FRT in labeling was clearly observed, with a slight decrease in weight assigned to ART. Sensitivity to these degraded cues measured by the discrimination task could not explain variability in cue weighting. FRT cue weighting explained significant variability in word recognition; ART cue weighting did not.

CONCLUSION: Spectral degradation affects children more than adults, but that degradation cannot explain the greater diminishment in children's weighting of FRT. It is suggested that auditory training could strengthen the weighting of spectral cues for implant recipients.}, } @article {pmid25607721, year = {2015}, author = {Joseph, S and Iverson, P and Manohar, S and Fox, Z and Scott, SK and Husain, M}, title = {Precision of working memory for speech sounds.}, journal = {Quarterly journal of experimental psychology (2006)}, volume = {68}, number = {10}, pages = {2022-2040}, doi = {10.1080/17470218.2014.1002799}, pmid = {25607721}, issn = {1747-0226}, support = {090961//Wellcome Trust/United Kingdom ; 098282//Wellcome Trust/United Kingdom ; }, mesh = {Acoustic Stimulation ; Adult ; Algorithms ; Female ; Humans ; Male ; Memory, Short-Term/*physiology ; Mental Recall/physiology ; Models, Psychological ; Phonetics ; *Sound ; Speech Perception/*physiology ; }, abstract = {Memory for speech sounds is a key component of models of verbal working memory (WM). But how good is verbal WM? Most investigations assess this using binary report measures to derive a fixed number of items that can be stored. However, recent findings in visual WM have challenged such "quantized" views by employing measures of recall precision with an analogue response scale. WM for speech sounds might rely on both continuous and categorical storage mechanisms. Using a novel speech matching paradigm, we measured WM recall precision for phonemes. Vowel qualities were sampled from a formant space continuum. A probe vowel had to be adjusted to match the vowel quality of a target on a continuous, analogue response scale. Crucially, this provided an index of the variability of a memory representation around its true value and thus allowed us to estimate how memories were distorted from the original sounds. Memory load affected the quality of speech sound recall in two ways. First, there was a gradual decline in recall precision with increasing number of items, consistent with the view that WM representations of speech sounds become noisier with an increase in the number of items held in memory, just as for vision. Based on multidimensional scaling (MDS), the level of noise appeared to be reflected in distortions of the formant space. Second, as memory load increased, there was evidence of greater clustering of participants' responses around particular vowels. A mixture model captured both continuous and categorical responses, demonstrating a shift from continuous to categorical memory with increasing WM load. This suggests that direct acoustic storage can be used for single items, but when more items must be stored, categorical representations must be used.}, } @article {pmid25576345, year = {2015}, author = {Van Lierde, KM and De Letter, M and Vermeersch, H and Roche, N and Stillaert, F and Lemmens, G and Peeters, P and Rogiers, X and Blondeel, P and Corthals, P}, title = {Longitudinal progress of overall intelligibility, voice, resonance, articulation and oromyofunctional behavior during the first 21 months after Belgian facial transplantation.}, journal = {Journal of communication disorders}, volume = {53}, number = {}, pages = {42-56}, doi = {10.1016/j.jcomdis.2014.09.001}, pmid = {25576345}, issn = {1873-7994}, mesh = {Facial Injuries/surgery ; Facial Muscles/physiology ; Facial Transplantation/*psychology ; Humans ; Longitudinal Studies ; Male ; Middle Aged ; Speech ; *Speech Intelligibility ; Time Factors ; *Voice ; }, abstract = {PURPOSE: The purpose of this study is to document the longitudinal progress of speech intelligibility, speech acceptability, voice, resonance, articulation and oromyofunctional behavior in a male facial transplant patient 8 days, 15 days, 5 months, 12 months and, finally, 21 months after surgery.

METHOD: Identical objective (Dysphonia Severity Index, nasometry, acoustic analysis) and subjective (consensus perceptual evaluation, Dutch speech intelligibility test; flexible videolaryngostroboscopy/naso-endoscopy) assessment techniques and questionnaires (speech and voice handicap index, oral health impact profile, facial disability index) were used during each of the five postsurgical assessments.

RESULTS: The pattern of results shows a longitudinal progress of speech intelligibility and acceptability and of the interactive processes underpinning overall speech intelligibility. Vocal quality is normal and resonance is characterized by hypernasality. The phonetic inventory is complete but four phonetic disorders remain. Outcomes pertaining to articulation (formant analysis) show evident progress over time. Lip functions are improving but still decreased.

CONCLUSIONS: Transplantation of the face in this patient has largely restored speech. To what extent resonance, articulation, and lip functions can be enhanced by the permanent use of a palatal obturator, by specialized facial and lip movement exercises in combination with motor-oriented speech therapy, is subject for further research. Learning outcomes Facial transplantation: Readers will be able to (1) describe the relationship between facial transplantation and the impact on speech and oromyofunctional behavior, (2) identify variables that influence the outcome after facial transplantation, (3) define an assessment protocol after facial transplantation, (4) define facial transplantation.}, } @article {pmid25570926, year = {2014}, author = {Sola-Soler, J and Fiz, JA and Torres, A and Jane, R}, title = {Identification of Obstructive Sleep Apnea patients from tracheal breath sound analysis during wakefulness in polysomnographic studies.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2014}, number = {}, pages = {4232-4235}, doi = {10.1109/EMBC.2014.6944558}, pmid = {25570926}, issn = {2694-0604}, mesh = {Discriminant Analysis ; Exhalation ; Female ; Humans ; Male ; Middle Aged ; Polysomnography/*methods ; Respiratory Sounds/*physiopathology ; Signal Processing, Computer-Assisted ; Sleep Apnea, Obstructive/*diagnosis/*physiopathology ; Trachea/*physiopathology ; Wakefulness/*physiology ; }, abstract = {Obstructive Sleep Apnea (OSA) is currently diagnosed by a full nocturnal polysomnography (PSG), a very expensive and time-consuming method. In previous studies we were able to distinguish patients with OSA through formant frequencies of breath sound during sleep. In this study we aimed at identifying OSA patients from breath sound analysis during wakefulness. The respiratory sound was acquired by a tracheal microphone simultaneously to PSG recordings. We selected several cycles of consecutive inspiration and exhalation episodes in 10 mild-moderate (AHI<;30) and 13 severe (AHI>=30) OSA patients during their wake state before getting asleep. Each episode's formant frequencies were estimated by linear predictive coding. We studied several formant features, as well as their variability, in consecutive inspiration and exhalation episodes. In most subjects formant frequencies were similar during inspiration and exhalation. Formant features in some specific frequency band were significantly different in mild OSA as compared to severe OSA patients, and showed a decreasing correlation with OSA severity. These formant characteristics, in combination with some anthropometric measures, allowed the classification of OSA subjects between mild-moderate and severe groups with sensitivity (specificity) up to 88.9% (84.6%) and accuracy up to 86.4%. In conclusion, the information provided by formant frequencies of tracheal breath sound recorded during wakefulness may allow identifying subjects with severe OSA.}, } @article {pmid25569211, year = {2015}, author = {Lameira, AR and Hardus, ME and Bartlett, AM and Shumaker, RW and Wich, SA and Menken, SB}, title = {Speech-like rhythm in a voiced and voiceless orangutan call.}, journal = {PloS one}, volume = {10}, number = {1}, pages = {e116136}, pmid = {25569211}, issn = {1932-6203}, mesh = {Animal Communication ; Animals ; Biological Evolution ; Humans ; Lip/*physiology ; Pongo pygmaeus/*physiology ; Tongue/*physiology ; Video Recording ; Vocal Cords/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {The evolutionary origins of speech remain obscure. Recently, it was proposed that speech derived from monkey facial signals which exhibit a speech-like rhythm of ∼5 open-close lip cycles per second. In monkeys, these signals may also be vocalized, offering a plausible evolutionary stepping stone towards speech. Three essential predictions remain, however, to be tested to assess this hypothesis' validity; (i) Great apes, our closest relatives, should likewise produce 5Hz-rhythm signals, (ii) speech-like rhythm should involve calls articulatorily similar to consonants and vowels given that speech rhythm is the direct product of stringing together these two basic elements, and (iii) speech-like rhythm should be experience-based. Via cinematic analyses we demonstrate that an ex-entertainment orangutan produces two calls at a speech-like rhythm, coined "clicks" and "faux-speech." Like voiceless consonants, clicks required no vocal fold action, but did involve independent manoeuvring over lips and tongue. In parallel to vowels, faux-speech showed harmonic and formant modulations, implying vocal fold and supralaryngeal action. This rhythm was several times faster than orangutan chewing rates, as observed in monkeys and humans. Critically, this rhythm was seven-fold faster, and contextually distinct, than any other known rhythmic calls described to date in the largest database of the orangutan repertoire ever assembled. The first two predictions advanced by this study are validated and, based on parsimony and exclusion of potential alternative explanations, initial support is given to the third prediction. Irrespectively of the putative origins of these calls and underlying mechanisms, our findings demonstrate irrevocably that great apes are not respiratorily, articulatorilly, or neurologically constrained for the production of consonant- and vowel-like calls at speech rhythm. Orangutan clicks and faux-speech confirm the importance of rhythmic speech antecedents within the primate lineage, and highlight potential articulatory homologies between great ape calls and human consonants and vowels.}, } @article {pmid25543604, year = {2014}, author = {Huang, YY and Li, XM and Zhao, JF and Li, F}, title = {[Study on articulation characteristics of patients after cleft palate repair in Henan province].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {23}, number = {5}, pages = {590-592}, pmid = {25543604}, issn = {1006-7248}, mesh = {*Articulation Disorders ; Case-Control Studies ; Cleft Palate/*surgery ; Humans ; Speech ; }, abstract = {PURPOSE: To establish the pathological speech characteristic of patients after cleft plate repair in Henan province by unusual speech frequency and vowel formants.

METHODS: One hundred normal speech patients and 121 patients after cleft palate repair were selected, their formant frequency of vowels/ a,o,e,i,u,ü/ were obtained. The pathological speech frequency was collected in 121 postoperative patients. The acoustic features of the vowel were compared by SPSS17.0 software package with two independent sample t test.

RESULTS: According to articulation position, 21.9% of misarticulation of postoperative patients occurred in dental consonants, 5.2% in retroflex consonants, 21.6% in alveolar consonants, 19.2% in palatal consonants, 12.9% in velar consonants and 6.6% in dorsum consonants. The average value of F2 of/a, o, e, u/did not show significant difference between the two groups (P>0.05).The difference of average value of F2 of /i,ü/ was significant (P<0.05). The average value of F1 of/a, o, e, I, u/did not show significant difference between the two groups (P>0.05).Except /a/,the average value of F3 of vowel /o, e, i, u, ü/showed significant difference between the two groups (P<0.05).

CONCLUSIONS: Misarticulation of patients after cleft palate repair in Henan province mainly occurred in dental consonants, alveolar consonants and palatal consonants; tongue over curly can be observed in operated patients with cleft palate. There was no significant difference between the two groups in mouth opening.}, } @article {pmid25536845, year = {2014}, author = {Nittrouer, S and Lowenstein, JH}, title = {Dynamic spectral structure specifies vowels for adults and children.}, journal = {Language and speech}, volume = {57}, number = {Pt 4}, pages = {487-512}, pmid = {25536845}, issn = {0023-8309}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; *Phonetics ; Psycholinguistics ; *Sound Spectrography ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; *Speech Production Measurement ; Young Adult ; }, abstract = {The dynamic specification account of vowel recognition suggests that formant movement between vowel targets and consonant margins is used by listeners to recognize vowels. This study tested that account by measuring contributions to vowel recognition of dynamic (i.e., time-varying) spectral structure and coarticulatory effects on stationary structure. Adults and children (four- and seven-year-olds) were tested with three kinds of consonant-vowel-consonant syllables: (I) unprocessed; (2) sine waves that preserved both stationary coarticulated and dynamic spectral structure; and (3) vocoded signals that primarily preserved that stationary, but not dynamic structure. Sections of two lengths were removed from syllable middles: (I) half the vocalic portion; and (2) all but the first and last three pitch periods. Adults performed accurately with unprocessed and sine-wave signals, as long as half the syllable remained; their recognition was poorer for vocoded signals, but above chance. Seven-year-olds performed more poorly than adults with both sorts of processed signals, but disproportionately worse with vocoded than sine-wave signals. Most four-year-olds were unable to recognize vowels at all with vocoded signals. Conclusions were that both dynamic and stationary coarticulated structures support vowel recognition for adults, but children attend to dynamic spectral structure more strongly because early phonological organization favors whole words.}, } @article {pmid25536843, year = {2014}, author = {Saito, K and Munro, MJ}, title = {The early phase of /see symbol/ production development in adult Japanese learners of English.}, journal = {Language and speech}, volume = {57}, number = {Pt 4}, pages = {451-469}, doi = {10.1177/0023830913513206}, pmid = {25536843}, issn = {0023-8309}, mesh = {Adult ; Canada ; Cues ; Female ; Humans ; Japan/ethnology ; Male ; *Multilingualism ; Pattern Recognition, Visual ; *Phonetics ; Reading ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Students/psychology ; Verbal Behavior ; Young Adult ; }, abstract = {Although previous research indicates that Japanese speakers' second language (L2) perception and production of English /see symbol/ may improve with increased L2 experience, relatively little is known about the fine phonetic details of their /see symbol/ productions, especially during the early phase of L2 speech learning. This cross-sectional study examined acoustic properties of word-initial /see symbol/ from 60 Japanese learners with a length of residence of between one month and one year in Canada. Their performance was compared to that of 15 native speakers of English and 15 low-proficiency Japanese learners of English. Formant frequencies (F2 and F3) and F1 transition durations were evaluated under three task conditions--word reading, sentence reading, and timed picture description. Learners with as little as two to three months of residence demonstrated target-like F2 frequencies. In addition, increased LOR was predictive of more target-like transition durations. Although the learners showed some improvement in F3 as a function of LOR, they did so mainly at a controlled level of speech production. The findings suggest that during the early phase of L2 segmental development, production accuracy is task-dependent and is influenced by the availability of L1 phonetic cues for redeployment in L2.}, } @article {pmid25524696, year = {2016}, author = {Dwivedi, RC and St Rose, S and Chisholm, EJ and Clarke, PM and Kerawala, CJ and Nutting, CM and Rhys-Evans, PH and Kazi, R and Harrington, KJ}, title = {Acoustic parameters of speech: Lack of correlation with perceptual and questionnaire-based speech evaluation in patients with oral and oropharyngeal cancer treated with primary surgery.}, journal = {Head & neck}, volume = {38}, number = {5}, pages = {670-676}, doi = {10.1002/hed.23956}, pmid = {25524696}, issn = {1097-0347}, support = {C46/A10588//Cancer Research UK/United Kingdom ; }, mesh = {Adult ; Aged ; Female ; Humans ; Male ; Middle Aged ; Mouth Neoplasms/physiopathology/*surgery ; Oropharyngeal Neoplasms/physiopathology/*surgery ; *Speech Acoustics ; Speech Disorders/*diagnosis/etiology ; Speech Intelligibility ; Speech Production Measurement/*methods ; Surveys and Questionnaires ; }, abstract = {BACKGROUND: Acoustic evaluation of speech is the least explored method of speech evaluation in patients with oral cavity and oropharyngeal cancer. The purpose of this study was to explore acoustic parameters of speech and their correlation with questionnaire evaluation and perceptual evaluation in patients with oral cavity and oropharyngeal cancer.

METHODS: One hundred seventeen subjects (65 consecutive patients with oral cavity and oropharyngeal cancer and 52 controls) participated in this study. Formant frequencies (by Linear Predictive Coding), Speech Handicap Index, and London Speech Evaluation scale were used for acoustic evaluation, questionnaire evaluation, and perceptual evaluation, respectively.

RESULTS: Men showed significant elevation in second formant (F2) values for patients with oral cavity cancer and those who underwent surgery alone. Female patients with early T classification cancers and those who underwent surgery and chemoradiation showed significant reduction in the mean F2 values. Importantly, however, acoustic evaluation parameters did not correlate with either perceptual evaluation or questionnaire evaluation parameters, although there was moderate correlation between questionnaire evaluation and perceptual evaluation speech parameters.

CONCLUSION: Acoustic evaluation modalities have no clear role in the management of patients with oral cavity and oropharyngeal cancer.}, } @article {pmid25517997, year = {2014}, author = {Cristia, A and Minagawa, Y and Dupoux, E}, title = {Responses to vocalizations and auditory controls in the human newborn brain.}, journal = {PloS one}, volume = {9}, number = {12}, pages = {e115162}, pmid = {25517997}, issn = {1932-6203}, mesh = {*Acoustic Stimulation ; Adult ; Animals ; Auditory Perception/*physiology ; Brain/*physiology ; Brain Mapping/methods ; Emotions/*physiology ; Evoked Potentials, Auditory ; Female ; Humans ; Infant ; Infant, Newborn ; Macaca ; Male ; Spectroscopy, Near-Infrared ; Speech Perception/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {In the adult brain, speech can recruit a brain network that is overlapping with, but not identical to, that involved in perceiving non-linguistic vocalizations. Using the same stimuli that had been presented to human 4-month-olds and adults, as well as adult macaques, we sought to shed light on the cortical networks engaged when human newborns process diverse vocalization types. Near infrared spectroscopy was used to register the response of 40 newborns' perisylvian regions when stimulated with speech, human and macaque emotional vocalizations, as well as auditory controls where the formant structure was destroyed but the long-term spectrum was retained. Left fronto-temporal and parietal regions were significantly activated in the comparison of stimulation versus rest, with unclear selectivity in cortical activation. These results for the newborn brain are qualitatively and quantitatively compared with previous work on newborns, older human infants, adult humans, and adult macaques reported in previous work.}, } @article {pmid25510161, year = {2015}, author = {Viegas, F and Viegas, D and Baeck, HE}, title = {Frequency measurement of vowel formants produced by Brazilian children aged between 4 and 8 years.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {3}, pages = {292-298}, doi = {10.1016/j.jvoice.2014.08.001}, pmid = {25510161}, issn = {1873-4588}, mesh = {Acoustics ; Age Factors ; Brazil ; Child ; *Child Language ; Child, Preschool ; Cross-Sectional Studies ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; }, abstract = {OBJECTIVE: To investigate frequency measurement of the first three formants of the seven oral Brazilian Portuguese vowels of healthy children aged 4-8 years.

METHODS: Two hundred seven children of both genders were selected by oral expression screening and perceptive-auditory analysis. They were separated into four age groups (G1, G2, G3, and G4) and by gender. The voice signals were obtained from key sentence utterance and segments of the seven Brazilian Portuguese oral vowels in tonic position used to estimate formant frequency measurement. Software Praat was used to for processing the recordings.

RESULTS: Findings were presented by mean values of each of the investigated parameters. A rate of 61.90% of statistically significant differences between genders was found, and when analyzing the age groups and genders, we observed that 65 of the 84 items studied (seven vowels × three formant frequencies × four groups) had higher frequencies of formants for girls. There was a decrease in the frequencies values of the first three formants with age. The results recommended grouping of G1 and G2, and they showed a clear difference between this new formed group and G4. In the age groups of 5-year old to 6 years 11 months (G2 and G3) and 6-year old to 7 years 11 months (G3 and G4), there were statistically significant changes that were random for parameter and vowel. There was a decrease in the frequencies values of the first three formants with age.

CONCLUSION: Formant frequencies showed a tendency to differentiate genders and their absolute values were in general higher in girls. Age increases showed decreases in formant frequencies. Tests for statistical differences led to grouping of G1 and G2 and a clear difference between this new formed group and G4. The comparison between G2 and G3 and G3 and G4 showed random changes. The changes during this age period (5-year old to 7 years 11 months) were attributed to a transition stage of acoustic measurements in children. As formant frequencies vary according to structural and postural aspects of the vocal tract and speech organs, their study in healthy children contributes for the understanding of the development of the pediatric phonation system, in addition to offering a reference data set for future studies of children with vocal disorders that can potentially impact the resonance system.}, } @article {pmid25500177, year = {2015}, author = {Easwar, V and Beamish, L and Aiken, S and Choi, JM and Scollie, S and Purcell, D}, title = {Sensitivity of envelope following responses to vowel polarity.}, journal = {Hearing research}, volume = {320}, number = {}, pages = {38-50}, doi = {10.1016/j.heares.2014.11.008}, pmid = {25500177}, issn = {1878-5891}, mesh = {Acoustic Stimulation/*legislation & jurisprudence/methods ; Adolescent ; Adult ; Auditory Perception/*physiology ; Female ; Hearing/*physiology ; Humans ; Male ; Noise ; Pilot Projects ; Reaction Time/physiology ; *Speech Acoustics ; Young Adult ; }, abstract = {Envelope following responses (EFRs) elicited by stimuli of opposite polarities are often averaged due to their insensitivity to polarity when elicited by amplitude modulated tones. A recent report illustrates that individuals exhibit varying degrees of polarity-sensitive differences in EFR amplitude when elicited by vowel stimuli (Aiken and Purcell, 2013). The aims of the current study were to evaluate the incidence and degree of polarity-sensitive differences in EFRs recorded in a large group of individuals, and to examine potential factors influencing the polarity-sensitive nature of EFRs. In Experiment I of the present study, we evaluated the incidence and degree of polarity-sensitive differences in EFR amplitude in a group of 39 participants. EFRs were elicited by opposite polarities of the vowel /ε/ in a natural /hVd/ context presented at 80 dB SPL. Nearly 30% of the participants with detectable responses (n = 24) showed a difference of greater than ∼39 nV in EFR response amplitude between the two polarities, that was unexplained by variations in noise estimates. In Experiment II, we evaluated the effect of vowel, frequency of harmonics and presence of the first harmonic (h1) on the polarity sensitivity of EFRs in 20 participants with normal hearing. For vowels /u/, /a/ and /i/, EFRs were elicited by two simultaneously presented carriers representing the first formant (resolved harmonics), and the second and higher formants (unresolved harmonics). Individual but simultaneous EFRs were elicited by the formant carriers by separating the fundamental frequency in the two carriers by 8 Hz. Vowels were presented as part of a naturally produced, but modified sequence /susaʃi/, at an overall level of 65 dB SPL. To evaluate the effect of h1 on polarity sensitivity of EFRs, EFRs were elicited by the same vowels without h1 in an identical sequence. A repeated measures analysis of variance indicated a significant effect of polarity on EFR amplitudes for the vowel /u/ and a near-significant effect for /i/, when h1 was present. EFRs elicited by unresolved harmonics and resolved harmonics without h1 demonstrated no significant differences in amplitude due to polarity. The results suggest that h1 contributes to the polarity sensitivity of EFRs elicited by low frequency F1 carriers. However, it is unlikely that this is only due to the influence of a polarity-sensitive frequency-following response to the fine structure at h1. Removing h1 by filtering also decreased the asymmetry of the vowel envelope, especially for those with low first formant frequencies. A measure called the envelope asymmetry index was computed to evaluate the relationship between stimulus envelope asymmetry above and below the baseline, and polarity-sensitive differences in EFR amplitude. A significant positive correlation between envelope asymmetry index and absolute amplitude differences in EFR due to polarity suggests that one of the causes contributing to the polarity sensitivity of EFRs could be the asymmetry in stimulus envelope. This stimulus characteristic, however, explains only a fraction of the variability observed and there may be other factors that contribute to individual differences in polarity sensitivity of the EFR to naturally produced vowel stimuli.}, } @article {pmid25499522, year = {2015}, author = {Macari, AT and Karam, IA and Tabri, D and Sarieddine, D and Hamdan, AL}, title = {Formants frequency and dispersion in relation to the length and projection of the upper and lower jaws.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {29}, number = {1}, pages = {83-90}, doi = {10.1016/j.jvoice.2014.05.011}, pmid = {25499522}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Child ; Cross-Sectional Studies ; Female ; Humans ; Male ; Mandible/*anatomy & histology/physiology ; Maxilla/*anatomy & histology/physiology ; *Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {OBJECTIVE: Investigate the association between formants frequencies and length and sagittal projection of the maxilla and mandible.

STUDY DESIGN: Cross-sectional study.

METHOD: A total of 47 consecutive patients were recruited. Craniofacial measures included; maxillary length (ANS-PNS), mandibular length (Co-Gn), relationship between maxilla and mandible in the sagittal plane (ANB), the sagittal projection of the maxilla (SNA), and mandible (SNB). Subjects were asked to phonate vowels /a/, /i/, /o/, and /u/. Measurements were made in real-time and formant frequencies across F1, F2, F3, and F4 were determined.

RESULTS: There was a significant negative association between the length of the maxilla and F4 for all the vowels, and a significant negative association between the length of the mandible and F4 for vowels /o/ and /u/. The length of maxilla and mandible also negatively associated with F3 for vowels /a/, /i/, /o/, and vowels /i/, /o/, and /u/ respectively. For the first two formants, the negative association was less pronounced.

CONCLUSION: There was a significant negative association between the formant frequencies F3, F4, and the length of the mandible and maxilla for vowels /a/, /i/, /o/, and /u/.}, } @article {pmid25495013, year = {2015}, author = {Lee, A and Gibbon, FE and Oebels, J}, title = {Lateral bracing of the tongue during the onset phase of alveolar stops: an EPG study.}, journal = {Clinical linguistics & phonetics}, volume = {29}, number = {3}, pages = {236-245}, doi = {10.3109/02699206.2014.991449}, pmid = {25495013}, issn = {1464-5076}, mesh = {Adult ; Electrodiagnosis/*methods ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; *Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Tongue/*physiology ; }, abstract = {Although raising the sides of the tongue to form a seal with the palate and upper teeth--lateral bracing--plays a key role in controlling airflow direction, providing overall tongue stability and building up oral pressure during alveolar consonant production, details of this articulatory gesture remain poorly understood. This study examined the dynamics of lateral bracing during the onset of alveolar stops /t/, /d/, /n/ produced by 15 typical English-speaking adults using electropalatography. Percent tongue palate contact in the lateral regions over a 150-ms period from the preceding schwa to stop closure was measured. Rapid rising of the sides of the tongue from the back towards the front during the 50-ms period before closure was observed, with oral stops showing significantly more contact than nasal stops. This feature corresponds to well-documented formant transitions detectable from acoustic analysis. Possible explanations for increased contact for oral stops and clinical implications are discussed.}, } @article {pmid25480527, year = {2015}, author = {Christmann, CA and Lachmann, T and Steinbrink, C}, title = {Evidence for a general auditory processing deficit in developmental dyslexia from a discrimination paradigm using speech versus nonspeech sounds matched in complexity.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {58}, number = {1}, pages = {107-121}, doi = {10.1044/2014_JSLHR-L-14-0174}, pmid = {25480527}, issn = {1558-9102}, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Auditory Perception/*physiology ; Auditory Perceptual Disorders/physiopathology/*psychology ; Case-Control Studies ; Dyslexia/physiopathology/*psychology ; Female ; Germany ; Humans ; Language ; Male ; *Phonetics ; Psycholinguistics ; Reading ; Speech/physiology ; Task Performance and Analysis ; Young Adult ; }, abstract = {PURPOSE: It is unknown whether phonological deficits are the primary cause of developmental dyslexia or whether they represent a secondary symptom resulting from impairments in processing basic acoustic parameters of speech. This might be due, in part, to methodological difficulties. Our aim was to overcome two of these difficulties: the comparability of stimulus material and task in speech versus nonspeech conditions.

METHOD: In this study, the authors (a) assessed auditory processing of German vowel center stimuli, spectrally rotated versions of these stimuli, and bands of formants; (b) used the same task for linguistic and nonlinguistic conditions; and (c) varied systematically temporal and spectral parameters inherent in the German vowel system. Forty-two adolescents and adults with and without reading disabilities participated.

RESULTS: Group differences were found for all linguistic and nonlinguistic conditions for both temporal and spectral parameters. Auditory deficits were identified in most but not all participants with dyslexia. These deficits were not restricted to speech stimuli-they were also found for nonspeech stimuli with equal and lower complexity compared with the vowel stimuli. Temporal deficits were not observed in isolation.

CONCLUSION: These results support the existence of a general auditory processing impairment in developmental dyslexia.}, } @article {pmid25480071, year = {2014}, author = {Titze, IR and Palaparthi, A and Smith, SL}, title = {Benchmarks for time-domain simulation of sound propagation in soft-walled airways: steady configurations.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {6}, pages = {3249}, pmid = {25480071}, issn = {1520-8524}, support = {R01 DC008612/DC/NIDCD NIH HHS/United States ; 1R01 DC008612-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; *Benchmarking ; *Computer Simulation ; Humans ; Models, Theoretical ; Phonation/*physiology ; Pulmonary Ventilation/*physiology ; Respiratory Physiological Phenomena ; Sound ; *Speech Production Measurement ; }, abstract = {Time-domain computer simulation of sound production in airways is a widely used tool, both for research and synthetic speech production technology. Speed of computation is generally the rationale for one-dimensional approaches to sound propagation and radiation. Transmission line and wave-reflection (scattering) algorithms are used to produce formant frequencies and bandwidths for arbitrarily shaped airways. Some benchmark graphs and tables are provided for formant frequencies and bandwidth calculations based on specific mathematical terms in the one-dimensional Navier-Stokes equation. Some rules are provided here for temporal and spatial discretization in terms of desired accuracy and stability of the solution. Kinetic losses, which have been difficult to quantify in frequency-domain simulations, are quantified here on the basis of the measurements of Scherer, Torkaman, Kucinschi, and Afjeh [(2010). J. Acoust. Soc. Am. 128(2), 828-838].}, } @article {pmid25457743, year = {2015}, author = {Ahn, J and Kim, G and Kim, YH and Hong, J}, title = {Acoustic analysis of vowel sounds before and after orthognathic surgery.}, journal = {Journal of cranio-maxillo-facial surgery : official publication of the European Association for Cranio-Maxillo-Facial Surgery}, volume = {43}, number = {1}, pages = {11-16}, doi = {10.1016/j.jcms.2014.10.002}, pmid = {25457743}, issn = {1878-4119}, mesh = {Acoustics ; Adolescent ; Adult ; Biocompatible Materials/chemistry ; Bone Plates ; Bone Screws ; Cohort Studies ; Follow-Up Studies ; Humans ; Malocclusion, Angle Class III/physiopathology/surgery ; Mandible/physiopathology/surgery ; Orthognathic Surgical Procedures/*methods ; Osteotomy, Sagittal Split Ramus/*methods ; *Phonetics ; Prognathism/physiopathology/*surgery ; Prospective Studies ; Sound Spectrography/instrumentation ; Speech/physiology ; Titanium/chemistry ; Young Adult ; }, abstract = {The purpose of this study was to compare the articular structures and vowel sounds of patients with mandibular prognathism before and after bilateral sagittal split ramus osteotomy (BSSRO). Eight patients who underwent BSSRO to correct mandibular prognathism were selected for inclusion in this study. All patients were asked to read short words (vowels), and these sounds were recorded. Every utterance was repeated twice in four different sessions before the operation and at 6 weeks, 3 months, and 6 months after the operation. The data were analysed using Praat (ver. 5.1.31), and the formant frequencies (F1, F2) of the eight vowels were extracted. PlotFormant (ver. 1.0) was used to draw formant diagrams. The F1 and F2 of front-low vowels were reduced after BSSRO, and the articulating positions of the patients shifted in a posterior-superior direction after the procedure. Additionally, the area of vowel articulation was dramatically reduced after BSSRO but increased slowly over time.}, } @article {pmid25451311, year = {2015}, author = {Daikoku, T and Yatomi, Y and Yumoto, M}, title = {Statistical learning of music- and language-like sequences and tolerance for spectral shifts.}, journal = {Neurobiology of learning and memory}, volume = {118}, number = {}, pages = {8-19}, doi = {10.1016/j.nlm.2014.11.001}, pmid = {25451311}, issn = {1095-9564}, mesh = {Adult ; Female ; Humans ; *Language ; *Learning ; Male ; *Markov Chains ; Music/*psychology ; *Pattern Recognition, Physiological ; Pitch Perception ; Young Adult ; }, abstract = {In our previous study (Daikoku, Yatomi, & Yumoto, 2014), we demonstrated that the N1m response could be a marker for the statistical learning process of pitch sequence, in which each tone was ordered by a Markov stochastic model. The aim of the present study was to investigate how the statistical learning of music- and language-like auditory sequences is reflected in the N1m responses based on the assumption that both language and music share domain generality. By using vowel sounds generated by a formant synthesizer, we devised music- and language-like auditory sequences in which higher-ordered transitional rules were embedded according to a Markov stochastic model by controlling fundamental (F0) and/or formant frequencies (F1-F2). In each sequence, F0 and/or F1-F2 were spectrally shifted in the last one-third of the tone sequence. Neuromagnetic responses to the tone sequences were recorded from 14 right-handed normal volunteers. In the music- and language-like sequences with pitch change, the N1m responses to the tones that appeared with higher transitional probability were significantly decreased compared with the responses to the tones that appeared with lower transitional probability within the first two-thirds of each sequence. Moreover, the amplitude difference was even retained within the last one-third of the sequence after the spectral shifts. However, in the language-like sequence without pitch change, no significant difference could be detected. The pitch change may facilitate the statistical learning in language and music. Statistically acquired knowledge may be appropriated to process altered auditory sequences with spectral shifts. The relative processing of spectral sequences may be a domain-general auditory mechanism that is innate to humans.}, } @article {pmid25449748, year = {2015}, author = {Weston, PS and Hunter, MD and Sokhi, DS and Wilkinson, ID and Woodruff, PW}, title = {Discrimination of voice gender in the human auditory cortex.}, journal = {NeuroImage}, volume = {105}, number = {}, pages = {208-214}, doi = {10.1016/j.neuroimage.2014.10.056}, pmid = {25449748}, issn = {1095-9572}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Female ; Humans ; Image Interpretation, Computer-Assisted ; Magnetic Resonance Imaging ; Male ; Voice ; Young Adult ; }, abstract = {Discerning a speaker's gender from their voice is a basic and crucial aspect of human communication. Voice pitch height, the perceptual correlate of fundamental frequency, is higher in females and provides a cue for gender discrimination. However, male and female voices are also differentiated by multiple other spectral and temporal characteristics, including mean formant frequency and spectral flux. The robust perceptual segregation of male and female voices is thought to result from processing the combination of discriminating features, which in neural terms may correspond to early sound object analysis occurring in non-primary auditory cortex. However, the specific mechanism for gender perception has been unclear. Here, using functional magnetic resonance imaging, we show that discrete sites in non-primary auditory cortex are differentially activated by male and female voices, with female voices consistently evoking greater activation in the upper bank of the superior temporal sulcus and posterior superior temporal plane. This finding was observed at the individual subject-level in all 24 subjects. The neural response was highly specific: no auditory regions were more activated by male than female voices. Further, the activation associated with female voices was 1) larger than can be accounted for by a sole effect of fundamental frequency, 2) not due to psychological attribution of female gender and 3) unaffected by listener gender. These results demonstrate that male and female voices are represented as distinct auditory objects in the human brain, with the mechanism for gender discrimination being a gender-dependent activation-level cue in non-primary auditory cortex.}, } @article {pmid25436042, year = {2014}, author = {Bahng, J and Hedrick, M and von Hapsburg, D}, title = {Weighting of static and transition cues in voiceless fricatives and stops in children wearing cochlear implants.}, journal = {Clinical and experimental otorhinolaryngology}, volume = {7}, number = {4}, pages = {254-259}, pmid = {25436042}, issn = {1976-8710}, abstract = {OBJECTIVES: To determine how normal-hearing adults (NHA), normal-hearing children (NHC) and children wearing cochlear implants (CI) differ in the perceptual weight given cues for fricative consonants (having a comparatively long static cue and short transition cue) versus stop consonants (having a comparatively short static cue and long transition cue).

METHODS: Ten NHA, eleven 5- to 8-year-old NHC and eight 5- to 8-year-old children wearing CI were participated. Fricative /su/-/∫u/ and stop /pu/-/tu/continua were constructed by varying the fricative/burst cue and the F2 onset transition cue. A quantitative method of analysis (analysis of variance model) was used to determine cue weighting and measure cue interaction within groups.

RESULTS: For the fricative consonant, all groups gave more weight to the frication spectral cue than to the formant transition. For the voiceless stop consonant, all groups gave more weight to the transition cue than to the burst cue. The CI group showed similar cue weighting strategies to age-matched NHC, but integration of cues by the CI group was not significant.

CONCLUSION: All groups favored the longer-duration cue in both continua to make phonemic judgments. Additionally, developmental patterns across groups were evident. Results of the current study may be used to guide development of CI devices and in efforts to improve speech and language of children wearing CIs.}, } @article {pmid25429113, year = {2015}, author = {Vaughn, C and Nasir, SM}, title = {Precise feedback control underlies sensorimotor learning in speech.}, journal = {Journal of neurophysiology}, volume = {113}, number = {3}, pages = {950-955}, doi = {10.1152/jn.00454.2014}, pmid = {25429113}, issn = {1522-1598}, mesh = {Adult ; Auditory Perception ; *Feedback, Physiological ; Female ; Humans ; *Learning ; Male ; Psychomotor Performance ; Speech/*physiology ; }, abstract = {Acquiring the skill of speaking in another language, or for that matter a child's learning to talk, does not follow a single recipe. People learn by variable amounts. A major component of speech learnability seems to be sensing precise feedback errors to correct subsequent utterances that help maintain speech goals. We have tested this idea in a speech motor learning paradigm under altered auditory feedback, in which subjects repeated a word while their auditory feedback was changed online. Subjects learned the task to variable degrees, with some simply failing to learn. We assessed feedback contribution by computing one-lag covariance between formant trajectories of the current feedback and the following utterance that was found to be a significant predictor of learning. Our findings rely on a novel use of information-rich formant trajectories in evaluating speech motor learning and argue for their relevance in auditory speech goals of vowel sounds.}, } @article {pmid25421087, year = {2014}, author = {Guérit, F and Santurette, S and Chalupper, J and Dau, T}, title = {Investigating interaural frequency-place mismatches via bimodal vowel integration.}, journal = {Trends in hearing}, volume = {18}, number = {}, pages = {}, pmid = {25421087}, issn = {2331-2165}, mesh = {Acoustic Stimulation ; Adult ; Aged ; Audiometry, Pure-Tone ; Auditory Threshold ; Case-Control Studies ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Correction of Hearing Impairment/*instrumentation ; Humans ; Middle Aged ; Persons With Hearing Impairments/psychology/*rehabilitation ; *Pitch Perception ; *Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Speech Perception ; Speech Reception Threshold Test ; Young Adult ; }, abstract = {For patients having residual hearing in one ear and a cochlear implant (CI) in the opposite ear, interaural place-pitch mismatches might be partly responsible for the large variability in individual benefit. Behavioral pitch-matching between the two ears has been suggested as a way to individualize the fitting of the frequency-to-electrode map but is rather tedious and unreliable. Here, an alternative method using two-formant vowels was developed and tested. The interaural spectral shift was inferred by comparing vowel spaces, measured by presenting the first formant (F1) to the nonimplanted ear and the second (F2) on either side. The method was first evaluated with eight normal-hearing listeners and vocoder simulations, before being tested with 11 CI users. Average vowel distributions across subjects showed a similar pattern when presenting F2 on either side, suggesting acclimatization to the frequency map. However, individual vowel spaces with F2 presented to the implant did not allow a reliable estimation of the interaural mismatch. These results suggest that interaural frequency-place mismatches can be derived from such vowel spaces. However, the method remains limited by difficulties in bimodal fusion of the two formants.}, } @article {pmid25416844, year = {2015}, author = {Fleischer, M and Pinkert, S and Mattheus, W and Mainka, A and Mürbe, D}, title = {Formant frequencies and bandwidths of the vocal tract transfer function are affected by the mechanical impedance of the vocal tract wall.}, journal = {Biomechanics and modeling in mechanobiology}, volume = {14}, number = {4}, pages = {719-733}, pmid = {25416844}, issn = {1617-7940}, mesh = {*Acoustics ; Biomechanical Phenomena ; Electric Impedance ; Finite Element Analysis ; Humans ; Imaging, Three-Dimensional ; Magnetic Resonance Imaging ; Male ; Models, Biological ; Mouth/physiology ; Vocal Cords/*physiology ; Young Adult ; }, abstract = {The acoustical properties of the vocal tract, the air-filled cavity between the vocal folds and the mouth opening, are determined by its individual geometry, the physical properties of the air and of its boundaries. In this article, we address the necessity of complex impedance boundary conditions at the mouth opening and at the border of the acoustical domain inside the human vocal tract. Using finite element models based on MRI data for spoken and sung vowels /a/, /i/ and /Ω(-1)/ and comparison of the transfer characteristics by analysis of acoustical data using an inverse filtering method, the global wall impedance showed a frequency-dependent behaviour and depends on the produced vowel and therefore on the individual vocal tract geometry. The values of the normalised inertial component (represented by the imaginary part of the impedance) ranged from 250 g/m(2) at frequencies higher than about 3 kHz up to about 2.5 × 10(5) g/m(2)in the mid-frequency range around 1.5-3 kHz. In contrast, the normalised dissipation (represented by the real part of the impedance) ranged from 65 to 4.5 × 10(5) Ns/m(3). These results indicate that structures enclosing the vocal tract (e.g. oral and pharyngeal mucosa and muscle tissues), especially their mechanical properties, influence the transfer of the acoustical energy and the position and bandwidth of the formant frequencies. It implies that the timbre characteristics of vowel sounds are likely to be tuned by specific control of relaxation and strain of the surrounding structures of the vocal tract.}, } @article {pmid25400977, year = {2014}, author = {Martel Sauvageau, V and Macoir, J and Langlois, M and Prud'Homme, M and Cantin, L and Roy, JP}, title = {Changes in vowel articulation with subthalamic nucleus deep brain stimulation in dysarthric speakers with Parkinson's disease.}, journal = {Parkinson's disease}, volume = {2014}, number = {}, pages = {487035}, pmid = {25400977}, issn = {2090-8083}, abstract = {Purpose. To investigate changes in vowel articulation with the electrical deep brain stimulation (DBS) of the subthalamic nucleus (STN) in dysarthric speakers with Parkinson's disease (PD). Methods. Eight Quebec-French speakers diagnosed with idiopathic PD who had undergone STN DBS were evaluated ON-stimulation and OFF-stimulation (1 hour after DBS was turned off). Vowel articulation was compared ON-simulation versus OFF-stimulation using acoustic vowel space and formant centralization ratio, calculated with the first (F1) and second formant (F2) of the vowels /i/, /u/, and /a/. The impact of the preceding consonant context on articulation, which represents a measure of coarticulation, was also analyzed as a function of the stimulation state. Results. Maximum vowel articulation increased during ON-stimulation. Analyses also indicate that vowel articulation was modulated by the consonant context but this relationship did not change with STN DBS. Conclusions. Results suggest that STN DBS may improve articulation in dysarthric speakers with PD, in terms of range of movement. Optimization of the electrical parameters for each patient is important and may lead to improvement in speech fine motor control. However, the impact on overall speech intelligibility may still be small. Clinical considerations are discussed and new research avenues are suggested.}, } @article {pmid25386962, year = {2014}, author = {Li, Y and Zhang, G and Galvin, JJ and Fu, QJ}, title = {Mandarin speech perception in combined electric and acoustic stimulation.}, journal = {PloS one}, volume = {9}, number = {11}, pages = {e112471}, pmid = {25386962}, issn = {1932-6203}, support = {R01 DC004792/DC/NIDCD NIH HHS/United States ; R01 DC004993/DC/NIDCD NIH HHS/United States ; R01-DC004993/DC/NIDCD NIH HHS/United States ; R01-DC004792/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Threshold ; *Cochlear Implants ; Electric Stimulation ; Female ; *Hearing Aids ; Humans ; *Language ; Male ; Noise ; Speech Perception/*physiology ; }, abstract = {For deaf individuals with residual low-frequency acoustic hearing, combined use of a cochlear implant (CI) and hearing aid (HA) typically provides better speech understanding than with either device alone. Because of coarse spectral resolution, CIs do not provide fundamental frequency (F0) information that contributes to understanding of tonal languages such as Mandarin Chinese. The HA can provide good representation of F0 and, depending on the range of aided acoustic hearing, first and second formant (F1 and F2) information. In this study, Mandarin tone, vowel, and consonant recognition in quiet and noise was measured in 12 adult Mandarin-speaking bimodal listeners with the CI-only and with the CI+HA. Tone recognition was significantly better with the CI+HA in noise, but not in quiet. Vowel recognition was significantly better with the CI+HA in quiet, but not in noise. There was no significant difference in consonant recognition between the CI-only and the CI+HA in quiet or in noise. There was a wide range in bimodal benefit, with improvements often greater than 20 percentage points in some tests and conditions. The bimodal benefit was compared to CI subjects' HA-aided pure-tone average (PTA) thresholds between 250 and 2000 Hz; subjects were divided into two groups: "better" PTA (<50 dB HL) or "poorer" PTA (>50 dB HL). The bimodal benefit differed significantly between groups only for consonant recognition. The bimodal benefit for tone recognition in quiet was significantly correlated with CI experience, suggesting that bimodal CI users learn to better combine low-frequency spectro-temporal information from acoustic hearing with temporal envelope information from electric hearing. Given the small number of subjects in this study (n = 12), further research with Chinese bimodal listeners may provide more information regarding the contribution of acoustic and electric hearing to tonal language perception.}, } @article {pmid25373975, year = {2014}, author = {Williams, D and Escudero, P}, title = {A cross-dialectal acoustic comparison of vowels in Northern and Southern British English.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {5}, pages = {2751-2761}, doi = {10.1121/1.4896471}, pmid = {25373975}, issn = {1520-8524}, mesh = {Adolescent ; Adult ; England ; Female ; Humans ; Language ; Male ; *Phonation ; *Phonetics ; Sex Characteristics ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {This study compares the duration and first two formants (F1 and F2) of 11 nominal monophthongs and five nominal diphthongs in Standard Southern British English (SSBE) and a Northern English dialect. F1 and F2 trajectories were fitted with parametric curves using the discrete cosine transform (DCT) and the zeroth DCT coefficient represented formant trajectory means and the first DCT coefficient represented the magnitude and direction of formant trajectory change to characterize vowel inherent spectral change (VISC). Cross-dialectal comparisons involving these measures revealed significant differences for the phonologically back monophthongs /ɒ, ɔː, ʊ, uː/ and also /зː/ and the diphthongs /eɪ, əʊ, aɪ, ɔɪ/. Most cross-dialectal differences are in zeroth DCT coefficients, suggesting formant trajectory means tend to characterize such differences, while first DCT coefficient differences were more numerous for diphthongs. With respect to VISC, the most striking differences are that /uː/ is considerably more diphthongized in the Northern dialect and that the F2 trajectory of /əʊ/ proceeds in opposite directions in the two dialects. Cross-dialectal differences were found to be largely unaffected by the consonantal context in which the vowels were produced. The implications of the results are discussed in relation to VISC, consonantal context effects and speech perception.}, } @article {pmid25348340, year = {2015}, author = {Koo, SK and Kwon, SB and Chon, KM and Kim, YJ and Kim, YJ}, title = {The role of the maxillary sinus on the voice.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {272}, number = {9}, pages = {2347-2350}, pmid = {25348340}, issn = {1434-4726}, mesh = {Female ; Humans ; Male ; Maxillary Sinus/*surgery ; Middle Aged ; Phonation ; Postoperative Period ; Prospective Studies ; *Voice Quality ; }, abstract = {The objective of the study was to determine the effect of the maxillary sinus on the voice. The prospective study was conducted at an academic secondary referral center. A prospective chart review of 43 patients (17 males, 26 females) who conducted a voice recording and survey before and 3 months after middle meatal antrostomy whose lesion was confined to the maxillary sinus. Subjective voice changes were surveyed using a questionnaire. After phonation [∧m ma: the Korean pronunciation of 'mother'], [Nu Na: the Korean pronunciation of 'sister'], we analyzed the nasal consonant [m] of [∧m ma] and nasalized vowel [a] of [∧m ma] and [a] of [Nu Na]. In the poll conducted, the change rates for males and females were 41.1 % (7/17) and 15.4 % (4/26), respectively; of the male patients, 85.7 % (6/7) felt that the sound quality was better and 14.3 % (1/6) that it was worse. However, all the female patients felt it was better. Among of the patients with an improved voice, reduced nasal sound was the most frequent observation. In an objective analysis, a tendency to lowered frequencies was observed for nasalized vowels after surgery. Significant differences were observed at second formant frequencies of [a] of [∧m ma] and first formant frequencies of [a] of [Nu Na] in female subjects (P < 0.005). Our findings indicated that the maxillary sinus plays a role in the modification of voice quality. Preoperative counseling is important for patients concerning expected changes in the voice after maxillary sinus surgery.}, } @article {pmid25326093, year = {2014}, author = {Skrinda, I and Krama, T and Kecko, S and Moore, FR and Kaasik, A and Meija, L and Lietuvietis, V and Rantala, MJ and Krams, I}, title = {Body height, immunity, facial and vocal attractiveness in young men.}, journal = {Die Naturwissenschaften}, volume = {101}, number = {12}, pages = {1017-1025}, pmid = {25326093}, issn = {1432-1904}, mesh = {Adult ; Body Height/*physiology ; Face/*anatomy & histology ; Female ; Hepatitis B Vaccines/immunology ; Humans ; Immunity, Active/*physiology ; Male ; Testosterone/blood ; Voice/*physiology ; Young Adult ; }, abstract = {Health, facial and vocal attributes and body height of men may affect a diverse range of social outcomes such as attractiveness to potential mates and competition for resources. Despite evidence that each parameter plays a role in mate choice, the relative role of each and inter-relationships between them, is still poorly understood. In this study, we tested relationships both between these parameters and with testosterone and immune function. We report positive relationships between testosterone with facial masculinity and attractiveness, and we found that facial masculinity predicted facial attractiveness and antibody response to a vaccine. Moreover, the relationship between antibody response to a hepatitis B vaccine and body height was found to be non-linear, with a positive relationship up to a height of 188 cm, but an inverse relationship in taller men. We found that vocal attractiveness was dependent upon vocal masculinity. The relationship between vocal attractiveness and body height was also non-linear, with a positive relationship of up to 178 cm, which then decreased in taller men. We did not find a significant relationship between body height and the fundamental frequency of vowel sounds provided by young men, while body height negatively correlated with the frequency of second formant. However, formant frequency was not associated with the strength of immune response. Our results demonstrate the potential of vaccination research to reveal costly traits that govern evolution of mate choice in humans and the importance of trade-offs among these traits.}, } @article {pmid25324089, year = {2014}, author = {Amir, N and Amir, O and Rosenhouse, J}, title = {Colloquial Arabic vowels in Israel: a comparative acoustic study of two dialects.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {4}, pages = {1895-1907}, doi = {10.1121/1.4894725}, pmid = {25324089}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Female ; Humans ; Israel ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {This study explores the acoustic properties of the vowel systems of two dialects of colloquial Arabic spoken in Israel. One dialect is spoken in the Galilee region in the north of Israel, and the other is spoken in the Triangle (Muthallath) region, in central Israel. These vowel systems have five short and five long vowels /i, i:, e, e:, a, a:, o, o:, u, u:/. Twenty men and twenty women from each region were included, uttering 30 vowels each. All speakers were adult Muslim native speakers of these two dialects. The studied vowels were uttered in non-pharyngeal and non-laryngeal environments in the context of CVC words, embedded in a carrier sentence. The acoustic parameters studied were the two first formants, F0, and duration. Results revealed that long vowels were approximately twice as long as short vowels and differed also in their formant values. The two dialects diverged mainly in the short vowels rather than in the long ones. An overlap was found between the two short vowel pairs /i/-/e/ and /u/-/o/. This study demonstrates the existence of dialectal differences in the colloquial Arabic vowel systems, underlining the need for further research into the numerous additional dialects found in the region.}, } @article {pmid25324088, year = {2014}, author = {Lee, S and Potamianos, A and Narayanan, S}, title = {Developmental acoustic study of American English diphthongs.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {4}, pages = {1880-1894}, pmid = {25324088}, issn = {1520-8524}, mesh = {*Acoustics ; Adolescent ; Adult ; Age Factors ; Child ; *Child Development ; Child, Preschool ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/*methods ; Time Factors ; }, abstract = {Developmental trends of durational and spectral parameters of five American English diphthongs are investigated by age and gender. Specifically, diphthong durations, the fundamental frequency (F0), and the first three formant (F1, F2, F3) trajectories as well as formant transition rates are analyzed as a function of age, gender and diphthong type. In addition, the distance between diphthong onset and offset positions and those of nearby monophthongs in the formant space is computed and age-dependent trends are presented. Furthermore, a spectral transition mid-point is estimated for a given diphthong trajectory and normalized time durations from onsets to mid-points are analyzed as a function of age and diphthong type. Finally, diphthong classification results using formant-related parameters are reported. Results show the expected age-dependent reductions of diphthong duration, fundamental frequency, onset and offset formant values, and formant transition rate. More interestingly, it is evident that speakers adjust onset and offset positions of diphthongs with respect to monophthongs as a function of age. Normalized duration of the first demisyllable segment is found to be different among diphthongs and that younger children spend more time in the first segment. The implications for diphthong development and the onset-offset definition of diphthongs are discussed in detail.}, } @article {pmid25324085, year = {2014}, author = {Nittrouer, S and Lowenstein, JH and Wucinich, T and Tarr, E}, title = {Benefits of preserving stationary and time-varying formant structure in alternative representations of speech: implications for cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {4}, pages = {1845-1856}, pmid = {25324085}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Age Factors ; Audiometry, Speech ; Auditory Threshold ; Child ; Child, Preschool ; *Cochlear Implants ; Humans ; Persons With Hearing Impairments/psychology/*rehabilitation ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {Cochlear implants have improved speech recognition for deaf individuals, but further modifications are required before performance will match that of normal-hearing listeners. In this study, the hypotheses were tested that (1) implant processing would benefit from efforts to preserve the structure of the low-frequency formants and (2) time-varying aspects of that structure would be especially beneficial. Using noise-vocoded and sine-wave stimuli with normal-hearing listeners, two experiments examined placing boundaries between static spectral channels to optimize representation of the first two formants and preserving time-varying formant structure. Another hypothesis tested in this study was that children might benefit more than adults from strategies that preserve formant structure, especially time-varying structure. Sixty listeners provided data to each experiment: 20 adults and 20 children at each of 5 and 7 years old. Materials were consonant-vowel-consonant words, four-word syntactically correct, meaningless sentences, and five-word syntactically correct, meaningful sentences. Results showed that listeners of all ages benefited from having channel boundaries placed to optimize information about the first two formants, and benefited even more from having time-varying structure. Children showed greater gains than adults only for time-varying formant structure. Results suggest that efforts would be well spent trying to design processing strategies that preserve formant structure.}, } @article {pmid25295385, year = {2015}, author = {Sidtis, JJ}, title = {Functional connectivity associated with acoustic stability during vowel production: implications for vocal-motor control.}, journal = {Brain connectivity}, volume = {5}, number = {2}, pages = {115-125}, pmid = {25295385}, issn = {2158-0022}, support = {R01 DC007658/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Brain/blood supply/diagnostic imaging/*physiology ; Cerebrovascular Circulation ; Female ; Humans ; Male ; Middle Aged ; Oxygen Radioisotopes ; Positron-Emission Tomography/methods ; *Speech Acoustics ; Voice/*physiology ; Water ; }, abstract = {Vowels provide the acoustic foundation of communication through speech and song, but little is known about how the brain orchestrates their production. Positron emission tomography was used to study regional cerebral blood flow (rCBF) during sustained production of the vowel /a/. Acoustic and blood flow data from 13, normal, right-handed, native speakers of American English were analyzed to identify CBF patterns that predicted the stability of the first and second formants of this vowel. Formants are bands of resonance frequencies that provide vowel identity and contribute to voice quality. The results indicated that formant stability was directly associated with blood flow increases and decreases in both left- and right-sided brain regions. Secondary brain regions (those associated with the regions predicting formant stability) were more likely to have an indirect negative relationship with first formant variability, but an indirect positive relationship with second formant variability. These results are not definitive maps of vowel production, but they do suggest that the level of motor control necessary to produce stable vowels is reflected in the complexity of an underlying neural system. These results also extend a systems approach to functional image analysis, previously applied to normal and ataxic speech rate that is solely based on identifying patterns of brain activity associated with specific performance measures. Understanding the complex relationships between multiple brain regions and the acoustic characteristics of vocal stability may provide insight into the pathophysiology of the dysarthrias, vocal disorders, and other speech changes in neurological and psychiatric disorders.}, } @article {pmid25288479, year = {2015}, author = {Šidlof, P and Zörner, S and Hüppe, A}, title = {A hybrid approach to the computational aeroacoustics of human voice production.}, journal = {Biomechanics and modeling in mechanobiology}, volume = {14}, number = {3}, pages = {473-488}, doi = {10.1007/s10237-014-0617-1}, pmid = {25288479}, issn = {1617-7940}, mesh = {*Acoustics ; *Air ; Humans ; Larynx/physiology ; Vocal Cords/physiology ; *Voice ; }, abstract = {The aeroacoustic mechanisms in human voice production are complex coupled processes that are still not fully understood. In this article, a hybrid numerical approach to analyzing sound generation in human voice production is presented. First, the fluid flow problem is solved using a parallel finite-volume computational fluid dynamics (CFD) solver on a fine computational mesh covering the larynx. The CFD simulations are run for four geometrical configurations: both with and without false vocal folds, and with fixed convergent or convergent-divergent motion of the medial vocal fold surface. Then the aeroacoustic sources and propagation of sound waves are calculated using Lighthill's analogy or acoustic perturbation equations on a coarse mesh covering the larynx, vocal tract, and radiation region near the mouth. Aeroacoustic sound sources are investigated in the time and frequency domains to determine their precise origin and correlation with the flow field. The problem of acoustic wave propagation from the larynx and vocal tract into the free field is solved using the finite-element method. Two different vocal-tract shapes are considered and modeled according to MRI vocal-tract data of the vowels /i/ and /u/. The spectra of the radiated sound evaluated from acoustic simulations show good agreement with formant frequencies known from human subjects.}, } @article {pmid25262622, year = {2014}, author = {Rocha-Muniz, CN and Befi-Lopes, DM and Schochat, E}, title = {Sensitivity, specificity and efficiency of speech-evoked ABR.}, journal = {Hearing research}, volume = {317}, number = {}, pages = {15-22}, doi = {10.1016/j.heares.2014.09.004}, pmid = {25262622}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Auditory Perceptual Disorders ; Brazil ; Child ; Evoked Potentials, Auditory, Brain Stem/*physiology ; False Positive Reactions ; Female ; Humans ; Language ; Language Development Disorders/*diagnosis ; Language Disorders ; Male ; Phonetics ; ROC Curve ; Reproducibility of Results ; Sensitivity and Specificity ; Speech/physiology ; }, abstract = {We determined the sensitivity, specificity and efficiency of speech-evoked Auditory Brainstem Response (ABR) as a diagnostic support for Auditory Processing Disorder (APD) and specific language impairment (SLI). Speech-evoked ABRs were elicited using the five-formant syllable/da/. The waveforms V, A, C, D, E, F, and O of all groups were analyzed. The sensitivity and specificity were calculated, and receiver operating characteristic analyses were performed to determine the optimum cut-off. Seventy-five children who were native speakers of Brazilian-Portuguese participated. The participants included 25 children with APD, 25 children with SLI and 25 with typical development. Statistical analysis demonstrated a cut-off for latency values of 6.48, 7.51, 17.82, 22.33, 30.79, 39.54 and 48.00 for V, A, C, D, E, F, and O waves, respectively. The A wave exhibited superior balance for the APD group. For the SLI group, the A, D and O waves exhibited the best balance. Furthermore, when analyzing the APD and SLI groups separately, better sensitivity values were observed for the SLI group than the APD group. Speech-evoked ABR is a useful test to identify auditory processing disorders and language impairment. Furthermore, this study represented an important step forward in establishing the clinical utility of speech-evoked ABR in Brazilian Portuguese-speaking children.}, } @article {pmid25235004, year = {2014}, author = {Shofner, WP}, title = {Perception of degraded speech sounds differs in chinchilla and human listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {4}, pages = {2065-2077}, doi = {10.1121/1.4867362}, pmid = {25235004}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Animals ; Audiometry, Speech ; *Auditory Perception ; Chinchilla/*psychology ; Conditioning, Operant ; Cues ; Generalization, Psychological ; Humans ; Psychoacoustics ; Recognition, Psychology ; Sound Spectrography ; *Speech Perception ; }, abstract = {The behavioral responses of chinchillas to noise-vocoded versions of naturally spoken speech sounds were measured using stimulus generalization and operant conditioning. Behavioral performance for speech generalization by chinchillas is compared to recognition by a group of human listeners for the identical speech sounds. The ability of chinchillas to generalize the vocoded versions as tokens of the natural speech sounds is far less than recognition by human listeners. In many cases, responses of chinchillas to noise-vocoded speech sounds were more similar to responses to band limited noise than to the responses to natural speech sounds. Chinchillas were also tested with a middle C musical note as played on a piano. Comparison of the responses of chinchillas for the middle C condition to the responses obtained for the speech conditions suggest that chinchillas may be more influenced by fundamental frequency than by formant structure. The differences between vocoded speech perception in chinchillas and human listeners may reflect differences in their abilities to resolve the formants along the cochlea. It is argued that lengthening of the cochlea during human evolution may have provided one of the auditory mechanisms that influenced the evolution of speech-specific mechanisms.}, } @article {pmid25234893, year = {2014}, author = {Green, T and Faulkner, A and Rosen, S}, title = {Overlapping frequency coverage and simulated spatial cue effects on bimodal (electrical and acoustical) sentence recognition in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {2}, pages = {851-861}, doi = {10.1121/1.4861843}, pmid = {25234893}, issn = {1520-8524}, support = {G53/ACT_/RNID/United Kingdom ; }, mesh = {Acoustic Stimulation ; Aged ; Aged, 80 and over ; Auditory Threshold ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; *Cues ; Electric Stimulation ; Humans ; Middle Aged ; Noise/*adverse effects ; *Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Prosthesis Design ; *Recognition, Psychology ; *Space Perception ; Speech Intelligibility ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {Sentence recognition in 20-talker babble was measured in eight Nucleus cochlear implant (CI) users with contralateral residual acoustic hearing. Speech reception thresholds (SRTs) were measured both in standard configurations, with some frequency regions presented both acoustically and electrically, and in configurations with no spectral overlap. In both cases a continuous interleaved sampling strategy was used. Mean SRTs were around 3 dB better with bimodal presentation than with CI alone in overlap configurations. A spherical head model was used to simulate azimuthal separation of speech and noise and provided no evidence of a contribution of spatial cues to bimodal benefit. There was no effect on bimodal performance of whether spectral overlap was present or was eliminated by switching off electrodes assigned to frequencies below the upper limit of acoustic hearing. In a subsequent experiment the CI was acutely re-mapped so that all available electrodes were used to cover frequencies not presented acoustically. This gave increased spectral resolution via the CI as assessed by formant frequency discrimination, but no improvement in bimodal performance compared to the configuration with overlap.}, } @article {pmid25232105, year = {2014}, author = {Bouchard, KE and Chang, EF}, title = {Control of spoken vowel acoustics and the influence of phonetic context in human speech sensorimotor cortex.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {34}, number = {38}, pages = {12662-12677}, pmid = {25232105}, issn = {1529-2401}, support = {R01-DC012379/DC/NIDCD NIH HHS/United States ; R00-NS065120/NS/NINDS NIH HHS/United States ; DP2 OD008627/OD/NIH HHS/United States ; R00 NS065120/NS/NINDS NIH HHS/United States ; R01 DC012379/DC/NIDCD NIH HHS/United States ; DP2-OD00862/OD/NIH HHS/United States ; }, mesh = {Cerebral Cortex/*physiology ; Humans ; *Phonetics ; Speech/*physiology ; *Speech Acoustics ; }, abstract = {Speech production requires the precise control of vocal tract movements to generate individual speech sounds (phonemes) which, in turn, are rapidly organized into complex sequences. Multiple productions of the same phoneme can exhibit substantial variability, some of which is inherent to control of the vocal tract and its biomechanics, and some of which reflects the contextual effects of surrounding phonemes ("coarticulation"). The role of the CNS in these aspects of speech motor control is not well understood. To address these issues, we recorded multielectrode cortical activity directly from human ventral sensory-motor cortex (vSMC) during the production of consonant-vowel syllables. We analyzed the relationship between the acoustic parameters of vowels (pitch and formants) and cortical activity on a single-trial level. We found that vSMC activity robustly predicted acoustic parameters across vowel categories (up to 80% of variance), as well as different renditions of the same vowel (up to 25% of variance). Furthermore, we observed significant contextual effects on vSMC representations of produced phonemes that suggest active control of coarticulation: vSMC representations for vowels were biased toward the representations of the preceding consonant, and conversely, representations for consonants were biased toward upcoming vowels. These results reveal that vSMC activity for phonemes are not invariant and provide insight into the cortical mechanisms of coarticulation.}, } @article {pmid25227394, year = {2014}, author = {Mori, Y and Hori, T and Erickson, D}, title = {Acoustic correlates of English rhythmic patterns for American versus Japanese speakers.}, journal = {Phonetica}, volume = {71}, number = {2}, pages = {83-108}, doi = {10.1159/000365594}, pmid = {25227394}, issn = {1423-0321}, mesh = {Acoustics ; Adolescent ; Adult ; Asian People ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Time Factors ; United States ; }, abstract = {This study investigates acoustic correlates of English rhythmic patterns for 20 American English speakers (AS) and 42 Japanese learners of English (JS). The results indicate that for AS in an English sentence where monosyllabic content and function words alternate, the vowels in content words are over twice as long as those in function words, resulting in alternating long-short vowels. In contrast, the JS show no stress-related duration control and realize a similar rhythmic pattern mostly through recursive high-low fundamental frequency (F0). In a sentence with a sequence of content words in which 4 stressed syllables occur successively, the AS show recursion of strong-weak syllables by means of F0, intensity and first formant, whereas JS show inconsistent stress patterns. These results indicate that the AS apply different strategies for implementing rhythmic alternation depending on sentence stress patterns, and these strategies are different from those of JS.}, } @article {pmid25215262, year = {2014}, author = {Tahaei, AA and Ashayeri, H and Pourbakht, A and Kamali, M}, title = {Speech evoked auditory brainstem response in stuttering.}, journal = {Scientifica}, volume = {2014}, number = {}, pages = {328646}, pmid = {25215262}, issn = {2090-908X}, abstract = {Auditory processing deficits have been hypothesized as an underlying mechanism for stuttering. Previous studies have demonstrated abnormal responses in subjects with persistent developmental stuttering (PDS) at the higher level of the central auditory system using speech stimuli. Recently, the potential usefulness of speech evoked auditory brainstem responses in central auditory processing disorders has been emphasized. The current study used the speech evoked ABR to investigate the hypothesis that subjects with PDS have specific auditory perceptual dysfunction. Objectives. To determine whether brainstem responses to speech stimuli differ between PDS subjects and normal fluent speakers. Methods. Twenty-five subjects with PDS participated in this study. The speech-ABRs were elicited by the 5-formant synthesized syllable/da/, with duration of 40 ms. Results. There were significant group differences for the onset and offset transient peaks. Subjects with PDS had longer latencies for the onset and offset peaks relative to the control group. Conclusions. Subjects with PDS showed a deficient neural timing in the early stages of the auditory pathway consistent with temporal processing deficits and their abnormal timing may underlie to their disfluency.}, } @article {pmid25207869, year = {2014}, author = {Wang, KC}, title = {The feature extraction based on texture image information for emotion sensing in speech.}, journal = {Sensors (Basel, Switzerland)}, volume = {14}, number = {9}, pages = {16692-16714}, pmid = {25207869}, issn = {1424-8220}, mesh = {Emotions/*classification/*physiology ; Humans ; *Language ; Pattern Recognition, Automated/*methods ; Reproducibility of Results ; Sensitivity and Specificity ; Sound Spectrography/*methods ; Speech/*physiology ; Speech Production Measurement/*methods ; }, abstract = {In this paper, we present a novel texture image feature for Emotion Sensing in Speech (ESS). This idea is based on the fact that the texture images carry emotion-related information. The feature extraction is derived from time-frequency representation of spectrogram images. First, we transform the spectrogram as a recognizable image. Next, we use a cubic curve to enhance the image contrast. Then, the texture image information (TII) derived from the spectrogram image can be extracted by using Laws' masks to characterize emotional state. In order to evaluate the effectiveness of the proposed emotion recognition in different languages, we use two open emotional databases including the Berlin Emotional Speech Database (EMO-DB) and eNTERFACE corpus and one self-recorded database (KHUSC-EmoDB), to evaluate the performance cross-corpora. The results of the proposed ESS system are presented using support vector machine (SVM) as a classifier. Experimental results show that the proposed TII-based feature extraction inspired by visual perception can provide significant classification for ESS systems. The two-dimensional (2-D) TII feature can provide the discrimination between different emotions in visual expressions except for the conveyance pitch and formant tracks. In addition, the de-noising in 2-D images can be more easily completed than de-noising in 1-D speech.}, } @article {pmid25192632, year = {2014}, author = {Daikoku, T and Yatomi, Y and Yumoto, M}, title = {Implicit and explicit statistical learning of tone sequences across spectral shifts.}, journal = {Neuropsychologia}, volume = {63}, number = {}, pages = {194-204}, doi = {10.1016/j.neuropsychologia.2014.08.028}, pmid = {25192632}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory ; Female ; Humans ; Magnetoencephalography ; Male ; Markov Chains ; Memory/*physiology ; Pattern Recognition, Physiological/physiology ; Pitch Perception/physiology ; *Probability Learning ; Young Adult ; }, abstract = {We investigated how the statistical learning of auditory sequences is reflected in neuromagnetic responses in implicit and explicit learning conditions. Complex tones with fundamental frequencies (F0s) in a five-tone equal temperament were generated by a formant synthesizer. The tones were subsequently ordered with the constraint that the probability of the forthcoming tone was statistically defined (80% for one tone; 5% for the other four) by the latest two successive tones (second-order Markov chains). The tone sequence consisted of 500 tones and 250 successive tones with a relative shift of F0s based on the same Markov transitional matrix. In explicit and implicit learning conditions, neuromagnetic responses to the tone sequence were recorded from fourteen right-handed participants. The temporal profiles of the N1m responses to the tones with higher and lower transitional probabilities were compared. In the explicit learning condition, the N1m responses to tones with higher transitional probability were significantly decreased compared with responses to tones with lower transitional probability in the latter half of the 500-tone sequence. Furthermore, this difference was retained even after the F0s were relatively shifted. In the implicit learning condition, N1m responses to tones with higher transitional probability were significantly decreased only for the 250 tones following the relative shift of F0s. The delayed detection of learning effects across the sound-spectral shift in the implicit condition may imply that learning may progress earlier in explicit learning conditions than in implicit learning conditions. The finding that the learning effects were retained across spectral shifts regardless of the learning modality indicates that relative pitch processing may be an essential ability for humans.}, } @article {pmid25190397, year = {2014}, author = {Wang, N and Oxenham, AJ}, title = {Spectral motion contrast as a speech context effect.}, journal = {The Journal of the Acoustical Society of America}, volume = {136}, number = {3}, pages = {1237}, pmid = {25190397}, issn = {1520-8524}, support = {R01 DC012262/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Audiometry ; Auditory Threshold ; Female ; Humans ; Male ; Noise/adverse effects ; Perceptual Masking ; *Speech Perception ; Time Factors ; Voice Quality ; Young Adult ; }, abstract = {Spectral contrast effects may help "normalize" the incoming sound and produce perceptual constancy in the face of the variable acoustics produced by different rooms, talkers, and backgrounds. Recent studies have concentrated on the after-effects produced by the long-term average power spectrum. The present study examined contrast effects based on spectral motion, analogous to visual-motion after-effects. In experiment 1, the existence of spectral-motion after-effects with word-length inducers was established by demonstrating that the identification of the direction of a target spectral glide was influenced by the spectral motion of a preceding inducer glide. In experiment 2, the target glide was replaced with a synthetic sine-wave speech sound, including a formant transition. The speech category boundary was shifted by the presence and direction of the inducer glide. Finally, in experiment 3, stimuli based on synthetic sine-wave speech sounds were used as both context and target stimuli to show that the spectral-motion after-effects could occur even with inducers with relatively short speech-like durations and small frequency excursions. The results suggest that spectral motion may play a complementary role to the long-term average power spectrum in inducing speech context effects.}, } @article {pmid25172585, year = {2014}, author = {Andreeva, NG and Kulikov, GA}, title = {Acoustic parameters of two-formant vowels in different speech types.}, journal = {Doklady biological sciences : proceedings of the Academy of Sciences of the USSR, Biological sciences sections}, volume = {457}, number = {1}, pages = {219-221}, pmid = {25172585}, issn = {1608-3105}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; Phonetics ; *Speech Acoustics ; }, } @article {pmid25170778, year = {2014}, author = {Eskander, A and Gordon, KA and Tirado, Y and Hopyan, T and Russell, L and Allegro, J and Papsin, BC and Campisi, P}, title = {Normal-like motor speech parameters measured in children with long-term cochlear implant experience using a novel objective analytic technique.}, journal = {JAMA otolaryngology-- head & neck surgery}, volume = {140}, number = {10}, pages = {967-974}, doi = {10.1001/jamaoto.2014.1730}, pmid = {25170778}, issn = {2168-619X}, mesh = {Adolescent ; Auditory Perception ; Case-Control Studies ; Child ; Child, Preschool ; *Cochlear Implants ; Cross-Sectional Studies ; Deafness/rehabilitation ; Female ; Humans ; Male ; *Speech Production Measurement ; Voice Quality ; }, abstract = {IMPORTANCE: Although voice has been studied extensively in children who use cochlear implants (CIs), speech production has not been studied in this population using the Motor Speech Profile. Whether children who receive CIs gain normal speech production abilities is unknown.

OBJECTIVE: To assess speech and articulation in deaf, long-term CI users who had undergone early unilateral cochlear implantation, compared with their normal-hearing peers.

Cross-sectional study at a tertiary pediatric hospital of 16 children aged 8 to 17 years who had undergone early implantation, are longstanding users, and had excellent audiogram and speech perception scores. Results were compared with a historical pediatric normal-hearing group.

INTERVENTION: Unilateral cochlear implantation.

MAIN OUTCOMES AND MEASURES: The Motor Speech Profile, an objective method for assessing motor speech in children.

RESULTS: The CI users had normal articulation and timing but poorer than normal intonation stimulability, particularly frequency variability. Diadochokinesis rates were within the 95% confidence interval of age-matched pediatric norms for 11 of 16 (69%) and 11 of 15 (73%) children with CI when they were performing /pa/ and /pataka/ tasks, respectively. The magnitude and rate of the second formant transitions were within normal limits for 9 of 16 (56%) and 10 of 12 (83%) children, respectively. The variability in frequency and amplitude of intonation stimulability domains were within normal limits for 7 of 16 (44%) and 16 of 16 (100%) children, respectively. The syllabic rate and duration were both within normal limits for 14 of 16 children (88%).

CONCLUSIONS AND RELEVANCE: Despite significant improvements in speech after cochlear implantation, abnormalities remain, particularly in frequency variability. Such deviations can present as a decreased expression of emotion in speech and likely reflects decreased auditory frequency resolution provided by the CI. These deficits have been the focus of ongoing work to advance CI technologies and speech-processing strategies.}, } @article {pmid25169905, year = {2014}, author = {Cartei, V and Bond, R and Reby, D}, title = {What makes a voice masculine: physiological and acoustical correlates of women's ratings of men's vocal masculinity.}, journal = {Hormones and behavior}, volume = {66}, number = {4}, pages = {569-576}, doi = {10.1016/j.yhbeh.2014.08.006}, pmid = {25169905}, issn = {1095-6867}, mesh = {Adult ; Auditory Perception/*physiology ; Body Height/physiology ; Cues ; Female ; Heterosexuality ; Humans ; Judgment ; Male ; *Masculinity ; Sex Characteristics ; *Speech Acoustics ; Testosterone/metabolism ; Voice/*physiology ; Young Adult ; }, abstract = {Men's voices contain acoustic cues to body size and hormonal status, which have been found to affect women's ratings of speaker size, masculinity and attractiveness. However, the extent to which these voice parameters mediate the relationship between speakers' fitness-related features and listener's judgments of their masculinity has not yet been investigated. We audio-recorded 37 adult heterosexual males performing a range of speech tasks and asked 20 adult heterosexual female listeners to rate speakers' masculinity on the basis of their voices only. We then used a two-level (speaker within listener) path analysis to examine the relationships between the physiological (testosterone, height), acoustic (fundamental frequency or F0, and resonances or ΔF) and perceptual dimensions (listeners' ratings) of speakers' masculinity. Overall, results revealed that male speakers who were taller and had higher salivary testosterone levels also had lower F0 and ΔF, and were in turn rated as more masculine. The relationship between testosterone and perceived masculinity was essentially mediated by F0, while that of height and perceived masculinity was partially mediated by both F0 and ΔF. These observations confirm that women listeners attend to sexually dimorphic voice cues to assess the masculinity of unseen male speakers. In turn, variation in these voice features correlate with speakers' variation in stature and hormonal status, highlighting the interdependence of these physiological, acoustic and perceptual dimensions.}, } @article {pmid25159273, year = {2015}, author = {Clark, GM}, title = {The multi-channel cochlear implant: multi-disciplinary development of electrical stimulation of the cochlea and the resulting clinical benefit.}, journal = {Hearing research}, volume = {322}, number = {}, pages = {4-13}, doi = {10.1016/j.heares.2014.08.002}, pmid = {25159273}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Animals ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Comprehension ; Cooperative Behavior ; Electric Stimulation ; Humans ; *Interdisciplinary Communication ; Persons With Hearing Impairments/psychology/*rehabilitation ; Prosthesis Design ; Signal Processing, Computer-Assisted ; Speech Intelligibility ; *Speech Perception ; }, abstract = {This multi-disciplinary research showed sound could be coded by electrical stimulation of the cochlea and peripheral auditory nervous system. But the temporal coding of frequency as seen in the experimental animal, was inadequate for the important speech frequencies. The data indicated the limitation was due in particular to deterministic firing of neurons and failure to reproduce the normal fine temporo-spatial pattern of neural responses seen with sound. However, the data also showed the need for the place coding of frequency, and this meant multi-electrodes inserted into the cochlea. Nevertheless, before this was evaluated on people we undertook biological safety studies to determine the effects of surgical trauma and electrical stimuli, and how to prevent infection. Then our research demonstrated place of stimulation had timbre and was perceived as vowels. This led to our discovery in 1978 of the formant-extraction speech code that first enabled severely-profoundly deaf people to understand running speech. This result in people who had hearing before becoming severely deaf was an outcome not previously considered possible. In 1985 it was the first multi-channel implant to be approved by the US Food and Drug Administration (FDA). It was also the fore runner of our advanced formant and fixed filter strategies When these codes were used from 1985 for those born deaf or deafened early in life we discovered there was a critical period when brain plasticity would allow speech perception and language to be developed near- normally, and this required in particular the acquisition of place coding. In 1990 this led to the first cochlear implant to be approved by the FDA for use in children. Finally, we achieved binaural hearing in 1989 with bilateral cochlear implants, followed by bimodal speech processing in 1990 with a hearing aid in one ear and implant in the other. The above research has been developed industrially, with for example 250,000 people worldwide receiving the Cochlear device in 2013, and as of December 2012 the NIH estimated that approximately 324,200 people worldwide had received this and other implants (NIH Publication No. 11-4798). This article is part of a Special Issue entitled .}, } @article {pmid25128409, year = {2014}, author = {Shin, H and Cho, J}, title = {Unconstrained snoring detection using a smartphone during ordinary sleep.}, journal = {Biomedical engineering online}, volume = {13}, number = {}, pages = {116}, pmid = {25128409}, issn = {1475-925X}, mesh = {Algorithms ; *Cell Phone ; Databases, Factual ; Equipment Design ; Humans ; Noise ; Reproducibility of Results ; Sensitivity and Specificity ; Sleep/*physiology ; Snoring/*diagnosis ; }, abstract = {BACKGROUND: Snoring can be a representative symptom of a sleep disorder, and thus snoring detection is quite important to improving the quality of an individual's daily life. The purpose of this research is to develop an unconstrained snoring detection technique that can be integrated into a smartphone application. In contrast with previous studies, we developed a practical technique for snoring detection during ordinary sleep by using the built-in sound recording system of a smartphone, and the recording was carried out in a standard private bedroom.

METHOD: The experimental protocol was designed to include a variety of actions that frequently produce noise (including coughing, playing music, talking, rining an alarm, opening/closing doors, running a fan, playing the radio, and walking) in order to accurately recreate the actual circumstances during sleep. The sound data were recorded for 10 individuals during actual sleep. In total, 44 snoring data sets and 75 noise datasets were acquired. The algorithm uses formant analysis to examine sound features according to the frequency and magnitude. Then, a quadratic classifier is used to distinguish snoring from non-snoring noises. Ten-fold cross validation was used to evaluate the developed snoring detection methods, and validation was repeated 100 times randomly to improve statistical effectiveness.

RESULTS: The overall results showed that the proposed method is competitive with those from previous research. The proposed method presented 95.07% accuracy, 98.58% sensitivity, 94.62% specificity, and 70.38% positive predictivity.

CONCLUSION: Though there was a relatively high false positive rate, the results show the possibility for ubiquitous personal snoring detection through a smartphone application that takes into account data from normally occurring noises without training using preexisting data.}, } @article {pmid25127854, year = {2014}, author = {Terband, H and van Brenk, F and van Doornik-van der Zee, A}, title = {Auditory feedback perturbation in children with developmental speech sound disorders.}, journal = {Journal of communication disorders}, volume = {51}, number = {}, pages = {64-77}, doi = {10.1016/j.jcomdis.2014.06.009}, pmid = {25127854}, issn = {1873-7994}, mesh = {Adaptation, Physiological/physiology ; Case-Control Studies ; Child ; Child, Preschool ; *Feedback, Sensory/physiology ; Female ; Humans ; Language Disorders/*physiopathology/psychology ; Male ; Speech Sound Disorder ; }, abstract = {BACKGROUND/PURPOSE: Several studies indicate a close relation between auditory and speech motor functions in children with speech sound disorders (SSD). The aim of this study was to investigate the ability to compensate and adapt for perturbed auditory feedback in children with SSD compared to age-matched normally developing children.

METHOD: 17 normally developing children aged 4.1-8.7 years (mean=5.5, SD=1.4), and 11 children with SSD aged 3.9-7.5 years (mean=5.1, SD=1.0) participated in the study. Auditory feedback was perturbed by real-time shifting the first and second formant of the vowel /e/ during the production of CVC words in a five-step paradigm (practice/familiarization; start/baseline; ramp; hold; end/release).

RESULTS: At the group level, the normally developing children were better able to compensate and adapt, adjusting their formant frequencies in the direction opposite to the perturbation, while the group of children with SSD followed (amplifying) the perturbation. However, large individual differences lie underneath. Furthermore, strong correlations were found between the amount of compensation and performance on oral motor movement non-word repetition tasks.

CONCLUSIONS: Results suggested that while most children with SSD can detect incongruencies in auditory feedback and can adapt their target representations, they are unable to compensate for perturbed auditory feedback. These findings suggest that impaired auditory-motor integration may play a key role in SSD.

LEARNING OUTCOMES: The reader will be able to: (1) describe the potential role of auditory feedback control in developmental speech disorders (SSD); (2) identify the neural control subsystems involved in feedback based speech motor control; (3) describe the differences between compensation and adaptation for perturbed auditory feedback; (4) explain why auditory-motor integration may play a key role in SSD.}, } @article {pmid25108306, year = {2014}, author = {Christmann, CA and Berti, S and Steinbrink, C and Lachmann, T}, title = {Differences in sensory processing of German vowels and physically matched non-speech sounds as revealed by the mismatch negativity (MMN) of the human event-related brain potential (ERP).}, journal = {Brain and language}, volume = {136}, number = {}, pages = {8-18}, doi = {10.1016/j.bandl.2014.07.004}, pmid = {25108306}, issn = {1090-2155}, mesh = {Adolescent ; Adult ; Analysis of Variance ; Discrimination, Psychological/physiology ; Dominance, Cerebral/physiology ; Dyslexia/physiopathology ; Electroencephalography ; Evoked Potentials ; *Evoked Potentials, Auditory ; Female ; Humans ; Language ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {We compared processing of speech and non-speech by means of the mismatch negativity (MMN). For this purpose, the MMN elicited by vowels was compared to those elicited by two non-speech stimulus types: spectrally rotated vowels, having the same stimulus complexity as the speech stimuli, and sounds based on the bands of formants of the vowels, representing non-speech stimuli of lower complexity as compared to the other stimulus types. This design allows controlling for effects of stimulus complexity when comparing neural correlates of processing speech to non-speech. Deviants within a modified multi-feature design differed either in duration or spectral property. Moreover, the difficulty to discriminate between the standard and the two deviants was controlled for each stimulus type by means of an additional active discrimination task. Vowels elicited a larger MMN compared to both non-speech stimulus types, supporting the concept of language-specific phoneme representations and the role of the participants' prior experience.}, } @article {pmid25085074, year = {2014}, author = {Guzman, M and Muñoz, D and Vivero, M and Marín, N and Ramírez, M and Rivera, MT and Vidal, C and Gerhard, J and González, C}, title = {Acoustic markers to differentiate gender in prepubescent children's speaking and singing voice.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {78}, number = {10}, pages = {1592-1598}, doi = {10.1016/j.ijporl.2014.06.030}, pmid = {25085074}, issn = {1872-8464}, mesh = {Biomarkers ; Child ; Female ; Humans ; Logistic Models ; Male ; Phonetics ; Reproducibility of Results ; *Sex Characteristics ; *Singing ; *Speech Acoustics ; *Voice ; }, abstract = {OBJECTIVES: Investigation sought to determine whether there is any acoustic variable to objectively differentiate gender in children with normal voices.

METHODS: A total of 30 children, 15 boys and 15 girls, with perceptually normal voices were examined. They were between 7 and 10 years old (mean: 8.1, SD: 0.7 years). Subjects were required to perform the following phonatory tasks: (1) to phonate sustained vowels [a:], [i:], [u:], (2) to read a phonetically balanced text, and (3) to sing a song. Acoustic analysis included long-term average spectrum (LTAS), fundamental frequency (F0), speaking fundamental frequency (SFF), equivalent continuous sound level (Leq), linear predictive code (LPC) to obtain formant frequencies, perturbation measures, harmonic to noise ratio (HNR), and Cepstral peak prominence (CPP). Auditory perceptual analysis was performed by four blinded judges to determine gender.

RESULTS: No significant gender-related differences were found for most acoustic variables. Perceptual assessment showed good intra and inter rater reliability for gender. Cepstrum for [a:], alpha ratio in text, shimmer for [i:], F3 in [a:], and F3 in [i:], were the parameters that composed the multivariate logistic regression model to best differentiate male and female children's voices.

CONCLUSION: Since perceptual assessment reliably detected gender, it is likely that other acoustic markers (not evaluated in the present study) are able to make clearer gender differences. For example, gender-specific patterns of intonation may be a more accurate feature for differentiating gender in children's voices.}, } @article {pmid25074511, year = {2014}, author = {Whitfield, JA and Goberman, AM}, title = {Articulatory-acoustic vowel space: application to clear speech in individuals with Parkinson's disease.}, journal = {Journal of communication disorders}, volume = {51}, number = {}, pages = {19-28}, doi = {10.1016/j.jcomdis.2014.06.005}, pmid = {25074511}, issn = {1873-7994}, mesh = {Adult ; Aged ; Aged, 80 and over ; Articulation Disorders/etiology/physiopathology ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/complications/*physiopathology ; Speech ; *Speech Acoustics ; *Speech Intelligibility/physiology ; Young Adult ; }, abstract = {BACKGROUND/PURPOSE: Individuals with Parkinson disease (PD) often exhibit decreased range of movement secondary to the disease process, which has been shown to affect articulatory movements. A number of investigations have failed to find statistically significant differences between control and disordered groups, and between speaking conditions, using traditional vowel space area measures. The purpose of the current investigation was to evaluate both between-group (PD versus control) and within-group (habitual versus clear) differences in articulatory function using a novel vowel space measure, the articulatory-acoustic vowel space (AAVS).

METHODS: The novel AAVS is calculated from continuously sampled formant trajectories of connected speech. In the current study, habitual and clear speech samples from twelve individuals with PD along with habitual control speech samples from ten neurologically healthy adults were collected and acoustically analyzed. In addition, a group of listeners completed perceptual rating of speech clarity for all samples.

RESULTS: Individuals with PD were perceived to exhibit decreased speech clarity compared to controls. Similarly, the novel AAVS measure was significantly lower in individuals with PD. In addition, the AAVS measure significantly tracked changes between the habitual and clear conditions that were confirmed by perceptual ratings.

CONCLUSIONS: In the current study, the novel AAVS measure is shown to be sensitive to disease-related group differences and within-person changes in articulatory function of individuals with PD. Additionally, these data confirm that individuals with PD can modulate the speech motor system to increase articulatory range of motion and speech clarity when given a simple prompt.

LEARNING OUTCOMES: The reader will be able to (i) describe articulatory behavior observed in the speech of individuals with Parkinson disease; (ii) describe traditional measures of vowel space area and how they relate to articulation; (iii) describe a novel measure of vowel space, the articulatory-acoustic vowel space and its relationship to articulation and the perception of speech clarity.}, } @article {pmid25060583, year = {2014}, author = {Zourmand, A and Mirhassani, SM and Ting, HN and Bux, SI and Ng, KH and Bilgen, M and Jalaludin, MA}, title = {A magnetic resonance imaging study on the articulatory and acoustic speech parameters of Malay vowels.}, journal = {Biomedical engineering online}, volume = {13}, number = {}, pages = {103}, pmid = {25060583}, issn = {1475-925X}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Image Processing, Computer-Assisted ; Indonesia ; *Magnetic Resonance Imaging ; Movement ; Speech/*physiology ; Tongue/physiology ; }, abstract = {The phonetic properties of six Malay vowels are investigated using magnetic resonance imaging (MRI) to visualize the vocal tract in order to obtain dynamic articulatory parameters during speech production. To resolve image blurring due to the tongue movement during the scanning process, a method based on active contour extraction is used to track tongue contours. The proposed method efficiently tracks tongue contours despite the partial blurring of MRI images. Consequently, the articulatory parameters that are effectively measured as tongue movement is observed, and the specific shape of the tongue and its position for all six uttered Malay vowels are determined.Speech rehabilitation procedure demands some kind of visual perceivable prototype of speech articulation. To investigate the validity of the measured articulatory parameters based on acoustic theory of speech production, an acoustic analysis based on the uttered vowels by subjects has been performed. As the acoustic speech and articulatory parameters of uttered speech were examined, a correlation between formant frequencies and articulatory parameters was observed. The experiments reported a positive correlation between the constriction location of the tongue body and the first formant frequency, as well as a negative correlation between the constriction location of the tongue tip and the second formant frequency. The results demonstrate that the proposed method is an effective tool for the dynamic study of speech production.}, } @article {pmid25048199, year = {2014}, author = {Delvaux, B and Howard, D}, title = {A new method to explore the spectral impact of the piriform fossae on the singing voice: benchmarking using MRI-based 3D-printed vocal tracts.}, journal = {PloS one}, volume = {9}, number = {7}, pages = {e102680}, pmid = {25048199}, issn = {1932-6203}, mesh = {Female ; Humans ; Magnetic Resonance Imaging/*methods ; Male ; Pyriform Sinus/*anatomy & histology/physiology ; *Singing ; Speech Acoustics ; Voice ; }, abstract = {The piriform fossae are the 2 pear-shaped cavities lateral to the laryngeal vestibule at the lower end of the vocal tract. They act acoustically as side-branches to the main tract, resulting in a spectral zero in the output of the human voice. This study investigates their spectral role by comparing numerical and experimental results of MRI-based 3D printed Vocal Tracts, for which a new experimental method (based on room acoustics) is introduced. The findings support results in the literature: the piriform fossae create a spectral trough in the region 4-5 kHz and act as formants repellents. Moreover, this study extends those results by demonstrating numerically and perceptually the impact of having large piriform fossae on the sung output.}, } @article {pmid25024638, year = {2014}, author = {Hodges-Simeon, CR and Gurven, M and Puts, DA and Gaulin, SJ}, title = {Vocal fundamental and formant frequencies are honest signals of threat potential in peripubertal males.}, journal = {Behavioral ecology : official journal of the International Society for Behavioral Ecology}, volume = {25}, number = {4}, pages = {984-988}, pmid = {25024638}, issn = {1045-2249}, abstract = {Fundamental and formant frequencies influence perceived pitch and are sexually dimorphic in humans. The information content of these acoustic parameters can illuminate the forces of sexual selection shaping vocal sex differences as well as the mechanisms that ensure signal reliability. We use multiple regression to examine the relationships between somatic (height, adiposity, and strength) and acoustic (fundamental frequency [F0], formant position [Pf], and fundamental frequency variation [F0-SD]) characteristics in a sample of peripubertal Bolivian Tsimane. Results indicate that among males-but not females-strength is the strongest predictor of F0 and Pf and that F0 and Pf are independent predictors of strength when height and adiposity are controlled. These findings suggest that listeners may attend to vocal frequencies because they signal honest, nonredundant information about male strength and threat potential, which are strongly related to physical maturity and which cannot be ascertained from visual or other indicators of height or adiposity alone.}, } @article {pmid25002128, year = {2014}, author = {Chintanpalli, A and Ahlstrom, JB and Dubno, JR}, title = {Computational model predictions of cues for concurrent vowel identification.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {15}, number = {5}, pages = {823-837}, pmid = {25002128}, issn = {1438-7573}, support = {C06 RR14516/RR/NCRR NIH HHS/United States ; C06 RR014516/RR/NCRR NIH HHS/United States ; P50 DC000422/DC/NIDCD NIH HHS/United States ; UL1 RR029882/RR/NCRR NIH HHS/United States ; R01 DC000184/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cochlear Nerve/physiology ; Computer Simulation ; *Cues ; Female ; Humans ; Male ; *Speech Perception ; }, abstract = {Although differences in fundamental frequencies (F0s) between vowels are beneficial for their segregation and identification, listeners can still segregate and identify simultaneous vowels that have identical F0s, suggesting that additional cues are contributing, including formant frequency differences. The current perception and computational modeling study was designed to assess the contribution of F0 and formant difference cues for concurrent vowel identification. Younger adults with normal hearing listened to concurrent vowels over a wide range of levels (25-85 dB SPL) for conditions in which F0 was the same or different between vowel pairs. Vowel identification scores were poorer at the lowest and highest levels for each F0 condition, and F0 benefit was reduced at the lowest level as compared to higher levels. To understand the neural correlates underlying level-dependent changes in vowel identification, a computational auditory-nerve model was used to estimate formant and F0 difference cues under the same listening conditions. Template contrast and average localized synchronized rate predicted level-dependent changes in the strength of phase locking to F0s and formants of concurrent vowels, respectively. At lower levels, poorer F0 benefit may be attributed to poorer phase locking to both F0s, which resulted from lower firing rates of auditory-nerve fibers. At higher levels, poorer identification scores may relate to poorer phase locking to the second formant, due to synchrony capture by lower formants. These findings suggest that concurrent vowel identification may be partly influenced by level-dependent changes in phase locking of auditory-nerve fibers to F0s and formants of both vowels.}, } @article {pmid24998780, year = {2015}, author = {Waaramaa, T and Palo, P and Kankare, E}, title = {Emotions in freely varying and mono-pitched vowels, acoustic and EGG analyses.}, journal = {Logopedics, phoniatrics, vocology}, volume = {40}, number = {4}, pages = {156-170}, doi = {10.3109/14015439.2014.934277}, pmid = {24998780}, issn = {1651-2022}, mesh = {*Acoustics ; Auditory Threshold ; *Electrodiagnosis ; *Emotions ; Female ; Humans ; Male ; Phonetics ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; *Voice Quality ; }, abstract = {Vocal emotions are expressed either by speech or singing. The difference is that in singing the pitch is predetermined while in speech it may vary freely. It was of interest to study whether there were voice quality differences between freely varying and mono-pitched vowels expressed by professional actors. Given their profession, actors have to be able to express emotions both by speech and singing. Electroglottogram and acoustic analyses of emotional utterances embedded in expressions of freely varying vowels [a:], [i:], [u:] (96 samples) and mono-pitched protracted vowels (96 samples) were studied. Contact quotient (CQEGG) was calculated using 35%, 55%, and 80% threshold levels. Three different threshold levels were used in order to evaluate their effects on emotions. Genders were studied separately. The results suggested significant gender differences for CQEGG 80% threshold level. SPL, CQEGG, and F4 were used to convey emotions, but to a lesser degree, when F0 was predetermined. Moreover, females showed fewer significant variations than males. Both genders used more hypofunctional phonation type in mono-pitched utterances than in the expressions with freely varying pitch. The present material warrants further study of the interplay between CQEGG threshold levels and formant frequencies, and listening tests to investigate the perceptual value of the mono-pitched vowels in the communication of emotions.}, } @article {pmid24971047, year = {2014}, author = {Divenyi, P}, title = {Decreased ability in the segregation of dynamically changing vowel-analog streams: a factor in the age-related cocktail-party deficit?.}, journal = {Frontiers in neuroscience}, volume = {8}, number = {}, pages = {144}, pmid = {24971047}, issn = {1662-4548}, support = {R01 AG007998/AG/NIA NIH HHS/United States ; }, abstract = {Pairs of harmonic complexes with different fundamental frequencies f0 (105 and 189 Hz or 105 and 136 Hz) but identical bandwidth (0.25-3 kHz) were band-pass filtered using a filter having an identical center frequency of 1 kHz. The filter's center frequency was modulated using a triangular wave having a 5-Hz modulation frequency fmod to obtain a pair of vowel-analog waveforms with dynamically varying single-formant transitions. The target signal S contained a single modulation cycle starting either at a phase of -π/2 (up-down) or π/2 (down-up), whereas the longer distracter N contained several cycles of the modulating triangular wave starting at a random phase. The level at which the target formant's modulating phase could be correctly identified was adaptively determined for several distracter levels and several extents of frequency swing (10-55%) in a group of experienced normal-hearing young and a group of experienced elderly individuals with hearing loss not exceeding one considered moderate. The most important result was that, for the two f0 differences, all distracter levels, and all frequency swing extents tested, elderly listeners needed about 20 dB larger S/N ratios than the young. Results also indicate that identification thresholds of both the elderly and the young listeners are between 4 and 12 dB higher than similarly determined detection thresholds and that, contrary to detection, identification is not a linear function of distracter level. Since formant transitions represent potent cues for speech intelligibility, the large S/N ratios required by the elderly for correct discrimination of single-formant transition dynamics may at least partially explain the well-documented intelligibility loss of speech in babble noise by the elderly.}, } @article {pmid24933617, year = {2014}, author = {Pisanski, K and Fraccaro, PJ and Tigue, CC and O'Connor, JJ and Feinberg, DR}, title = {Return to Oz: voice pitch facilitates assessments of men's body size.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {40}, number = {4}, pages = {1316-1331}, doi = {10.1037/a0036956}, pmid = {24933617}, issn = {1939-1277}, mesh = {Adolescent ; Adult ; *Body Size ; Female ; Humans ; Male ; Pitch Perception/*physiology ; Speech Perception/*physiology ; Voice/*physiology ; Young Adult ; }, abstract = {Listeners associate low voice pitch (fundamental frequency and/or harmonics) and formants (vocal-tract resonances) with large body size. Although formants reliably predict size within sexes, pitch does not reliably predict size in groups of same-sex adults. Voice pitch has therefore long been hypothesized to confound within-sex size assessment. Here we performed a knockout test of this hypothesis using whispered and 3-formant sine-wave speech devoid of pitch. Listeners estimated the relative size of men with above-chance accuracy from voiced, whispered, and sine-wave speech. Critically, although men's pitch and physical height were unrelated, the accuracy of listeners' size assessments increased in the presence rather than absence of pitch. Size assessments based on relatively low pitch yielded particularly high accuracy (70%-80%). Results of Experiment 2 revealed that amplitude, noise, and signal degradation of unvoiced speech could not explain this effect; listeners readily perceived formant shifts in manipulated whispered speech. Rather, in Experiment 3, we show that the denser harmonic spectrum provided by low pitch allowed for better resolution of formants, aiding formant-based size assessment. These findings demonstrate that pitch does not confuse body size assessment as has been previously suggested, but instead facilitates accurate size assessment by providing a carrier signal for vocal-tract resonances.}, } @article {pmid24932053, year = {2014}, author = {Reinisch, E and Wozny, DR and Mitterer, H and Holt, LL}, title = {Phonetic category recalibration: What are the categories?.}, journal = {Journal of phonetics}, volume = {45}, number = {}, pages = {91-105}, pmid = {24932053}, issn = {0095-4470}, support = {R01 DC004674/DC/NIDCD NIH HHS/United States ; }, abstract = {Listeners use lexical or visual context information to recalibrate auditory speech perception. After hearing an ambiguous auditory stimulus between /aba/ and /ada/ coupled with a clear visual stimulus (e.g., lip closure in /aba/), an ambiguous auditory-only stimulus is perceived in line with the previously seen visual stimulus. What remains unclear, however, is what exactly listeners are recalibrating: phonemes, phone sequences, or acoustic cues. To address this question we tested generalization of visually-guided auditory recalibration to 1) the same phoneme contrast cued differently (i.e., /aba/-/ada/ vs. /ibi/-/idi/ where the main cues are formant transitions in the vowels vs. burst and frication of the obstruent), 2) a different phoneme contrast cued identically (/aba/-/ada/ vs. /ama/-/ana/ both cued by formant transitions in the vowels), and 3) the same phoneme contrast with the same cues in a different acoustic context (/aba/-/ada/ vs. (/ubu/-/udu/). Whereas recalibration was robust for all recalibration control trials, no generalization was found in any of the experiments. This suggests that perceptual recalibration may be more specific than previously thought as it appears to be restricted to the phoneme category experienced during exposure as well as to the specific manipulated acoustic cues. We suggest that recalibration affects context-dependent sub-lexical units.}, } @article {pmid24910484, year = {2014}, author = {Amano-Kusumoto, A and Hosom, JP and Kain, A and Aronoff, JM}, title = {Determining the relevance of different aspects of formant contours to intelligibility.}, journal = {Speech communication}, volume = {59}, number = {}, pages = {1-9}, pmid = {24910484}, issn = {0167-6393}, support = {T32 DC009975/DC/NIDCD NIH HHS/United States ; }, abstract = {Previous studies have shown that "clear" speech, where the speaker intentionally tries to enunciate, has better intelligibility than "conversational" speech, which is produced in regular conversation. However, conversational and clear speech vary along a number of acoustic dimensions and it is unclear what aspects of clear speech lead to better intelligibility. Previously, Kain et al. [J. Acoust. Soc. Am. 124 (4), 2308-2319 (2008)] showed that a combination of short-term spectra and duration was responsible for the improved intelligibility of one speaker. This study investigates subsets of specific features of short-term spectra including temporal aspects. Similar to Kain's study, hybrid stimuli were synthesized with a combination of features from clear speech and complementary features from conversational speech to determine which acoustic features cause the improved intelligibility of clear speech. Our results indicate that, although steady-state formant values of tense vowels contributed to the intelligibility of clear speech, neither the steady-state portion nor the formant transition was sufficient to yield comparable intelligibility to that of clear speech. In contrast, when the entire formant contour of conversational speech including the phoneme duration was replaced by that of clear speech, intelligibility was comparable to that of clear speech. It indicated that the combination of formant contour and duration information was relevant to the improved intelligibility of clear speech. The study provides a better understanding of the relevance of different aspects of formant contours to the improved intelligibility of clear speech.}, } @article {pmid24907823, year = {2014}, author = {d'Alessandro, C and Feugère, L and Le Beux, S and Perrotin, O and Rilliard, A}, title = {Drawing melodies: evaluation of chironomic singing synthesis.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {6}, pages = {3601-3612}, doi = {10.1121/1.4875718}, pmid = {24907823}, issn = {1520-8524}, abstract = {Cantor Digitalis, a real-time formant synthesizer controlled by a graphic tablet and a stylus, is used for assessment of melodic precision and accuracy in singing synthesis. Melodic accuracy and precision are measured in three experiments for groups of 20 and 28 subjects. The task of the subjects is to sing musical intervals and short melodies, at various tempi, using chironomy (hand-controlled singing), mute chironomy (without audio feedback), and their own voices. The results show the high accuracy and precision obtained by all the subjects for chironomic control of singing synthesis. Some subjects performed significantly better in chironomic singing compared to natural singing, although other subjects showed comparable proficiency. For the chironomic condition, mean note accuracy is less than 12 cents and mean interval accuracy is less than 25 cents for all the subjects. Comparing chironomy and mute chironomy shows that the skills used for writing and drawing are used for chironomic singing, but that the audio feedback helps in interval accuracy. Analysis of blind chironomy (without visual reference) indicates that a visual feedback helps greatly in both note and interval accuracy and precision. This study demonstrates the capabilities of chironomy as a precise and accurate mean for controlling singing synthesis.}, } @article {pmid24907820, year = {2014}, author = {Ferguson, SH and Quené, H}, title = {Acoustic correlates of vowel intelligibility in clear and conversational speech for young normal-hearing and elderly hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {6}, pages = {3570-3584}, pmid = {24907820}, issn = {1520-8524}, support = {DC008886/DC/NIDCD NIH HHS/United States ; R01 DC002229/DC/NIDCD NIH HHS/United States ; DC02229/DC/NIDCD NIH HHS/United States ; DC005803/DC/NIDCD NIH HHS/United States ; HD002528/HD/NICHD NIH HHS/United States ; R03 DC008886/DC/NIDCD NIH HHS/United States ; P30 DC005803/DC/NIDCD NIH HHS/United States ; P30 HD002528/HD/NICHD NIH HHS/United States ; }, mesh = {*Acoustics ; Adolescent ; Adult ; Age Factors ; Aged ; Aged, 80 and over ; Aging/*psychology ; Audiometry, Pure-Tone ; Audiometry, Speech ; Auditory Threshold ; Cues ; Female ; Hearing Loss, Sensorineural/diagnosis/*psychology ; Humans ; Male ; Middle Aged ; Recognition, Psychology ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The present investigation carried out acoustic analyses of vowels in clear and conversational speech produced by 41 talkers. Mixed-effects models were then deployed to examine relationships among acoustic and perceptual data for these vowels. Acoustic data include vowel duration, steady-state formant frequencies, and two measures of dynamic formant movement. Perceptual data consist of vowel intelligibility in noise for young normal-hearing and elderly hearing-impaired listeners, as reported by Ferguson in 2004 and 2012 [J. Acoust. Soc. Am. 116, 2365-2373 (2004); J. Speech Lang. Hear. Res. 55, 779-790 (2012)], respectively. Significant clear speech effects were observed for all acoustic metrics, although not all measures changed for all vowels and considerable talker variability was observed. Mixed-effects analyses revealed that the contribution of duration and steady-state formant information to vowel intelligibility differed for the two listener groups. This outcome is consistent with earlier research suggesting that hearing loss, and possibly aging, alters the way acoustic cues are used for identifying vowels.}, } @article {pmid24902631, year = {2013}, author = {Dong, L and Sundberg, J and Kong, J}, title = {Formant and voice source properties in two male Kunqu Opera roles: a pilot study.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {65}, number = {6}, pages = {294-302}, doi = {10.1159/000360760}, pmid = {24902631}, issn = {1421-9972}, mesh = {Adult ; Glottis/physiology ; Humans ; Male ; Pilot Projects ; *Singing ; Sound Spectrography ; *Voice Quality ; }, abstract = {OBJECTIVE: This investigation analyzes flow glottogram and electroglottogram (EGG) parameters as well as the relationship between formant frequencies and partials in two male Kunqu Opera roles, Colorful face (CF) and Old man (OM).

PARTICIPANTS AND METHODS: Four male professional Kunqu Opera singers volunteered as participants, 2 singers for each role. Using inverse filtering of the audio signal flow glottogram parameters and formant frequencies were measured in each note of scales. Two EGG parameters, contact quotient (CoQ) and speed quotient, were measured.

RESULTS: Formant tuning was observed only in 1 of the OM singers and appeared in a pitch range lower than the passaggio range of Western male opera singers. Both the CF and the OM role singers showed high CoQ values and low values of the normalized amplitude quotient in singing. For 3 of the 4 singers CoQ and the level difference between the first and second partials showed a positive and a negative correlation with fundamental frequency (F0), respectively.

CONCLUSIONS: Formant tuning may be applied by a singer of the OM role, and both CF and OM role singers may use a rather pressed type of phonation, CF singers more than OM singers in the lower part of the pitch range. Most singers increased glottal adduction with rising F0.}, } @article {pmid24892496, year = {2014}, author = {Xu, Y and Liu, Y and Wu, Y and Xia, X and Liao, Y and Li, Q}, title = {Fluorescent probe-based lateral flow assay for multiplex nucleic acid detection.}, journal = {Analytical chemistry}, volume = {86}, number = {12}, pages = {5611-5614}, doi = {10.1021/ac5010458}, pmid = {24892496}, issn = {1520-6882}, mesh = {Fluorescent Dyes/*chemistry ; Nucleic Acids/*analysis ; Polymerase Chain Reaction/methods ; }, abstract = {Here we report a rapid, low cost, and disposable dipstick-type DNA biosensor that enables multiplex detection in a single assay. The fluorescent probes labeled with different fluorophores were introduced into the lateral flow nucleic acid testing system. In combination with multiple immobilized probes arranged in an array formant on the membrane, a dual-color fluorescent lateral flow DNA biosensor was developed using a portable fluorescence reader. Up to 13 human papillomavirus types could be detected simultaneously by a single-step operation in less than 30 min after linear-after-the-exponential (LATE)-PCR. The sensitivity was determined to be 10-10(2) copies plasmid DNA/μL. The specificity study showed no cross-reactivity among the 31 different common HPV types. In the clinical validation, 95.3% overall agreement showed very good potential for this method in the clinical application when compared to a commercial kit.}, } @article {pmid24890664, year = {2014}, author = {Bidelman, GM and Weiss, MW and Moreno, S and Alain, C}, title = {Coordinated plasticity in brainstem and auditory cortex contributes to enhanced categorical speech perception in musicians.}, journal = {The European journal of neuroscience}, volume = {40}, number = {4}, pages = {2662-2673}, doi = {10.1111/ejn.12627}, pmid = {24890664}, issn = {1460-9568}, support = {/CAPMC/CIHR/Canada ; /CAPMC/CIHR/Canada ; }, mesh = {Adult ; Auditory Cortex/*physiology ; Brain Stem/*physiology ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; Music/*psychology ; *Neuronal Plasticity ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Musicianship is associated with neuroplastic changes in brainstem and cortical structures, as well as improved acuity for behaviorally relevant sounds including speech. However, further advance in the field depends on characterizing how neuroplastic changes in brainstem and cortical speech processing relate to one another and to speech-listening behaviors. Here, we show that subcortical and cortical neural plasticity interact to yield the linguistic advantages observed with musicianship. We compared brainstem and cortical neuroelectric responses elicited by a series of vowels that differed along a categorical speech continuum in amateur musicians and non-musicians. Musicians obtained steeper identification functions and classified speech sounds more rapidly than non-musicians. Behavioral advantages coincided with more robust and temporally coherent brainstem phase-locking to salient speech cues (voice pitch and formant information) coupled with increased amplitude in cortical-evoked responses, implying an overall enhancement in the nervous system's responsiveness to speech. Musicians' subcortical and cortical neural enhancements (but not behavioral measures) were correlated with their years of formal music training. Associations between multi-level neural responses were also stronger in musically trained listeners, and were better predictors of speech perception than in non-musicians. Results suggest that musicianship modulates speech representations at multiple tiers of the auditory pathway, and strengthens the correspondence of processing between subcortical and cortical areas to allow neural activity to carry more behaviorally relevant information. We infer that musicians have a refined hierarchy of internalized representations for auditory objects at both pre-attentive and attentive levels that supplies more faithful phonemic templates to decision mechanisms governing linguistic operations.}, } @article {pmid24860460, year = {2014}, author = {Bourguignon, NJ and Baum, SR and Shiller, DM}, title = {Lexical-perceptual integration influences sensorimotor adaptation in speech.}, journal = {Frontiers in human neuroscience}, volume = {8}, number = {}, pages = {208}, pmid = {24860460}, issn = {1662-5161}, support = {R01 DC012502/DC/NIDCD NIH HHS/United States ; }, abstract = {A combination of lexical bias and altered auditory feedback was used to investigate the influence of higher-order linguistic knowledge on the perceptual aspects of speech motor control. Subjects produced monosyllabic real words or pseudo-words containing the vowel [ε] (as in "head") under conditions of altered auditory feedback involving a decrease in vowel first formant (F1) frequency. This manipulation had the effect of making the vowel sound more similar to [I] (as in "hid"), affecting the lexical status of produced words in two Lexical-Change (LC) groups (either changing them from real words to pseudo-words: e.g., less-liss, or pseudo-words to real words: e.g., kess-kiss). Two Non-Lexical-Change (NLC) control groups underwent the same auditory feedback manipulation during the production of [ε] real- or pseudo-words, only without any resulting change in lexical status (real words to real words: e.g., mess-miss, or pseudo-words to pseudo-words: e.g., ness-niss). The results from the LC groups indicate that auditory-feedback-based speech motor learning is sensitive to the lexical status of the stimuli being produced, in that speakers tend to keep their acoustic speech outcomes within the auditory-perceptual space corresponding to the task-related side of the word/non-word boundary (real words or pseudo-words). For the NLC groups, however, no such effect of lexical status is observed.}, } @article {pmid24857769, year = {2014}, author = {Saxena, M and Behari, M and Kumaran, SS and Goyal, V and Narang, V}, title = {Assessing speech dysfunction using BOLD and acoustic analysis in parkinsonism.}, journal = {Parkinsonism & related disorders}, volume = {20}, number = {8}, pages = {855-861}, doi = {10.1016/j.parkreldis.2014.04.024}, pmid = {24857769}, issn = {1873-5126}, mesh = {Aged ; Brain/*physiopathology ; Brain Mapping/*methods ; Dysarthria/etiology/*physiopathology ; Humans ; Magnetic Resonance Imaging ; Middle Aged ; Parkinsonian Disorders/complications/*physiopathology ; }, abstract = {INTRODUCTION: Speech dysfunction is often associated with parkinsonism (Parkinson's disease (PD), Multiple System Atrophy (MSA), and Progressive Supranuclear Palsy (PSP)), along with characteristic motor features. Any or all of the following i.e. respiratory, phonatory, resonatory, or articulatory components of speech production may be affected. Articulatory imprecision, repetition of syllables (tachyphrenia), and tremor of oropharyngeal structures add to speech unintelligibility. We studied acoustics using spectrogram and its correlation with BOLD activation during voice/speech production across these subjects.

METHODS: BOLD studies were conducted on 108 subjects (29 PD, 20 MSA and 19 PSP and 40 controls) on 1.5 T MR scanner using 130 dynamics. Active phase involved acquisition (10 volumes each) of audible reading of visually presented bi-syllabic meaningful Hindi simple words (5 types of non-nasal stop consonant categories, i.e. namely velars, palatals, retroflexes, dentals, bilabials and one nasal stop consonant) with interleaved silence during baseline. The subjects' voice samples were analyzed for acoustic parameters, namely formant frequencies of the adjoining vowels, voice onset time (VOT), and intensities using spectrogram. Correlation of BOLD activation in different brain areas with acoustic parameters was evaluated.

RESULTS: Voice intensity was significantly lowered, while VOTs were delayed in these patients as compared to healthy controls. All acoustic parameters were significantly affected for nasal consonants. BOLD activation correlated positively in primary motor cortex to VOTs, while F2 formants to activation of supplementary motor area.

CONCLUSION: The differences in the acoustic quality of various stop consonants in patients may be helpful in differentiating these three parkinsonian disorders.}, } @article {pmid24842068, year = {2014}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Formant-frequency variation and informational masking of speech by extraneous formants: evidence against dynamic and speech-specific acoustical constraints.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {40}, number = {4}, pages = {1507-1525}, pmid = {24842068}, issn = {1939-1277}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Linguistics ; Male ; Perceptual Masking/*physiology ; Speech Intelligibility/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {How speech is separated perceptually from other speech remains poorly understood. Recent research indicates that the ability of an extraneous formant to impair intelligibility depends on the variation of its frequency contour. This study explored the effects of manipulating the depth and pattern of that variation. Three formants (F1+F2+F3) constituting synthetic analogues of natural sentences were distributed across the 2 ears, together with a competitor for F2 (F2C) that listeners must reject to optimize recognition (left = F1+F2C; right = F2+F3). The frequency contours of F1 - F3 were each scaled to 50% of their natural depth, with little effect on intelligibility. Competitors were created either by inverting the frequency contour of F2 about its geometric mean (a plausibly speech-like pattern) or using a regular and arbitrary frequency contour (triangle wave, not plausibly speech-like) matched to the average rate and depth of variation for the inverted F2C. Adding a competitor typically reduced intelligibility; this reduction depended on the depth of F2C variation, being greatest for 100%-depth, intermediate for 50%-depth, and least for 0%-depth (constant) F2Cs. This suggests that competitor impact depends on overall depth of frequency variation, not depth relative to that for the target formants. The absence of tuning (i.e., no minimum in intelligibility for the 50% case) suggests that the ability to reject an extraneous formant does not depend on similarity in the depth of formant-frequency variation. Furthermore, triangle-wave competitors were as effective as their more speech-like counterparts, suggesting that the selection of formants from the ensemble also does not depend on speech-specific constraints.}, } @article {pmid24837135, year = {2014}, author = {Fujii, S and Watanabe, H and Oohashi, H and Hirashima, M and Nozaki, D and Taga, G}, title = {Precursors of dancing and singing to music in three- to four-months-old infants.}, journal = {PloS one}, volume = {9}, number = {5}, pages = {e97680}, pmid = {24837135}, issn = {1932-6203}, mesh = {Arm/physiology ; Auditory Perception ; *Child Development ; Dancing/*physiology ; Female ; Humans ; Infant ; Leg/physiology ; Male ; Movement/physiology ; Music/*psychology ; Singing/*physiology ; Voice/*physiology ; }, abstract = {Dancing and singing to music involve auditory-motor coordination and have been essential to our human culture since ancient times. Although scholars have been trying to understand the evolutionary and developmental origin of music, early human developmental manifestations of auditory-motor interactions in music have not been fully investigated. Here we report limb movements and vocalizations in three- to four-months-old infants while they listened to music and were in silence. In the group analysis, we found no significant increase in the amount of movement or in the relative power spectrum density around the musical tempo in the music condition compared to the silent condition. Intriguingly, however, there were two infants who demonstrated striking increases in the rhythmic movements via kicking or arm-waving around the musical tempo during listening to music. Monte-Carlo statistics with phase-randomized surrogate data revealed that the limb movements of these individuals were significantly synchronized to the musical beat. Moreover, we found a clear increase in the formant variability of vocalizations in the group during music perception. These results suggest that infants at this age are already primed with their bodies to interact with music via limb movements and vocalizations.}, } @article {pmid24815278, year = {2014}, author = {Mitsuya, T and MacDonald, EN and Munhall, KG}, title = {Temporal control and compensation for perturbed voicing feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {5}, pages = {2986-2994}, pmid = {24815278}, issn = {1520-8524}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological/physiology ; Adolescent ; Computer Systems ; Feedback, Psychological/*physiology ; Feedback, Sensory/*physiology ; Female ; Humans ; Motor Skills/physiology ; Noise ; Perceptual Distortion/*physiology ; Phonation/*physiology ; Psychoacoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {Previous research employing a real-time auditory perturbation paradigm has shown that talkers monitor their own speech attributes such as fundamental frequency, vowel intensity, vowel formants, and fricative noise as part of speech motor control. In the case of vowel formants or fricative noise, what was manipulated is spectral information about the filter function of the vocal tract. However, segments can be contrasted by parameters other than spectral configuration. It is possible that the feedback system monitors phonation timing in the way it does spectral information. This study examined whether talkers exhibit a compensatory behavior when manipulating information about voicing. When talkers received feedback of the cognate of the intended voicing category (saying "tipper" while hearing "dipper" or vice versa), they changed the voice onset time and in some cases the following vowel.}, } @article {pmid24815277, year = {2014}, author = {Lien, YA and Stepp, CE}, title = {Comparison of voice relative fundamental frequency estimates derived from an accelerometer signal and low-pass filtered and unprocessed microphone signals.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {5}, pages = {2977-2985}, pmid = {24815277}, issn = {1520-8524}, support = {R03 DC012651/DC/NIDCD NIH HHS/United States ; DC012651/DC/NIDCD NIH HHS/United States ; }, mesh = {Accelerometry ; Adolescent ; Adult ; Analysis of Variance ; Female ; Humans ; Male ; Neck ; Observer Variation ; *Phonation ; *Phonetics ; Reproducibility of Results ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Transducers, Pressure ; *Voice Quality ; Young Adult ; }, abstract = {The relative fundamental frequency (RFF) surrounding the production of a voiceless consonant has previously been estimated using unprocessed and low-pass filtered microphone signals, but it can also be estimated using a neck-placed accelerometer signal that is less affected by vocal tract formants. Determining the effects of signal type on RFF will allow for comparisons across studies and aid in establishing a standard protocol with minimal within-speaker variability. Here RFF was estimated in 12 speakers with healthy voices using unprocessed microphone, low-pass filtered microphone, and unprocessed accelerometer signals. Unprocessed microphone and accelerometer signals were recorded simultaneously using a microphone and neck-placed accelerometer. The unprocessed microphone signal was filtered at 350 Hz to construct the low-pass filtered microphone signal. Analyses of variance showed that signal type and the interaction of vocal cycle × signal type had significant effects on both RFF means and standard deviations, but with small effect sizes. The overall RFF trend was preserved regardless of signal type and the intra-speaker variability of RFF was similar among the signal types. Thus, RFF can be estimated using either a microphone or an accelerometer signal in individuals with healthy voices. Future work extending these findings to individuals with disordered voices is warranted.}, } @article {pmid24815276, year = {2014}, author = {Nirgianaki, E}, title = {Acoustic characteristics of Greek fricatives.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {5}, pages = {2964-2976}, doi = {10.1121/1.4870487}, pmid = {24815276}, issn = {1520-8524}, mesh = {Adult ; Air ; Female ; Friction ; Humans ; Language ; Male ; Mouth/physiology ; Pharynx/physiology ; *Phonation ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Time Factors ; Vocal Cords/physiology ; Young Adult ; }, abstract = {The present study examined the acoustics of Greek fricative consonants in terms of temporal, spectral, and amplitude parameters. The effects of voicing, speaker's gender, place of articulation, and post-fricative vowel on the acoustic parameters were also investigated. The results indicated that first and second spectral moments (i.e., spectral mean and spectral variance), as well as second formant (F2) onset, and normalized amplitude values are the acoustic parameters most correlated with the Greek fricative place of articulation distinction. F2 onset and spectral mean were the parameters that distinguished all five places of articulation, while normalized amplitude differentiated sibilants from non-sibilants. In addition, normalized duration and normalized amplitude are the parameters that distinguish Greek voiced from voiceless fricatives, with high classification accuracy.}, } @article {pmid24809722, year = {2014}, author = {Kim, J and Lee, SK and Lee, B}, title = {EEG classification in a single-trial basis for vowel speech perception using multivariate empirical mode decomposition.}, journal = {Journal of neural engineering}, volume = {11}, number = {3}, pages = {036010}, doi = {10.1088/1741-2560/11/3/036010}, pmid = {24809722}, issn = {1741-2552}, mesh = {Adult ; *Algorithms ; Brain/*physiology ; *Brain-Computer Interfaces ; Electroencephalography/*methods ; Evoked Potentials/physiology ; Evoked Potentials, Auditory/*physiology ; Humans ; Korea ; Male ; Multivariate Analysis ; Pattern Recognition, Automated/methods ; Psychomotor Performance/physiology ; *Semantics ; Speech Perception/*physiology ; Speech Production Measurement/methods ; Speech Recognition Software ; }, abstract = {OBJECTIVE: The objective of this study is to find components that might be related to phoneme representation in the brain and to discriminate EEG responses for each speech sound on a trial basis.

APPROACH: We used multivariate empirical mode decomposition (MEMD) and common spatial pattern for feature extraction. We chose three vowel stimuli, /a/, /i/ and /u/, based on previous findings, such that the brain can detect change in formant frequency (F2) of vowels. EEG activity was recorded from seven native Korean speakers at Gwangju Institute of Science and Technology. We applied MEMD over EEG channels to extract speech-related brain signal sources, and looked for the intrinsic mode functions which were dominant in the alpha bands. After the MEMD procedure, we applied the common spatial pattern algorithm for enhancing the classification performance, and used linear discriminant analysis (LDA) as a classifier.

MAIN RESULTS: The brain responses to the three vowels could be classified as one of the learned phonemes on a single-trial basis with our approach.

SIGNIFICANCE: The results of our study show that brain responses to vowels can be classified for single trials using MEMD and LDA. This approach may not only become a useful tool for the brain-computer interface but it could also be used for discriminating the neural correlates of categorical speech perception.}, } @article {pmid24800977, year = {2014}, author = {Jones, MR and Witt, CC}, title = {Migrate small, sound big: functional constraints on body size promote tracheal elongation in cranes.}, journal = {Journal of evolutionary biology}, volume = {27}, number = {6}, pages = {1256-1264}, doi = {10.1111/jeb.12397}, pmid = {24800977}, issn = {1420-9101}, mesh = {*Animal Migration ; Animals ; Biological Evolution ; Birds/*anatomy & histology/genetics ; *Body Size ; Female ; Male ; Mating Preference, Animal ; Phylogeny ; *Selection, Genetic ; Trachea/*anatomy & histology ; Vocalization, Animal ; }, abstract = {Organismal traits often represent the outcome of opposing selection pressures. Although social or sexual selection can cause the evolution of traits that constrain function or survival (e.g. ornamental feathers), it is unclear how the strength and direction of selection respond to ecological shifts that increase the severity of the constraint. For example, reduced body size might evolve by natural selection to enhance flight performance in migratory birds, but social or sexual selection favouring large body size may provide a countervailing force. Tracheal elongation is a potential outcome of these opposing pressures because it allows birds to convey an auditory signal of exaggerated body size. We predicted that the evolution of migration in cranes has coincided with a reduction in body size and a concomitant intensification of social or sexual selection for apparent large body size via tracheal elongation. We used a phylogenetic comparative approach to examine the relationships among migration distance, body mass and trachea length in cranes. As predicted, we found that migration distance correlated negatively with body size and positively with proportional trachea length. This result was consistent with our hypothesis that evolutionary reductions in body size led to intensified selection for trachea length. The most likely ultimate causes of intensified positive selection on trachea length are the direct benefits of conveying a large body size in intraspecific contests for mates and territories. We conclude that the strength of social or sexual selection on crane body size is linked to the degree of functional constraint.}, } @article {pmid24795664, year = {2014}, author = {Barlow, JA}, title = {Age of acquisition and allophony in Spanish-English bilinguals.}, journal = {Frontiers in psychology}, volume = {5}, number = {}, pages = {288}, pmid = {24795664}, issn = {1664-1078}, abstract = {This study examines age of acquisition (AoA) in Spanish-English bilinguals' phonetic and phonological knowledge of /l/ in English and Spanish. In English, the lateral approximant /l/ varies in darkness by context [based on the second formant (F2) and the difference between F2 and the first formant (F1)], but the Spanish /l/ does not. Further, English /l/ is overall darker than Spanish /l/. Thirty-eight college-aged adults participated: 11 Early Spanish-English bilinguals who learned English before the age of 5 years, 14 Late Spanish-English bilinguals who learned English after the age of 6 years, and 13 English monolinguals. Participants' /l/ productions were acoustically analyzed by language and context. The results revealed a Spanish-to-English phonetic influence on /l/ productions for both Early and Late bilinguals, as well as an English-to-Spanish phonological influence on the patterning of /l/ for the Late Bilinguals. These findings are discussed in terms of the Speech Learning Model and the effect of AoA on the interaction between a bilingual speaker's two languages.}, } @article {pmid24736231, year = {2014}, author = {Marklund, E and Schwarz, IC and Lacerda, F}, title = {Mismatch negativity at Fz in response to within-category changes of the vowel /i/.}, journal = {Neuroreport}, volume = {25}, number = {10}, pages = {756-759}, pmid = {24736231}, issn = {1473-558X}, mesh = {Acoustic Stimulation ; Adult ; Brain/*physiology ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {The amplitude of the mismatch negativity response for acoustic within-category deviations in speech stimuli was investigated by presenting participants with different exemplars of the vowel /i/ in an odd-ball paradigm. The deviants differed from the standard either in terms of fundamental frequency, the first formant, or the second formant. Changes in fundamental frequency are generally more salient than changes in the first formant, which in turn are more salient than changes in the second formant. The mismatch negativity response was expected to reflect this with greater amplitude for more salient deviations. The fundamental frequency deviants did indeed result in greater amplitude than both first formant deviants and second formant deviants, but no difference was found between the first formant deviants and the second formant deviants. It is concluded that greater difference between standard and within-category deviants across different acoustic dimensions results in greater mismatch negativity amplitude, suggesting that the processing of linguistically irrelevant changes in speech sounds may be processed similar to nonspeech sound changes.}, } @article {pmid24730744, year = {2014}, author = {Viswanathan, N and Magnuson, JS and Fowler, CA}, title = {Information for coarticulation: Static signal properties or formant dynamics?.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {40}, number = {3}, pages = {1228-1236}, pmid = {24730744}, issn = {1939-1277}, support = {P01 HD001994/HD/NICHD NIH HHS/United States ; R15 DC011875/DC/NIDCD NIH HHS/United States ; P01HD-001994/HD/NICHD NIH HHS/United States ; R15DC00565/DC/NIDCD NIH HHS/United States ; }, mesh = {*Attention ; Female ; *Gestures ; Humans ; Male ; Perceptual Distortion ; Perceptual Masking ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Visual Perception ; Voice Quality ; Young Adult ; }, abstract = {Perception of a speech segment changes depending on properties of surrounding segments in a phenomenon called compensation for coarticulation (Mann, 1980). The nature of information that drives these perceptual changes is a matter of debate. One account attributes perceptual shifts to low-level auditory system contrast effects based on static portions of the signal (e.g., third formant [F3] center or average frequency; Lotto & Kluender, 1998). An alternative account is that listeners' perceptual shifts result from listeners attuning to the acoustic effects of gestural overlap and that this information for coarticulation is necessarily dynamic (Fowler, 2006). In a pair of experiments, we used sinewave speech precursors to investigate the nature of information for compensation for coarticulation. In Experiment 1, as expected by both accounts, we found that sinewave speech precursors produce shifts in following segments. In Experiment 2, we investigated whether effects in Experiment 1 were driven by static F3 offsets of sinewave speech precursors, or by dynamic relationships among their formants. We temporally reversed F1 and F2 in sinewave precursors, preserving static F3 offset and average F1, F2 and F3 frequencies, but disrupting dynamic formant relationships. Despite having identical F3s, selectively reversed precursors produced effects that were significantly smaller and restricted to only a small portion of the continuum. We conclude that dynamic formant relations rather than static properties of the precursor provide information for compensation for coarticulation.}, } @article {pmid24698886, year = {2014}, author = {Deme, A}, title = {Intelligibility of sung vowels: the effect of consonantal context and the onset of voicing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {28}, number = {4}, pages = {523.e19-25}, doi = {10.1016/j.jvoice.2014.01.003}, pmid = {24698886}, issn = {1873-4588}, mesh = {Female ; Humans ; Linguistics ; Phonation/*physiology ; *Phonetics ; Singing/*physiology ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Voice/*physiology ; }, abstract = {BACKGROUND: Studies addressing the identification of sung vowels concern mainly the effect of the fundamental frequency (f0) and conclude that correct vowel identification decreases with increasing pitch. In one experiment, the impact of consonantal environment on the intelligibility of the vowels in high-pitched singing was also studied. The results of that experiment showed positive effect of the consonantal environment. This finding is in line with results that had been reported for speech in an earlier study. However, the data on singing are not as transparent as the authors suggest, and there are some conditions in the experiment that could also be controlled for more strictly. Therefore, the effect of the dynamic acoustic information encoded in the formant transitions at high fundamental frequencies is still an open question.

OBJECTIVES: The aim of the present study was to redesign and extend the above-mentioned experiment to test whether the phonetic context and the onset of the vowel uttered in isolation (namely the onset of voicing) have a positive effect on vowel identification.

METHODS: For this purpose, a vowel identification test was carried out. The stimuli included three Hungarian vowels /aː iː uː/ in three conditions (in /bVb/ context, in isolation and with eliminated onset) at seven different fundamental frequencies from 175 to 988 Hz (F3, B3, F4, B4, F5, B5, and speech). The stimuli were produced by one professional soprano singer.

RESULTS: The results show that consonantal context does not specify vowel identity in singing as clearly as it has been demonstrated for spoken utterances. In addition, no effect of vowel onset (ie, the onset of voicing) was found. Recognition percentages seemed only to be dependent on f0 and vowel quality.

CONCLUSIONS: The unexpected results lend themselves to two possible explanations: the reduction of the consonants and the undersampling of the formant transitions.}, } @article {pmid24687467, year = {2014}, author = {Lansford, KL and Liss, JM}, title = {Vowel acoustics in dysarthria: speech disorder diagnosis and classification.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {1}, pages = {57-67}, pmid = {24687467}, issn = {1558-9102}, support = {F31 DC010093/DC/NIDCD NIH HHS/United States ; R01 DC006859/DC/NIDCD NIH HHS/United States ; 1 F31 DC 10093/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Amyotrophic Lateral Sclerosis/complications ; Databases, Factual ; Dysarthria/*classification/*diagnosis/etiology ; Female ; Humans ; Huntington Disease/complications ; Male ; Middle Aged ; Parkinson Disease/complications ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement/*methods ; }, abstract = {PURPOSE: The purpose of this study was to determine the extent to which vowel metrics are capable of distinguishing healthy from dysarthric speech and among different forms of dysarthria.

METHOD: A variety of vowel metrics were derived from spectral and temporal measurements of vowel tokens embedded in phrases produced by 45 speakers with dysarthria and 12 speakers with no history of neurological disease. Via means testing and discriminant function analysis (DFA), the acoustic metrics were used to (a) detect the presence of dysarthria and (b) classify the dysarthria subtype.

RESULTS: Significant differences between dysarthric and healthy control speakers were revealed for all vowel metrics. However, the results of the DFA demonstrated some metrics (particularly metrics that capture vowel distinctiveness) to be more sensitive and specific predictors of dysarthria. Only the vowel metrics that captured slope of the second formant (F2) demonstrated between-group differences across the dysarthrias. However, when subjected to DFA, these metrics proved unreliable classifiers of dysarthria subtype.

CONCLUSION: The results of these analyses suggest that some vowel metrics may be useful clinically for the detection of dysarthria but may not be reliable indicators of dysarthria subtype using the current dysarthria classification scheme.}, } @article {pmid24687465, year = {2014}, author = {Burris, C and Vorperian, HK and Fourakis, M and Kent, RD and Bolt, DM}, title = {Quantitative and descriptive comparison of four acoustic analysis systems: vowel measurements.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {1}, pages = {26-45}, pmid = {24687465}, issn = {1558-9102}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; P-30 HD03352/HD/NICHD NIH HHS/United States ; R01-DC006282/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Communication Aids for Disabled/*standards ; Female ; Humans ; Male ; *Phonetics ; Software Design ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; Speech Recognition Software/*standards ; }, abstract = {PURPOSE: This study examines accuracy and comparability of 4 trademarked acoustic analysis software packages (AASPs): Praat, WaveSurfer, TF32, and CSL by using synthesized and natural vowels. Features of AASPs are also described.

METHOD: Synthesized and natural vowels were analyzed using each of the AASP's default settings to secure 9 acoustic measures: fundamental frequency (F0), formant frequencies (F1-F4), and formant bandwidths (B1-B4). The discrepancy between the software measured values and the input values (synthesized, previously reported, and manual measurements) was used to assess comparability and accuracy. Basic AASP features are described.

RESULTS: Results indicate that Praat, WaveSurfer, and TF32 generate accurate and comparable F0 and F1-F4 data for synthesized vowels and adult male natural vowels. Results varied by vowel for women and children, with some serious errors. Bandwidth measurements by AASPs were highly inaccurate as compared with manual measurements and published data on formant bandwidths.

CONCLUSIONS: Values of F0 and F1-F4 are generally consistent and fairly accurate for adult vowels and for some child vowels using the default settings in Praat, WaveSurfer, and TF32. Manipulation of default settings yields improved output values in TF32 and CSL. Caution is recommended especially before accepting F1-F4 results for children and B1-B4 results for all speakers.}, } @article {pmid24687464, year = {2014}, author = {McGowan, RW and McGowan, RS and Denny, M and Nittrouer, S}, title = {A longitudinal study of very young children's vowel production.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {1}, pages = {1-15}, pmid = {24687464}, issn = {1558-9102}, support = {R01 DC001247/DC/NIDCD NIH HHS/United States ; R01 DC006237/DC/NIDCD NIH HHS/United States ; }, mesh = {*Child Language ; Child, Preschool ; Female ; Humans ; Infant ; Language ; *Language Development ; Longitudinal Studies ; Male ; *Phonetics ; *Speech ; Speech Intelligibility ; *Speech Production Measurement ; }, abstract = {PURPOSE: Ecologically realistic, spontaneous, adult-directed, longitudinal speech data of young children were described by acoustic analyses.

METHOD: The first 2 formant frequencies of vowels produced by 6 children from different American English dialect regions were analyzed from ages 18 to 48 months. The vowels were from largely conversational contexts and were classified according to dictionary pronunciation.

RESULTS: Within-subject formant frequency variability remained relatively constant for the span of ages studied. It was often difficult to detect overall decreases in the first 2 formant frequencies between ages 30 and 48 months. A study of the movement of the corner vowels with respect to the vowel centroid showed that the shape of the vowel space remained qualitatively constant from 30 through 48 months.

CONCLUSIONS: The shape of the vowel space is established early in life. Some aspects of regional dialect were observed in some of the subjects at 42 months of age. The present study adds to the existing data on the development of vowel spaces by describing ecologically realistic speech.}, } @article {pmid24687447, year = {2013}, author = {Lammert, A and Proctor, M and Narayanan, S}, title = {Interspeaker variability in hard palate morphology and vowel production.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {56}, number = {6}, pages = {S1924-33}, doi = {10.1044/1092-4388(2013/12-0211)}, pmid = {24687447}, issn = {1558-9102}, support = {R01 DC007124-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Computer Simulation ; Efferent Pathways/physiology ; Humans ; *Models, Biological ; Palate, Hard/*anatomy & histology/*physiology ; *Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Tongue/innervation/physiology ; Vocal Cords/innervation/physiology ; }, abstract = {PURPOSE: Differences in vocal tract morphology have the potential to explain interspeaker variability in speech production. The potential acoustic impact of hard palate shape was examined in simulation, in addition to the interplay among morphology, articulation, and acoustics in real vowel production data.

METHOD: High-front vowel production from 5 speakers of American English was examined using midsagittal real-time magnetic resonance imaging data with synchronized audio. Relationships among hard palate morphology, tongue shaping, and formant frequencies were analyzed. Simulations were performed to determine the acoustical properties of vocal tracts whose area functions are altered according to prominent hard palate variations.

RESULTS: Simulations revealed that altering the height and position of the palatal dome alters formant frequencies. Examinations of real speech data showed that palatal morphology is not significantly correlated with any formant frequency but is correlated with major aspects of lingual articulation.

CONCLUSION: Certain differences in hard palate morphology can substantially affect vowel acoustics, but those effects are not noticeable in real speech. Speakers adapt their lingual articulation to accommodate palate shape differences with the potential to substantially affect formant frequencies, while ignoring palate shape differences with relatively little acoustic impact, lending support for acoustic goals of vowel production.}, } @article {pmid24687083, year = {2014}, author = {Ménard, L and Leclerc, A and Tiede, M}, title = {Articulatory and acoustic correlates of contrastive focus in congenitally blind adults and sighted adults.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {3}, pages = {793-804}, doi = {10.1044/2014_JSLHR-S-12-0395}, pmid = {24687083}, issn = {1558-9102}, support = {//Canadian Institutes of Health Research/Canada ; }, mesh = {Adult ; Blindness/congenital/*physiopathology ; Female ; Gestures ; Humans ; Lip/physiology ; Male ; Middle Aged ; Phonetics ; Sensory Deprivation/physiology ; Speech/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Vision, Ocular/*physiology ; Voice/*physiology ; }, abstract = {PURPOSE: The role of vision in speech representation was investigated in congenitally blind speakers and sighted speakers by studying the correlates of contrastive focus, a prosodic condition in which phonemic contrasts are enhanced. It has been reported that the lips (visible articulators) are less involved in implementing the rounding feature for blind speakers. If the weight of visible gestures in speech representation is reduced in blind speakers, they should show different strategies to mark focus-induced prominence.

METHOD: Nine congenitally blind French speakers and 9 sighted French speakers were recorded while uttering sentences in neutral and contrastive focus conditions. Internal lip area, upper lip protrusion, and acoustic values (formants, fundamental frequency, duration, and intensity) were measured.

RESULTS: In the acoustic domain, both groups signaled focus by using comparable values of fundamental frequency, intensity, and duration. Formant values in sighted speakers were more affected by the prosodic condition. In the articulatory domain, sighted speakers significantly altered lip geometry in the contrastive focus condition compared with the neutral condition, whereas blind speakers did not.

CONCLUSION: These results suggest that implementation of prosodic focus is affected by congenital visual deprivation. The authors discuss how these findings can be interpreted in the framework of the perception-for-action-control theory.}, } @article {pmid24686520, year = {2014}, author = {Jacewicz, E and Fox, RA}, title = {The effects of indexical and phonetic variation on vowel perception in typically developing 9- to 12-year-old children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {2}, pages = {389-405}, pmid = {24686520}, issn = {1558-9102}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; R01 DC 006871/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child Development ; Cues ; Female ; Humans ; Language ; Male ; North Carolina ; *Phonetics ; Psycholinguistics ; Speech/*physiology ; *Speech Intelligibility ; Speech Perception/*physiology ; Speech Production Measurement ; Wisconsin ; }, abstract = {PURPOSE The purpose of this study was to investigate how linguistic knowledge interacts with indexical knowledge in older children's perception under demanding listening conditions created by extensive talker variability. METHOD Twenty-five 9- to 12-year-old children, 12 from North Carolina (NC) and 13 from Wisconsin (WI), identified 12 vowels in isolated /hVd/ words produced by 120 talkers representing the 2 dialects (NC and WI), both genders, and 3 age groups (generations) of residents from the same geographic locations as the listeners. RESULTS Identification rates were higher for responses to talkers from the same dialect as the listeners and for female speech. Listeners were sensitive to systematic positional variations in vowels and their dynamic structure (formant movement) associated with generational differences in vowel pronunciation resulting from sound change in a speech community. Overall identification rate was 71.7%, which is 8.5% lower than for the adults responding to the same stimuli in Jacewicz and Fox (2012). CONCLUSION Typically developing older children were successful in dealing with both phonetic and indexical variation related to talker dialect, gender, and generation. They were less consistent than the adults, most likely because of less efficient encoding of acoustic-phonetic information in the speech of multiple talkers and relative inexperience with indexical variation.}, } @article {pmid24686228, year = {2014}, author = {Rao, A and Carney, LH}, title = {Speech enhancement for listeners with hearing loss based on a model for vowel coding in the auditory midbrain.}, journal = {IEEE transactions on bio-medical engineering}, volume = {61}, number = {7}, pages = {2081-2091}, pmid = {24686228}, issn = {1558-2531}, support = {R01 DC010813/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Auditory Perception/*physiology ; Child ; Female ; *Hearing Aids ; Humans ; Male ; Mesencephalon/*physiology ; *Models, Neurological ; *Signal Processing, Computer-Assisted ; Speech/*classification ; }, abstract = {A novel signal-processing strategy is proposed to enhance speech for listeners with hearing loss. The strategy focuses on improving vowel perception based on a recent hypothesis for vowel coding in the auditory system. Traditionally, studies of neural vowel encoding have focused on the representation of formants (peaks in vowel spectra) in the discharge patterns of the population of auditory-nerve (AN) fibers. A recent hypothesis focuses instead on vowel encoding in the auditory midbrain, and suggests a robust representation of formants. AN fiber discharge rates are characterized by pitch-related fluctuations having frequency-dependent modulation depths. Fibers tuned to frequencies near formants exhibit weaker pitch-related fluctuations than those tuned to frequencies between formants. Many auditory midbrain neurons show tuning to amplitude modulation frequency in addition to audio frequency. According to the auditory midbrain vowel encoding hypothesis, the response map of a population of midbrain neurons tuned to modulations near voice pitch exhibits minima near formant frequencies, due to the lack of strong pitch-related fluctuations at their inputs. This representation is robust over the range of noise conditions in which speech intelligibility is also robust for normal-hearing listeners. Based on this hypothesis, a vowel-enhancement strategy has been proposed that aims to restore vowel encoding at the level of the auditory midbrain. The signal processing consists of pitch tracking, formant tracking, and formant enhancement. The novel formant-tracking method proposed here estimates the first two formant frequencies by modeling characteristics of the auditory periphery, such as saturated discharge rates of AN fibers and modulation tuning properties of auditory midbrain neurons. The formant enhancement stage aims to restore the representation of formants at the level of the midbrain by increasing the dominance of a single harmonic near each formant and saturating that frequency channel. A MATLAB implementation of the system with low computational complexity was developed. Objective tests of the formant-tracking subsystem on vowels suggest that the method generalizes well over a wide range of speakers and vowels.}, } @article {pmid24611966, year = {2015}, author = {Tang, YL and Li, B and Jin, W and Li, DH}, title = {Torsional resonance frequency analysis: a novel method for assessment of dental implant stability.}, journal = {Clinical oral implants research}, volume = {26}, number = {6}, pages = {615-622}, doi = {10.1111/clr.12350}, pmid = {24611966}, issn = {1600-0501}, mesh = {Dental Implantation, Endosseous/methods ; *Dental Implants ; Dental Prosthesis Design ; *Dental Prosthesis Retention ; Humans ; Osseointegration/physiology ; Polymethyl Methacrylate/chemistry ; *Torsion, Mechanical ; *Vibration ; }, abstract = {OBJECTIVES: To establish and experimentally validate a novel resonance frequency analysis (RFA) method for measurement of dental implant stability by analyzing torsional resonance frequency (TRF).

MATERIAL AND METHODS: A numerical study and in vitro measurements were performed to evaluate the feasibility and reliability of the method of torsional RFA (T-RFA) using a T-shaped bilateral cantilever beam transducer. The sensitivity of this method was assessed by measuring the TRFs of dental implants with 8 sizes of T-shaped transducers during polymerization, which simulated the process of bone healing around an implant. The TRFs of the test implants detected using this new method and the bending resonance frequencies (BRFs) measured by Osstell(®) ISQ were compared. TRFs and BRFs on implant models in polymethyl methacrylate (PMMA) blocks with three exposure heights were also measured to assess the specificity of this method.

RESULTS: Finite element analysis showed two bending modes (5333 and 6008 Hz) following a torsional mode (8992 Hz) in the lower rank frequency. During in vitro measurements, a bending formant (mean 6075 Hz) and a torsional formant (mean 10225 Hz) appeared, which were verified by multipoint measurement with invariable excitation frequency in the laboratory. In the self-curing resin experiments, the average growth rate at all time points of TRFs using the new method with Transducer II was 2.36% and that of BRFs using Osstell(®) ISQ was 1.97%. In the implant exposure height tests, the mean declined rate of TRFs was 2.06% and that of BRFs using Osstell(®) ISQ was 12.34%.

CONCLUSION: A novel method for assessment of implant stability through TRF was established using a T-shape transducer, which showed high reliability and sensibility. The method alleviated the effects of implant exposure height on the measurements compared with Osstell(®) ISQ. The application of T-RFA represents another way in the investigation of dental implant osseointegration.}, } @article {pmid24606288, year = {2014}, author = {Savela, J and Eerola, O and Aaltonen, O}, title = {Weighted vowel prototypes in Finnish and German.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {3}, pages = {1530-1540}, doi = {10.1121/1.4864305}, pmid = {24606288}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {This study explores the perceptual vowel space of the Finnish and German languages, which have a similar vowel system with eight vowels, /ɑ/ /e/ /i/ /o/ /u/ /y/ /æ∼ε/ /ø/. Three different prototypicality measures are used for describing the internal structuring of the vowel categories in terms of the F1 and F2 formant frequencies: The arithmetic mean (centroid) of the F1-F2 space of the category (Pc), the absolute prototype of the category (Pa), and the weighted prototype of the category (Pω), in which the stimulus formant values are weighted by their goodness rating values. The study gave the following main results: (1) in both languages, the inter-subject differences were the smallest in Pω, and on the order of Difference Limen (DL) of F1-F2 frequencies for all of the three measures, (2) the Pa and Pω differed significantly from the centroid, with the absolute prototypes being the most peripheric, (3) the vowel systems of the two languages were similar (Euclidean distances in Pω of Finnish and German 7-34 mels) although minor differences were found in /e/, / ø/, and /u/, and (4) the mean difference of the prototypes from some earlier published production data was 100-150 mels.}, } @article {pmid26557355, year = {2014}, author = {Kumar Neupane, A and Gururaj, K and Mehta, G and Sinha, SK}, title = {Effect of Repetition Rate on Speech Evoked Auditory Brainstem Response in Younger and Middle Aged Individuals.}, journal = {Audiology research}, volume = {4}, number = {1}, pages = {106}, pmid = {26557355}, issn = {2039-4330}, abstract = {Speech evoked auditory brainstem responses depicts the neural encoding of speech at the level of brainstem. This study was designed to evaluate the neural encoding of speech at the brainstem in younger population and middle-aged population at three different repetition rates (6.9, 10.9 and 15.4). Speech evoked auditory brainstem response was recorded from 84 participants (young participants=42, middle aged participants=42) with normal hearing sensitivity. The latency of wave V and amplitude of the fundamental frequency, first formant frequency and second formant frequency was calculated. Results showed that the latency of wave V was prolonged for middle-aged individuals for all three-repetition rates compared to the younger participants. The results of the present study also revealed that there was no difference in encoding of fundamental frequency between middle aged and younger individuals at any of the repetition rates. However, increase in repetition rate did affect the encoding of the fundamental frequency in middle-aged individuals. The above results suggest a differential effect of repetition rate on wave V latency and encoding of fundamental frequency. Further, it was noticed that repetition rate did not affect the amplitude of first formant frequency or second formant frequency in middle aged participants compared to the younger participants.}, } @article {pmid24588492, year = {2014}, author = {Jahanbin, A and Pahlavannezhad, MR and Savadi, M and Hasanzadeh, N}, title = {The effect of speech therapy on acoustic speech characteristics of cleft lip and palate patients: a preliminary study.}, journal = {Special care in dentistry : official publication of the American Association of Hospital Dentists, the Academy of Dentistry for the Handicapped, and the American Society for Geriatric Dentistry}, volume = {34}, number = {2}, pages = {84-87}, doi = {10.1111/scd.12031}, pmid = {24588492}, issn = {1754-4505}, mesh = {Child ; Child, Preschool ; Cleft Lip/*rehabilitation ; Cleft Palate/*rehabilitation ; Female ; Humans ; Iran ; Male ; *Speech Acoustics ; Speech Therapy/*methods ; Treatment Outcome ; }, abstract = {PURPOSE: To assess the effects of speech therapy on the acoustic characteristics of speech in a group of cleft lip and palate patients.

MATERIALS AND METHODS: In this experimental pilot study, eight patients with unilateral cleft lip and palate participated in the 13-month period of speech therapy. They had some exercises in the class and at home to reduce the hypernasality of speech. Using an acoustic software named Praat, three formants (F1, F2, F3) of speech sounds /b /, /p /, /f /, /v /, /k /, and /g / were obtained pre and post speech therapy. We used paired samples t-test to compare the acoustic variables of each consonant before and after the therapy.

RESULTS: No significant differences were found between the pre- and posttherapy acoustic measures for fricative consonants /f /, /v / and stop consonants /k / and /g/. However, a statistically significant improvement in hypernasality of labial consonants /b / and /p / was observed after the speech therapy.

CONCLUSIONS: The present study underlines the role of a speech therapist in a complete cleft care team, in order to take care of the speech improvement of the patient and to motivate the parents to help the speech development of their child.}, } @article {pmid24586753, year = {2014}, author = {Soltis, J and King, LE and Douglas-Hamilton, I and Vollrath, F and Savage, A}, title = {African elephant alarm calls distinguish between threats from humans and bees.}, journal = {PloS one}, volume = {9}, number = {2}, pages = {e89403}, pmid = {24586753}, issn = {1932-6203}, mesh = {*Animal Communication ; Animals ; *Bees ; Elephants/*physiology ; Humans ; }, abstract = {The Samburu pastoralists of Northern Kenya co-exist with African elephants, Loxodonta africana, and compete over resources such as watering holes. Audio playback experiments demonstrate that African elephants produce alarm calls in response to the voices of Samburu tribesmen. When exposed to adult male Samburu voices, listening elephants exhibited vigilance behavior, flight behavior, and produced vocalizations (rumbles, roars and trumpets). Rumble vocalizations were most common and were characterized by increased and more variable fundamental frequencies, and an upward shift in the first [F1] and second [F2] formant locations, compared to control rumbles. When exposed to a sequence of these recorded rumbles, roars and trumpets, listening elephants also exhibited vigilance and flight behavior. The same behavior was observed, in lesser degrees, both when the roars and trumpets were removed, and when the second formants were artificially lowered to levels typical of control rumbles. The "Samburu alarm rumble" is acoustically distinct from the previously described "bee alarm rumble." The bee alarm rumbles exhibited increased F2, while Samburu alarm rumbles exhibited increased F1 and F2, compared to controls. Moreover, the behavioral reactions to the two threats were different. Elephants exhibited vigilance and flight behavior in response to Samburu and bee stimuli and to both alarm calls, but headshaking behavior only occurred in response to bee sounds and bee alarm calls. In general, increasingly threatening stimuli elicited alarm calls with increases in F0 and in formant locations, and increasing numbers of these acoustic cues in vocal stimuli elicited increased vigilance and flight behavior in listening elephants. These results show that African elephant alarm calls differentiate between two types of threat and reflect the level of urgency of threats.}, } @article {pmid24576808, year = {2014}, author = {Moberly, AC and Bhat, J and Welling, DB and Shahin, AJ}, title = {Neurophysiology of spectrotemporal cue organization of spoken language in auditory memory.}, journal = {Brain and language}, volume = {130}, number = {}, pages = {42-49}, pmid = {24576808}, issn = {1090-2155}, support = {R03 DC011168/DC/NIDCD NIH HHS/United States ; R03-DC011168/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Brain Mapping ; *Cues ; *Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; *Language ; Male ; Memory/*physiology ; *Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Listeners assign different weights to spectral dynamics, such as formant rise time (FRT), and temporal dynamics, such as amplitude rise time (ART), during phonetic judgments. We examined the neurophysiological basis of FRT and ART weighting in the /ba/-/wa/ contrast. Electroencephalography was recorded for thirteen adult English speakers during a mismatch negativity (MMN) design using synthetic stimuli: a /ba/ with /ba/-like FRT and ART; a /wa/ with /wa/-like FRT and ART; and a /ba/(wa) with /ba/-like FRT and /wa/-like ART. We hypothesized that because of stronger reliance on FRT, subjects would encode a stronger memory trace and exhibit larger MMN during the FRT than the ART contrast. Results supported this hypothesis. The effect was most robust in the later portion of MMN. Findings suggest that MMN is generated by multiple sources, differentially reflecting acoustic change detection (earlier MMN, bottom-up process) and perceptual weighting of ART and FRT (later MMN, top-down process).}, } @article {pmid24568951, year = {2015}, author = {Chang, YL and Hung, CH and Chen, PY and Chen, WC and Hung, SH}, title = {Preliminary study of acoustic analysis for evaluating speech-aid oral prostheses: Characteristic dips in octave spectrum for comparison of nasality.}, journal = {Journal of the Formosan Medical Association = Taiwan yi zhi}, volume = {114}, number = {10}, pages = {950-958}, doi = {10.1016/j.jfma.2014.01.013}, pmid = {24568951}, issn = {0929-6646}, mesh = {*Acoustics ; Female ; Humans ; Male ; Middle Aged ; *Palatal Obturators ; *Speech ; Velopharyngeal Insufficiency/*rehabilitation ; *Voice Quality ; }, abstract = {BACKGROUND/PURPOSE: Acoustic analysis is often used in speech evaluation but seldom for the evaluation of oral prostheses designed for reconstruction of surgical defect. This study aimed to introduce the application of acoustic analysis for patients with velopharyngeal insufficiency (VPI) due to oral surgery and rehabilitated with oral speech-aid prostheses.

METHODS: The pre- and postprosthetic rehabilitation acoustic features of sustained vowel sounds from two patients with VPI were analyzed and compared with the acoustic analysis software Praat.

RESULTS: There were significant differences in the octave spectrum of sustained vowel speech sound between the pre- and postprosthetic rehabilitation. Acoustic measurements of sustained vowels for patients before and after prosthetic treatment showed no significant differences for all parameters of fundamental frequency, jitter, shimmer, noise-to-harmonics ratio, formant frequency, F1 bandwidth, and band energy difference. The decrease in objective nasality perceptions correlated very well with the decrease in dips of the spectra for the male patient with a higher speech bulb height.

CONCLUSION: Acoustic analysis may be a potential technique for evaluating the functions of oral speech-aid prostheses, which eliminates dysfunctions due to the surgical defect and contributes to a high percentage of intelligible speech. Octave spectrum analysis may also be a valuable tool for detecting changes in nasality characteristics of the voice during prosthetic treatment of VPI.}, } @article {pmid24560003, year = {2014}, author = {Andrade, PA and Wood, G and Ratcliffe, P and Epstein, R and Pijper, A and Svec, JG}, title = {Electroglottographic study of seven semi-occluded exercises: LaxVox, straw, lip-trill, tongue-trill, humming, hand-over-mouth, and tongue-trill combined with hand-over-mouth.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {28}, number = {5}, pages = {589-595}, doi = {10.1016/j.jvoice.2013.11.004}, pmid = {24560003}, issn = {1873-4588}, mesh = {Adult ; Electrodiagnosis/*methods ; Exercise Therapy ; Female ; Glottis/*physiology ; Healthy Volunteers ; Humans ; Lip/*physiology ; Male ; Phonation/*physiology ; Speech Acoustics ; Tongue/*physiology ; Vocal Cords/physiology ; Voice/*physiology ; *Voice Quality ; Voice Training ; }, abstract = {INTRODUCTION: Semi-occluded vocal tract exercises (SOVTE) are often used in voice clinics. SOVTE change the acoustic vocal tract impedance in relation to the glottis impedance, improving voice quality. However, differences among SOVTE, such as the number of vibration sources into the vocal tract, are often disregarded by clinicians. Some SOVTE present single, whereas others double source. This study aims at investigating changes in voice production pattern for a series of SOVTE. A combined exercise (tongue-trill coupled with hand-over-mouth) was implemented to illustrate the effect of a secondary source of vibration in the vocal tract.

METHOD: Twenty-three healthy volunteers performed a series of SOVTE: LaxVox, straw, lip-trill, tongue-trill, hand-over-mouth, humming, and tongue-trill combined with hand-over-mouth. Comfortable phonation served as control exercise. The dependent variables were electroglottography contact quotient (CQ), contact quotient range (CQr), fundamental frequency (F0), fundamental frequency range, and difference between the first formant frequency and F0 (F1 - F0).

RESULTS: A significant difference for CQr scores compared with comfortable phonation was found for the combined tongue-trill with hand-over-mouth, lip-trill, LaxVox, and tongue-trill exercises. The F1 - F0 acoustic analysis showed significant differences in scores for exercises with one versus two sources of vibration.

DISCUSSION AND CONCLUSION: The results indicate that SOVTE should be divided into two groups, as follows: (a) steady (single sourced) with lower CQr and F1 - F0 difference (hand-over-mouth, humming, and straw) and (b) fluctuating (dual source) with larger CQr and F1 - F0 difference (tongue-trill, lip-trill, and LaxVox). Because of these differences, also different therapeutic effects can be expected. Tongue-trill combined with hand-over-mouth exhibited mixed effects of both the exercise groups.}, } @article {pmid24511179, year = {2014}, author = {Salverda, AP and Kleinschmidt, D and Tanenhaus, MK}, title = {Immediate effects of anticipatory coarticulation in spoken-word recognition.}, journal = {Journal of memory and language}, volume = {71}, number = {1}, pages = {145-163}, pmid = {24511179}, issn = {0749-596X}, support = {R01 DC005071/DC/NIDCD NIH HHS/United States ; R01 HD073890/HD/NICHD NIH HHS/United States ; }, abstract = {Two visual-world experiments examined listeners' use of pre word-onset anticipatory coarticulation in spoken-word recognition. Experiment 1 established the shortest lag with which information in the speech signal influences eye-movement control, using stimuli such as "The … ladder is the target". With a neutral token of the definite article preceding the target word, saccades to the referent were not more likely than saccades to an unrelated distractor until 200-240 ms after the onset of the target word. In Experiment 2, utterances contained definite articles which contained natural anticipatory coarticulation pertaining to the onset of the target word (" The ladder … is the target"). A simple Gaussian classifier was able to predict the initial sound of the upcoming target word from formant information from the first few pitch periods of the article's vowel. With these stimuli, effects of speech on eye-movement control began about 70 ms earlier than in Experiment 1, suggesting rapid use of anticipatory coarticulation. The results are interpreted as support for "data explanation" approaches to spoken-word recognition. Methodological implications for visual-world studies are also discussed.}, } @article {pmid24486810, year = {2014}, author = {Smith, DR}, title = {Does knowing speaker sex facilitate vowel recognition at short durations?.}, journal = {Acta psychologica}, volume = {148}, number = {}, pages = {81-90}, doi = {10.1016/j.actpsy.2014.01.010}, pmid = {24486810}, issn = {1873-6297}, mesh = {Adolescent ; Adult ; Female ; Humans ; Judgment/physiology ; Linguistics ; Male ; Recognition, Psychology/*physiology ; *Speech ; Speech Perception/*physiology ; Time Factors ; *Voice ; Young Adult ; }, abstract = {A man, woman or child saying the same vowel do so with very different voices. The auditory system solves the complex problem of extracting what the man, woman or child has said despite substantial differences in the acoustic properties of their voices. Much of the acoustic variation between the voices of men and woman is due to changes in the underlying anatomical mechanisms for producing speech. If the auditory system knew the sex of the speaker then it could potentially correct for speaker sex related acoustic variation thus facilitating vowel recognition. This study measured the minimum stimulus duration necessary to accurately discriminate whether a brief vowel segment was spoken by a man or woman, and the minimum stimulus duration necessary to accuately recognise what vowel was spoken. Results showed that reliable vowel recognition precedesreliable speaker sex discrimination, thus questioning the use of speaker sex information in compensating for speaker sex related acoustic variation in the voice. Furthermore, the pattern of performance across experiments where the fundamental frequency and formant frequency information of speaker's voices were systematically varied, was markedly different depending on whether the task was speaker-sex discrimination or vowel recognition. This argues for there being little relationship between perception of speaker sex (indexical information) and perception of what has been said (linguistic information) at short durations.}, } @article {pmid24474943, year = {2013}, author = {Pernet, CR and Belin, P and Jones, A}, title = {Behavioral evidence of a dissociation between voice gender categorization and phoneme categorization using auditory morphed stimuli.}, journal = {Frontiers in psychology}, volume = {4}, number = {}, pages = {1018}, pmid = {24474943}, issn = {1664-1078}, abstract = {Both voice gender perception and speech perception rely on neuronal populations located in the peri-sylvian areas. However, whilst functional imaging studies suggest a left vs. right hemisphere and anterior vs. posterior dissociation between voice and speech categorization, psycholinguistic studies on talker variability suggest that these two processes share common mechanisms. In this study, we investigated the categorical perception of voice gender (male vs. female) and phonemes (/pa/ vs. /ta/) using the same stimulus continua generated by morphing. This allowed the investigation of behavioral differences while controlling acoustic characteristics, since the same stimuli were used in both tasks. Despite a higher acoustic dissimilarity between items during the phoneme categorization task (a male and female voice producing the same phonemes) than the gender task (the same person producing 2 phonemes), results showed that speech information is being processed much faster than voice information. In addition, f0 or timbre equalization did not affect RT, which disagrees with the classical psycholinguistic models in which voice information is stripped away or normalized to access phonetic content. Also, despite similar average response (percentages) and perceptual (d') curves, a reverse correlation analysis on acoustic features revealed that only the vowel formant frequencies distinguish stimuli in the gender task, whilst, as expected, the formant frequencies of the consonant distinguished stimuli in the phoneme task. The 2nd set of results thus also disagrees with models postulating that the same acoustic information is used for voice and speech. Altogether these results suggest that voice gender categorization and phoneme categorization are dissociated at an early stage on the basis of different enhanced acoustic features that are diagnostic to the task at hand.}, } @article {pmid24470453, year = {2014}, author = {Zhou, YL and Qiu, X and Fang, XB and Yang, LY and Zhao, Y and Fang, T and Zheng, WH and Liu, JS}, title = {Acoustic characteristics of eight common Chinese anurans during the breeding season.}, journal = {Dong wu xue yan jiu = Zoological research}, volume = {35}, number = {1}, pages = {42-50}, pmid = {24470453}, issn = {0254-5853}, mesh = {Animals ; Anura/*physiology ; China ; Male ; Reproduction/*physiology ; *Seasons ; Vocalization, Animal/*physiology ; }, abstract = {Anurans often have species-specific vocalizations. To quantify and compare the characteristics of anuran calls in Gutianshan National Nature Reserve, Zhejiang Province, we recorded the advertisement calls of eight species belonging to four families (Ranidae, Microhylidae, Megophryidae and Bufonidae) from June to September 2012 using Sony ICD-FX8 IC recorders. All recordings were analyzed using the "Praat" software. Five acoustics parameters were measured, including temporal traits (call duration, number of notes or pulse number/call) and spectral traits (fundamental frequency, the first three formants and dominant frequency). The characteristic parameters of Microhyla ornate and Fejervarya limnocharis calls were different as were the calls of some populations of the same species recorded in different regions. The advertisement calls of the eight species were specific. Our study has provided a useful reference for identifying the calls of some common Chinese anurans.}, } @article {pmid24437777, year = {2014}, author = {Arnela, M and Guasch, O}, title = {Two-dimensional vocal tracts with three-dimensional behavior in the numerical generation of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {135}, number = {1}, pages = {369-379}, doi = {10.1121/1.4837221}, pmid = {24437777}, issn = {1520-8524}, mesh = {Acoustics ; Biomechanical Phenomena ; *Computer Simulation ; Finite Element Analysis ; Glottis/anatomy & histology/*physiology ; Humans ; *Models, Biological ; *Numerical Analysis, Computer-Assisted ; *Phonation ; Phonetics ; *Speech Acoustics ; Vocal Cords/anatomy & histology/physiology ; *Voice Quality ; }, abstract = {Two-dimensional (2D) numerical simulations of vocal tract acoustics may provide a good balance between the high quality of three-dimensional (3D) finite element approaches and the low computational cost of one-dimensional (1D) techniques. However, 2D models are usually generated by considering the 2D vocal tract as a midsagittal cut of a 3D version, i.e., using the same radius function, wall impedance, glottal flow, and radiation losses as in 3D, which leads to strong discrepancies in the resulting vocal tract transfer functions. In this work, a four step methodology is proposed to match the behavior of 2D simulations with that of 3D vocal tracts with circular cross-sections. First, the 2D vocal tract profile becomes modified to tune the formant locations. Second, the 2D wall impedance is adjusted to fit the formant bandwidths. Third, the 2D glottal flow gets scaled to recover 3D pressure levels. Fourth and last, the 2D radiation model is tuned to match the 3D model following an optimization process. The procedure is tested for vowels /a/, /i/, and /u/ and the obtained results are compared with those of a full 3D simulation, a conventional 2D approach, and a 1D chain matrix model.}, } @article {pmid24399956, year = {2013}, author = {White-Schwoch, T and Kraus, N}, title = {Physiologic discrimination of stop consonants relates to phonological skills in pre-readers: a biomarker for subsequent reading ability?(†).}, journal = {Frontiers in human neuroscience}, volume = {7}, number = {}, pages = {899}, pmid = {24399956}, issn = {1662-5161}, support = {R01 HD069414/HD/NICHD NIH HHS/United States ; }, abstract = {Reading development builds upon the accurate representation of the phonological structure of spoken language. This representation and its neural foundations have been studied extensively with respect to reading due to pervasive performance deficits on basic phonological tasks observed in children with dyslexia. The subcortical auditory system - a site of intersection for sensory and cognitive input - is exquisitely tuned to code fine timing differences between phonemes, and so likely plays a foundational role in the development of phonological processing and, eventually, reading. This temporal coding of speech varies systematically with reading ability in school age children. Little is known, however, about subcortical speech representation in pre-school age children. We measured auditory brainstem responses to the stop consonants [ba] and [ga] in a cohort of 4-year-old children and assessed their phonological skills. In a typical auditory system, brainstem responses to [ba] and [ga] are out of phase (i.e., differ in time) due to formant frequency differences in the consonant-vowel transitions of the stimuli. We found that children who performed worst on the phonological awareness task insufficiently code this difference, revealing a physiologic link between early phonological skills and the neural representation of speech. We discuss this finding in light of existing theories of the role of the auditory system in developmental dyslexia, and argue for a systems-level perspective for understanding the importance of precise temporal coding for learning to read.}, } @article {pmid24379774, year = {2013}, author = {Varnet, L and Knoblauch, K and Meunier, F and Hoen, M}, title = {Using auditory classification images for the identification of fine acoustic cues used in speech perception.}, journal = {Frontiers in human neuroscience}, volume = {7}, number = {}, pages = {865}, pmid = {24379774}, issn = {1662-5161}, abstract = {An essential step in understanding the processes underlying the general mechanism of perceptual categorization is to identify which portions of a physical stimulation modulate the behavior of our perceptual system. More specifically, in the context of speech comprehension, it is still a major open challenge to understand which information is used to categorize a speech stimulus as one phoneme or another, the auditory primitives relevant for the categorical perception of speech being still unknown. Here we propose to adapt a method relying on a Generalized Linear Model with smoothness priors, already used in the visual domain for the estimation of so-called classification images, to auditory experiments. This statistical model offers a rigorous framework for dealing with non-Gaussian noise, as it is often the case in the auditory modality, and limits the amount of noise in the estimated template by enforcing smoother solutions. By applying this technique to a specific two-alternative forced choice experiment between stimuli "aba" and "ada" in noise with an adaptive SNR, we confirm that the second formantic transition is key for classifying phonemes into /b/ or /d/ in noise, and that its estimation by the auditory system is a relative measurement across spectral bands and in relation to the perceived height of the second formant in the preceding syllable. Through this example, we show how the GLM with smoothness priors approach can be applied to the identification of fine functional acoustic cues in speech perception. Finally we discuss some assumptions of the model in the specific case of speech perception.}, } @article {pmid24372318, year = {2014}, author = {Cartei, V and Cowles, W and Banerjee, R and Reby, D}, title = {Control of voice gender in pre-pubertal children.}, journal = {The British journal of developmental psychology}, volume = {32}, number = {1}, pages = {100-106}, doi = {10.1111/bjdp.12027}, pmid = {24372318}, issn = {2044-835X}, mesh = {*Auditory Perception ; Child ; Female ; *Gender Identity ; Humans ; Male ; *Sex Characteristics ; *Voice ; }, abstract = {Adult listeners are capable of identifying the gender of speakers as young as 4 years old from their voice. In the absence of a clear anatomical dimorphism in the dimensions of pre-pubertal boys' and girls' vocal apparatus, the observed gender differences may reflect children's regulation of their vocal behaviour. A detailed acoustic analysis was conducted of the utterances of 34 6- to 9-year-old children, in their normal voices and also when asked explicitly to speak like a boy or a girl. Results showed statistically significant shifts in fundamental and formant frequency values towards those expected from the sex dimorphism in adult voices. Directions for future research on the role of vocal behaviours in pre-pubertal children's expression of gender are considered.}, } @article {pmid24363497, year = {2013}, author = {Plotsky, K and Rendall, D and Riede, T and Chase, K}, title = {Radiographic analysis of vocal tract length and its relation to overall body size in two canid species.}, journal = {Journal of zoology (London, England : 1987)}, volume = {291}, number = {1}, pages = {}, pmid = {24363497}, issn = {0952-8369}, support = {R01 GM063056/GM/NIGMS NIH HHS/United States ; }, abstract = {Body size is an important determinant of resource and mate competition in many species. Competition is often mediated by conspicuous vocal displays, which may help to intimidate rivals and attract mates by providing honest cues to signaler size. Fitch proposed that vocal tract resonances (or formants) should provide particularly good, or honest, acoustic cues to signaler size because they are determined by the length of the vocal tract, which in turn, is hypothesized to scale reliably with overall body size. There is some empirical support for this hypothesis, but to date, many of the effects have been either mixed for males compared with females, weaker than expected in one or the other sex, or complicated by sampling issues. In this paper, we undertake a direct test of Fitch's hypothesis in two canid species using large samples that control for age- and sex-related variation. The samples involved radiographic images of 120 Portuguese water dogs Canis lupus familiaris and 121 Russian silver foxes Vulpes vulpes. Direct measurements were made of vocal tract length from X-ray images and compared against independent measures of body size. In adults of both species, and within both sexes, overall vocal tract length was strongly and significantly correlated with body size. Effects were strongest for the oral component of the vocal tract. By contrast, the length of the pharyngeal component was not as consistently related to body size. These outcomes are some of the clearest evidence to date in support of Fitch's hypothesis. At the same time, they highlight the potential for elements of both honest and deceptive body signaling to occur simultaneously via differential acoustic cues provided by the oral versus pharyngeal components of the vocal tract.}, } @article {pmid24356307, year = {2013}, author = {Kuo, C}, title = {Formant transitions in varied utterance positions.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {65}, number = {4}, pages = {178-184}, pmid = {24356307}, issn = {1421-9972}, support = {R01 DC003723/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Diagnosis, Computer-Assisted ; Female ; Humans ; Male ; *Phonetics ; Software ; Sound Spectrography ; *Speech Acoustics ; *Speech Articulation Tests ; *Speech Production Measurement ; Young Adult ; }, abstract = {AIM: Acoustic characteristics associated with varied utterance positions were examined to understand the acoustic consequences of potential articulatory changes near utterance boundaries.

METHODS: Second formant transition characteristics, including transition duration (ms), transition extent (Hz), and derived slope of transition (Hz/ms), of 12 healthy speakers of American English were examined for two diphthong transitions in sew and sigh and one consonant-vowel transition in bee in utterance-initial, utterance-final, and utterance-end positions. Speakers performed a task of contrastive stress variation that served to demonstrate the changeability of acoustic characteristics as an index of articulatory change in shaping the vocal tract.

RESULTS: Contrastive stress, as compared to words spoken without increased stress, was associated with longer transition duration, greater transition extent, and a decreased slope. Although some utterance position effects were present, no systematic differences consistent with boundary strengthening or declination were found.

CONCLUSION: Findings suggest that varied utterance positions may be associated with stimulus-dependent variation in articulatory changes that is reflected in the acoustic output. These results indicate the need to further understand the construct of utterance-level speech materials, such as carrier phrases, in clinical practice and research.}, } @article {pmid24349399, year = {2013}, author = {Sitek, KR and Mathalon, DH and Roach, BJ and Houde, JF and Niziolek, CA and Ford, JM}, title = {Auditory cortex processes variation in our own speech.}, journal = {PloS one}, volume = {8}, number = {12}, pages = {e82925}, pmid = {24349399}, issn = {1932-6203}, support = {MH-076989/MH/NIMH NIH HHS/United States ; U01 MH076989/MH/NIMH NIH HHS/United States ; MH-58262/MH/NIMH NIH HHS/United States ; T32 DC000038/DC/NIDCD NIH HHS/United States ; UL1 TR000004/TR/NCATS NIH HHS/United States ; R01-DC010145/DC/NIDCD NIH HHS/United States ; R01 MH076989/MH/NIMH NIH HHS/United States ; F32 DC011249/DC/NIDCD NIH HHS/United States ; R01 MH058262/MH/NIMH NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Cortex/*physiology ; Evoked Potentials/*physiology ; Female ; Humans ; Male ; Speech Perception/*physiology ; }, abstract = {As we talk, we unconsciously adjust our speech to ensure it sounds the way we intend it to sound. However, because speech production involves complex motor planning and execution, no two utterances of the same sound will be exactly the same. Here, we show that auditory cortex is sensitive to natural variations in self-produced speech from utterance to utterance. We recorded event-related potentials (ERPs) from ninety-nine subjects while they uttered "ah" and while they listened to those speech sounds played back. Subjects' utterances were sorted based on their formant deviations from the previous utterance. Typically, the N1 ERP component is suppressed during talking compared to listening. By comparing ERPs to the least and most variable utterances, we found that N1 was less suppressed to utterances that differed greatly from their preceding neighbors. In contrast, an utterance's difference from the median formant values did not affect N1. Trial-to-trial pitch (f0) deviation and pitch difference from the median similarly did not affect N1. We discuss mechanisms that may underlie the change in N1 suppression resulting from trial-to-trial formant change. Deviant utterances require additional auditory cortical processing, suggesting that speaking-induced suppression mechanisms are optimally tuned for a specific production.}, } @article {pmid24328826, year = {2014}, author = {Chhabra, S and Badcock, JC and Maybery, MT and Leung, D}, title = {Voice identity discrimination and hallucination-proneness in healthy young adults: a further challenge to the continuum model of psychosis?.}, journal = {Cognitive neuropsychiatry}, volume = {19}, number = {4}, pages = {305-318}, doi = {10.1080/13546805.2013.865512}, pmid = {24328826}, issn = {1464-0619}, mesh = {Adult ; *Auditory Perception ; Cues ; Female ; Hallucinations/etiology/*physiopathology ; Humans ; *Identification, Psychological ; Male ; Middle Aged ; *Models, Psychological ; Psychotic Disorders/complications/*physiopathology ; Schizophrenia/complications/*physiopathology ; *Voice Quality ; Young Adult ; }, abstract = {INTRODUCTION: Auditory hallucinations occur in schizophrenia and also in the general population. However, evidence points to differences in the nature and the mechanisms of clinical and non-clinical hallucinations, challenging the dominant assumption that they represent the same phenomenon. The current study extended this evidence by examining voice identity perception in hallucination-prone individuals. In schizophrenia, deficiencies discriminating between real (external) voices have been linked to basic acoustic cues, but voice discrimination has not yet been investigated in non-clinical hallucinations.

METHODS: Using a task identical to that employed in patients, multidimensional scaling of voice dissimilarity judgements was used to examine how healthy individuals differing in hallucination-proneness (30 high and 30 low hallucination-prone individuals) distinguish pairs of unfamiliar voices. The resulting dimensions were interpreted with reference to acoustic measures relevant to voice identity.

RESULTS: A two-dimensional "voice space", defined by fundamental frequency (F0) and formant dispersion (Df), was derived for high and low hallucination-prone groups. There were no significant differences in speaker discrimination for high versus low hallucination-prone individuals on the basis of either F0 or Df.

CONCLUSIONS: These findings suggest voice identity perception is not impaired in healthy individuals predisposed to hallucinations, adding a further challenge to the continuum model of psychotic symptoms.}, } @article {pmid24326394, year = {2014}, author = {Zeyl, TJ and Bruce, IC}, title = {Analysis of spatiotemporal pattern correction using a computational model of the auditory periphery.}, journal = {Ear and hearing}, volume = {35}, number = {2}, pages = {246-255}, doi = {10.1097/AUD.0b013e3182a4a6f9}, pmid = {24326394}, issn = {1538-4667}, mesh = {*Algorithms ; Animals ; Auditory Perception/*physiology ; Cats ; Cochlea/*physiology ; Cochlear Nerve/*physiology ; Computer Simulation ; Hair Cells, Auditory/physiology ; *Hearing Aids ; }, abstract = {OBJECTIVES: The purpose of this study was to determine the cause of poor experimental performance of a spatiotemporal pattern correction (SPC) scheme that has been proposed as a hearing aid algorithm and to determine contexts in which it may provide benefit. The SPC scheme is intended to compensate for altered phase response and group delay differences in the auditory nerve spiking patterns in impaired ears. Based on theoretical models of loudness and the hypothesized importance of temporal fine structure for intelligibility, the compensations of the SPC scheme are expected to provide benefit; however, preliminary experiments revealed that listeners preferred unprocessed or minimally processed speech as opposed to complete SPC processed speech.

DESIGN: An improved version of the SPC scheme was evaluated with a computational auditory model in response to a synthesized vowel at multiple SPLs. The impaired model auditory nerve response to SPC-aided stimuli was compared to the unaided stimuli for spectrotemporal response similarity to the healthy auditory model. This comparison included analysis of synchronized rate across auditory nerve characteristic frequencies and a measure of relative phase response of auditory nerve fibers to complex stimuli derived from cross-correlations.

RESULTS: Analysis indicates that SPC can improve a metric of relative phase response at low SPLs, but may do so at the cost of decreased spectrotemporal response similarity to the healthy auditory model and degraded synchrony to vowel formants. In-depth analysis identifies several technical and conceptual problems associated with SPC that need to be addressed. These include the following: (1) a nonflat frequency response through the analysis-synthesis filterbank that results from time-varying changes in the relative temporal alignment of filterbank channels, (2) group delay corrections that are based on incorrect frequencies because of spread of synchrony in auditory nerve responses, and (3) frequency modulations in the processed signal created by the insertion of delays.

CONCLUSIONS: Despite these issues, SPC provided benefit to an error metric derived from auditory nerve response cross-correlations at low SPLs, which may mean phase adjustment is achieved at the expense of other metrics, but could be beneficial for low-level speech.}, } @article {pmid24321591, year = {2014}, author = {Ozbal Koc, EA and Koc, B and Ercan, I and Kocak, I and Tadihan, E and Turgut, S}, title = {Effects of septoplasty on speech and voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {28}, number = {3}, pages = {393.e11-5}, doi = {10.1016/j.jvoice.2013.09.008}, pmid = {24321591}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Adult ; Female ; Humans ; Male ; Middle Aged ; Nasal Septum/*surgery ; Prospective Studies ; *Rhinoplasty/adverse effects ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Treatment Outcome ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVES: The aim of this study was subjective and objective evaluation of changes in acoustic features of voice before and after septoplasty surgery.

STUDY DESIGN: Prospective.

METHODS: Twenty patients scheduled for septoplasty procedure were included in the study. Before and 1 and 3 months after septoplasty surgery, acoustic analysis were performed. The recordings of /a/ vowel were used to evaluate average fundamental frequency (F0), Jitter percent, and Shimmer percent. In spectrographic analyses, F3-F4 values for the vowels /i, e, a, o, and u/, nasal formant frequencies of the consonants /m/ and /n/ in the word /mana/, and four formant frequencies (F1, F2, F3, and F4) for nasalized /a/ vowel following a nasal consonant /n/ in the word /mana/ were compared. For the perceptual evaluation, the patients were asked to read the Turkish "Dere" passage. The differences in nasal resonance and subjective evaluations were rated.

RESULTS: A statistically significant change was not observed in F0 (P=0.307), Jitter (P=0.919), and Shimmer (P=0.024) values measured before and after the operation for vowel /a/. Nasal formants measured before and after the operation for nasal formant /m/ and nasal formant /n/ in the word /mana/, which contains nasal consonants, and nasalized vowel /a/, which comes after a nasal consonant, did not differ statistically significant (P=0.096 and P=0.034, respectively). Comparisons among F1, F2, F3, and F4 values did not reveal a statistically significant change for nasalized vowel /a/, which comes after a nasal consonant in the word /mana/.

CONCLUSIONS: Our study shows that a complete therapeutic approach to patients affected by nasal septum deviation do not reveal significant voice abnormalities.}, } @article {pmid24316566, year = {2013}, author = {Wang, KC}, title = {A novel voice sensor for the detection of speech signals.}, journal = {Sensors (Basel, Switzerland)}, volume = {13}, number = {12}, pages = {16533-16550}, pmid = {24316566}, issn = {1424-8220}, mesh = {Humans ; Noise ; *Signal-To-Noise Ratio ; *Speech ; *Voice ; }, abstract = {In order to develop a novel voice sensor to detect human voices, the use of features which are more robust to noise is an important issue. Voice sensor is also called voice activity detection (VAD). Due to that the inherent nature of the formant structure only occurred on the speech spectrogram (well-known as voiceprint), Wu et al. were the first to use band-spectral entropy (BSE) to describe the characteristics of voiceprints. However, the performance of VAD based on BSE feature was degraded in colored noise (or voiceprint-like noise) environments. In order to solve this problem, we propose the two-dimensional part-band energy entropy (TD-PBEE) parameter based on two variables: part-band partition number upon frequency index and long-term window size upon time index to further improve the BSE-based VAD algorithm. The two variables can efficiently represent the characteristics of voiceprints on each critical frequency band and use long-term information for noisy speech spectrograms, respectively. The TD-PBEE parameter can be regarded as a PBEE parameter over time. First, the strength of voiceprints can be partly enhanced by using four entropies applied to four part-bands. We can use the four part-band energy entropies for describing the voiceprints in detail. Due to the characteristics of non-stationary for speech and various noises, we will then use long-term information processing to refine the PBEE, so the voice-like noise can be distinguished from noisy speech through the concept of PBEE with long-term information. Our experiments show that the proposed feature extraction with the TD-PBEE parameter is quite insensitive to background noise. The proposed TD-PBEE-based VAD algorithm is evaluated for four types of noises and five signal-to-noise ratio (SNR) levels. We find that the accuracy of the proposed TD-PBEE-based VAD algorithm averaged over all noises and all SNR levels is better than that of other considered VAD algorithms.}, } @article {pmid24312517, year = {2013}, author = {Cartei, V and Reby, D}, title = {Effect of formant frequency spacing on perceived gender in pre-pubertal children's voices.}, journal = {PloS one}, volume = {8}, number = {12}, pages = {e81022}, pmid = {24312517}, issn = {1932-6203}, mesh = {Adolescent ; Adult ; Aging/*physiology ; Child ; Female ; Humans ; Male ; Puberty/*physiology ; *Sex Characteristics ; Voice Quality/*physiology ; }, abstract = {BACKGROUND: It is usually possible to identify the sex of a pre-pubertal child from their voice, despite the absence of sex differences in fundamental frequency at these ages. While it has been suggested that the overall spacing between formants (formant frequency spacing--ΔF) is a key component of the expression and perception of sex in children's voices, the effect of its continuous variation on sex and gender attribution has not yet been investigated.

In the present study we manipulated voice ΔF of eight year olds (two boys and two girls) along continua covering the observed variation of this parameter in pre-pubertal voices, and assessed the effect of this variation on adult ratings of speakers' sex and gender in two separate experiments. In the first experiment (sex identification) adults were asked to categorise the voice as either male or female. The resulting identification function exhibited a gradual slope from male to female voice categories. In the second experiment (gender rating), adults rated the voices on a continuum from "masculine boy" to "feminine girl", gradually decreasing their masculinity ratings as ΔF increased.

CONCLUSIONS/SIGNIFICANCE: These results indicate that the role of ΔF in voice gender perception, which has been reported in adult voices, extends to pre-pubertal children's voices: variation in ΔF not only affects the perceived sex, but also the perceived masculinity or femininity of the speaker. We discuss the implications of these observations for the expression and perception of gender in children's voices given the absence of anatomical dimorphism in overall vocal tract length before puberty.}, } @article {pmid24291729, year = {2014}, author = {Jafari, Z and Malayeri, S}, title = {Effects of congenital blindness on the subcortical representation of speech cues.}, journal = {Neuroscience}, volume = {258}, number = {}, pages = {401-409}, doi = {10.1016/j.neuroscience.2013.11.027}, pmid = {24291729}, issn = {1873-7544}, mesh = {Acoustic Stimulation ; Adult ; Auditory Perception/physiology ; Blindness/*physiopathology ; Brain Stem/*physiopathology ; Cues ; Evoked Potentials, Auditory, Brain Stem ; Female ; Humans ; Male ; Speech ; Speech Perception/*physiology ; Time Factors ; }, abstract = {Human modalities play a vital role in the way the brain produces mental representations of the world around us. Although congenital blindness limits the understanding of the environment in some aspects, blind individuals may have other superior capabilities from long-term experience and neural plasticity. This study investigated the effects of congenital blindness on temporal and spectral neural encoding of speech at the subcortical level. The study included 26 congenitally blind individuals and 24 normal-sighted individuals with normal hearing. Auditory brainstem response (ABR) was recorded with both click and speech synthetic 40-ms /da/ stimuli. No significant difference was observed between the two groups in wave latencies or amplitudes of click ABR. Latencies of speech ABR D (p=0.012) and O (p=0.014) waves were significantly shorter in blind individuals than in normal-sighted individuals. Amplitudes of the A (p<0.001) and E (p=0.001) speech ABR (sABR) waves were also significantly higher in blind subjects. Blind individuals had significantly better results for duration (p<0.001) amplitude (p=0.015) and slope of the V-A complex (p=0.004), signal-to-noise ratio (p<0.001), and amplitude of the stimulus fundamental frequency (F0) (p=0.009), first formant (F1) (p<0.001) and higher-frequency region (HF) (p<0.001) ranges. Results indicate that congenitally blind subjects have improved hearing function in response to the /da/ syllable in both source and filter classes of sABR. It is possible that these subjects have enhanced neural representation of vocal cord vibrations and improved neural synchronization in temporal encoding of the onset and offset parts of speech stimuli at the brainstem level. This may result from the compensatory mechanism of neural reorganization in blind subjects influenced from top-down corticofugal connections with the auditory cortex.}, } @article {pmid25669261, year = {2013}, author = {Takanen, M and Raitio, T and Santala, O and Alku, P and Pulkki, V}, title = {Fusion of spatially separated vowel formant cues.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {6}, pages = {4508}, doi = {10.1121/1.4826181}, pmid = {25669261}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Speech ; Auditory Threshold ; *Cues ; Humans ; Male ; Pattern Recognition, Physiological ; Recognition, Psychology ; Sound Localization ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Previous studies on fusion in speech perception have demonstrated the ability of the human auditory system to group separate components of speech-like sounds together and consequently to enable the identification of speech despite the spatial separation between the components. Typically, the spatial separation has been implemented using headphone reproduction where the different components evoke auditory images at different lateral positions. In the present study, a multichannel loudspeaker system was used to investigate whether the correct vowel is identified and whether two auditory events are perceived when a noise-excited vowel is divided into two components that are spatially separated. The two components consisted of the even and odd formants. Both the amount of spatial separation between the components and the directions of the components were varied. Neither the spatial separation nor the directions of the components affected the vowel identification. Interestingly, an additional auditory event not associated with any vowel was perceived at the same time when the components were presented symmetrically in front of the listener. In such scenarios, the vowel was perceived from the direction of the odd formant components.}, } @article {pmid25669259, year = {2013}, author = {Cheng, C and Xu, Y}, title = {Articulatory limit and extreme segmental reduction in Taiwan Mandarin.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {6}, pages = {4481}, doi = {10.1121/1.4824930}, pmid = {25669259}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Biomechanical Phenomena ; Glottis/anatomy & histology/physiology ; Humans ; Male ; Phonation ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The present study investigated whether extreme phonetic reduction could result from acute time pressure, i.e., when a segment is given less articulation time than its minimum duration, as defined by Klatt [(1973). J. Acoust. Soc. Am. 54, 1102-1104]. Taiwan Mandarin was examined for its known high frequency of extreme reduction. Native speakers produced sentences containing nonsense disyllabic words with varying phonetic structures at different speech rates. High frequency words from spontaneous speech corpora were also examined for severe reduction. Results show that extreme reduction occurs frequently in nonsense words whenever local speech rate is roughly doubled from normal speech rate. The mean duration at which extreme reduction begins occurring is consistent with previously reported minimum segmental duration, maximum repetition rate and the rate of fast speech at which intelligibility is significantly reduced. Further examination of formant peak velocities as a function of formant displacement from both laboratory and corpus data shows that articulatory strength is not decreased during reduction. It is concluded that extreme reduction is not a feature unique only to high frequency words or casual speech, but a severe form of undershoot that occurs whenever time pressure is too great to allow the minimum execution of the required articulatory movement.}, } @article {pmid24288686, year = {2013}, author = {Lee, JW and Kang, HG and Choi, JY and Son, YI}, title = {An investigation of vocal tract characteristics for acoustic discrimination of pathological voices.}, journal = {BioMed research international}, volume = {2013}, number = {}, pages = {758731}, pmid = {24288686}, issn = {2314-6141}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; *Speech Recognition Software ; Vocal Cords/*physiopathology ; *Voice ; *Voice Disorders/diagnosis/physiopathology ; }, abstract = {This paper investigates the effectiveness of measures related to vocal tract characteristics in classifying normal and pathological speech. Unlike conventional approaches that mainly focus on features related to the vocal source, vocal tract characteristics are examined to determine if interaction effects between vocal folds and the vocal tract can be used to detect pathological speech. Especially, this paper examines features related to formant frequencies to see if vocal tract characteristics are affected by the nature of the vocal fold-related pathology. To test this hypothesis, stationary fragments of vowel /aa/ produced by 223 normal subjects, 472 vocal fold polyp subjects, and 195 unilateral vocal cord paralysis subjects are analyzed. Based on the acoustic-articulatory relationships, phonation for pathological subjects is found to be associated with measures correlated with a raised tongue body or an advanced tongue root. Vocal tract-related features are also found to be statistically significant from the Kruskal-Wallis test in distinguishing normal and pathological speech. Classification results demonstrate that combining the formant measurements with vocal fold-related features results in improved performance in differentiating vocal pathologies including vocal polyps and unilateral vocal cord paralysis, which suggests that measures related to vocal tract characteristics may provide additional information in diagnosing vocal disorders.}, } @article {pmid24239878, year = {2013}, author = {Benders, T}, title = {Mommy is only happy! Dutch mothers' realisation of speech sounds in infant-directed speech expresses emotion, not didactic intent.}, journal = {Infant behavior & development}, volume = {36}, number = {4}, pages = {847-862}, doi = {10.1016/j.infbeh.2013.09.001}, pmid = {24239878}, issn = {1934-8800}, mesh = {Adult ; *Child Language ; Emotions/*physiology ; Female ; Happiness ; Humans ; Infant ; Intention ; Language ; Language Development ; Longitudinal Studies ; Male ; *Mother-Child Relations ; Mothers/*psychology ; Phonetics ; Speech/*physiology ; Speech Perception ; }, abstract = {Exaggeration of the vowel space in infant-directed speech (IDS) is well documented for English, but not consistently replicated in other languages or for other speech-sound contrasts. A second attested, but less discussed, pattern of change in IDS is an overall rise of the formant frequencies, which may reflect an affective speaking style. The present study investigates longitudinally how Dutch mothers change their corner vowels, voiceless fricatives, and pitch when speaking to their infant at 11 and 15 months of age. In comparison to adult-directed speech (ADS), Dutch IDS has a smaller vowel space, higher second and third formant frequencies in the vowels, and a higher spectral frequency in the fricatives. The formants of the vowels and spectral frequency of the fricatives are raised more strongly for infants at 11 than at 15 months, while the pitch is more extreme in IDS to 15-month olds. These results show that enhanced positive affect is the main factor influencing Dutch mothers' realisation of speech sounds in IDS, especially to younger infants. This study provides evidence that mothers' expression of emotion in IDS can influence the realisation of speech sounds, and that the loss or gain of speech clarity may be secondary effects of affect.}, } @article {pmid24216269, year = {2014}, author = {Howard, DM and Williams, J and Herbst, CT}, title = {"Ring" in the solo child singing voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {28}, number = {2}, pages = {161-169}, doi = {10.1016/j.jvoice.2013.09.001}, pmid = {24216269}, issn = {1873-4588}, mesh = {Acoustics ; Adolescent ; Age Factors ; *Auditory Perception ; Child ; Female ; Humans ; Male ; Pressure ; *Singing ; Sound Spectrography ; *Voice Quality ; }, abstract = {OBJECTIVES/HYPOTHESIS: Listeners often describe the voices of solo child singers as being "pure" or "clear"; these terms would suggest that the voice is not only pleasant but also clearly audible. The audibility or clarity could be attributed to the presence of high-frequency partials in the sound: a "brightness" or "ring." This article aims to investigate spectrally the acoustic nature of this ring phenomenon in children's solo voices, and in particular, relating it to their "nonring" production. Additionally, this is set in the context of establishing to what extent, if any, the spectral characteristics of ring are shared with those of the singer's formant cluster associated with professional adult opera singers in the 2.5-3.5kHz region.

METHODS: A group of child solo singers, acknowledged as outstanding by a singing teacher who specializes in teaching professional child singers, were recorded in a major UK concert hall performing Come unto him, all ye that labour, from the aria He shall feed his flock from The Messiah by GF Handel. Their singing was accompanied by a recording of a piano played through in-ear headphones. Sound pressure recordings were made from well within the critical distance in the hall. The singers were observed to produce notes with and without ring, and these recordings were analyzed in the frequency domain to investigate their spectra.

RESULTS: The results indicate that there is evidence to suggest that ring in child solo singers is carried in two areas of the output spectrum: first in the singer's formant cluster region, centered around 4kHz, which is more than 1000Hz higher than what is observed in adults; and second in the region around 7.5-11kHz where a significant strengthening of harmonic presence is observed. A perceptual test has been carried out demonstrating that 94% of 62 listeners label a synthesized version of the calculated overall average ring spectrum for all subjects as having ring when compared with a synthesized version of the calculated overall average nonring spectrum.

CONCLUSIONS: The notion of ring in the child solo voice manifests itself not only with spectral features in common with the projection peak found in adult singers but also in a higher frequency region. It is suggested that the formant cluster at around 4kHz is the children's equivalent of the singers' formant cluster; the frequency is higher than in the adult, most likely due to the smaller dimensions of the epilaryngeal tube. The frequency cluster observed as a strong peak at about 7.5-11kHz, when added to the children's singers' formant cluster, may be the key to cueing the notion of ring in the child solo voice.}, } @article {pmid24183398, year = {2014}, author = {Ahadi, M and Pourbakht, A and Jafari, AH and Shirjian, Z and Jafarpisheh, AS}, title = {Gender disparity in subcortical encoding of binaurally presented speech stimuli: an auditory evoked potentials study.}, journal = {Auris, nasus, larynx}, volume = {41}, number = {3}, pages = {239-243}, doi = {10.1016/j.anl.2013.10.010}, pmid = {24183398}, issn = {1879-1476}, mesh = {Acoustic Stimulation ; Adult ; Brain Stem/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Sex Factors ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVES: To investigate the influence of gender on subcortical representation of speech acoustic parameters where simultaneously presented to both ears.

METHODS: Two-channel speech-evoked auditory brainstem responses were obtained in 25 female and 23 male normal hearing young adults by using binaural presentation of the 40 ms synthetic consonant-vowel/da/, and the encoding of the fast and slow elements of speech stimuli at subcortical level were compared in the temporal and spectral domains between the sexes using independent sample, two tailed t-test.

RESULTS: Highly detectable responses were established in both groups. Analysis in the time domain revealed earlier and larger Fast-onset-responses in females but there was no gender related difference in sustained segment and offset of the response. Interpeak intervals between Frequency Following Response peaks were also invariant to sex. Based on shorter onset responses in females, composite onset measures were also sex dependent. Analysis in the spectral domain showed more robust and better representation of fundamental frequency as well as the first formant and high frequency components of first formant in females than in males.

CONCLUSIONS: Anatomical, biological and biochemical distinctions between females and males could alter the neural encoding of the acoustic cues of speech stimuli at subcortical level. Females have an advantage in binaural processing of the slow and fast elements of speech. This could be a physiological evidence for better identification of speaker and emotional tone of voice, as well as better perceiving the phonetic information of speech in women.}, } @article {pmid24176300, year = {2014}, author = {John, J and Ganapathy, K and John, S and Rajashekhar, B}, title = {Normative for motor speech profile in Kannada-speaking adults.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {28}, number = {1}, pages = {7-13}, doi = {10.1016/j.jvoice.2013.06.009}, pmid = {24176300}, issn = {1873-4588}, mesh = {Adult ; Age Factors ; Cross-Sectional Studies ; Female ; Humans ; Kinetics ; *Language ; Male ; Middle Aged ; Pattern Recognition, Automated ; Reference Values ; Sex Factors ; Signal Processing, Computer-Assisted ; Software ; *Speech Acoustics ; Speech Perception ; Speech Production Measurement/*methods ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: The primary purpose of this study was to establish normative for 18 parameters of Motor Speech Profile (MSP; Computerized Speech Lab; KayPENTAX, Lincoln Park, NJ) in Indian population, specifically for Kannada-speaking adults across age and gender.

STUDY DESIGN: Cross-sectional study.

METHOD: Native Kannada speakers (n = 300) were divided into three age groups (20-40, 41-50, and 51-60 years) with 50 males and 50 females in each group. The obtained data are reported across age and gender for the parameters of diadochokinetic rate, second formant transition, and voice and tremor characteristics of MSP software.

RESULTS: Across gender, a statistically significant difference (P < 0.05) was seen for seven parameters; whereas across age, a statistically significant variation was seen for nine parameters in the age group of 51-60 years than other groups (20-40 and 41-50 years).

CONCLUSION: Establishment of the normative is essential for the effective use of acoustic analysis as an objective tool. The findings of the present study serve as a norm-based reference for MSP software in Indian population, aged between 20 and 60 years.}, } @article {pmid24167231, year = {2014}, author = {Bae, Y and Perry, JL and Kuehn, DP}, title = {Videofluoroscopic investigation of body position on articulatory positioning.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {4}, pages = {1135-1147}, doi = {10.1044/2013_JSLHR-S-12-0235}, pmid = {24167231}, issn = {1558-9102}, mesh = {Adult ; Epiglottis/diagnostic imaging ; Female ; Gravitation ; Healthy Volunteers ; Humans ; Male ; Palate, Soft/diagnostic imaging ; Patient Positioning/*methods ; Photofluorography/*methods ; *Posture ; Speech/*physiology ; Tongue/diagnostic imaging ; Video Recording ; Young Adult ; }, abstract = {PURPOSE: To quantitatively examine the effects of body position on the positioning of the epiglottis, tongue, and velum at rest and during speech.

METHOD: Videofluoroscopic data were obtained from 12 healthy adults in the supine and upright positions at rest and during speech while the participants produced 12 VCV sequences. The effects of body position, target sounds, and adjacent sounds on structural positioning and vowel formant structure were investigated.

RESULTS: Velar retropositioning in the supine position was the most consistent pattern observed at rest. During speech, all structures, with varying degrees of adjustment, appeared to work against the gravitational pull, resulting in no significant narrowing in the oro- and nasopharyngeal regions while in the supine position. Minimal differences in the formant data between the body positions were also observed. Overall, structural positioning was significantly dependent on the target and adjacent sounds regardless of body position.

CONCLUSIONS: The present study demonstrated that structural positioning in response to gravity varied across individuals based on the type of activities being performed. With varying degrees of positional adjustment across different structures, fairly consistent articulatory positioning in the anterior-posterior dimension was maintained in different body positions during speech.}, } @article {pmid24157437, year = {2013}, author = {Babel, M and McGuire, G}, title = {Listener expectations and gender bias in nonsibilant fricative perception.}, journal = {Phonetica}, volume = {70}, number = {1-2}, pages = {117-151}, doi = {10.1159/000354644}, pmid = {24157437}, issn = {1423-0321}, mesh = {Cues ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The nonsibilant English fricatives /f/ and /θ / are known to be acoustically nonrobust. Using /f/ and /θ/ stimuli produced in CV, VCV, and VC syllables in /i α u/ contexts spoken by 10 talkers (5 male), we first replicate previous research suggesting that the most robust cues to this contrast are in the formant transitions in adjacent vowels. We also demonstrate vowel and syllable contextual differences that point to the contrast being most robust in /u/ contexts. In a series of perception experiments we go on to demonstrate effects of bias on perception of /f/ and /θ/ that derive from the uninformative nature of the frication noise, making them vulnerable to misperception in general, and especially in low-saliency contexts where the formant transition information is less robust. In experiment 1, listeners' classification of /f/ and /θ/ demonstrated a general bias to respond /f/ for fricatives produced by females and /θ/ for those produced by males. We hypothesize that the perceived concentrations of spectral energy in the fricative are shifted based on the concentration of energy in the vowel, which depend on a talker's gender. In experiment 2, vowel and frication noise portions were cross-spliced to probe this effect, resulting in the same gender-based bias. In a final experiment the vocalic information was removed and only the frication noise was presented to listeners for classification. In this task there was a general bias for /f/, regardless of the talker gender. Overall we demonstrate topdown gender effects in perception that originate in the strong indexical properties of adjacent vowels rather than being present in the frication noise itself.}, } @article {pmid24157128, year = {2013}, author = {Van Zyl, M and Hanekom, JJ}, title = {Perception of vowels and prosody by cochlear implant recipients in noise.}, journal = {Journal of communication disorders}, volume = {46}, number = {5-6}, pages = {449-464}, doi = {10.1016/j.jcomdis.2013.09.002}, pmid = {24157128}, issn = {1873-7994}, mesh = {Adult ; Aged ; *Cochlear Implants ; Female ; Humans ; Language Tests ; Male ; Middle Aged ; *Perceptual Masking ; *Phonetics ; Sound Spectrography ; South Africa ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; Young Adult ; }, abstract = {UNLABELLED: The aim of the present study was to compare the ability of cochlear implant (CI) recipients to recognise speech prosody in the presence of speech-weighted noise to their ability to recognise vowels in the same test paradigm and listening condition. All test materials were recorded from four different speakers (two male, two female). Two prosody recognition tasks were developed, both using single words as stimuli. The first task involved a question/statement distinction, while the second task required listeners to make a judgement about the speaker's attitude. Vowel recognition tests were conducted using vowel pairs selected on the basis of specific acoustic cues (frequencies of the first two formants and duration). Ten CI users and ten normal-hearing controls were tested in both quiet and an adaptive noise condition, using a two-alternative forced-choice test paradigm for all the tests. Results indicated that vowel recognition was significantly better than prosody recognition in both listener groups in both quiet and noise, and that question/statement discrimination was the most difficult task for CI listeners in noise. Data from acoustic analyses were used to interpret differences in performance on different tasks and with different speakers.

LEARNING OUTCOMES: As a result of this activity, readers will be able to (1) describe suitable methods for comparing vowel and prosody perception in noise, (2) compare performance on vowel and prosody perception tasks in quiet in normal-hearing listeners and cochlear implant recipients, (3) compare performance on vowel and prosody perception tasks in noise in normal-hearing listeners and cochlear implant recipients and (4) relate performance on prosody tasks in quiet to performance on these tasks in noise.}, } @article {pmid24143193, year = {2013}, author = {Bergelson, E and Shvartsman, M and Idsardi, WJ}, title = {Differences in mismatch responses to vowels and musical intervals: MEG evidence.}, journal = {PloS one}, volume = {8}, number = {10}, pages = {e76758}, pmid = {24143193}, issn = {1932-6203}, support = {R01 DC005660/DC/NIDCD NIH HHS/United States ; 7R01DC005660-07/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Auditory Perception ; Brain/physiology ; *Electrophysiological Phenomena ; Female ; Humans ; *Language ; *Magnetoencephalography ; Male ; *Music ; Young Adult ; }, abstract = {We investigated the electrophysiological response to matched two-formant vowels and two-note musical intervals, with the goal of examining whether music is processed differently from language in early cortical responses. Using magnetoencephalography (MEG), we compared the mismatch-response (MMN/MMF, an early, pre-attentive difference-detector occurring approximately 200 ms post-onset) to musical intervals and vowels composed of matched frequencies. Participants heard blocks of two stimuli in a passive oddball paradigm in one of three conditions: sine waves, piano tones and vowels. In each condition, participants heard two-formant vowels or musical intervals whose frequencies were 11, 12, or 24 semitones apart. In music, 12 semitones and 24 semitones are perceived as highly similar intervals (one and two octaves, respectively), while in speech 12 semitones and 11 semitones formant separations are perceived as highly similar (both variants of the vowel in 'cut'). Our results indicate that the MMN response mirrors the perceptual one: larger MMNs were elicited for the 12-11 pairing in the music conditions than in the language condition; conversely, larger MMNs were elicited to the 12-24 pairing in the language condition that in the music conditions, suggesting that within 250 ms of hearing complex auditory stimuli, the neural computation of similarity, just as the behavioral one, differs significantly depending on whether the context is music or speech.}, } @article {pmid24140826, year = {2013}, author = {Zäske, R and Skuk, VG and Kaufmann, JM and Schweinberger, SR}, title = {Perceiving vocal age and gender: an adaptation approach.}, journal = {Acta psychologica}, volume = {144}, number = {3}, pages = {583-593}, doi = {10.1016/j.actpsy.2013.09.009}, pmid = {24140826}, issn = {1873-6297}, mesh = {*Adaptation, Psychological ; Adult ; Age Factors ; Aged ; Female ; Humans ; Male ; Middle Aged ; *Pitch Perception ; Sex Factors ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {Aftereffects of adaptation have revealed both independent and interactive coding of facial signals including identity and expression or gender and age. By contrast, interactive processing of non-linguistic features in voices has rarely been investigated. Here we studied bidirectional cross-categorical aftereffects of adaptation to vocal age and gender. Prolonged exposure to young (~20yrs) or old (~70yrs) male or female voices biased perception of subsequent test voices away from the adapting age (Exp. 1) and the adapting gender (Exp. 2). Relative to gender-congruent adaptor-test pairings, vocal age aftereffects (VAAEs) were reduced but remained significant when voice gender changed between adaptation and test. This suggests that the VAAE relies on both gender-specific and gender-independent age representations for male and female voices. By contrast, voice gender aftereffects (VGAEs) were not modulated by age-congruency of adaptor and test voices (Exp. 2). Instead, young voice adaptors generally induced larger VGAEs than old voice adaptors. This suggests that young voices are particularly efficient gender adaptors, likely reflecting more pronounced sexual dimorphism in these voices. In sum, our findings demonstrate how high-level processing of vocal age and gender is partially intertwined.}, } @article {pmid24133475, year = {2013}, author = {Xu, M and Homae, F and Hashimoto, R and Hagiwara, H}, title = {Acoustic cues for the recognition of self-voice and other-voice.}, journal = {Frontiers in psychology}, volume = {4}, number = {}, pages = {735}, pmid = {24133475}, issn = {1664-1078}, abstract = {Self-recognition, being indispensable for successful social communication, has become a major focus in current social neuroscience. The physical aspects of the self are most typically manifested in the face and voice. Compared with the wealth of studies on self-face recognition, self-voice recognition (SVR) has not gained much attention. Converging evidence has suggested that the fundamental frequency (F0) and formant structures serve as the key acoustic cues for other-voice recognition (OVR). However, little is known about which, and how, acoustic cues are utilized for SVR as opposed to OVR. To address this question, we independently manipulated the F0 and formant information of recorded voices and investigated their contributions to SVR and OVR. Japanese participants were presented with recorded vocal stimuli and were asked to identify the speaker-either themselves or one of their peers. Six groups of 5 peers of the same sex participated in the study. Under conditions where the formant information was fully preserved and where only the frequencies lower than the third formant (F3) were retained, accuracies of SVR deteriorated significantly with the modulation of the F0, and the results were comparable for OVR. By contrast, under a condition where only the frequencies higher than F3 were retained, the accuracy of SVR was significantly higher than that of OVR throughout the range of F0 modulations, and the F0 scarcely affected the accuracies of SVR and OVR. Our results indicate that while both F0 and formant information are involved in SVR, as well as in OVR, the advantage of SVR is manifested only when major formant information for speech intelligibility is absent. These findings imply the robustness of self-voice representation, possibly by virtue of auditory familiarity and other factors such as its association with motor/articulatory representation.}, } @article {pmid24131362, year = {2014}, author = {Dong, L and Kong, J and Sundberg, J}, title = {Long-term-average spectrum characteristics of Kunqu Opera singers' speaking, singing and stage speech.}, journal = {Logopedics, phoniatrics, vocology}, volume = {39}, number = {2}, pages = {72-80}, doi = {10.3109/14015439.2013.841752}, pmid = {24131362}, issn = {1651-2022}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Male ; Middle Aged ; Signal Processing, Computer-Assisted ; *Singing ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {Long-term-average spectrum (LTAS) characteristics were analyzed for ten Kunqu Opera singers, two in each of five roles. Each singer performed singing, stage speech, and conversational speech. Differences between the roles and between their performances of these three conditions are examined. After compensating for Leq difference LTAS characteristics still differ between the roles but are similar for the three conditions, especially for Colorful face (CF) and Old man roles, and especially between reading and singing. The curves show no evidence of a singer's formant cluster peak, but the CF role demonstrates a speaker's formant peak near 3 kHz. The LTAS characteristics deviate markedly from non-singers' standard conversational speech as well as from those of Western opera singing.}, } @article {pmid24127034, year = {2013}, author = {Kempe, V and Puts, DA and Cárdenas, RA}, title = {Masculine men articulate less clearly.}, journal = {Human nature (Hawthorne, N.Y.)}, volume = {24}, number = {4}, pages = {461-475}, pmid = {24127034}, issn = {1936-4776}, mesh = {Adolescent ; Adult ; Body Height/physiology ; Body Size/*physiology ; Female ; Humans ; Male ; Muscle Strength/physiology ; *Phonation ; *Phonetics ; Saliva/chemistry ; Sex Factors ; *Speech Acoustics ; Testosterone/analysis ; Young Adult ; }, abstract = {In previous research, acoustic characteristics of the male voice have been shown to signal various aspects of mate quality and threat potential. But the human voice is also a medium of linguistic communication. The present study explores whether physical and vocal indicators of male mate quality and threat potential are linked to effective communicative behaviors such as vowel differentiation and use of more salient phonetic variants of consonants. We show that physical and vocal indicators of male threat potential, height and formant position, are negatively linked to vowel space size, and that height and levels of circulating testosterone are negatively linked to the use of the aspirated variant of the alveolar stop consonant /t/. Thus, taller, more masculine men display less clarity in their speech and prefer phonetic variants that may be associated with masculine attributes such as toughness. These findings suggest that vocal signals of men's mate quality and/or dominance are not confined to the realm of voice acoustics but extend to other aspects of communicative behavior, even if this means a trade-off with speech patterns that are considered communicatively advantageous, such as clarity and indexical cues to higher social class.}, } @article {pmid24110339, year = {2013}, author = {Yadollahi, A and Rudzicz, F and Montazeri, A and Bradley, TD}, title = {Variations in respiratory sounds in relation to fluid accumulation in the upper airways.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2013}, number = {}, pages = {2924-2927}, doi = {10.1109/EMBC.2013.6610152}, pmid = {24110339}, issn = {2694-0604}, support = {//Canadian Institutes of Health Research/Canada ; }, mesh = {*Acoustics ; Adult ; Algorithms ; Body Fluids/*physiology ; Body Mass Index ; Discriminant Analysis ; Female ; Healthy Volunteers ; Humans ; Male ; Monitoring, Physiologic/*instrumentation/methods ; Neck/*physiology ; *Respiratory Sounds ; Signal Processing, Computer-Assisted ; Sleep/*physiology ; Sleep Apnea, Obstructive/*physiopathology ; Time Factors ; }, abstract = {Obstructive sleep apnea (OSA) is a common disorder due to recurrent collapse of the upper airway (UA) during sleep that increases the risk for several cardiovascular diseases. Recently, we showed that nocturnal fluid accumulation in the neck can narrow the UA and predispose to OSA. Our goal is to develop non-invasive methods to study the pathogenesis of OSA and the factors that increase the risks of developing it. Respiratory sound analysis is a simple and non-invasive way to study variations in the properties of the UA. In this study we examine whether such analysis can be used to estimate the amount of neck fluid volume and whether fluid accumulation in the neck alters the properties of these sounds. Our acoustic features include estimates of formants, pitch, energy, duration, zero crossing rate, average power, Mel frequency power, Mel cepstral coefficients, skewness, and kurtosis across segments of sleep. Our results show that while all acoustic features vary significantly among subjects, only the variations in respiratory sound energy, power, duration, pitch, and formants varied significantly over time. Decreases in energy and power over time accompany increases in neck fluid volume which may indicate narrowing of UA and consequently an increased risk of OSA. Finally, simple discriminant analysis was used to estimate broad classes of neck fluid volume from acoustic features with an accuracy of 75%. These results suggest that acoustic analysis of respiratory sounds might be used to assess the role of fluid accumulation in the neck on the pathogenesis of OSA.}, } @article {pmid24075910, year = {2013}, author = {Duvvuru, S and Erickson, M}, title = {The effect of change in spectral slope and formant frequencies on the perception of loudness.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {6}, pages = {691-697}, doi = {10.1016/j.jvoice.2013.05.004}, pmid = {24075910}, issn = {1873-4588}, mesh = {Humans ; *Loudness Perception ; *Speech Acoustics ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study attempts to understand how changes in spectral slope and formant frequency influence changes in perceived loudness. It was hypothesized that voices synthesized with steeper spectral slopes will be perceived as less loud than voices synthesized with less steep spectral slopes, in spite of the fact that they are of equal root mean square (RMS) amplitude. It was also hypothesized that stimuli with higher formant patterns will be perceived as louder than those with lower formant patterns, in spite of the fact that they are of equal RMS amplitude.

STUDY DESIGN: Repeated measures factorial design.

METHODS: For the pitches A3, C4, B4, and F5, three different source signals were synthesized with varying slopes of -9, -12, and -15 dB/octave using a frequency vibrato rate of 5.6 Hz and a frequency vibrato extent of 50 cents. Each of the three source signals were filtered using two formant patterns, a lower formant pattern typical of a mezzo-soprano (pattern A) and a higher formant pattern typical of a soprano (pattern B) for the vowel /a/. For each pitch, the six stimuli were combined into all possible pairs and normalized to equal RMS amplitude. Listeners were presented with 120 paired stimuli (60 pairs repeated twice). The listener's task was to indicate whether the first or second stimulus in the pair was louder.

RESULTS: Generally, as the spectral slope decreased, perceived loudness increased, with the magnitude of the perceived difference in loudness being related to the degree of difference in spectral slope. Likewise, at all pitches except A3, perceived loudness increased as formant frequency increased.

CONCLUSION: RMS amplitude is an important predictor of loudness perception, but many other factors also affect the perception of this important vocal parameter. Spectral composition is one such factor and must be considered when using loudness perception in the process of clinical diagnostics.}, } @article {pmid24070592, year = {2013}, author = {Leclerc, I and Dajani, HR and Giguère, C}, title = {Differences in shimmer across formant regions.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {6}, pages = {685-690}, doi = {10.1016/j.jvoice.2013.05.002}, pmid = {24070592}, issn = {1873-4588}, mesh = {Adult ; Female ; Healthy Volunteers ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Voice Disorders/*diagnosis ; Young Adult ; }, abstract = {OBJECTIVES: Objective acoustic measures used to analyze phonatory dysfunction include shimmer and jitter. These measures are limited in that they do not take into account auditory processing. However, previous studies have indicated that shimmer may be processed differently along the tonotopic axis of the ear and, in particular, may be perceptually and physiologically significant around the third and fourth formants.

METHODS: This study investigated the relationship between shimmer around the first four formants (F1-F4) and in the broadband unfiltered speech waveform for 18 normal speakers from the voice disorders database of KayPENTAX. The voice samples were filtered around each formant with a bandwidth of 400Hz and then shimmer was assessed using five built-in different measures from Praat software.

RESULTS: Comparisons of means tests revealed that shimmer increases significantly with formant frequency from F1 to F4, for all shimmer measures. Furthermore, for all shimmer measures, shimmer in the unfiltered speech was significantly and more strongly correlated with shimmer around F1 (r = 0.45-0.61) and F2 (r = 0.69-0.74), significantly but more weakly correlated with F4 (r = 0.42-0.47), and not significantly correlated with F3.

CONCLUSIONS: The findings indicate that there are differences in the shimmer found around the different formants and that shimmer information around F3 and F4 is not well captured in standard shimmer measurements based on the broadband unfiltered waveform.}, } @article {pmid24023381, year = {2013}, author = {Massida, Z and Marx, M and Belin, P and James, C and Fraysse, B and Barone, P and Deguine, O}, title = {Gender categorization in cochlear implant users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {56}, number = {5}, pages = {1389-1401}, doi = {10.1044/1092-4388(2013/12-0132)}, pmid = {24023381}, issn = {1558-9102}, mesh = {Acoustic Stimulation/methods ; Adult ; Aged ; Aged, 80 and over ; Cochlear Implantation/*rehabilitation ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Follow-Up Studies ; Hearing ; Humans ; Male ; Middle Aged ; Psychoacoustics ; Psychometrics ; *Sex Characteristics ; Speech Acoustics ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {PURPOSE: In this study, the authors examined the ability of subjects with cochlear implants (CIs) to discriminate voice gender and how this ability evolved as a function of CI experience.

METHOD: The authors presented a continuum of voice samples created by voice morphing, with 9 intermediate acoustic parameter steps between a typical male and a typical female. This method allowed for the evaluation of gender categorization not only when acoustical features were specific to gender but also for more ambiguous cases, when fundamental frequency or formant distribution were located between typical values.

RESULTS: Results showed a global, though variable, deficit for voice gender categorization in CI recipients compared with subjects with normal hearing. This deficit was stronger for ambiguous stimuli in the voice continuum: Average performance scores for CI users were 58% lower than average scores for subjects with normal hearing in cases of ambiguous stimuli and 19% lower for typical male and female voices. The authors found no significant improvement in voice gender categorization with CI experience.

CONCLUSIONS: These results emphasize the dissociation between recovery of speech recognition and voice feature perception after cochlear implantation. This large and durable deficit may be related to spectral and temporal degradation induced by CI sound coding, or it may be related to central voice processing deficits.}, } @article {pmid23986953, year = {2013}, author = {Hattori, M and Sumita, Y and Taniguchi, H}, title = {[Three kinds of speech evaluation in maxillectomy patients during the fabrication process of a hollow-type obturator].}, journal = {Kokubyo Gakkai zasshi. The Journal of the Stomatological Society, Japan}, volume = {80}, number = {2}, pages = {49-53}, pmid = {23986953}, issn = {0300-9149}, mesh = {Adult ; Female ; Humans ; Male ; Maxilla/*surgery ; Middle Aged ; *Palatal Obturators ; Psychoacoustics ; *Speech Intelligibility ; }, abstract = {Speech evaluation is essential to the rehabilitation of maxillectomy patients. A speech intelligibility test has been frequently used for the evaluation because of its simplicity. However, the test needs human listeners and this has been a shortcoming of the test. Thus, objective evaluation methods have been discussed. One of them is formant analysis for vowel evaluation and another is psychoacoustic analysis for consonants. In this research, to evaluate the speech in five maxillectomy patients, the F2-range was examined as formant analysis and the sharpness of /sa/ was examined as psychoacoustic analysis, in addition to the speech intelligibility test, during the fabrication process of a hollow-type obturator. The objective evaluation of patients' speech was effectively performed using those three methods. The F2-range was larger, the sharpness of /sa/ was higher and the speech intelligibility was higher when patients wore an obturator compared to when patients did not wear an obturator. The difference in speech when changing the types of obturator was not clear. The method and the findings are important for future research in this field, since speech is one of the important factors in maxillofacial rehabilitation.}, } @article {pmid23967956, year = {2013}, author = {Cazau, D and Adam, O and Laitman, JT and Reidenberg, JS}, title = {Understanding the intentional acoustic behavior of humpback whales: a production-based approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {3}, pages = {2268-2273}, doi = {10.1121/1.4816403}, pmid = {23967956}, issn = {1520-8524}, mesh = {*Acoustics ; Animals ; Humpback Whale/anatomy & histology/*physiology/psychology ; Larynx/anatomy & histology/*physiology ; *Phonation ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Following a production-based approach, this paper deals with the acoustic behavior of humpback whales. This approach investigates various physical factors, which are either internal (e.g., physiological mechanisms) or external (e.g., environmental constraints) to the respiratory tractus of the whale, for their implications in sound production. This paper aims to describe a functional scenario of this tractus for the generation of vocal sounds. To do so, a division of this tractus into three different configurations is proposed, based on the air recirculation process which determines air sources and laryngeal valves. Then, assuming a vocal function (in sound generation or modification) for several specific anatomical components, an acoustic characterization of each of these configurations is proposed to link different spectral features, namely, fundamental frequencies and formant structures, to specific vocal production mechanisms. A discussion around the question of whether the whale is able to fully exploit the acoustic potential of its respiratory tractus is eventually provided.}, } @article {pmid23967947, year = {2013}, author = {Rusz, J and Cmejla, R and Tykalova, T and Ruzickova, H and Klempir, J and Majerova, V and Picmausova, J and Roth, J and Ruzicka, E}, title = {Imprecise vowel articulation as a potential early marker of Parkinson's disease: effect of speaking task.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {3}, pages = {2171-2181}, doi = {10.1121/1.4816541}, pmid = {23967947}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Aged ; Aged, 80 and over ; Articulation Disorders/diagnosis/*etiology/physiopathology ; Case-Control Studies ; Early Diagnosis ; Humans ; Male ; Middle Aged ; Parkinson Disease/*complications/diagnosis/physiopathology ; *Phonation ; *Phonetics ; Predictive Value of Tests ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {The purpose of this study was to analyze vowel articulation across various speaking tasks in a group of 20 early Parkinson's disease (PD) individuals prior to pharmacotherapy. Vowels were extracted from sustained phonation, sentence repetition, reading passage, and monologue. Acoustic analysis was based upon measures of the first (F1) and second (F2) formant of the vowels /a/, /i/, and /u/, vowel space area (VSA), F2i/F2u and vowel articulation index (VAI). Parkinsonian speakers manifested abnormalities in vowel articulation across F2u, VSA, F2i/F2u, and VAI in all speaking tasks except sustained phonation, compared to 15 age-matched healthy control participants. Findings suggest that sustained phonation is an inappropriate task to investigate vowel articulation in early PD. In contrast, monologue was the most sensitive in differentiating between controls and PD patients, with classification accuracy up to 80%. Measurements of vowel articulation were able to capture even minor abnormalities in speech of PD patients with no perceptible dysarthria. In conclusion, impaired vowel articulation may be considered as a possible early marker of PD. A certain type of speaking task can exert significant influence on vowel articulation. Specifically, complex tasks such as monologue are more likely to elicit articulatory deficits in parkinsonian speech, compared to other speaking tasks.}, } @article {pmid23954862, year = {2013}, author = {Ranasinghe, KG and Vrana, WA and Matney, CJ and Kilgard, MP}, title = {Increasing diversity of neural responses to speech sounds across the central auditory pathway.}, journal = {Neuroscience}, volume = {252}, number = {}, pages = {80-97}, pmid = {23954862}, issn = {1873-7544}, support = {R01 DC010433/DC/NIDCD NIH HHS/United States ; R15 DC006624/DC/NIDCD NIH HHS/United States ; R01DC010433/DC/NIDCD NIH HHS/United States ; R15DC006624/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Female ; Humans ; Inferior Colliculi/*physiology ; Neurons/*physiology ; Rats ; Rats, Sprague-Dawley ; Speech ; }, abstract = {Neurons at higher stations of each sensory system are responsive to feature combinations not present at lower levels. As a result, the activity of these neurons becomes less redundant than lower levels. We recorded responses to speech sounds from the inferior colliculus and the primary auditory cortex neurons of rats, and tested the hypothesis that primary auditory cortex neurons are more sensitive to combinations of multiple acoustic parameters compared to inferior colliculus neurons. We independently eliminated periodicity information, spectral information and temporal information in each consonant and vowel sound using a noise vocoder. This technique made it possible to test several key hypotheses about speech sound processing. Our results demonstrate that inferior colliculus responses are spatially arranged and primarily determined by the spectral energy and the fundamental frequency of speech, whereas primary auditory cortex neurons generate widely distributed responses to multiple acoustic parameters, and are not strongly influenced by the fundamental frequency of speech. We found no evidence that inferior colliculus or primary auditory cortex was specialized for speech features such as voice onset time or formants. The greater diversity of responses in primary auditory cortex compared to inferior colliculus may help explain how the auditory system can identify a wide range of speech sounds across a wide range of conditions without relying on any single acoustic cue.}, } @article {pmid23927128, year = {2013}, author = {Reilly, KJ and Dougherty, KE}, title = {The role of vowel perceptual cues in compensatory responses to perturbations of speech auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {2}, pages = {1314-1323}, pmid = {23927128}, issn = {1520-8524}, support = {R03 DC011159/DC/NIDCD NIH HHS/United States ; DC011159/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adolescent ; Adult ; *Cues ; *Feedback, Psychological ; Female ; Humans ; Male ; *Phonetics ; Reaction Time ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The perturbation of acoustic features in a speaker's auditory feedback elicits rapid compensatory responses that demonstrate the importance of auditory feedback for control of speech output. The current study investigated whether responses to a perturbation of speech auditory feedback vary depending on the importance of the perturbed feature to perception of the vowel being produced. Auditory feedback of speakers' first formant frequency (F1) was shifted upward by 130 mels in randomly selected trials during the speakers' production of consonant-vowel-consonant words containing either the vowel /Λ/ or the vowel /ɝ/. Although these vowels exhibit comparable F1 frequencies, the contribution of F1 to perception of /Λ/ is greater than its contribution to perception of /ɝ/. Compensation to the F1 perturbation was observed during production of both vowels, but compensatory responses during /Λ/ occurred at significantly shorter latencies and exhibited significantly larger magnitudes than compensatory responses during /ɝ/. The finding that perturbation of vowel F1 during /Λ/ and /ɝ/ yielded compensatory differences that mirrored the contributions of F1 to perception of these vowels indicates that some portion of feedback control is weighted toward monitoring and preservation of acoustic cues for speech perception.}, } @article {pmid23927127, year = {2013}, author = {Alku, P and Pohjalainen, J and Vainio, M and Laukkanen, AM and Story, BH}, title = {Formant frequency estimation of high-pitched vowels using weighted linear prediction.}, journal = {The Journal of the Acoustical Society of America}, volume = {134}, number = {2}, pages = {1295-1313}, doi = {10.1121/1.4812756}, pmid = {23927127}, issn = {1520-8524}, mesh = {Adult ; Algorithms ; Biomechanical Phenomena ; Child, Preschool ; Computer Simulation ; Female ; Glottis/anatomy & histology/*physiology ; Humans ; *Linear Models ; Male ; Numerical Analysis, Computer-Assisted ; Pattern Recognition, Automated ; *Phonation ; *Phonetics ; *Pitch Perception ; Pressure ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Vocal Cords/physiology ; *Voice Quality ; }, abstract = {All-pole modeling is a widely used formant estimation method, but its performance is known to deteriorate for high-pitched voices. In order to address this problem, several all-pole modeling methods robust to fundamental frequency have been proposed. This study compares five such previously known methods and introduces a technique, Weighted Linear Prediction with Attenuated Main Excitation (WLP-AME). WLP-AME utilizes temporally weighted linear prediction (LP) in which the square of the prediction error is multiplied by a given parametric weighting function. The weighting downgrades the contribution of the main excitation of the vocal tract in optimizing the filter coefficients. Consequently, the resulting all-pole model is affected more by the characteristics of the vocal tract leading to less biased formant estimates. By using synthetic vowels created with a physical modeling approach, the results showed that WLP-AME yields improved formant frequencies for high-pitched sounds in comparison to the previously known methods (e.g., relative error in the first formant of the vowel [a] decreased from 11% to 3% when conventional LP was replaced with WLP-AME). Experiments conducted on natural vowels indicate that the formants detected by WLP-AME changed in a more regular manner between repetitions of different pitch than those computed by conventional LP.}, } @article {pmid23922967, year = {2013}, author = {Charlton, BD and Whisson, DA and Reby, D}, title = {Free-ranging male koalas use size-related variation in formant frequencies to assess rival males.}, journal = {PloS one}, volume = {8}, number = {7}, pages = {e70279}, pmid = {23922967}, issn = {1932-6203}, mesh = {Animals ; *Body Size ; *Cues ; Male ; Phascolarctidae/*physiology ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Although the use of formant frequencies in nonhuman animal vocal communication systems has received considerable recent interest, only a few studies have examined the importance of these acoustic cues to body size during intra-sexual competition between males. Here we used playback experiments to present free-ranging male koalas with re-synthesised bellow vocalisations in which the formants were shifted to simulate either a large or a small adult male. We found that male looking responses did not differ according to the size variant condition played back. In contrast, male koalas produced longer bellows and spent more time bellowing when they were presented with playbacks simulating larger rivals. In addition, males were significantly slower to respond to this class of playback stimuli than they were to bellows simulating small males. Our results indicate that male koalas invest more effort into their vocal responses when they are presented with bellows that have lower formants indicative of larger rivals, but also show that males are slower to engage in vocal exchanges with larger males that represent more dangerous rivals. By demonstrating that male koalas use formants to assess rivals during the breeding season we have provided evidence that male-male competition constitutes an important selection pressure for broadcasting and attending to size-related formant information in this species. Further empirical studies should investigate the extent to which the use of formants during intra-sexual competition is widespread throughout mammals.}, } @article {pmid23893199, year = {2013}, author = {Aono, K and Shaga, RK and Chakrabartty, S}, title = {Exploiting jump-resonance hysteresis in silicon auditory front-ends for extracting speaker discriminative formant trajectories.}, journal = {IEEE transactions on biomedical circuits and systems}, volume = {7}, number = {4}, pages = {389-400}, doi = {10.1109/TBCAS.2012.2218104}, pmid = {23893199}, issn = {1940-9990}, mesh = {*Artifacts ; Female ; Humans ; Male ; Nonlinear Dynamics ; Probability ; Signal Processing, Computer-Assisted ; Silicon/*chemistry ; Sound Spectrography ; *Speech ; }, abstract = {Jump-resonance is a phenomenon observed in non-linear circuits where the amplitude of the output signal exhibits an abrupt jump when the frequency of the input signal is varied. For [Formula: see text] filters used in the design of analog auditory front-ends (AFEs), jump-resonance is generally considered to be undesirable and several techniques have been proposed in literature to avoid or alleviate this artifact. In this paper we explore the use of jump-resonance based hysteresis in [Formula: see text] band-pass filters for encoding speech formant trajectories. Using prototypes of silicon AFEs fabricated in a 0.5 μm CMOS process, we demonstrate the benefits of the proposed approach for extracting speaker discriminative features. These benefits are validated using speaker recognition experiments where consistent improvements in equal-error-rates (EERs) are achieved using the jump-resonance based features as compared to conventional features.}, } @article {pmid23891323, year = {2013}, author = {Warhurst, S and McCabe, P and Yiu, E and Heard, R and Madill, C}, title = {Acoustic characteristics of male commercial and public radio broadcast voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {5}, pages = {655.e1-7}, doi = {10.1016/j.jvoice.2013.04.012}, pmid = {23891323}, issn = {1873-4588}, mesh = {Adult ; Commerce ; Humans ; Male ; *Mass Media ; *Speech Acoustics ; *Voice Quality ; }, abstract = {AIMS: Radio broadcasters need to have a voice that suits the station, which employs them. The aim of this study was to determine whether there are any acoustic measures that reflect differences between male broadcasters, who use their voices on commercial and public radio stations, and nonbroadcasting, male controls.

METHOD: Male commercial (n = 4) and public (n = 11) broadcasters and two groups of male, age-matched controls were recorded while reading the "Rainbow Passage" as if presenting on radio. Reading productions were analyzed for equivalent sound level (L(eq)), two measures of the long-term average spectrum and two measures of cepstral peak prominence. A two-group (ie, commercial vs public) by two-paired (broadcaster/matching control) analysis of variance was performed for each measure.

RESULTS: An interaction effect was observed such that commercial broadcasters had a higher peak in speaker's formant (SF) region and lower alpha ratio (AR) (lower level difference between 0 and 1 k Hz and 1 and 4 kHz ranges) than public broadcasters and controls. Post hoc discriminant function analyses showed that AR could predict whether a radio performer worked on commercial or public radio network to 81% accuracy (R(2) = 0.810, P < 0.001).

DISCUSSION: Commercial broadcasters have a more prominent peak in the SF region and smaller AR than public broadcasters and controls, similar to levels documented in actors. Given these features were not found in the public broadcasters and either control group, these results indicate that voice quality requirements for broadcasters may distinctly differ based on their station of employment. Further research with a larger sample size is required to validate this hypothesis.}, } @article {pmid23882002, year = {2014}, author = {Skuk, VG and Schweinberger, SR}, title = {Influences of fundamental frequency, formant frequencies, aperiodicity, and spectrum level on the perception of voice gender.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {57}, number = {1}, pages = {285-296}, doi = {10.1044/1092-4388(2013/12-0314)}, pmid = {23882002}, issn = {1558-9102}, mesh = {Adult ; Cues ; Female ; Humans ; Judgment ; Male ; *Models, Biological ; Phonetics ; *Sex Characteristics ; *Speech Acoustics ; Speech Perception/*physiology ; Voice/*physiology ; Young Adult ; }, abstract = {PURPOSE: To determine the relative importance of acoustic parameters (fundamental frequency [F0], formant frequencies [FFs], aperiodicity, and spectrum level [SL]) on voice gender perception, the authors used a novel parameter-morphing approach that, unlike spectral envelope shifting, allows the application of nonuniform scale factors to transform formants and more direct comparison of parameter impact.

METHOD: In each of 2 experiments, 16 listeners with normal hearing (8 female, 8 male) classified voice gender for morphs between female and male speakers, using syllable tokens from 2 male-female speaker pairs. Morphs varied single acoustic parameters (Experiment 1) or selected combinations (Experiment 2), keeping residual parameters androgynous, as determined in a baseline experiment.

RESULTS: The strongest cue related to gender perception was F0, followed by FF and SL. Aperiodicity did not systematically influence gender perception. Morphing F0 and FF in conjunction produced convincing changes in perceived gender-changes that were equivalent to those for Full morphs interpolating all parameters. Despite the importance of F0, morphing FF and SL in combination produced effective changes in voice gender perception.

CONCLUSIONS: The most important single parameters for gender perception are, in order, F0, FF, and SL. At the same time, F0 and vocal tract resonances have a comparable impact on voice gender perception.}, } @article {pmid23874316, year = {2013}, author = {Sato, M and Grabski, K and Garnier, M and Granjon, L and Schwartz, JL and Nguyen, N}, title = {Converging toward a common speech code: imitative and perceptuo-motor recalibration processes in speech production.}, journal = {Frontiers in psychology}, volume = {4}, number = {}, pages = {422}, pmid = {23874316}, issn = {1664-1078}, abstract = {Auditory and somatosensory systems play a key role in speech motor control. In the act of speaking, segmental speech movements are programmed to reach phonemic sensory goals, which in turn are used to estimate actual sensory feedback in order to further control production. The adult's tendency to automatically imitate a number of acoustic-phonetic characteristics in another speaker's speech however suggests that speech production not only relies on the intended phonemic sensory goals and actual sensory feedback but also on the processing of external speech inputs. These online adaptive changes in speech production, or phonetic convergence effects, are thought to facilitate conversational exchange by contributing to setting a common perceptuo-motor ground between the speaker and the listener. In line with previous studies on phonetic convergence, we here demonstrate, in a non-interactive situation of communication, online unintentional and voluntary imitative changes in relevant acoustic features of acoustic vowel targets (fundamental and first formant frequencies) during speech production and imitation. In addition, perceptuo-motor recalibration processes, or after-effects, occurred not only after vowel production and imitation but also after auditory categorization of the acoustic vowel targets. Altogether, these findings demonstrate adaptive plasticity of phonemic sensory-motor goals and suggest that, apart from sensory-motor knowledge, speech production continuously draws on perceptual learning from the external speech environment.}, } @article {pmid23838989, year = {2013}, author = {Tjaden, K and Lam, J and Wilding, G}, title = {Vowel acoustics in Parkinson's disease and multiple sclerosis: comparison of clear, loud, and slow speaking conditions.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {56}, number = {5}, pages = {1485-1502}, pmid = {23838989}, issn = {1558-9102}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; R01DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; *Dysarthria/etiology/physiopathology/therapy ; Female ; Humans ; Male ; Middle Aged ; Multiple Sclerosis, Chronic Progressive/complications/*physiopathology ; Multiple Sclerosis, Relapsing-Remitting/complications/*physiopathology ; Parkinson Disease/complications/*physiopathology ; Phonetics ; Reproducibility of Results ; *Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement/*methods/standards ; Speech Therapy/methods ; }, abstract = {PURPOSE: The impact of clear speech, increased vocal intensity, and rate reduction on acoustic characteristics of vowels was compared in speakers with Parkinson's disease (PD), speakers with multiple sclerosis (MS), and healthy controls.

METHOD: Speakers read sentences in habitual, clear, loud, and slow conditions. Variations in clarity, intensity, and rate were stimulated using magnitude production. Formant frequency values for peripheral and nonperipheral vowels were obtained at 20%, 50%, and 80% of vowel duration to derive static and dynamic acoustic measures. Intensity and duration measures were obtained.

RESULTS: Rate was maximally reduced in the slow condition, and vocal intensity was maximized in the loud condition. The clear condition also yielded a reduced articulatory rate and increased intensity, although less than for the slow or loud conditions. Overall, the clear condition had the most consistent impact on vowel spectral characteristics. Spectral and temporal distinctiveness for peripheral-nonperipheral vowel pairs was largely similar across conditions.

CONCLUSIONS: Clear speech maximized peripheral and nonperipheral vowel space areas for speakers with PD and MS while also reducing rate and increasing vocal intensity. These results suggest that a speech style focused on increasing articulatory amplitude yields the most robust changes in vowel segmental articulation.}, } @article {pmid23834133, year = {2013}, author = {Kara, M and Öztürk, K and Özer, B}, title = {An evaluation of the effects of adenoidectomy on voice and speech function in children.}, journal = {Kulak burun bogaz ihtisas dergisi : KBB = Journal of ear, nose, and throat}, volume = {23}, number = {4}, pages = {225-231}, doi = {10.5606/kbbihtisas.2013.09476}, pmid = {23834133}, issn = {1300-7475}, mesh = {*Adenoidectomy ; Adenoids/*pathology ; Case-Control Studies ; Child ; Female ; Humans ; Hypertrophy/physiopathology/surgery ; Male ; Postoperative Period ; *Speech Acoustics ; Treatment Outcome ; *Voice Quality ; }, abstract = {OBJECTIVES: This study aims to evaluate the possible effects of adenoidectomy on voice and speech function.

PATIENTS AND METHODS: Thirty-six children (20 boys, 16 girls; mean age 8.22±1.86 years) with adenoid hypertrophy and 50 healthy children (23 boys, 27 girls; mean age 8.54±1.92 years) were included in the study. Acoustic and spectrographic analyses, voice analysis and nasalance assessment were carried out preoperatively and at one week and three months postoperatively in children who underwent adenoidectomy operation and control group.

RESULTS: A significant change in voice nasalance and F3 and F4 formants was observed in children who underwent adenoidectomy. There was no significant change in F0, shimmer %, amplitude perturbation quotient (APQ), jitter %, relative average perturbation (RAP), noise to harmonic ratio (NHR), F1 and F2 formant values, as assessed by objective voice analysis.

CONCLUSION: Our study results show that adenoidectomy may affect voice resonance and nasalance, changing the shape and size of nasopharynx and upper respiratory tract. Adenoidectomy seems to be safe without any significant change in the voice quality.}, } @article {pmid23830784, year = {2013}, author = {Delviniotis, DS}, title = {Acoustic characteristics of modern Greek Orthodox Church music.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {5}, pages = {656.e1-12}, doi = {10.1016/j.jvoice.2013.03.011}, pmid = {23830784}, issn = {1873-4588}, mesh = {*Eastern Orthodoxy ; Greece ; Humans ; Male ; *Music ; *Singing ; *Speech Acoustics ; }, abstract = {OBJECTIVES: Some acoustic characteristics of the two types of vocal music of the Greek Orthodox Church Music, the Byzantine chant (BC) and ecclesiastical speech (ES), are studied in relation to the common Greek speech and the Western opera.

STUDY DESIGN: Vocal samples were obtained, and their acoustic parameters of sound pressure level (SPL), fundamental frequency (F0), and the long-time average spectrum (LTAS) characteristics were analyzed.

METHOD: Twenty chanters, including two chanters-singers of opera, sang (BC) and read (ES) the same hymn of Byzantine music (BM), the two opera singers sang the same aria of opera, and common speech samples were obtained, and all audio were analyzed.

RESULTS: The distribution of SPL values showed that the BC and ES have higher SPL by 9 and 12 dB, respectively, than common speech. The average F0 in ES tends to be lower than the common speech, and the smallest standard deviation (SD) of F0 values characterizes its monotonicity. The tone-scale intervals of BC are close enough to the currently accepted theory with SD equal to 0.24 semitones. The rate and extent of vibrato, which is rare in BC, equals 4.1 Hz and 0.6 semitones, respectively. The average LTAS slope is greatest in BC (+4.5 dB) but smaller than in opera (+5.7 dB). In both BC and ES, instead of a singer's formant appearing in an opera voice, a speaker's formant (SPF) was observed around 3300 Hz, with relative levels of +6.3 and +4.6 dB, respectively.

CONCLUSIONS: The two vocal types of BM, BC, and ES differ both to each other and common Greek speech and opera style regarding SPL, the mean and SD of F0, the LTAS slope, and the relative level of SPF.}, } @article {pmid23809566, year = {2013}, author = {Sundberg, J and Scherer, R and Hess, M and Müller, F and Granqvist, S}, title = {Subglottal pressure oscillations accompanying phonation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {4}, pages = {411-421}, doi = {10.1016/j.jvoice.2013.03.006}, pmid = {23809566}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Biomechanical Phenomena ; Glottis/*physiology ; Humans ; Male ; Oscillometry ; *Phonation ; Pressure ; Sound Spectrography ; *Speech Acoustics ; Time Factors ; Vibration ; *Voice Quality ; }, abstract = {Acoustic and aerodynamic properties of the voice source and vocal tract have been extensively analyzed during the last half century. Corresponding investigations of the subglottal system are rare but can be assumed to be relevant to voice production. In the present exploratory study, subglottal pressure was recorded in a male adult subject by means of tracheal puncture. Also recorded were the oral airflow and audio signals. Effects of vowel, phonation type, and vocal register shifts on the subglottal pressure waveform were examined. The moment of maximum flow declination rate was synchronous with the main positive peak of the subglottal pressure waveform. The three lowest subglottal resonance frequencies, determined by inverse filtering and long-term average spectra of the subglottal pressure during speech, were found to be about 500, 1220, and 2000Hz, irrespective of supraglottal variations and phonation type. However, the subglottal pressure waveform was affected by the supraglottal formants, whereas the radiated vowel spectra did not show clear influence by the subglottal resonances. The fundamental frequency immediately preceding and immediately following a register break in pitch glides did not show systematic relationships with formants or with the lowest subglottal resonance.}, } @article {pmid23792077, year = {2013}, author = {Gnanateja, GN and Ranjan, R and Firdose, H and Sinha, SK and Maruthy, S}, title = {Acoustic basis of context dependent brainstem encoding of speech.}, journal = {Hearing research}, volume = {304}, number = {}, pages = {28-32}, doi = {10.1016/j.heares.2013.06.002}, pmid = {23792077}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Brain Stem/*physiology ; Electroencephalography ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The newfound context dependent brainstem encoding of speech is evidence of online regularity detection and modulation of the sub-cortical responses. We studied the influence of spectral structure of the contextual stimulus on context dependent encoding of speech at the brainstem, in an attempt to understand the acoustic basis for this effect. Fourteen normal hearing adults participated in a randomized true experimental design in whom brainstem responses were recorded. Brainstem responses for a high pass filtered /da/ in the context of syllables, that either had same or different spectral structure were compared with each other. The findings suggest that spectral structure is one of the parameters which cue the context dependent sub-cortical encoding of speech. Interestingly, the results also revealed that, brainstem can encode pitch even with negligible acoustic information below the second formant frequency.}, } @article {pmid23763476, year = {2013}, author = {Bozeman, KW}, title = {Acoustic passaggio pedagogy for the male voice.}, journal = {Logopedics, phoniatrics, vocology}, volume = {38}, number = {2}, pages = {64-69}, doi = {10.3109/14015439.2012.679967}, pmid = {23763476}, issn = {1651-2022}, mesh = {*Acoustics ; Biomechanical Phenomena ; Humans ; Larynx/anatomy & histology/*physiology ; Male ; *Phonation ; *Singing ; Time Factors ; Vibration ; *Voice Quality ; }, abstract = {Awareness of interactions between the lower harmonics of the voice source and the first formant of the vocal tract, and of the passive vowel modifications that accompany them, can assist in working out a smooth transition through the passaggio of the male voice. A stable vocal tract length establishes the general location of all formants, including the higher formants that form the singer's formant cluster. Untrained males instinctively shorten the tube to preserve the strong F1/H2 acoustic coupling of voce aperta, resulting in 'yell' timbre. If tube length and shape are kept stable during pitch ascent, the yell can be avoided by allowing the second harmonic to rise above the first formant, creating the balanced timbre of voce chiusa.}, } @article {pmid23759336, year = {2013}, author = {Robb, MP and Crowell, DH and Dunn-Rankin, P}, title = {Sudden infant death syndrome: cry characteristics.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {77}, number = {8}, pages = {1263-1267}, doi = {10.1016/j.ijporl.2013.05.005}, pmid = {23759336}, issn = {1872-8464}, support = {29056//PHS HHS/United States ; 29060//PHS HHS/United States ; 29067//PHS HHS/United States ; 29071//PHS HHS/United States ; 29073//PHS HHS/United States ; 34625//PHS HHS/United States ; }, mesh = {Case-Control Studies ; Crying/*physiology/psychology ; Female ; Humans ; Infant ; Infant, Newborn ; Larynx/physiopathology ; Male ; Pain/physiopathology/psychology ; Sound Spectrography ; Speech Acoustics ; *Sudden Infant Death ; }, abstract = {OBJECTIVE: To acoustically evaluate the cries of SIDS infants and compare these cry features to a group of healthy term (HT) infants, as well as previously published results for SIDS infants.

METHODS: Pain-induced crying episodes were collected from four infants during the first weeks of life that later died of SIDS. Temporal and spectral features of each crying episode were characterized based on measures of cry duration, cry fundamental frequency (F0), and cry formant frequencies (F1 and F2).

RESULTS: The SIDS infants were found to produce cries with longer duration compared to HT infants. The cries of SIDS infants also differed from HT infants in regard to the absolute difference in F2-F1 frequency.

CONCLUSIONS: The acoustic features considered in the present study support the contention that the cries of SIDS infants are reflective of atypical respiratory-laryngeal control. Although research of this nature is rare, there is evidence to suggest an acoustic profile of crying that is specific to SIDS.}, } @article {pmid23755148, year = {2013}, author = {Pitcher, BJ and Mesoudi, A and McElligott, AG}, title = {Sex-biased sound symbolism in english-language first names.}, journal = {PloS one}, volume = {8}, number = {6}, pages = {e64825}, pmid = {23755148}, issn = {1932-6203}, mesh = {England ; Female ; Gender Identity ; Humans ; Language ; Male ; *Names ; New South Wales ; Phonetics ; Sex Characteristics ; Sexism ; Speech Acoustics ; *Symbolism ; United States ; Wales ; }, abstract = {Sexual selection has resulted in sex-based size dimorphism in many mammals, including humans. In Western societies, average to taller stature men and comparatively shorter, slimmer women have higher reproductive success and are typically considered more attractive. This size dimorphism also extends to vocalisations in many species, again including humans, with larger individuals exhibiting lower formant frequencies than smaller individuals. Further, across many languages there are associations between phonemes and the expression of size (e.g. large /a, o/, small /i, e/), consistent with the frequency-size relationship in vocalisations. We suggest that naming preferences are a product of this frequency-size relationship, driving male names to sound larger and female names smaller, through sound symbolism. In a 10-year dataset of the most popular British, Australian and American names we show that male names are significantly more likely to contain larger sounding phonemes (e.g. "Thomas"), while female names are significantly more likely to contain smaller phonemes (e.g. "Emily"). The desire of parents to have comparatively larger, more masculine sons, and smaller, more feminine daughters, and the increased social success that accompanies more sex-stereotyped names, is likely to be driving English-language first names to exploit sound symbolism of size in line with sexual body size dimorphism.}, } @article {pmid23742437, year = {2013}, author = {Zhou, X and Woo, J and Stone, M and Prince, JL and Espy-Wilson, CY}, title = {Improved vocal tract reconstruction and modeling using an image super-resolution technique.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {EL439-45}, pmid = {23742437}, issn = {1520-8524}, support = {R01CA133 015/CA/NCI NIH HHS/United States ; }, mesh = {Algorithms ; Artifacts ; *Computer Simulation ; Epiglottis/*anatomy & histology/physiology ; Humans ; Image Enhancement/*methods ; Image Processing, Computer-Assisted/*methods ; Imaging, Three-Dimensional/*methods ; Larynx/*anatomy & histology/physiology ; Lip/*anatomy & histology/physiology ; Magnetic Resonance Imaging/*methods ; Pharynx/*anatomy & histology/physiology ; Phonation/*physiology ; *Phonetics ; Sensitivity and Specificity ; Software ; Sound Spectrography ; Speech Acoustics ; }, abstract = {Magnetic resonance imaging has been widely used in speech production research. Often only one image stack (sagittal, axial, or coronal) is used for vocal tract modeling. As a result, complementary information from other available stacks is not utilized. To overcome this, a recently developed super-resolution technique was applied to integrate three orthogonal low-resolution stacks into one isotropic volume. The results on vowels show that the super-resolution volume produces better vocal tract visualization than any of the low-resolution stacks. Its derived area functions generally produce formant predictions closer to the ground truth, particularly for those formants sensitive to area perturbations at constrictions.}, } @article {pmid23742379, year = {2013}, author = {Kumaresan, R and Peddinti, VK and Cariani, P}, title = {Synchrony capture filterbank: auditory-inspired signal processing for tracking individual frequency components in speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {4290-4310}, doi = {10.1121/1.4802653}, pmid = {23742379}, issn = {1520-8524}, mesh = {Algorithms ; Cochlea/physiology ; Cochlear Nerve/*physiology ; Computer Simulation ; Female ; Humans ; Male ; *Phonetics ; Pitch Perception/*physiology ; *Signal Processing, Computer-Assisted ; Sound Localization/*physiology ; *Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {A processing scheme for speech signals is proposed that emulates synchrony capture in the auditory nerve. The role of stimulus-locked spike timing is important for representation of stimulus periodicity, low frequency spectrum, and spatial location. In synchrony capture, dominant single frequency components in each frequency region impress their time structures on temporal firing patterns of auditory nerve fibers with nearby characteristic frequencies (CFs). At low frequencies, for voiced sounds, synchrony capture divides the nerve into discrete CF territories associated with individual harmonics. An adaptive, synchrony capture filterbank (SCFB) consisting of a fixed array of traditional, passive linear (gammatone) filters cascaded with a bank of adaptively tunable, bandpass filter triplets is proposed. Differences in triplet output envelopes steer triplet center frequencies via voltage controlled oscillators (VCOs). The SCFB exhibits some cochlea-like responses, such as two-tone suppression and distortion products, and possesses many desirable properties for processing speech, music, and natural sounds. Strong signal components dominate relatively greater numbers of filter channels, thereby yielding robust encodings of relative component intensities. The VCOs precisely lock onto harmonics most important for formant tracking, pitch perception, and sound separation.}, } @article {pmid23742378, year = {2013}, author = {Verschuur, C and Boland, C and Frost, E and Constable, J}, title = {The role of first formant information in simulated electro-acoustic hearing.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {4279-4289}, doi = {10.1121/1.4803910}, pmid = {23742378}, issn = {1520-8524}, mesh = {Adolescent ; Adult ; *Attention ; *Auditory Threshold ; *Cochlear Implants ; Combined Modality Therapy ; *Cues ; Deafness/psychology/*rehabilitation ; Female ; *Hearing Aids ; Humans ; Male ; Perceptual Masking ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Reception Threshold Test ; Young Adult ; }, abstract = {Cochlear implant (CI) recipients with residual hearing show improved performance with the addition of low-frequency acoustic stimulation (electro-acoustic stimulation, EAS). The present study sought to determine whether a synthesized first formant (F1) signal provided benefit to speech recognition in simulated EAS hearing and to compare such benefit with that from other low-frequency signals. A further aim was to determine if F1 amplitude or frequency was more important in determining benefit and if F1 benefit varied with formant bandwidth. In two experiments, sentence recordings from a male speaker were processed via a simulation of a partial insertion CI, and presented to normal hearing listeners in combination with various low-frequency signals, including a tone tracking fundamental frequency (F0), low-pass filtered speech, and signals based on F1 estimation. A simulated EAS benefit was found with F1 signals, and was similar to the benefit from F0 or low-pass filtered speech. The benefit did not differ significantly with the narrowing or widening of the F1 bandwidth. The benefit from low-frequency envelope signals was significantly less than the benefit from any low-frequency signal containing fine frequency information. Results indicate that F1 provides a benefit in simulated EAS hearing but low frequency envelope information is less important than low frequency fine structure in determining such benefit.}, } @article {pmid23742374, year = {2013}, author = {Idemaru, K and Holt, LL}, title = {The developmental trajectory of children's perception and production of English /r/-/l/.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {4232-4246}, pmid = {23742374}, issn = {1520-8524}, support = {R01 DC004674/DC/NIDCD NIH HHS/United States ; R01DC004674/DC/NIDCD NIH HHS/United States ; }, mesh = {Age Factors ; Child ; Child, Preschool ; Cues ; Female ; Humans ; *Language Development ; Male ; *Phonation ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {The English /l-r/ distinction is difficult to learn for some second language learners as well as for native-speaking children. This study examines the use of the second (F2) and third (F3) formants in the production and perception of /l/ and /r/ sounds in 4-, 4.5-, 5.5-, and 8.5-yr-old English-speaking children. The children were tested with elicitation and repetition tasks as well as word recognition tasks. The results indicate that whereas young children's /l/ and /r/ in both production and perception show fairly high accuracy and were well defined along the primary acoustic parameter that differentiates them, F3 frequency, these children were still developing in regard to the integration of the secondary cue, F2 frequency. The pattern of development is consistent with the distribution of these features in the ambient input relative to the /l/ and /r/ category distinction: F3 is robust and reliable, whereas F2 is less reliable in distinguishing /l/ and /r/. With delayed development of F2, cue weighting of F3 and F2 for the English /l-r/ categorization seems to continue to develop beyond 8 or 9 yr of age. These data are consistent with a rather long trajectory of phonetic development whereby native categories are refined and tuned well into childhood.}, } @article {pmid23742373, year = {2013}, author = {Tarr, E and Nittrouer, S}, title = {Explaining coherence in coherence masking protection for adults and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {4218-4231}, pmid = {23742373}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Attention ; Child ; Child, Preschool ; Comprehension ; Female ; Humans ; Infant ; Male ; Pattern Recognition, Visual ; *Perceptual Masking ; *Phonetics ; Psycholinguistics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {Coherence masking protection (CMP) is the phenomenon in which a low-frequency target (typically a first formant) is labeled accurately in poorer signal-to-noise levels when combined with a high-frequency cosignal, rather than presented alone. An earlier study by the authors revealed greater CMP for children than adults, with more resistance to disruptions in harmonicity across spectral components [Nittrouer and Tarr (2011). Atten. Percept. Psychophys. 73, 2606-2623]. That finding was interpreted as demonstrating that children are obliged to process speech signals as broad spectral patterns, regardless of the harmonic structure of the spectral components. The current study tested three alternative, auditory explanations for the observed coherence of target + cosignal: (1) unique spectral shapes of target + cosignal support labeling, (2) periodicity of target + cosignal promotes coherence, and (3) temporal synchrony across target + cosignal reinforces temporal expectancies. Adults, eight-year-olds, and five-year-olds labeled stimuli in five conditions: F1 only and F1 + a constant cosignal (both used previously) were benchmarks for comparing thresholds for F1 + 3 new cosignals. Children again showed greater CMP than adults, but none of the three hypotheses could explain their CMP. It was again concluded that children are obliged to recognize speech signals as broad spectral patterns.}, } @article {pmid23742369, year = {2013}, author = {Hutka, SA and Alain, C and Binns, MA and Bidelman, GM}, title = {Age-related differences in the sequential organization of speech sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {6}, pages = {4177-4187}, doi = {10.1121/1.4802745}, pmid = {23742369}, issn = {1520-8524}, support = {/CAPMC/CIHR/Canada ; /CAPMC/CIHR/Canada ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Aging/*psychology ; Audiometry, Pure-Tone ; Auditory Threshold ; Comprehension ; Female ; Humans ; Male ; Perceptual Masking ; *Phonetics ; Reference Values ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {This study investigated the effects of age on listeners' tendency to group speech tokens into one or two auditory streams. Younger and older adults were presented with sequences of four vowel sounds, which were arranged according to the proximity of first-formant frequencies between adjacent vowels. In Experiment 1, participants were less accurate in identifying the order of the four vowels and more likely to report hearing two streams when the first-formant alternated between low and high frequency and the overall difference between adjacent vowels was large. This effect of first-formant continuity on temporal order judgments and probability of hearing two streams was higher in younger than in older adults. In Experiment 2, participants indicated whether there was rhythm irregularity in an otherwise isochronous sequence of four vowels. Young adults' thresholds were lower when successive first-formants ascended or descended monotonically (condition promoting integration) than when they alternated discontinuously (condition promoting streaming). This effect was not observed in older adults whose thresholds were comparable for both types of vowel sequences. These two experiments provide converging evidence for an age-related deficit in exploiting first-formant information between consecutive vowels, which appear to impede older adults' ability to sequentially group speech sounds over time.}, } @article {pmid23720522, year = {2013}, author = {Charlton, BD and Taylor, AM and Reby, D}, title = {Are men better than women at acoustic size judgements?.}, journal = {Biology letters}, volume = {9}, number = {4}, pages = {20130270}, pmid = {23720522}, issn = {1744-957X}, mesh = {Acoustics ; Adolescent ; Body Size ; England ; Female ; Humans ; Male ; *Selection, Genetic ; Sex Characteristics ; *Speech Perception ; Young Adult ; }, abstract = {Formants are important phonetic elements of human speech that are also used by humans and non-human mammals to assess the body size of potential mates and rivals. As a consequence, it has been suggested that formant perception, which is crucial for speech perception, may have evolved through sexual selection. Somewhat surprisingly, though, no previous studies have examined whether sexes differ in their ability to use formants for size evaluation. Here, we investigated whether men and women differ in their ability to use the formant frequency spacing of synthetic vocal stimuli to make auditory size judgements over a wide range of fundamental frequencies (the main determinant of vocal pitch). Our results reveal that men are significantly better than women at comparing the apparent size of stimuli, and that lower pitch improves the ability of both men and women to perform these acoustic size judgements. These findings constitute the first demonstration of a sex difference in formant perception, and lend support to the idea that acoustic size normalization, a crucial prerequisite for speech perception, may have been sexually selected through male competition. We also provide the first evidence that vocalizations with relatively low pitch improve the perception of size-related formant information.}, } @article {pmid23719271, year = {2013}, author = {Zielińska-Bliźniewska, H and Pietkiewicz, P and Miłoński, J and Urbaniak, J and Olszewski, J}, title = {Acoustic and capacity analysis of voice academic teachers with diagnosed hyperfunctional dysphonia by using DiagnoScope Specialist software.}, journal = {Otolaryngologia polska = The Polish otolaryngology}, volume = {67}, number = {3}, pages = {144-148}, doi = {10.1016/j.otpol.2013.02.001}, pmid = {23719271}, issn = {2300-8423}, mesh = {Adult ; Diagnostic Techniques and Procedures ; Dysphonia/*diagnosis ; Faculty ; Female ; Humans ; Laryngoscopes ; Middle Aged ; Occupational Diseases/*diagnosis ; *Software ; *Speech Acoustics ; Vocal Cord Dysfunction/*diagnosis ; }, abstract = {OBJECTIVES: The aim of the study was to assess the acoustic and capacity analyses of voice in academic teachers with hyperfunctional dysphonia using DiagnoScope Specialist software.

MATERIAL AND METHODS: The study covered 46 female academic teachers aged 34-48 years. The women were diagnosed with hyperfunctional dysphonia (with absence of organic pathologies). Having obtained the informed consent, a primary medical history was taken, videolaryngoscopic and stroboscopic examinations were performed and diagnostic voice acoustic and capacity analyses were carried out using DiagnoScope Specialist software.

RESULTS: The acoustic analysis carried out of academic teachers with diagnosed hyperfunctional dysphonia showed enhancement in the following parameters: fundamental frequency (FO) by 1.2%; relative average perturbation (Jitter by 100.0% and RAP by 81.8%); relative amplitude perturbation quotient (APQ) by 2.9%; non-harmonic to harmonic ratio (U2H) by 16.0%; and noise to harmonic ratio (NHR) by 13.4%. A decrease of 2.5% from normal values was noted in relative amplitude perturbation (Shimmer). Formant frequencies also showed reduction (F1 by 10.7%, F2 by 5.1%, F3 by 2.2%, and F4 by 3.5%). The harmonic perturbation quotient (HPQ) was 0.8% lower and the residual harmonic perturbation quotient (RHPQ) 16.8% lower, with the residual to harmonic (R2H) decreasing by 35.1 per cent; the sub-harmonic to harmonic (S2H) by 2.4%; and the Yanagihara coefficient by 20.2%.

CONCLUSIONS: The capacity analysis with the DiagnoScope Specialist software showed figures significantly lower than normal values of the following parameters: phonation time, true phonation time, phonation break coefficients, vocal capacity coefficient and mean vocal capacity.}, } @article {pmid23716238, year = {2013}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Formant-frequency variation and its effects on across-formant grouping in speech perception.}, journal = {Advances in experimental medicine and biology}, volume = {787}, number = {}, pages = {323-331}, doi = {10.1007/978-1-4614-1590-9_36}, pmid = {23716238}, issn = {0065-2598}, mesh = {Acoustic Stimulation/methods ; Cues ; Dichotic Listening Tests ; Humans ; Male ; *Models, Biological ; Perceptual Masking/physiology ; *Phonetics ; Sound Localization/*physiology ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Intelligibility ; Speech Perception/*physiology ; }, abstract = {How speech is separated perceptually from other speech remains poorly understood. In a series of experiments, perceptual organisation was probed by presenting three-formant (F1+F2+F3) analogues of target sentences dichotically, together with a competitor for F2 (F2C), or for F2+F3, which listeners must reject to optimise recognition. To control for energetic masking, the competitor was always presented in the opposite ear to the corresponding target formant(s). Sine-wave speech was used initially, and different versions of F2C were derived from F2 using separate manipulations of its amplitude and frequency contours. F2Cs with time-varying frequency contours were highly effective competitors, whatever their amplitude characteristics, whereas constant-frequency F2Cs were ineffective. Subsequent studies used synthetic-formant speech to explore the effects of manipulating the rate and depth of formant-frequency change in the competitor. Competitor efficacy was not tuned to the rate of formant-frequency variation in the target sentences; rather, the reduction in intelligibility increased with competitor rate relative to the rate for the target sentences. Therefore, differences in speech rate may not be a useful cue for separating the speech of concurrent talkers. Effects of competitors whose depth of formant-frequency variation was scaled by a range of factors were explored using competitors derived either by inverting the frequency contour of F2 about its geometric mean (plausibly speech-like pattern) or by using a regular and arbitrary frequency contour (triangle wave, not plausibly speech-like) matched to the average rate and depth of variation for the inverted F2C. Competitor efficacy depended on the overall depth of frequency variation, not depth relative to that for the other formants. Furthermore, the triangle-wave competitors were as effective as their more speech-like counterparts. Overall, the results suggest that formant-frequency variation is critical for the across-frequency grouping of formants but that this grouping does not depend on speech-specific constraints.}, } @article {pmid23690872, year = {2013}, author = {Kim, KH and Choi, SJ and Kim, JH}, title = {A sound processor for cochlear implant using a simple dual path nonlinear model of basilar membrane.}, journal = {Computational and mathematical methods in medicine}, volume = {2013}, number = {}, pages = {153039}, pmid = {23690872}, issn = {1748-6718}, mesh = {Acoustic Stimulation ; Adult ; Algorithms ; Basilar Membrane/*physiology ; *Cochlear Implants/statistics & numerical data ; Computational Biology ; Female ; Hearing/physiology ; Humans ; Male ; *Models, Biological ; Nonlinear Dynamics ; Speech Acoustics ; Speech Perception/physiology ; Young Adult ; }, abstract = {We propose a new active nonlinear model of the frequency response of the basilar membrane in biological cochlea called the simple dual path nonlinear (SDPN) model and a novel sound processing strategy for cochlear implants (CIs) based upon this model. The SDPN model was developed to utilize the advantages of the level-dependent frequency response characteristics of the basilar membrane for robust formant representation under noisy conditions. In comparison to the dual resonance nonlinear model (DRNL) which was previously proposed as an active nonlinear model of the basilar membrane, the SDPN model can reproduce similar level-dependent frequency responses with a much simpler structure and is thus better suited for incorporation into CI sound processors. By the analysis of dominant frequency component, it was confirmed that the formants of speech are more robustly represented after frequency decomposition by the nonlinear filterbank using SDPN, compared to a linear bandpass filter array which is used in conventional strategies. Acoustic simulation and hearing experiments in subjects with normal hearing showed that the proposed strategy results in better syllable recognition under speech-shaped noise compared to the conventional strategy based on fixed linear bandpass filters.}, } @article {pmid23683806, year = {2013}, author = {Guzman, M and Laukkanen, AM and Krupa, P and Horáček, J and Švec, JG and Geneid, A}, title = {Vocal tract and glottal function during and after vocal exercising with resonance tube and straw.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {4}, pages = {523.e19-34}, doi = {10.1016/j.jvoice.2013.02.007}, pmid = {23683806}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Auditory Perception ; Biomechanical Phenomena ; Electrodiagnosis ; Female ; Glottis/diagnostic imaging/*physiology ; Humans ; Judgment ; Male ; *Phonation ; Pressure ; *Singing ; Sound Spectrography ; Tomography, Spiral Computed ; Vocal Cords/diagnostic imaging/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVE: The present study aimed to investigate the vocal tract and glottal function during and after phonation into a tube and a stirring straw.

METHODS: A male classically trained singer was assessed. Computerized tomography (CT) was performed when the subject produced [a:] at comfortable speaking pitch, phonated into the resonance tube and when repeating [a:] after the exercise. Similar procedure was performed with a narrow straw after 15 minutes silence. Anatomic distances and area measures were obtained from CT midsagittal and transversal images. Acoustic, perceptual, electroglottographic (EGG), and subglottic pressure measures were also obtained.

RESULTS: During and after phonation into the tube or straw, the velum closed the nasal passage better, the larynx position lowered, and hypopharynx area widened. Moreover, the ratio between the inlet of the lower pharynx and the outlet of the epilaryngeal tube became larger during and after tube/straw phonation. Acoustic results revealed a stronger spectral prominence in the singer/speaker's formant cluster region after exercising. Listening test demonstrated better voice quality after straw/tube than before. Contact quotient derived from EGG decreased during both tube and straw and remained lower after exercising. Subglottic pressure increased during straw and remained somewhat higher after it.

CONCLUSION: CT and acoustic results indicated that vocal exercises with increased vocal tract impedance lead to increased vocal efficiency and economy. One of the major changes was the more prominent singer's/speaker's formant cluster. Vocal tract and glottal modifications were more prominent during and after straw exercising compared with tube phonation.}, } @article {pmid23667678, year = {2013}, author = {Passilongo, D and Reby, D and Carranza, J and Apollonio, M}, title = {Roaring high and low: composition and possible functions of the Iberian stag's vocal repertoire.}, journal = {PloS one}, volume = {8}, number = {5}, pages = {e63841}, pmid = {23667678}, issn = {1932-6203}, mesh = {Animals ; *Biological Evolution ; Cluster Analysis ; Deer/*physiology ; Linear Models ; Male ; Selection, Genetic ; Sexual Behavior, Animal/*physiology ; Sound Spectrography ; Spain ; Species Specificity ; Vocalization, Animal/classification/*physiology ; }, abstract = {We provide a detailed description of the rutting vocalisations of free-ranging male Iberian deer (Cervus elaphus hispanicus, Hilzheimer 1909), a geographically isolated and morphologically differentiated subspecies of red deer Cervus elaphus. We combine spectrographic examinations, spectral analyses and automated classifications to identify different call types, and compare the composition of the vocal repertoire with that of other red deer subspecies. Iberian stags give bouts of roars (and more rarely, short series of barks) that are typically composed of two different types of calls. Long Common Roars are mostly given at the beginning or at the end of the bout, and are characterised by a high fundamental frequency (F0) resulting in poorly defined formant frequencies but a relatively high amplitude. In contrast, Short Common Roars are typically given in the middle or at the end of the bout, and are characterised by a lower F0 resulting in relatively well defined vocal tract resonances, but low amplitude. While we did not identify entirely Harsh Roars (as described in the Scottish red deer subspecies (Cervus elaphus scoticus)), a small percentage of Long Common Roars contained segments of deterministic chaos. We suggest that the evolution of two clearly distinct types of Common Roars may reflect divergent selection pressures favouring either vocal efficiency in high pitched roars or the communication of body size in low-pitched, high spectral density roars highlighting vocal tract resonances. The clear divergence of the Iberian red deer vocal repertoire from those of other documented European red deer populations reinforces the status of this geographical variant as a distinct subspecies.}, } @article {pmid23662557, year = {2013}, author = {Chang, Y and Li, F}, title = {[Effects of different major connectors on pronunciation and comfort in Kennedy I dentition defect].}, journal = {Hua xi kou qiang yi xue za zhi = Huaxi kouqiang yixue zazhi = West China journal of stomatology}, volume = {31}, number = {2}, pages = {158-60, 164}, pmid = {23662557}, issn = {1000-1182}, mesh = {Aged ; Dentition ; *Denture Design ; *Denture, Partial, Removable ; Humans ; Molar ; Tooth, Deciduous ; }, abstract = {OBJECTIVE: To analyze the effect on pronunciation and comfort level when 4 different major connectors (type 1: Palatal plate; type 2: Single palatal strap; type 3: Anterior-posterior joint palatal strap; type 4: Anterior-posterior joint palatal bar) were used in patients with Kennedy I deletion.

METHODS: Tirty young volunteers were chosen to wear 4 different major connectors. The pronunciation of consonants /s/, /t/, /j/, /sh/ were detected and the different formant parameters(F1 and F2) were analyzed by the computer speech analytical system. The comfort levels of 4 major connectors were compared by questionnaire survey.

RESULTS: 1)There was no significant difference(P5).05) in pronunciation of consonants /s/, /sh/, /t/(F1 and F2) among the 4 types of connectors whatever wearing or not, but there was significant difference (P < 0.05) in pronunciation of consonant /j/ among wearing type 3, type 4 and no wearing. 2)There was no significant difference in comfort level (P > 0.05) among the four connectors.

CONCLUSION: The effect on pronunciation of patients with Kennedy I deletion was less when wearing the first and the second types of connectors than that of wearing the third and the fourth types of connectors. There was no significant difference in comfort level among the four connectors.}, } @article {pmid23656095, year = {2013}, author = {Jin, SH and Liu, C}, title = {The vowel inherent spectral change of English vowels spoken by native and non-native speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {5}, pages = {EL363-9}, doi = {10.1121/1.4798620}, pmid = {23656095}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Audiometry, Pure-Tone ; Auditory Threshold ; Humans ; *Multilingualism ; *Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The current study examined Vowel Inherent Spectral Change (VISC) of English vowels spoken by English-, Chinese-, and Korean-native speakers. Two metrics, spectral distance (amount of spectral shift) and spectral angle (direction of spectral shift) of formant movement from the onset to the offset, were measured for 12 English monophthongs produced in a /hvd/ context. While Chinese speakers showed significantly greater spectral distances of vowels than English and Korean speakers, there was no significant speakers' native language effect on spectral angles. Comparisons to their native vowels for Chinese and Korean speakers suggest that VISC might be affected by language-specific phonological structure.}, } @article {pmid23654403, year = {2013}, author = {Mitsuya, T and Samson, F and Ménard, L and Munhall, KG}, title = {Language dependent vowel representation in speech production.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {5}, pages = {2993-3003}, pmid = {23654403}, issn = {1520-8524}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Feedback, Sensory ; Female ; Humans ; *Phonetics ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Perception ; *Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The representation of speech goals was explored using an auditory feedback paradigm. When talkers produce vowels the formant structure of which is perturbed in real time, they compensate to preserve the intended goal. When vowel formants are shifted up or down in frequency, participants change the formant frequencies in the opposite direction to the feedback perturbation. In this experiment, the specificity of vowel representation was explored by examining the magnitude of vowel compensation when the second formant frequency of a vowel was perturbed for speakers of two different languages (English and French). Even though the target vowel was the same for both language groups, the pattern of compensation differed. French speakers compensated to smaller perturbations and made larger compensations overall. Moreover, French speakers modified the third formant in their vowels to strengthen the compensation even though the third formant was not perturbed. English speakers did not alter their third formant. Changes in the perceptual goodness ratings by the two groups of participants were consistent with the threshold to initiate vowel compensation in production. These results suggest that vowel goals not only specify the quality of the vowel but also the relationship of the vowel to the vowel space of the spoken language.}, } @article {pmid23649684, year = {2013}, author = {Eun, YG and Shin, SY and Kim, SW}, title = {Effects of uvulopalatopharyngoplasty with or without radiofrequency tongue base reduction on voice in patients with obstructive sleep apnea.}, journal = {The Laryngoscope}, volume = {123}, number = {7}, pages = {1806-1810}, doi = {10.1002/lary.23456}, pmid = {23649684}, issn = {1531-4995}, mesh = {Adult ; Catheter Ablation/methods ; Humans ; Male ; Middle Aged ; Palate, Soft/*surgery ; Pharynx/*surgery ; Phonation ; Polysomnography ; Prospective Studies ; Sleep Apnea, Obstructive/*surgery ; Treatment Outcome ; Uvula/*surgery ; *Voice Quality ; }, abstract = {OBJECTIVES/HYPOTHESIS: To investigate voice change as a complication after uvulopalatopharyngoplasty (UPPP) with or without radiofrequency tongue base reduction (RTBR) in patients with obstructive sleep apnea.

STUDY DESIGN: Before and after study.

METHODS: Twenty-two patients with suspected velopharyngeal collapse only underwent uvulopalatopharyngoplasty (UPPP group). Twenty-five patients with velopharyngeal and retrolingual collapse underwent concurrent UPPP with RTBR (RTBR group). All patients were evaluated before surgery and at 8 weeks after surgery. Acoustic measures included mean fundamental frequency (mF0), maximal phonation time (MPT), jitter, shimmer, noise-to-harmonic ratio (NHR), hypernasality test, and the first three formant frequencies (F1, F2, F3) for sustained vowels. Voice handicap index (VHI) was used to determine subjective voice change.

RESULTS: Postoperative values for mF0, MPT, jitter, shimmer, NHR, hypernasality test, and F1 did not significantly change in either group following surgery. There were the significant decreases at the F2 of /u/ and the F3 of /o/ in the UPPP group, and at the F2 of /o/ and the F3 of /a/, /i/, and /o/ in the RTBR group. Postoperative VHI score was increased only in the RTBR group.

CONCLUSIONS: UPPP and UPPP with RTBR have an impact on formant frequencies of vowels. Despite a relatively small number of patients, it is apparent that UPPP with RTBR influences VHI. Patients, especially professional voice users, should be advised of this before considering the surgery.}, } @article {pmid23638065, year = {2013}, author = {Xu, Y and Lee, A and Wu, WL and Liu, X and Birkholz, P}, title = {Human vocal attractiveness as signaled by body size projection.}, journal = {PloS one}, volume = {8}, number = {4}, pages = {e62397}, pmid = {23638065}, issn = {1932-6203}, mesh = {Adult ; *Auditory Perception ; *Body Size ; Female ; Humans ; Male ; Sex Factors ; Sound Spectrography ; Speech Acoustics ; *Voice ; Voice Quality ; Young Adult ; }, abstract = {Voice, as a secondary sexual characteristic, is known to affect the perceived attractiveness of human individuals. But the underlying mechanism of vocal attractiveness has remained unclear. Here, we presented human listeners with acoustically altered natural sentences and fully synthetic sentences with systematically manipulated pitch, formants and voice quality based on a principle of body size projection reported for animal calls and emotional human vocal expressions. The results show that male listeners preferred a female voice that signals a small body size, with relatively high pitch, wide formant dispersion and breathy voice, while female listeners preferred a male voice that signals a large body size with low pitch and narrow formant dispersion. Interestingly, however, male vocal attractiveness was also enhanced by breathiness, which presumably softened the aggressiveness associated with a large body size. These results, together with the additional finding that the same vocal dimensions also affect emotion judgment, indicate that humans still employ a vocal interaction strategy used in animal calls despite the development of complex language.}, } @article {pmid23591453, year = {2013}, author = {Mendes, AP and Rodrigues, AF and Guerreiro, DM}, title = {Acoustic and phonatory characterization of the Fado voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {5}, pages = {655.e9-655.e15}, doi = {10.1016/j.jvoice.2012.10.008}, pmid = {23591453}, issn = {1873-4588}, mesh = {Adult ; *Culture ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; Portugal ; *Singing ; *Speech Acoustics ; }, abstract = {Fado is a Portuguese musical genre, instrumentally accompanied by a Portuguese and an acoustic guitar. Fado singers' voice is perceptually characterized by a low pitch, hoarse, and strained voice. The present research study sketches the acoustic and phonatory profile of the Fado singers' voice. Fifteen Fado singers produced spoken and sung phonatory tasks. For the spoken voice measures, the maximum phonation time and s/z ratio of Fado singers were near the inefficient physiological threshold. Fundamental frequency was higher than that found in nonsingers and lower than that found in Western Classical singers. Jitter and shimmer mean values were higher compared with nonsingers. Harmonic-to-noise ratio (HNR) was similar to the mean values for nonsingers. For the sung voice, jitter was higher compared with Country, Musical Theater, Soul, Jazz, and Western Classical singers and lower than Pop singers. Shimmer mean values were lower than Country, Musical Theater, Pop, Soul, and Jazz singers and higher than Western Classical singers. HNR was similar for Western Classical singers. Maximum phonational frequency range of Fado singers indicated that male and female subjects had a lower range compared with Western Classical singers. Additionally, Fado singers produced vibrato, but singer's formant was rarely produced. These sung voice characteristics could be related with life habits, less/lack of singing training, or could be just a Fado voice characteristic.}, } @article {pmid23583206, year = {2013}, author = {Hamdan, AL and Al Barazi, R and Khneizer, G and Turfe, Z and Sinno, S and Ashkar, J and Tabri, D}, title = {Formant frequency in relation to body mass composition.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {5}, pages = {567-571}, doi = {10.1016/j.jvoice.2012.09.005}, pmid = {23583206}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; *Body Composition ; Cues ; Healthy Volunteers ; Humans ; Male ; Middle Aged ; *Phonation ; Young Adult ; }, abstract = {OBJECTIVES: This study examines the relationship between total body mass composition and vowel formant frequency and formant dispersion in men.

METHODS: A total of 60 healthy male volunteers were recruited. Formant frequencies and dispersions of F1, F2, F3, and F4 for the vowels /ɑː/ and /iː/ were determined using spectrographic analysis.

RESULTS: The mean height and weight were 179.17 cm and 80.53 kg, respectively, with fat-free weight averaging to 67.02 kg (65.5% in the extremities vs 16.7% in the trunk). The body mass index (BMI) was 25.5 ± 3.34 kg/m(2). For the vowel /ɑː/, F1 and F4 correlated poorly with weight and trunk fat-free mass. There was also a poor negative correlation between F4 and muscle mass and body fat-free mass (r < 0.36). For the /iː/ vowel, there was a weak negative correlation between F2, F3, and F4 and height (r = -0.260, -0.299, and -0.320, respectively). Similarly, there was a negative correlation between F2 and muscle mass, trunk fat-free mass, and body fat-free mass (r = -0.291, -0.276, and -0.272, respectively). For the vowel /ɑː/, F1-F2 interspace correlated positively with fat weight, fat mass in the extremities, and trunk (r = 0.313, 0.350, and 0.264, respectively), whereas F2-F3 negatively correlated with weight (r = -0.255). For the /iː/ vowel, only F1-F2 negatively correlated with weight and BMI (r = -0.297 and -0.281).

CONCLUSION: There is no significant correlation between body mass composition, formant frequencies, and dispersions. All the correlations were poor with r values less than 0.36.}, } @article {pmid23570734, year = {2013}, author = {Carpenter, AL and Shahin, AJ}, title = {Development of the N1-P2 auditory evoked response to amplitude rise time and rate of formant transition of speech sounds.}, journal = {Neuroscience letters}, volume = {544}, number = {}, pages = {56-61}, pmid = {23570734}, issn = {1872-7972}, support = {R03 DC011168/DC/NIDCD NIH HHS/United States ; R03-DC011168/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Aging/*physiology ; Auditory Cortex/*physiology ; Child ; Child, Preschool ; Cues ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {We investigated the development of weighting strategies for acoustic cues by examining the morphology of the N1-P2 auditory evoked potential (AEP) to changes in amplitude rise time (ART) and rate of formant transition (RFT) of consonant-vowel (CV) pairs in 4-6-year olds and adults. In the AEP session, individuals listened passively to the CVs /ba/, /wa/, and a /ba/ with a superimposed slower-rising /wa/ envelope (/ba/(wa)). In the behavioral session, individuals listened to the same stimuli and judged whether they heard a /ba/ or /wa/. We hypothesized that a developmental shift in weighting strategies should be reflected in a change in the morphology of the N1-P2 AEP. In 6-year olds and adults, the N1-P2 amplitude at the vertex reflected a change in RFT but not in ART. In contrast, in the 4-5-year olds, the vertex N1-P2 did not show specificity to changes in ART or RFT. In all groups, the N1-P2 amplitude at channel C4 (right hemisphere) reflected a change in ART but not in RFT. Behaviorally, 6-year olds and adults predominately utilized RFT cues (classified /ba/(wa) as /ba/) during phonetic judgments, as opposed to 4-5-year olds which utilized both cues equally. Our findings suggest that both ART and RFT are encoded in the auditory cortex, but an N1-P2 shift toward the vertex following age 4-5 indicates a shift toward an adult-like weighting strategy, such that, to utilize RFT to a greater extent.}, } @article {pmid23517635, year = {2015}, author = {Vampola, T and Horáček, J and Laukkanen, AM and Švec, JG}, title = {Human vocal tract resonances and the corresponding mode shapes investigated by three-dimensional finite-element modelling based on CT measurement.}, journal = {Logopedics, phoniatrics, vocology}, volume = {40}, number = {1}, pages = {14-23}, doi = {10.3109/14015439.2013.775333}, pmid = {23517635}, issn = {1651-2022}, mesh = {Acoustics ; Biomechanical Phenomena ; *Computer Simulation ; Female ; Finite Element Analysis ; Glottis/*diagnostic imaging/*physiology ; Humans ; *Models, Anatomic ; *Models, Biological ; Numerical Analysis, Computer-Assisted ; *Phonation ; Pressure ; Radiographic Image Interpretation, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Tomography, X-Ray Computed ; Vibration ; *Voice Quality ; }, abstract = {Resonance frequencies of the vocal tract have traditionally been modelled using one-dimensional models. These cannot accurately represent the events in the frequency region of the formant cluster around 2.5-4.5 kHz, however. Here, the vocal tract resonance frequencies and their mode shapes are studied using a three-dimensional finite element model obtained from computed tomography measurements of a subject phonating on vowel [a:]. Instead of the traditional five, up to eight resonance frequencies of the vocal tract were found below the prominent antiresonance around 4.7 kHz. The three extra resonances were found to correspond to modes which were axially asymmetric and involved the piriform sinuses, valleculae, and transverse vibrations in the oral cavity. The results therefore suggest that the phenomenon of speaker's and singer's formant clustering may be more complex than originally thought.}, } @article {pmid23515561, year = {2012}, author = {Ferrat, K and Guerti, M}, title = {A study of sounds produced by Algerian esophageal speakers.}, journal = {African health sciences}, volume = {12}, number = {4}, pages = {452-458}, pmid = {23515561}, issn = {1729-0503}, mesh = {Algeria ; Algorithms ; Humans ; Laryngeal Neoplasms/*surgery ; Laryngectomy/adverse effects/*rehabilitation ; Male ; Middle Aged ; *Speech Acoustics ; Speech Production Measurement/methods ; *Voice ; *Voice Quality ; }, abstract = {BACKGROUND: Total Laryngectomy is a mode of treatment of patients with advanced laryngeal cancer. It affects the voice and the speech communication.

OBJECTIVE: To present an acoustic analysis of the new voice after total laryngectomy in Algerian hospital environment.

METHODS: A corpus of sounds was collected from October 2008 to September 2009 and pronounced by eight male speakers who have undergone total laryngectomy. Minimum age of patients was 47 years and maximum age was 59 years with mean age 54.87 years. Recordings were made before the beginning of reeducation and after three, six, and eleven months using esophageal voice. The acoustic analysis includes the Pitch F0 (Hz), Formants, intensity, Jitter (%), Shimmer (dB), harmonic to noise ratio HNR (dB), and degree of unvoiced frames DUF (%).

RESULTS: We note a restriction in F0, increasing of Jitter and Shimmer, decreasing of HNR values, and reduced intensity compared to the voice of normal laryngeal speakers. In addition, we note a higher percentage of DUF during the pronunciation of sustained vowels.

CONCLUSION: Some deficiencies were reported in the taking care of patients. Therefore, the acoustic analysis may be used in evaluating the reliability of the technique of reeducation.}, } @article {pmid23473455, year = {2013}, author = {Zourmand, A and Ting, HN and Mirhassani, SM}, title = {Gender classification in children based on speech characteristics: using fundamental and formant frequencies of Malay vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {2}, pages = {201-209}, doi = {10.1016/j.jvoice.2012.12.006}, pmid = {23473455}, issn = {1873-4588}, mesh = {Age Factors ; Algorithms ; Child ; *Child Language ; Female ; Humans ; Malaysia ; Male ; Markov Chains ; Neural Networks, Computer ; *Phonation ; *Phonetics ; Sex Factors ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {Speech is one of the prevalent communication mediums for humans. Identifying the gender of a child speaker based on his/her speech is crucial in telecommunication and speech therapy. This article investigates the use of fundamental and formant frequencies from sustained vowel phonation to distinguish the gender of Malay children aged between 7 and 12 years. The Euclidean minimum distance and multilayer perceptron were used to classify the gender of 360 Malay children based on different combinations of fundamental and formant frequencies (F0, F1, F2, and F3). The Euclidean minimum distance with normalized frequency data achieved a classification accuracy of 79.44%, which was higher than that of the nonnormalized frequency data. Age-dependent modeling was used to improve the accuracy of gender classification. The Euclidean distance method obtained 84.17% based on the optimal classification accuracy for all age groups. The accuracy was further increased to 99.81% using multilayer perceptron based on mel-frequency cepstral coefficients.}, } @article {pmid23453594, year = {2013}, author = {Sundberg, J and Lã, FM and Gill, BP}, title = {Formant tuning strategies in professional male opera singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {3}, pages = {278-288}, doi = {10.1016/j.jvoice.2012.12.002}, pmid = {23453594}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Electrodiagnosis ; Humans ; Male ; *Occupations ; Phonation ; *Singing ; Sound Spectrography ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {The term "formant tuning" is generally used for the case that one of the lowest formant frequencies coincides with the frequency of a source spectrum partial. Some authors claim that such coincidence is favorable and belongs to the goals of classical opera voice training, whereas other authors have found evidence for advising against it. This investigation analyzes the relationships between formant frequencies and partials in professional singers, who sang scales on the vowels /a/, /u/, /i/, and /ae/ in a pitch range including the passaggio, that is, the fundamental frequency range of approximately 300-400Hz, applying either of the two singing strategies that are typically used (1) in classical and (2) in nonclassical singing, respectively. Formant frequencies of each note in the scales were measured by inverse-filtering the acoustic signal. In the classical style, the first formant tended to be lower than in the nonclassical style. Neither the first nor the second formant tended to change systematically between scale tones, such that on some scale tones either or both formants was just below, just above, or right on a spectrum partial. In many cases, singers produced similar spectrum characteristics of the top tones of the scales with different first and second formant frequencies. Regardless of whether the first formant was slightly lower, slightly higher, or right on a partial, the properties of the voice source did not seem to be affected.}, } @article {pmid23429419, year = {2013}, author = {Goldsworthy, RL and Delhorne, LA and Braida, LD and Reed, CM}, title = {Psychoacoustic and phoneme identification measures in cochlear-implant and normal-hearing listeners.}, journal = {Trends in amplification}, volume = {17}, number = {1}, pages = {27-44}, pmid = {23429419}, issn = {1940-5588}, support = {R01 DC000117/DC/NIDCD NIH HHS/United States ; R01 DC007152/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Analysis of Variance ; Audiometry, Pure-Tone ; Audiometry, Speech ; Case-Control Studies ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Correction of Hearing Impairment/*instrumentation/methods ; Electric Stimulation ; Female ; Hearing Loss/diagnosis/psychology/*rehabilitation ; Humans ; Male ; Middle Aged ; Noise/adverse effects ; Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; *Phonetics ; *Pitch Discrimination ; *Psychoacoustics ; Recognition, Psychology ; Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Young Adult ; }, abstract = {The purpose of this study is to identify precise and repeatable measures for assessing cochlear-implant (CI) hearing. The study presents psychoacoustic and phoneme identification measures in CI and normal-hearing (NH) listeners, with correlations between measures examined. Psychoacoustic measures included pitch discrimination tasks using pure tones, harmonic complexes, and tone pips; intensity perception tasks included intensity discrimination for tones and modulation detection; spectral-temporal masking tasks included gap detection, forward and backward masking, tone-on-tone masking, synthetic formant-on-formant masking, and tone in noise detection. Phoneme perception measures included vowel and consonant identification in quiet and stationary and temporally gated speech-shaped noise. Results on psychoacoustic measures illustrate the effects of broader filtering in CI hearing contributing to reduced pitch perception and increased spectral masking. Results on consonant and vowel identification measures illustrate a wide range in performance across CI listeners. They also provide further evidence that CI listeners obtain little to no release of masking in temporally gated noise compared to stationary noise. The forward and backward-masking measures had the highest correlation with the phoneme identification measures for CI listeners. No significant correlations between speech reception and psychoacoustic measures were observed for NH listeners. The superior NH performance on measures of phoneme identification, especially in the presence of background noise, is a key difference between groups.}, } @article {pmid23429237, year = {2012}, author = {Ng, ML and Chan, MW}, title = {Analyzing neoglottal vibration of Cantonese tracheoesophageal speech: preliminary aerodynamic study using inverse filtering.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {64}, number = {6}, pages = {283-289}, doi = {10.1159/000346878}, pmid = {23429237}, issn = {1421-9972}, mesh = {Aged ; Aged, 80 and over ; Algorithms ; Glottis/physiopathology ; Humans ; *Language ; Male ; Middle Aged ; Phonation ; *Pulmonary Ventilation ; Signal Processing, Computer-Assisted ; *Sound Spectrography ; Speech Acoustics ; *Speech, Esophageal ; *Vibration ; }, abstract = {OBJECTIVE: Despite the many studies on tracheoesophageal (TE) speech, the actual vibratory characteristics of the neoglottis in TE speech are not known. The present study examined the neoglottal vibratory pattern of Cantonese TE speakers by comparison with the glottal waveform produced by laryngeal speakers.

SUBJECTS AND METHODS: Eight superior TE and 8 laryngeal speakers of Cantonese participated in the study. All speakers were instructed to produce the vowels /i, œ, a, [see text], u/ 3 times, and both acoustic and aerodynamic signals were recorded. While the airflow signals were later inverse-filtered to obtain the neoglottal and glottal waveforms, the acoustic signals were analyzed to obtain vowel formants, which were used as inputs to inverse filtering. Aerodynamic parameters including open quotient (OQ), speed quotient (SQ), and area under the curve (AUC) were compared between TE and laryngeal speakers. -

RESULTS: RESULTS revealed that TE speakers exhibited comparable OQ and AUC, but significantly reduced SQ when compared with laryngeal speakers.

CONCLUSIONS: Both inverse-filtered airflow signals associated with TE and laryngeal speech of Cantonese appear to be triangular in shape, indicating the opening, closing and closed phases of the (neo)glottis. However, neoglottal waveforms are highly variable. Such variability and inconsistency in neoglottal waveforms might be related to the lack of complete control over the neoglottis in TE speakers.}, } @article {pmid23415148, year = {2013}, author = {Gelfer, MP and Bennett, QE}, title = {Speaking fundamental frequency and vowel formant frequencies: effects on perception of gender.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {5}, pages = {556-566}, doi = {10.1016/j.jvoice.2012.11.008}, pmid = {23415148}, issn = {1873-4588}, mesh = {Adult ; *Auditory Perception ; Body Height ; Cues ; Female ; Humans ; Male ; *Sex Characteristics ; *Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {OBJECTIVES/HYPOTHESIS: The purpose of the present study was to investigate the contribution of vowel formant frequencies to gender identification in connected speech, the distinctiveness of vowel formants in males versus females, and how ambiguous speaking fundamental frequencies (SFFs) and vowel formants might affect perception of gender.

STUDY DESIGN: Multivalent experimental.

METHODS: Speakers subjects (eight tall males, eight short females, and seven males and seven females of "middle" height) were recorded saying two carrier phrases to elicit the vowels /i/ and /α/ and a sentence. The gender/height groups were selected to (presumably) maximize formant differences between some groups (tall vs short) and minimize differences between others (middle height). Each subjects' samples were digitally altered to distinct SFFs (116, 145, 155, 165, and 207 Hz) to represent SFFs typical of average males, average females, and in an ambiguous range. Listeners judged the gender of each randomized altered speech sample.

RESULTS: Results indicated that female speakers were perceived as female even with an SFF in the typical male range. For male speakers, gender perception was less accurate at SFFs of 165 Hz and higher. Although the ranges of vowel formants had considerable overlap between genders, significant differences in formant frequencies of males and females were seen.

CONCLUSIONS: Vowel formants appeared to be important to perception of gender, especially for SFFs in the range of 145-165 Hz; however, formants may be a more salient cue in connected speech when compared with isolated vowels or syllables.}, } @article {pmid23412568, year = {2013}, author = {Wu, L and Wan, C and Wang, S and Wan, M}, title = {Improvement of electrolaryngeal speech quality using a supraglottal voice source with compensation of vocal tract characteristics.}, journal = {IEEE transactions on bio-medical engineering}, volume = {60}, number = {7}, pages = {1965-1974}, doi = {10.1109/TBME.2013.2246789}, pmid = {23412568}, issn = {1558-2531}, mesh = {Biomimetics/instrumentation/methods ; Computer-Aided Design ; Equipment Failure Analysis ; Glottis/*physiopathology ; Humans ; *Larynx, Artificial ; Prosthesis Design ; Sound Spectrography/*instrumentation/*methods ; Speech Production Measurement/*methods ; Speech, Alaryngeal/*instrumentation/methods ; Voice Quality ; }, abstract = {Electrolarynx (EL) is a medical speech-recovery device designed for patients who have lost their original voice box due to laryngeal cancer. As a substitute for human larynx, the current commercial EL voice source cannot reconstruct natural EL speech under laryngectomy conditions. To eliminate the abnormal acoustic properties of EL speech, a supraglottal voice source with compensation of vocal tract characteristics was proposed and provided through an experimental EL(SGVS-EL) system. The acoustic analyses of simulated EL speech and reconstructed EL speech produced with different voice sources were performed in the normal subject and laryngectomee. The results indicated that the supraglottal voice source was successful in improving the acoustic properties of EL speech by enhancing low- frequency energy, correcting the shifted formants to normal range, and eliminating the visible spectral zeros. Both normal subject and laryngectomee also produced more natural vowels using SGVS-EL than commercial EL, even if the vocal tract parameter was substituted and the supraglottal voice source was biased to a certain degree. Therefore, supraglottal voice source is a feasible and effective approach to improving the acoustic quality of EL speech.}, } @article {pmid23391478, year = {2013}, author = {Yan, N and Ng, ML and Man, MK and To, TH}, title = {Vocal tract dimensional characteristics of professional male and female singers with different types of singing voices.}, journal = {International journal of speech-language pathology}, volume = {15}, number = {5}, pages = {484-491}, doi = {10.3109/17549507.2012.744429}, pmid = {23391478}, issn = {1754-9515}, mesh = {Adult ; Female ; Humans ; Larynx/*anatomy & histology ; Male ; Middle Aged ; Mouth/*anatomy & histology ; Nasal Cavity/*anatomy & histology ; Pharynx/*anatomy & histology ; *Singing ; Young Adult ; }, abstract = {The present study examined the possible relationship between classification of professional singing voices and their vocal tract parameters including vocal tract length and volume, and vowel formant frequencies. Acoustic reflection technology (ART) was used to measure vocal tract length and volume of 107 professional singers: 32 tenors, 25 baritones, 27 sopranos, and 23 mezzo-sopranos. The first three formant frequencies (F1-F3) of the English vowels /a, æ, i/ produced by the professional singers were also obtained. Results indicated significantly shorter oral and vocal tract length, and smaller oral and vocal tract volume associated with sopranos when compared with mezzo-sopranos. Acoustically, sopranos had higher F1, F2, and F3 values than mezzo-sopranos. The present findings suggest that, in addition to vocal tract length, vocal tract volume may also affect formant frequencies, implying the possibility that classifying professional singing voices is based on both vocal tract length and volume information.}, } @article {pmid23388046, year = {2013}, author = {Hodges-Simeon, CR and Gurven, M and Cárdenas, RA and Gaulin, SJ}, title = {Voice change as a new measure of male pubertal timing: a study among Bolivian adolescents.}, journal = {Annals of human biology}, volume = {40}, number = {3}, pages = {209-219}, doi = {10.3109/03014460.2012.759622}, pmid = {23388046}, issn = {1464-5033}, support = {R01AG024119-01/AG/NIA NIH HHS/United States ; R01AG024119-07/AG/NIA NIH HHS/United States ; R56AG024119-06/AG/NIA NIH HHS/United States ; }, mesh = {Adolescent ; Body Height ; Body Weight ; Bolivia ; Child ; Female ; Humans ; Indians, South American ; Male ; *Puberty ; Sex Characteristics ; United States ; *Voice ; Young Adult ; }, abstract = {BACKGROUND: Age at menarche is often used to measure maturational tempo in girls. Unfortunately, no parallel marker exists for boys. It is suggested that voice change has a number of advantages as a marker of the timing and degree of male pubertal development.

AIM: Traditional auxological methods are applied to voice change in order to compare differential development both between (males vs females; Tsimane vs North American; better vs worse condition) and within (voice vs height; fundamental frequency vs formant structure) populations.

SUBJECTS AND METHODS: Fundamental and formant frequencies, as well as height and weight, were measured for 172 Tsimane males and females, aged 8-23. Participants were assigned to 'better' or 'worse' condition based on a median split of height-for-age and weight-for-age z-scores.

RESULTS: Results support dramatic vocal changes in males. Peak voice change among Tsimane male adolescents occurs∼1 year later than in an age-matched North American sample. Achieved adult male voices are also higher in the Tsimane. Tsimane males in worse condition experience voice change more than 1 year later than Tsimane males in better condition.

CONCLUSION: Voice change has a number of attractive features as a marker of male pubertal timing including its methodological and technical simplicity as well as its social salience to group members.}, } @article {pmid23366718, year = {2012}, author = {Yadollahi, A and Alshaer, H and Radfar, MH and Bradley, TD}, title = {Relationship of respiratory sounds to alterations in the upper airway resistance.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2012}, number = {}, pages = {3648-3651}, doi = {10.1109/EMBC.2012.6346757}, pmid = {23366718}, issn = {2694-0604}, support = {MOP-82731//Canadian Institutes of Health Research/Canada ; }, mesh = {Airway Resistance/*physiology ; Female ; Humans ; Inhalation/physiology ; Linear Models ; Male ; Middle Aged ; Pharynx/physiology ; Pressure ; Respiratory Sounds/*physiology ; }, abstract = {Respiratory sound analysis is a simple and noninvasive way to study the pathophysiology of the upper airway (UA). Recently, it has been used to diagnose partial or complete UA collapse in patients with obstructive sleep apnea (OSA). In this study, we investigated whether uid accumulation in the neck alters the properties of respiratory sounds in temporal and spectral domains and whether the respiratory sounds analysis can be used to monitor variations in the physiology of the UA, as re ected by UA resistance (R(UA)). We recorded respiratory sounds and R(UA) from 19 individuals while awake. We applied lower body positive pressure (LBPP) to shift uid out of the legs and into the neck, which increased R(UA). We calculated !rst and second formants and energy of inspiratory sound segments. Our results show that during both control (no LBPP) and LBPP arms of the study, the extracted features were different for the sound segments corresponding to low and high R(UA). Also, the features were different during control and LBPP arms of the study. With the application of support vector machine (SVM) based classi!er, we were able to classify the sound segments into two groups of high/low resistance during control and LBPP arms and into two groups of control/LBPP when including all sound segments. The accuracies of non-linear SVM classi!er were 74.5 ± 19.5%, 75.0 ± 15.4% and 77.1 ± 12.3% for the control arm, LBPP arm and between the arms, respectively. We also showed that during the LBPP arm, the variations in !rst formant of the sound segments corresponding to low and high R(UA) was much less than during the control arm. This indicates that with application of LBPP and accumulation of uid in the neck, there are less variations in the morphology of the UA in response to changes in R(UA), than during the control arm. These results indicate that acoustic analysis of respiratory sounds can be used to investigate physiology of the UA and how interventions can alter UA properties.}, } @article {pmid23363191, year = {2013}, author = {Brimijoin, WO and Akeroyd, MA and Tilbury, E and Porr, B}, title = {The internal representation of vowel spectra investigated using behavioral response-triggered averaging.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {2}, pages = {EL118-22}, pmid = {23363191}, issn = {1520-8524}, support = {MC_U135097131/MRC_/Medical Research Council/United Kingdom ; U135097131/MRC_/Medical Research Council/United Kingdom ; /CSO_/Chief Scientist Office/United Kingdom ; }, mesh = {Acoustic Stimulation ; Adolescent ; Audiometry, Speech ; Female ; Humans ; Male ; Noise/adverse effects ; Perceptual Masking ; *Psychoacoustics ; *Signal Detection, Psychological ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Listeners presented with noise were asked to press a key whenever they heard the vowels [a] or [i:]. The noise had a random spectrum, with levels in 60 frequency bins changing every 0.5 s. Reverse correlation was used to average the spectrum of the noise prior to each key press, thus estimating the features of the vowels for which the participants were listening. The formant frequencies of these reverse-correlated vowels were similar to those of their respective whispered vowels. The success of this response-triggered technique suggests that it may prove useful for estimating other internal representations, including perceptual phenomena like tinnitus.}, } @article {pmid23363122, year = {2013}, author = {Barreda, S and Nearey, TM}, title = {Training listeners to report the acoustic correlate of formant-frequency scaling using synthetic voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {2}, pages = {1065-1077}, doi = {10.1121/1.4773858}, pmid = {23363122}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Attention ; Audiometry, Speech ; *Discrimination Learning ; *Discrimination, Psychological ; Feasibility Studies ; Humans ; Judgment ; Psychoacoustics ; Recognition, Psychology ; *Speech Acoustics ; *Speech Perception ; Task Performance and Analysis ; Time Factors ; *Voice Quality ; }, abstract = {The vocal tract length of a speaker is the primary determinant of the range of formant frequencies (FFs) produced by that speaker. Listeners have demonstrated sensitivity to the average FFs produced by voices, for example, in estimating the relative heights of two speakers based on their speech. However, it is not known whether they can learn to identify voices based on the acoustic characteristic associated with the average FFs produced by a voice (this characteristic will be referred to as FF-scaling). To investigate this, a series of vowels corresponding to voices that differed in their average f0 and/or FF-scaling were synthesized. Listeners (n = 71) were trained to identify these voices using a training procedure where, for each trial, they heard the vowels representing a voice and then had to identify the stimulus voice from among a series of candidate voices that differed in terms of their FF-scaling and/or their f0. Results indicate that listeners can identify voices on the basis of FF-scaling quite accurately and consistently after only a short training session and that, although f0 weakly influences these estimates, they are most strongly determined by the stimulus FFs.}, } @article {pmid23357804, year = {2013}, author = {Perreau, AE and Bentler, RA and Tyler, RS}, title = {The contribution of a frequency-compression hearing aid to contralateral cochlear implant performance.}, journal = {Journal of the American Academy of Audiology}, volume = {24}, number = {2}, pages = {105-120}, doi = {10.3766/jaaa.24.2.4}, pmid = {23357804}, issn = {1050-0545}, mesh = {Adult ; Aged ; Aged, 80 and over ; Auditory Threshold ; *Cochlear Implants ; Combined Modality Therapy ; Female ; *Hearing Aids ; Hearing Loss, Bilateral/*therapy ; Humans ; Male ; Middle Aged ; Phonetics ; Prosthesis Design ; Severity of Illness Index ; Speech Discrimination Tests ; *Speech Perception ; Surveys and Questionnaires ; }, abstract = {BACKGROUND: Frequency-lowering signal processing in hearing aids has re-emerged as an option to improve audibility of the high frequencies by expanding the input bandwidth. Few studies have investigated the usefulness of the scheme as an option for bimodal users (i.e., combined use of a cochlear implant and a contralateral hearing aid). In this study, that question was posed.

PURPOSE: The purposes of this study were (1) to determine if frequency compression was a better bimodal option than conventional amplification and (2) to determine the impact of a frequency-compression hearing aid on speech recognition abilities.

RESEARCH DESIGN: There were two separate experiments in this study. The first experiment investigated the contribution of a frequency-compression hearing aid to contralateral cochlear implant (CI) performance for localization and speech perception in noise. The second experiment assessed monaural consonant and vowel perception in quiet using the frequency-compression and conventional hearing aid without the use of a contralateral CI or hearing aid.

STUDY SAMPLE: Ten subjects fitted with a cochlear implant and hearing aid participated in the first experiment. Seventeen adult subjects with a cochlear implant and hearing aid or two hearing aids participated in the second experiment. To be included, subjects had to have a history of postlingual deafness, a moderate or moderate-to-severe hearing loss, and have not worn this type of frequency-lowering hearing aid previously.

DATA COLLECTION AND ANALYSIS: In the first experiment, performance using the frequency-compression and conventional hearing aids was assessed on tests of sound localization, speech perception in a background of noise, and two self-report questionnaires. In the second experiment, consonant and vowel perception in quiet was assessed monaurally for the two conditions. In both experiments, subjects alternated daily between a frequency-compression and conventional hearing aid for 2 mo. The parameters of frequency compression were set individually for each subject, and audibility was measured for the frequency compression and conventional hearing aid programs by comparing estimations of the Speech Intelligibility Index (SII) using a modified algorithm (Bentler et al, 2011). In both experiments, the outcome measures were administered following the hearing aid fitting to assess performance at baseline and after 2 mo of use.

RESULTS: For this group of subjects, the results revealed no significant difference between the frequency-compression and conventional hearing aid on tests of localization and consonant recognition. Spondee-in-noise and vowel perception scores were significantly higher with the conventional hearing aid compared to the frequency-compression hearing aid after 2 mo of use.

CONCLUSIONS: These results suggest that, for the subjects in this study, frequency compression is not a better bimodal option than conventional amplification. In addition, speech perception may be negatively influenced by frequency compression because formant frequencies are too severely compressed and can no longer be distinguished.}, } @article {pmid23308038, year = {2012}, author = {Ingvalson, EM and Holt, LL and McClelland, JL}, title = {Can native Japanese listeners learn to differentiate/r-l/on the basis of F3 onset frequency?.}, journal = {Bilingualism (Cambridge, England)}, volume = {15}, number = {2}, pages = {434-435}, pmid = {23308038}, issn = {1366-7289}, support = {P50 MH064445/MH/NIMH NIH HHS/United States ; R01 DC004674/DC/NIDCD NIH HHS/United States ; R01 DC004674-11/DC/NIDCD NIH HHS/United States ; }, abstract = {Many attempts have been made to teach native Japanese listeners to perceptually differentiate English/r-l/(e.g. rock-lock). Though improvement is evident, in no case is final performance native English-like. We focused our training on the third formant onset frequency, shown to be the most reliable indicator of/r-l/category membership. We first presented listeners with instances of synthetic/r-l/stimuli varying only in F3 onset frequency, in a forced-choice identification training task with feedback. Evidence of learning was limited. The second experiment utilized an adaptive paradigm beginning with non-speech stimuli consisting only of/r/and/l/F3 frequency trajectories progressing to synthetic speech instances of/ra-la/; half of the trainees received feedback. Improvement was shown by some listeners, suggesting some enhancement of/r-l/identification is possible following training with only F3 onset frequency. However, only a subset of these listeners showed signs of generalization of the training effect beyond the trained synthetic context.}, } @article {pmid23299859, year = {2013}, author = {Wilkinson, EP and Abdel-Hamid, O and Galvin, JJ and Jiang, H and Fu, QJ}, title = {Voice conversion in cochlear implantation.}, journal = {The Laryngoscope}, volume = {123 Suppl 3}, number = {0 3}, pages = {S29-43}, pmid = {23299859}, issn = {1531-4995}, support = {R01 DC004993/DC/NIDCD NIH HHS/United States ; 5R01DC004993/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; *Cochlear Implantation ; Female ; Humans ; Male ; Middle Aged ; *Speech Perception ; *Speech Recognition Software ; Voice ; }, abstract = {OBJECTIVES/HYPOTHESIS: Voice conversion algorithms may benefit cochlear implant (CI) users who better understand speech produced by one talker than by another. It is unclear how the source or target talker's fundamental frequency (F0) information may contribute to perception of converted speech. This study evaluated voice conversion algorithms for CI users in which the source or target talker's F0 was included in the converted speech.

STUDY DESIGN: Development and evaluation of computerized voice conversion algorithms in CI patients.

METHODS: A series of cepstral analysis-based algorithms were developed and evaluated in six CI users. The algorithms converted talker voice gender (male-to-female, or female-to-male); either the source or target talker F0 was included in the converted speech. The voice conversion algorithms were evaluated in terms of recognition of IEEE sentences, speech quality, and voice gender discrimination.

RESULTS: Voice gender recognition performance showed that listeners strongly cued to the F0 that was included within the converted speech. For both IEEE sentence recognition and voice quality ratings, performance was poorer with the voice conversion algorithms than with original speech. Performance on female-to-male conversion was superior to male-to-female conversion.

CONCLUSION: The strong cueing to F0 within the voice conversion algorithms suggests that CI users are able to utilize temporal periodicity information for some pitch-related tasks. Limitations on spectral channel information experienced by CI users may result in poorer performance with voice conversion algorithms due to distortion of speech formant information and degradation of the spectral envelope.}, } @article {pmid23298018, year = {2013}, author = {Zhang, C and Morrison, GS and Ochoa, F and Enzinger, E}, title = {Reliability of human-supervised formant-trajectory measurement for forensic voice comparison.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {1}, pages = {EL54-60}, doi = {10.1121/1.4773223}, pmid = {23298018}, issn = {1520-8524}, mesh = {*Acoustics ; Adult ; Female ; Forensic Sciences/*methods ; Humans ; Middle Aged ; Models, Statistical ; Observer Variation ; Pattern Recognition, Physiological ; *Phonetics ; Recognition, Psychology ; Reproducibility of Results ; Signal Processing, Computer-Assisted ; Software ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Telephone ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Acoustic-phonetic approaches to forensic voice comparison often include human-supervised measurement of vowel formants, but the reliability of such measurements is a matter of concern. This study assesses the within- and between-supervisor variability of three sets of formant-trajectory measurements made by each of four human supervisors. It also assesses the validity and reliability of forensic-voice-comparison systems based on these measurements. Each supervisor's formant-trajectory system was fused with a baseline mel-frequency cepstral-coefficient system, and performance was assessed relative to the baseline system. Substantial improvements in validity were found for all supervisors' systems, but some supervisors' systems were more reliable than others.}, } @article {pmid23297909, year = {2013}, author = {Bizley, JK and Walker, KM and King, AJ and Schnupp, JW}, title = {Spectral timbre perception in ferrets: discrimination of artificial vowels under different listening conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {133}, number = {1}, pages = {365-376}, pmid = {23297909}, issn = {1520-8524}, support = {BB/H016813/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; WT076508AIA/WT_/Wellcome Trust/United Kingdom ; 076508/WT_/Wellcome Trust/United Kingdom ; BB/D009758/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; /WT_/Wellcome Trust/United Kingdom ; }, mesh = {Acoustic Stimulation ; Animals ; *Behavior, Animal ; Choice Behavior ; Cues ; *Discrimination, Psychological ; Female ; Ferrets/*psychology ; Humans ; Noise/adverse effects ; Perceptual Masking ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Voice Quality ; }, abstract = {Spectral timbre is an acoustic feature that enables human listeners to determine the identity of a spoken vowel. Despite its importance to sound perception, little is known about the neural representation of sound timbre and few psychophysical studies have investigated timbre discrimination in non-human species. In this study, ferrets were positively conditioned to discriminate artificial vowel sounds in a two-alternative-forced-choice paradigm. Animals quickly learned to discriminate the vowel sound /u/ from /ε/ and were immediately able to generalize across a range of voice pitches. They were further tested in a series of experiments designed to assess how well they could discriminate these vowel sounds under different listening conditions. First, a series of morphed vowels was created by systematically shifting the location of the first and second formant frequencies. Second, the ferrets were tested with single formant stimuli designed to assess which spectral cues they could be using to make their decisions. Finally, vowel discrimination thresholds were derived in the presence of noise maskers presented from either the same or a different spatial location. These data indicate that ferrets show robust vowel discrimination behavior across a range of listening conditions and that this ability shares many similarities with human listeners.}, } @article {pmid23288656, year = {2013}, author = {Sjerps, MJ and McQueen, JM and Mitterer, H}, title = {Evidence for precategorical extrinsic vowel normalization.}, journal = {Attention, perception & psychophysics}, volume = {75}, number = {3}, pages = {576-587}, doi = {10.3758/s13414-012-0408-7}, pmid = {23288656}, issn = {1943-393X}, mesh = {Analysis of Variance ; Discrimination, Psychological/*physiology ; Humans ; *Phonetics ; Speech/*classification ; Speech Perception/*physiology ; }, abstract = {Three experiments investigated whether extrinsic vowel normalization takes place largely at a categorical or a precategorical level of processing. Traditional vowel normalization effects in categorization were replicated in Experiment 1: Vowels taken from an [I]-[ε] continuum were more often interpreted as /I/ (which has a low first formant, F(1)) when the vowels were heard in contexts that had a raised F(1) than when the contexts had a lowered F(1). This was established with contexts that consisted of only two syllables. These short contexts were necessary for Experiment 2, a discrimination task that encouraged listeners to focus on the perceptual properties of vowels at a precategorical level. Vowel normalization was again found: Ambiguous vowels were more easily discriminated from an endpoint [ε] than from an endpoint [I] in a high-F(1) context, whereas the opposite was true in a low-F(1) context. Experiment 3 measured discriminability between pairs of steps along the [I]-[ε] continuum. Contextual influences were again found, but without discrimination peaks, contrary to what was predicted from the same participants' categorization behavior. Extrinsic vowel normalization therefore appears to be a process that takes place at least in part at a precategorical processing level.}, } @article {pmid26557347, year = {2013}, author = {Kumar, K and Bhat, JS and D'Costa, PE and Srivastava, M and Kalaiah, MK}, title = {Effect of Stimulus Polarity on Speech Evoked Auditory Brainstem Response.}, journal = {Audiology research}, volume = {3}, number = {1}, pages = {e8}, pmid = {26557347}, issn = {2039-4330}, abstract = {The aim of the present study was to investigate the effect of stimulus polarity on speech evoked auditory brainstem response (ABR). In order to accomplish it, speech evoked ABR was recorded with various stimulus polarities from 17 normally hearing adults. The result of the study shows differential effect of stimulus polarity on components of speech evoked ABR. Latency of peaks for onset, sustained and offset responses of speech evoked ABR were found to be not significantly different across stimulus polarities. In contrast, the amplitude of first formant and high frequency components was found to be significantly reduced for alternating polarity compared to single polarity, while amplitude of fundamental frequency response was not affected by polarity of the stimuli. Thus speech evoked ABR may be recorded using single polarity rather than using alternating polarities.}, } @article {pmid23275414, year = {2013}, author = {Dilley, LC and Wieland, EA and Gamache, JL and McAuley, JD and Redford, MA}, title = {Age-related changes to spectral voice characteristics affect judgments of prosodic, segmental, and talker attributes for child and adult speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {56}, number = {1}, pages = {159-177}, pmid = {23275414}, issn = {1558-9102}, support = {R01 HD061458/HD/NICHD NIH HHS/United States ; R01HD061458/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Child, Preschool ; Female ; Humans ; Judgment/physiology ; Male ; Speech/*physiology ; Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Perception/physiology ; *Speech Production Measurement ; Voice/*physiology ; Young Adult ; }, abstract = {PURPOSE: As children mature, changes in voice spectral characteristics co-vary with changes in speech, language, and behavior. In this study, spectral characteristics were manipulated to alter the perceived ages of talkers' voices while leaving critical acoustic-prosodic correlates intact, to determine whether perceived age differences were associated with differences in judgments of prosodic, segmental, and talker attributes.

METHOD: Speech was modified by lowering formants and fundamental frequency, for 5-year-old children's utterances, or raising them, for adult caregivers' utterances. Next, participants differing in awareness of the manipulation (Experiment 1A) or amount of speech-language training (Experiment 1B) made judgments of prosodic, segmental, and talker attributes. Experiment 2 investigated the effects of spectral modification on intelligibility. Finally, in Experiment 3, trained analysts used formal prosody coding to assess prosodic characteristics of spectrally modified and unmodified speech.

RESULTS: Differences in perceived age were associated with differences in ratings of speech rate, fluency, intelligibility, likeability, anxiety, cognitive impairment, and speech-language disorder/delay; effects of training and awareness of the manipulation on ratings were limited. There were no significant effects of the manipulation on intelligibility or formally coded prosody judgments.

CONCLUSION: Age-related voice characteristics can greatly affect judgments of speech and talker characteristics, raising cautionary notes for developmental research and clinical work.}, } @article {pmid23238527, year = {2013}, author = {Ji, C and Galvin, JJ and Xu, A and Fu, QJ}, title = {Effect of speaking rate on recognition of synthetic and natural speech by normal-hearing and cochlear implant listeners.}, journal = {Ear and hearing}, volume = {34}, number = {3}, pages = {313-323}, pmid = {23238527}, issn = {1538-4667}, support = {R01 DC004993/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Audiometry, Speech ; *Cochlear Implants ; Communication Aids for Disabled ; Female ; Hearing Loss, Sensorineural/*therapy ; Humans ; Male ; Middle Aged ; Regression Analysis ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: Most studies have evaluated cochlear implant (CI) performance using "clear" speech materials, which are highly intelligible and well articulated. CI users may encounter much greater variability in speech patterns in the "real world," including synthetic speech. In this study, the authors measured sentence recognition with multiple talkers and speaking rates, and with naturally produced and synthetic speech in listeners with normal hearing (NH) and CIs.

DESIGN: NH and CI subjects were asked to recognize naturally produced or synthetic sentences, presented at a slow, normal, or fast speaking rate. Natural speech was produced by one male and one female talker; synthetic speech was generated to simulate a male and female talker. For natural speech, the speaking rate was time-scaled while preserving voice pitch and formant frequency information. For synthetic speech, the speaking rate was adjusted within the speech synthesis engine. NH subjects were tested while listening to unprocessed speech or to an eight-channel acoustic CI simulation. CI subjects were tested while listening with their clinical processors and the recommended microphone sensitivity and volume settings.

RESULTS: The NH group performed significantly better than did the CI-simulation group, and the CI-simulation group performed significantly better than did the CI group. For all subject groups, sentence recognition was significantly better with natural speech than with synthetic speech. The performance deficit with synthetic speech was relatively small for NH subjects listening to unprocessed speech. However, the performance deficit with synthetic speech was much greater for CI subjects and for CI-simulation subjects. There was significant effect of talker gender, with slightly better performance with the female talker for CI subjects and slightly better performance with the male talker for the CI simulations. For all subject groups, sentence recognition was significantly poorer only at the fast rate. CI performance was very poor (approximately 10% correct) at the fast rate.

CONCLUSIONS: CI listeners are susceptible to variability in speech patterns caused by speaking rate and production style (natural versus synthetic). CI performance with clear speech materials may overestimate performance in real-world listening conditions. The poorer CI performance may be because of other factors besides reduced spectro-temporal resolution, such the quality of electric stimulation, duration of deafness, or cortical processing. Optimizing the input or training may improve CI users' tolerance for variability in speech patterns.}, } @article {pmid23231128, year = {2012}, author = {Jokinen, E and Yrttiaho, S and Pulakka, H and Vainio, M and Alku, P}, title = {Signal-to-noise ratio adaptive post-filtering method for intelligibility enhancement of telephone speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {6}, pages = {3990-4001}, doi = {10.1121/1.4765074}, pmid = {23231128}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; *Acoustics ; Adult ; Algorithms ; Analysis of Variance ; *Cell Phone ; Female ; Humans ; Male ; Noise/adverse effects ; Perceptual Masking ; *Signal Processing, Computer-Assisted ; Signal-To-Noise Ratio ; *Speech Acoustics ; *Speech Intelligibility ; Speech Reception Threshold Test ; *Voice Quality ; Young Adult ; }, abstract = {Post-filtering can be utilized to improve the quality and intelligibility of telephone speech. Previous studies have shown that energy reallocation with a high-pass type filter works effectively in improving the intelligibility of speech in difficult noise conditions. The present study introduces a signal-to-noise ratio adaptive post-filtering method that utilizes energy reallocation to transfer energy from the first formant to higher frequencies. The proposed method adapts to the level of the background noise so that, in favorable noise conditions, the post-filter has a flat frequency response and the effect of the post-filtering is increased as the level of the ambient noise increases. The performance of the proposed method is compared with a similar post-filtering algorithm and unprocessed speech in subjective listening tests which evaluate both intelligibility and listener preference. The results indicate that both of the post-filtering methods maintain the quality of speech in negligible noise conditions and are able to provide intelligibility improvement over unprocessed speech in adverse noise conditions. Furthermore, the proposed post-filtering algorithm performs better than the other post-filtering method under evaluation in moderate to difficult noise conditions, where intelligibility improvement is mostly required.}, } @article {pmid23221306, year = {2013}, author = {Woodall, A and Liu, C}, title = {Effects of signal level and spectral contrast on vowel formant discrimination for normal-hearing and hearing-impaired listeners.}, journal = {American journal of audiology}, volume = {22}, number = {1}, pages = {94-104}, doi = {10.1044/1059-0889(2012/12-0044)}, pmid = {23221306}, issn = {1558-9137}, mesh = {Adolescent ; Adult ; Case-Control Studies ; Hearing Loss, Sensorineural/*rehabilitation ; Humans ; Middle Aged ; *Speech Perception ; *Speech Reception Threshold Test ; Young Adult ; }, abstract = {PURPOSE: The aim of this study was to determine whether increasing the overall speech level or the individual spectral contrasts of vowel sounds can improve vowel formant discrimination for listeners both with and without normal hearing.

METHOD: Thresholds of vowel formant discrimination were examined for the F2 frequencies of 3 American English vowels for listeners with and without normal hearing. Spectral contrasts of the F2 were enhanced by 3, 6, and 9 dB. Vowel stimuli were presented at 70 and 90 dB SPL.

RESULTS: The thresholds of listeners with hearing impairment were reduced significantly after spectral enhancement was implemented, especially at 90 dB SPL, whereas normal-hearing listeners did not benefit from spectral enhancement.

CONCLUSION: These results indicate that a combination of spectral enhancement of F2 and high speech level is most beneficial to improve vowel formant discrimination for listeners with hearing impairment.}, } @article {pmid23182951, year = {2012}, author = {Etz, T and Reetz, H and Wegener, C}, title = {A classification model for infant cries with hearing impairment and unilateral cleft lip and palate.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {64}, number = {5}, pages = {254-261}, doi = {10.1159/000343994}, pmid = {23182951}, issn = {1421-9972}, mesh = {Age Factors ; Cleft Lip/*diagnosis ; Cleft Palate/*diagnosis ; *Crying ; Deafness/*diagnosis ; Female ; Humans ; Infant ; Male ; Reference Values ; *Sound Spectrography ; }, abstract = {OBJECTIVE: The aim of the study was to examine the acoustic features of infant cries in order to find differences between infants with normal development, hearing impairment (HI) and unilateral cleft lip and palate (UCLP). The study focused on two issues: (1) are differences in acoustic cry parameters specific of a certain pathological development, and (2) do these differences allow for a reliable classification of infant cries?

PATIENTS AND METHODS: In total, 128 spontaneous cries of infants up to 12 months of age were recorded. The mean values of acoustic cry parameters were statistically analyzed by one-way analysis of variance and Sheffé post hoc tests. A C5.0 decision tree was trained to classify infant cries by their acoustic parameters.

RESULTS: Significant differences in cry duration, fundamental frequency, formants 2 and 4, intensity, jitter, shimmer and harmonics-to-noise ratio were found between the groups. The C5.0 decision tree was able to predict an infant's membership to the healthy, UCLP or HI groups with a probability of 89.2%.

CONCLUSION: We conclude that C5.0 decision trees are suited to reliably classify healthy infants as well as infants with HI and UCLP by acoustic parameters.}, } @article {pmid23174416, year = {2013}, author = {Tuomainen, J and Savela, J and Obleser, J and Aaltonen, O}, title = {Attention modulates the use of spectral attributes in vowel discrimination: behavioral and event-related potential evidence.}, journal = {Brain research}, volume = {1490}, number = {}, pages = {170-183}, doi = {10.1016/j.brainres.2012.10.067}, pmid = {23174416}, issn = {1872-6240}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Attention/*physiology ; Data Interpretation, Statistical ; Electroencephalography ; Evoked Potentials/*physiology ; Female ; Humans ; Male ; Psychomotor Performance/physiology ; Reaction Time/physiology ; Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Speech contains a variety of acoustic cues to auditory and phonetic contrasts that are exploited by the listener in decoding the acoustic signal. In three experiments, we tried to elucidate whether listeners rely on formant peak frequencies or whole spectrum attributes in vowel discrimination. We created two vowel continua in which the acoustic distance in formant frequencies was constant but the continua differed in spectral moments (i.e., the whole spectrum modeled as a probability density function). In Experiment 1, we measured reaction times and response accuracy while listeners performed a go/no-go discrimination task. The results indicated that the performance of the listeners was based on the spectral moments (especially the first and second moments), and not on formant peaks. Behavioral results in Experiment 2 showed that, when the stimuli were presented in noise eliminating differences in spectral moments between the two continua, listeners employed formant peak frequencies. In Experiment 3, using the same listeners and stimuli as in Experiment 1, we measured an automatic brain potential, the mismatch negativity (MMN), when listeners did not attend to the auditory stimuli. Results showed that the MMN reflects sensitivity only to the formant structure of the vowels. We suggest that the auditory cortex automatically and pre-attentively encodes formant peak frequencies, whereas attention can be deployed for processing additional spectral information, such as spectral moments, to enhance vowel discrimination.}, } @article {pmid23159032, year = {2013}, author = {Gelfer, MP and Van Dong, BR}, title = {A preliminary study on the use of vocal function exercises to improve voice in male-to-female transgender clients.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {3}, pages = {321-334}, doi = {10.1016/j.jvoice.2012.07.008}, pmid = {23159032}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Case-Control Studies ; Female ; Feminization ; Humans ; Male ; Middle Aged ; Prospective Studies ; Self Concept ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; Speech Perception ; Time Factors ; *Transgender Persons/psychology ; Transsexualism/physiopathology/psychology/*therapy ; Treatment Outcome ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVES: This study explores the outcomes of symptomatic voice treatment plus Stemple's vocal function exercises (VFEs) for a group of male-to-female (MTF) transgender (TG) clients seeking voice feminization. Both acoustic and perceptual outcomes were assessed, in addition to the clients' attitudes toward VFE.

DESIGN: Prospective treatment study.

METHOD: Three MTF TG clients plus three control female speakers and three control male speakers served as subjects. All provided a variety of speech samples. The TG clients underwent symptomatic voice therapy for 6 weeks, while simultaneously performing the VFE protocol. At the end of therapy, the TG clients provided posttreatment voice samples. All voice samples were analyzed for speaking fundamental frequency (SFF), SFF upper and lower limits, and the first three formants of /i/. A CD of pre- and posttreatment voice samples plus the control voices was presented to listeners for gender judgments and masculinity and femininity ratings.

RESULTS: For acoustic measures, the TG subjects appeared more similar to the male control speakers in the pretest, and more similar to the female controls in the posttest. Perceptually, listeners continued to identify the TG subjects as male following therapy, although they were rated as significantly less masculine and more feminine. TG subjects were generally positive about the addition of VFE to their therapy experience.

CONCLUSIONS: The addition of VFE did not appear to improve posttreatment outcomes compared with previous literature. It was suggested that both number of sessions and experience living full-time as a woman might be important variables in predicting progress in therapy.}, } @article {pmid23155427, year = {2012}, author = {Stoeger, AS and Heilmann, G and Zeppelzauer, M and Ganswindt, A and Hensman, S and Charlton, BD}, title = {Visualizing sound emission of elephant vocalizations: evidence for two rumble production types.}, journal = {PloS one}, volume = {7}, number = {11}, pages = {e48907}, pmid = {23155427}, issn = {1932-6203}, support = {P 23099/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Acoustics ; Animals ; Elephants/*physiology ; Female ; Male ; Social Behavior ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {Recent comparative data reveal that formant frequencies are cues to body size in animals, due to a close relationship between formant frequency spacing, vocal tract length and overall body size. Accordingly, intriguing morphological adaptations to elongate the vocal tract in order to lower formants occur in several species, with the size exaggeration hypothesis being proposed to justify most of these observations. While the elephant trunk is strongly implicated to account for the low formants of elephant rumbles, it is unknown whether elephants emit these vocalizations exclusively through the trunk, or whether the mouth is also involved in rumble production. In this study we used a sound visualization method (an acoustic camera) to record rumbles of five captive African elephants during spatial separation and subsequent bonding situations. Our results showed that the female elephants in our analysis produced two distinct types of rumble vocalizations based on vocal path differences: a nasally- and an orally-emitted rumble. Interestingly, nasal rumbles predominated during contact calling, whereas oral rumbles were mainly produced in bonding situations. In addition, nasal and oral rumbles varied considerably in their acoustic structure. In particular, the values of the first two formants reflected the estimated lengths of the vocal paths, corresponding to a vocal tract length of around 2 meters for nasal, and around 0.7 meters for oral rumbles. These results suggest that African elephants may be switching vocal paths to actively vary vocal tract length (with considerable variation in formants) according to context, and call for further research investigating the function of formant modulation in elephant vocalizations. Furthermore, by confirming the use of the elephant trunk in long distance rumble production, our findings provide an explanation for the extremely low formants in these calls, and may also indicate that formant lowering functions to increase call propagation distances in this species'.}, } @article {pmid23148469, year = {2013}, author = {Viswanathan, N and Magnuson, JS and Fowler, CA}, title = {Similar response patterns do not imply identical origins: an energetic masking account of nonspeech effects in compensation for coarticulation.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {39}, number = {4}, pages = {1181-1192}, pmid = {23148469}, issn = {1939-1277}, support = {P01 HD001994/HD/NICHD NIH HHS/United States ; R15 DC011875/DC/NIDCD NIH HHS/United States ; DC00565/DC/NIDCD NIH HHS/United States ; HD-001994/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; Female ; Humans ; Male ; Perceptual Masking/*physiology ; Psychomotor Performance/physiology ; Signal Detection, Psychological/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Nonspeech materials are widely used to identify basic mechanisms underlying speech perception. For instance, they have been used to examine the origin of compensation for coarticulation, the observation that listeners' categorization of phonetic segments depends on neighboring segments (Mann, 1980). Specifically, nonspeech precursors matched to critical formant frequencies of speech precursors have been shown to produce similar categorization shifts as speech contexts. This observation has been interpreted to mean that spectrally contrastive frequency relations between neighboring segments underlie the categorization shifts observed after speech, as well as nonspeech precursors (Lotto & Kluender, 1998). From the gestural perspective, however, categorization shifts in speech contexts occur because of listeners' sensitivity to acoustic information for coarticulatory gestural overlap in production; in nonspeech contexts, this occurs because of energetic masking of acoustic information for gestures. In 2 experiments, we distinguish the energetic masking and spectral contrast accounts. In Experiment 1, we investigated the effects of varying precursor tone frequency on speech categorization. Consistent only with the masking account, tonal effects were greater for frequencies close enough to those in the target syllables for masking to occur. In Experiment 2, we filtered the target stimuli to simulate effects of masking and obtained behavioral outcomes that closely resemble those with nonspeech tones. We conclude that masking provides the more plausible account of nonspeech context effects. More generally, we suggest that similar results from the use of speech and nonspeech materials do not automatically imply identical origins and that the use of nonspeech in speech studies entails careful examination of the nature of information in the nonspeech materials.}, } @article {pmid23145625, year = {2012}, author = {Barreda, S}, title = {Vowel normalization and the perception of speaker changes: an exploration of the contextual tuning hypothesis.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {5}, pages = {3453-3464}, doi = {10.1121/1.4747011}, pmid = {23145625}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Audiometry, Speech ; Humans ; Perceptual Masking ; Reaction Time ; Recognition, Psychology ; Signal Detection, Psychological ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; }, abstract = {Many experiments have reported a perceptual advantage for vowels presented in blocked-versus mixed-voice conditions. Nusbaum and colleagues [Nusbaum and Morin (1992). in Speech Perception, Speech Production, and Linguistic Structure, edited by Y. Tohkura, Y. Sagisaka, and E. Vatikiotis-Bateson (OHM, Tokyo), pp. 113-134; Magnuson and Nusbaum (2007). J. Exp. Psychol. Hum. Percept. Perform. 33(2), 391-409] present results which suggest that the size of this advantage may be related to the facility with which listeners can detect speaker changes, so that combinations of less similar voices can result in better performance than combinations of more similar voices. To test this, a series of synthetic voices (differing in their source characteristics and/or formant-spaces) was used in a speeded-monitoring task. Vowels were presented in blocks made up of tokens from one or two synthetic voices. Results indicate that formant-space differences, in the absence of source differences between voices in a block, were unlikely to result in the perception of multiple voices, leading to lower accuracy and relatively faster reaction times. Source differences between voices in a block resulted in the perception of multiple voices, increased reaction times, and a decreased negative effect of formant-space differences between voices on identification accuracy. These results are consistent with a process in which the detection of speaker changes guides the appropriate or inappropriate use of extrinsic information in normalization.}, } @article {pmid23122846, year = {2012}, author = {Stoeger, AS and Mietchen, D and Oh, S and de Silva, S and Herbst, CT and Kwon, S and Fitch, WT}, title = {An Asian elephant imitates human speech.}, journal = {Current biology : CB}, volume = {22}, number = {22}, pages = {2144-2148}, pmid = {23122846}, issn = {1879-0445}, support = {230604/ERC_/European Research Council/International ; P 23099/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Acoustics ; Animals ; *Behavior, Animal ; Elephants/*physiology ; Humans ; *Imitative Behavior ; Male ; Phonation/*physiology ; }, abstract = {Vocal imitation has convergently evolved in many species, allowing learning and cultural transmission of complex, conspecific sounds, as in birdsong. Scattered instances also exist of vocal imitation across species, including mockingbirds imitating other species or parrots and mynahs producing human speech. Here, we document a male Asian elephant (Elephas maximus) that imitates human speech, matching Korean formants and fundamental frequency in such detail that Korean native speakers can readily understand and transcribe the imitations. To create these very accurate imitations of speech formant frequencies, this elephant (named Koshik) places his trunk inside his mouth, modulating the shape of the vocal tract during controlled phonation. This represents a wholly novel method of vocal production and formant control in this or any other species. One hypothesized role for vocal imitation is to facilitate vocal recognition by heightening the similarity between related or socially affiliated individuals. The social circumstances under which Koshik's speech imitations developed suggest that one function of vocal learning might be to cement social bonds and, in unusual cases, social bonds across species.}, } @article {pmid23102940, year = {2012}, author = {Covington, MA and Lunden, SL and Cristofaro, SL and Wan, CR and Bailey, CT and Broussard, B and Fogarty, R and Johnson, S and Zhang, S and Compton, MT}, title = {Phonetic measures of reduced tongue movement correlate with negative symptom severity in hospitalized patients with first-episode schizophrenia-spectrum disorders.}, journal = {Schizophrenia research}, volume = {142}, number = {1-3}, pages = {93-95}, pmid = {23102940}, issn = {1573-2509}, support = {R01 MH081011/MH/NIMH NIH HHS/United States ; }, mesh = {Adult ; Female ; Hospitalization ; Humans ; Male ; *Movement Disorders/diagnosis/etiology/pathology ; *Phonetics ; Psychiatric Status Rating Scales ; Psychotic Disorders/*complications/diagnosis ; Retrospective Studies ; Schizophrenia/*complications/diagnosis ; *Schizophrenic Psychology ; Severity of Illness Index ; Tongue/*physiopathology ; Video Recording ; Young Adult ; }, abstract = {BACKGROUND: Aprosody, or flattened speech intonation, is a recognized negative symptom of schizophrenia, though it has rarely been studied from a linguistic/phonological perspective. To bring the latest advances in computational linguistics to the phenomenology of schizophrenia and related psychotic disorders, a clinical first-episode psychosis research team joined with a phonetics/computational linguistics team to conduct a preliminary, proof-of-concept study.

METHODS: Video recordings from a semi-structured clinical research interview were available from 47 first-episode psychosis patients. Audio tracks of the video recordings were extracted, and after review of quality, 25 recordings were available for phonetic analysis. These files were de-noised and a trained phonologist extracted a 1-minute sample of each patient's speech. WaveSurfer 1.8.5 was used to create, from each speech sample, a file of formant values (F0, F1, F2, where F0 is the fundamental frequency and F1 and F2 are resonance bands indicating the moment-by-moment shape of the oral cavity). Variability in these phonetic indices was correlated with severity of Positive and Negative Syndrome Scale negative symptom scores using Pearson correlations.

RESULTS: A measure of variability of tongue front-to-back position-the standard deviation of F2-was statistically significantly correlated with the severity of negative symptoms (r=-0.446, p=0.03).

CONCLUSION: This study demonstrates a statistically significant and meaningful correlation between negative symptom severity and phonetically measured reductions in tongue movements during speech in a sample of first-episode patients just initiating treatment. Further studies of negative symptoms, applying computational linguistics methods, are warranted.}, } @article {pmid23084812, year = {2013}, author = {Gelfer, MP and Tice, RM}, title = {Perceptual and acoustic outcomes of voice therapy for male-to-female transgender individuals immediately after therapy and 15 months later.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {27}, number = {3}, pages = {335-347}, doi = {10.1016/j.jvoice.2012.07.009}, pmid = {23084812}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Case-Control Studies ; Female ; Feminization ; Humans ; Male ; Prospective Studies ; *Self Concept ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Transgender Persons/*psychology ; Transsexualism/physiopathology/psychology/*therapy ; Treatment Outcome ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {OBJECTIVES: The present study examined how effectively listeners' perceptions of gender could be changed from male to female for male-to-female (MTF) transgender (TG) clients based on the voice signal alone, immediately after voice therapy and at long-term follow-up. Short- and long-term changes in masculinity and femininity ratings and acoustic measures of speaking fundamental frequency (SFF) and vowel formant frequencies were also investigated.

DESIGN: Prospective treatment study.

METHOD: Five MTF TG clients, five control female speakers, and five control male speakers provided a variety of speech samples for later analysis. The TG clients then underwent 8 weeks of voice therapy. Voice samples were collected immediately at the termination of therapy and again 15 months later. Two groups of listeners were recruited to evaluate gender and provide masculinity and femininity ratings.

RESULTS: Perceptual results revealed that TG subjects were perceived as female 1.9% of the time in the pretest, 50.8% of the time in the immediate posttest, and 33.1% of the time in the long-term posttest. The TG speakers were also perceived as significantly less masculine and more feminine in the immediate posttest and the long-term posttest compared with the pre-test. Some acoustic measures showed significant differences between the pretest and the immediate posttest and long-term posttest.

CONCLUSIONS: It appeared that 8 weeks of voice therapy could result in vocal changes in MTF TG individuals that persist at least partially for up to 15 months. However, some TG subjects were more successful with voice feminization than others.}, } @article {pmid23057793, year = {2012}, author = {Flipsen, P and Lee, S}, title = {Reference data for the American English acoustic vowel space.}, journal = {Clinical linguistics & phonetics}, volume = {26}, number = {11-12}, pages = {926-933}, doi = {10.3109/02699206.2012.720634}, pmid = {23057793}, issn = {1464-5076}, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Humans ; *Language ; Language Development ; Male ; Middle Aged ; Phonation/*physiology ; *Phonetics ; Reference Values ; Sex Factors ; *Speech Acoustics ; Speech Production Measurement/*standards ; United States ; }, abstract = {Reference data for the acoustic vowel space area (VSA) in children and adolescents do not currently appear to be available in a form suitable for normative comparisons. In the current study, individual speaker formant data for the four corner vowels of American English (/i, u, æ, ɑ/) were used to compute individual speaker VSAs. The sample included 300 children aged 5-18 years and 38 adults aged 25-50 years. Age trends and sex differences were examined and reference data for clinical application were developed. Findings indicated significant declines in VSA with age; sex differences were not fully apparent until late adolescence and adulthood. Implications are discussed.}, } @article {pmid23055485, year = {2012}, author = {Anderson, S and Parbery-Clark, A and White-Schwoch, T and Kraus, N}, title = {Aging affects neural precision of speech encoding.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {32}, number = {41}, pages = {14156-14164}, pmid = {23055485}, issn = {1529-2401}, support = {R01 DC010016/DC/NIDCD NIH HHS/United States ; T32 DC009399/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Aged ; Aging/*physiology ; Auditory Threshold/*physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Humans ; Middle Aged ; Neural Inhibition/physiology ; Reaction Time/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Older adults frequently report they can hear what is said but cannot understand the meaning, especially in noise. This difficulty may arise from the inability to process rapidly changing elements of speech. Aging is accompanied by a general slowing of neural processing and decreased neural inhibition, both of which likely interfere with temporal processing in auditory and other sensory domains. Age-related reductions in inhibitory neurotransmitter levels and delayed neural recovery can contribute to decreases in the temporal precision of the auditory system. Decreased precision may lead to neural timing delays, reductions in neural response magnitude, and a disadvantage in processing the rapid acoustic changes in speech. The auditory brainstem response (ABR), a scalp-recorded electrical potential, is known for its ability to capture precise neural synchrony within subcortical auditory nuclei; therefore, we hypothesized that a loss of temporal precision results in subcortical timing delays and decreases in response consistency and magnitude. To assess this hypothesis, we recorded ABRs to the speech syllable /da/ in normal hearing younger (18-30 years old) and older (60-67 years old) adult humans. Older adults had delayed ABRs, especially in response to the rapidly changing formant transition, and greater response variability. We also found that older adults had decreased phase locking and smaller response magnitudes than younger adults. Together, our results support the theory that older adults have a loss of temporal precision in the subcortical encoding of sound, which may account, at least in part, for their difficulties with speech perception.}, } @article {pmid23039458, year = {2012}, author = {Swanepoel, R and Oosthuizen, DJ and Hanekom, JJ}, title = {The relative importance of spectral cues for vowel recognition in severe noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {4}, pages = {2652-2662}, doi = {10.1121/1.4751543}, pmid = {23039458}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Analysis of Variance ; Audiometry, Speech ; Auditory Threshold ; *Cues ; Female ; Fourier Analysis ; Humans ; Male ; Noise/*adverse effects ; *Perceptual Masking ; *Recognition, Psychology ; Signal Detection, Psychological ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Time Factors ; *Voice Quality ; }, abstract = {The importance of formants and spectral shape was investigated for vowel perception in severe noise. Twelve vowels were synthesized using two different synthesis methods, one where the original spectral detail was preserved, and one where the vowel was represented by the spectral peaks of the first three formants. In addition, formants F1 and F2 were suppressed individually to investigate the importance of each in severe noise. Vowels were presented to listeners in quiet and in speech-shaped noise at signal to noise ratios (SNRs) of 0, -5, and -10 dB, and vowel confusions were determined in a number of conditions. Results suggest that the auditory system relies on formant information for vowel perception irrespective of the SNR, but that, as noise increases, it relies increasingly on more complete spectral information to perform formant extraction. A second finding was that, while F2 is more important in quiet or low noise conditions, F1 and F2 are of similar importance in severe noise.}, } @article {pmid23039456, year = {2012}, author = {Karlsson, F and van Doorn, J}, title = {Vowel formant dispersion as a measure of articulation proficiency.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {4}, pages = {2633-2641}, doi = {10.1121/1.4746025}, pmid = {23039456}, issn = {1520-8524}, mesh = {Bias ; Computer Simulation ; Humans ; Male ; *Models, Statistical ; *Phonetics ; *Speech Acoustics ; *Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {The articulatory range of a speaker has previously been estimated by the shape formed by first and second formant measurements of produced vowels. In a majority of the currently available metrics, formant frequency measurements are reduced to a single estimate for a condition, which has adverse consequences for subsequent statistical testing. Other metrics provide estimates of size of vowel articulation changes only, and do not provide a method for studying the direction of the change. This paper proposes an alternative approach. Vowel formant frequencies are redefined as vectors originating from a defined center point of the vowel space fixed to a basic three-vowel frame. The Euclidean length of the vectors, the vowel formant dispersion (VFD), can be compared across conditions for evidence of articulatory expansions or reductions across conditions or speaker groups. Further, the angle component of the vowel vectors allows for analyses of direction of the reduction or expansion. Based on the range of investigations afforded by the VFD metric, and simulation experiments that compare its statistical properties with those of other proposed metrics, it is argued that the VFD procedure offers an enhanced view of vowel articulation change over rival metrics.}, } @article {pmid23039452, year = {2012}, author = {Lulich, SM and Morton, JR and Arsikere, H and Sommers, MS and Leung, GK and Alwan, A}, title = {Subglottal resonances of adult male and female native speakers of American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {4}, pages = {2592-2602}, pmid = {23039452}, issn = {1520-8524}, mesh = {Accelerometry ; Adolescent ; Age Factors ; Biomechanical Phenomena ; Body Height ; Female ; Glottis/*physiology ; Humans ; *Language ; Male ; *Phonation ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Vibration ; *Voice Quality ; Young Adult ; }, abstract = {This paper presents a large-scale study of subglottal resonances (SGRs) (the resonant frequencies of the tracheo-bronchial tree) and their relations to various acoustical and physiological characteristics of speakers. The paper presents data from a corpus of simultaneous microphone and accelerometer recordings of consonant-vowel-consonant (CVC) words embedded in a carrier phrase spoken by 25 male and 25 female native speakers of American English ranging in age from 18 to 24 yr. The corpus contains 17,500 utterances of 14 American English monophthongs, diphthongs, and the rhotic approximant [[inverted r]] in various CVC contexts. Only monophthongs are analyzed in this paper. Speaker height and age were also recorded. Findings include (1) normative data on the frequency distribution of SGRs for young adults, (2) the dependence of SGRs on height, (3) the lack of a correlation between SGRs and formants or the fundamental frequency, (4) a poor correlation of the first SGR with the second and third SGRs but a strong correlation between the second and third SGRs, and (5) a significant effect of vowel category on SGR frequencies, although this effect is smaller than the measurement standard deviations and therefore negligible for practical purposes.}, } @article {pmid23033936, year = {2012}, author = {Chen, Y and Ng, ML and Li, TS}, title = {English vowels produced by Cantonese-English bilingual speakers.}, journal = {International journal of speech-language pathology}, volume = {14}, number = {6}, pages = {557-568}, doi = {10.3109/17549507.2012.718360}, pmid = {23033936}, issn = {1754-9515}, mesh = {Adult ; Female ; Humans ; Learning ; Male ; Middle Aged ; *Multilingualism ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; Young Adult ; }, abstract = {The present study attempted to test the postulate that sounds of a foreign language that are familiar can be produced with less accuracy than sounds that are new to second language (L2) learners. The first two formant frequencies (F1 and F2) were obtained from the 11 English monophthong vowels produced by 40 Cantonese-English (CE) bilingual and 40 native American English monolingual speakers. Based on F1 and F2, compact-diffuse (C-D) and grave-acute (G-A) values, and Euclidean Distance (ED) associated with the English vowels were evaluated and correlated with the perceived amount of accent present in the vowels. Results indicated that both male and female CE speakers exhibited different vowel spaces compared to their AE counterparts. While C-D and G-A indicated that acquisition of familiar and new vowels were not particularly different, ED values suggested better performance in CE speakers' productions of familiar vowels over new vowels. In conclusion, analyses based on spectral measurements obtained from the English vowel sounds produced by CE speakers did not provide favourable evidence to support the Speech Learning Model (SLM) proposed by Flege (1995) . Nevertheless, for both familiar and new sounds, English back vowels were found to be produced with greater inaccuracy than English front vowels.}, } @article {pmid23028996, year = {2012}, author = {Charlton, BD and Reby, D and Ellis, WA and Brumm, J and Fitch, WT}, title = {Estimating the active space of male koala bellows: propagation of cues to size and identity in a Eucalyptus forest.}, journal = {PloS one}, volume = {7}, number = {9}, pages = {e45420}, pmid = {23028996}, issn = {1932-6203}, support = {230604/ERC_/European Research Council/International ; }, mesh = {Animals ; Body Size ; *Eucalyptus ; Male ; Phascolarctidae/*physiology ; Trees ; Vocalization, Animal/*physiology ; }, abstract = {Examining how increasing distance affects the information content of vocal signals is fundamental for determining the active space of a given species' vocal communication system. In the current study we played back male koala bellows in a Eucalyptus forest to determine the extent that individual classification of male koala bellows becomes less accurate over distance, and also to quantify how individually distinctive acoustic features of bellows and size-related information degrade over distance. Our results show that the formant frequencies of bellows derived from Linear Predictive Coding can be used to classify calls to male koalas over distances of 1-50 m. Further analysis revealed that the upper formant frequencies and formant frequency spacing were the most stable acoustic features of male bellows as they propagated through the Eucalyptus canopy. Taken together these findings suggest that koalas could recognise known individuals at distances of up to 50 m and indicate that they should attend to variation in the upper formant frequencies and formant frequency spacing when assessing the identity of callers. Furthermore, since the formant frequency spacing is also a cue to male body size in this species and its variation over distance remained very low compared to documented inter-individual variation, we suggest that male koalas would still be reliably classified as small, medium or large by receivers at distances of up to 150 m.}, } @article {pmid23007720, year = {2013}, author = {Molis, MR and Diedesch, A and Gallun, F and Leek, MR}, title = {Vowel identification by amplitude and phase contrast.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {14}, number = {1}, pages = {125-137}, pmid = {23007720}, issn = {1438-7573}, support = {R01 DC000626/DC/NIDCD NIH HHS/United States ; T35 DC008764/DC/NIDCD NIH HHS/United States ; R01 DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Auditory Threshold/physiology ; Hearing Loss/*physiopathology ; Humans ; Middle Aged ; *Phonetics ; Speech Discrimination Tests/methods ; Speech Perception/*physiology ; }, abstract = {Vowel identification is largely dependent on listeners' access to the frequency of two or three peaks in the amplitude spectrum. Earlier work has demonstrated that, whereas normal-hearing listeners can identify harmonic complexes with vowel-like spectral shapes even with very little amplitude contrast between "formant" components and remaining harmonic components, listeners with hearing loss require greater amplitude differences. This is likely the result of the poor frequency resolution that often accompanies hearing loss. Here, we describe an additional acoustic dimension for emphasizing formant versus non-formant harmonics that may supplement amplitude contrast information. The purpose of this study was to determine whether listeners were able to identify "vowel-like" sounds using temporal (component phase) contrast, which may be less affected by cochlear loss than spectral cues, and whether overall identification improves when congruent temporal and spectral information are provided together. Five normal-hearing and five hearing-impaired listeners identified three vowels over many presentations. Harmonics representing formant peaks were varied in amplitude, phase, or a combination of both. In addition to requiring less amplitude contrast, normal-hearing listeners could accurately identify the sounds with less phase contrast than required by people with hearing loss. However, both normal-hearing and hearing-impaired groups demonstrated the ability to identify vowel-like sounds based solely on component phase shifts, with no amplitude contrast information, and they also showed improved performance when congruent phase and amplitude cues were combined. For nearly all listeners, the combination of spectral and temporal information improved identification in comparison to either dimension alone.}, } @article {pmid22992704, year = {2013}, author = {Nittrouer, S and Lowenstein, JH and Tarr, E}, title = {Amplitude rise time does not cue the /ba/-/wa/ contrast for adults or children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {56}, number = {2}, pages = {427-440}, pmid = {22992704}, issn = {1558-9102}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Age Factors ; Child, Preschool ; Cues ; Female ; Humans ; *Loudness Perception ; Male ; *Phonetics ; *Pitch Perception ; Psychoacoustics ; Sound Spectrography ; Speech ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: Previous research has demonstrated that children weight the acoustic cues to many phonemic decisions differently than do adults and gradually shift those strategies as they gain language experience. However, that research has focused on spectral and duration cues rather than on amplitude cues. In the current study, the authors examined amplitude rise time (ART; an amplitude cue) and formant rise time (FRT; a spectral cue) in the /b/-/w/ manner contrast for adults and children, and related those speech decisions to outcomes of nonspeech discrimination tasks.

METHOD: Twenty adults and 30 children (ages 4-5 years) labeled natural and synthetic speech stimuli manipulated to vary ARTs and FRTs, and discriminated nonspeech analogs that varied only by ART in an AX paradigm.

RESULTS: Three primary results were obtained. First, listeners in both age groups based speech labeling judgments on FRT, not on ART. Second, the fundamental frequency of the natural speech samples did not influence labeling judgments. Third, discrimination performance for the nonspeech stimuli did not predict how listeners would perform with the speech stimuli.

CONCLUSION: Even though both adults and children are sensitive to ART, it was not weighted in phonemic judgments by these typical listeners.}, } @article {pmid22979831, year = {2012}, author = {Liu, C and Tao, S and Wang, W and Dong, Q}, title = {Formant discrimination of speech and non-speech sounds for English and Chinese listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {3}, pages = {EL189-95}, doi = {10.1121/1.4742318}, pmid = {22979831}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Audiometry, Speech ; Auditory Threshold ; Cues ; *Discrimination, Psychological ; Humans ; *Language ; Male ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {Thresholds of formant discrimination of speech and non-speech sounds were compared for native listeners of English and Chinese. English listeners showed significantly better thresholds than Chinese listeners, not only for English vowels, but also for Chinese vowels. Thresholds for vowel-spectrum-shaped noise were comparable for the two groups. These results suggest that English listeners are more sensitive to formant frequency changes of vowel stimuli than Chinese listeners, possibly due to the denser vowel space for English than for Mandarin Chinese. However, the psychophysical capacity to discriminate formant frequency changes in non-speech sounds is similar for English and Chinese listeners.}, } @article {pmid22978900, year = {2012}, author = {Mehta, DD and Rudoy, D and Wolfe, PJ}, title = {Kalman-based autoregressive moving average modeling and inference for formant and antiformant tracking.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {3}, pages = {1732-1746}, doi = {10.1121/1.4739462}, pmid = {22978900}, issn = {1520-8524}, mesh = {Adult ; Algorithms ; Audiometry, Speech ; Biomechanical Phenomena ; Female ; Humans ; Linear Models ; Male ; *Models, Statistical ; *Phonation ; *Phonetics ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Time Factors ; Uncertainty ; Vibration ; Vocal Cords/*physiology ; }, abstract = {Vocal tract resonance characteristics in acoustic speech signals are classically tracked using frame-by-frame point estimates of formant frequencies followed by candidate selection and smoothing using dynamic programming methods that minimize ad hoc cost functions. The goal of the current work is to provide both point estimates and associated uncertainties of center frequencies and bandwidths in a statistically principled state-space framework. Extended Kalman (K) algorithms take advantage of a linearized mapping to infer formant and antiformant parameters from frame-based estimates of autoregressive moving average (ARMA) cepstral coefficients. Error analysis of KARMA, wavesurfer, and praat is accomplished in the all-pole case using a manually marked formant database and synthesized speech waveforms. KARMA formant tracks exhibit lower overall root-mean-square error relative to the two benchmark algorithms with the ability to modify parameters in a controlled manner to trade off bias and variance. Antiformant tracking performance of KARMA is illustrated using synthesized and spoken nasal phonemes. The simultaneous tracking of uncertainty levels enables practitioners to recognize time-varying confidence in parameters of interest and adjust algorithmic settings accordingly.}, } @article {pmid22974503, year = {2012}, author = {Rocha-Muniz, CN and Befi-Lopes, DM and Schochat, E}, title = {Investigation of auditory processing disorder and language impairment using the speech-evoked auditory brainstem response.}, journal = {Hearing research}, volume = {294}, number = {1-2}, pages = {143-152}, doi = {10.1016/j.heares.2012.08.008}, pmid = {22974503}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Auditory Pathways/physiopathology ; Auditory Perceptual Disorders/*physiopathology ; Case-Control Studies ; Child ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Language Development Disorders/*physiopathology ; Male ; Phonetics ; }, abstract = {This study investigated whether there are differences in the Speech-Evoked Auditory Brainstem Response among children with Typical Development (TD), (Central) Auditory Processing Disorder (C)APD, and Language Impairment (LI). The speech-evoked Auditory Brainstem Response was tested in 57 children (ages 6-12). The children were placed into three groups: TD (n = 18), (C)APD (n = 18) and LI (n = 21). Speech-evoked ABR were elicited using the five-formant syllable/da/. Three dimensions were defined for analysis, including timing, harmonics, and pitch. A comparative analysis of the responses between the typical development children and children with (C)APD and LI revealed abnormal encoding of the speech acoustic features that are characteristics of speech perception in children with (C)APD and LI, although the two groups differed in their abnormalities. While the children with (C)APD might had a greater difficulty distinguishing stimuli based on timing cues, the children with LI had the additional difficulty of distinguishing speech harmonics, which are important to the identification of speech sounds. These data suggested that an inefficient representation of crucial components of speech sounds may contribute to the difficulties with language processing found in children with LI. Furthermore, these findings may indicate that the neural processes mediated by the auditory brainstem differ among children with auditory processing and speech-language disorders.}, } @article {pmid22926979, year = {2012}, author = {Koda, H and Nishimura, T and Tokuda, IT and Oyakawa, C and Nihonmatsu, T and Masataka, N}, title = {Soprano singing in gibbons.}, journal = {American journal of physical anthropology}, volume = {149}, number = {3}, pages = {347-355}, doi = {10.1002/ajpa.22124}, pmid = {22926979}, issn = {1096-8644}, mesh = {Animals ; Biological Evolution ; Helium ; Humans ; Hylobates/anatomy & histology/*physiology ; Larynx/anatomy & histology/physiology ; Models, Theoretical ; Singing/*physiology ; Speech Acoustics ; Vocalization, Animal/*physiology ; }, abstract = {Diversifications in primate vocalization, including human speech, are believed to reflect evolutionary modifications in vocal anatomy and physiology. Gibbon song is acoustically unique, comprising loud, melodious, penetrating pure tone-like calls. In a white-handed gibbon, Hylobates lar, the fundamental frequency (f(0)) of song sounds is amplified distinctively from the higher harmonics in normal air. In a helium-enriched atmosphere, f(0) does not shift, but it is significantly suppressed and 2f(0) is emphasized. This implies that the source is independent of the resonance filter of the supralaryngeal vocal tract (SVT) in gibbons, in contrast to musical wind instruments, in which the filter primarily determines f(0) . Acoustic simulation further supported that gibbons' singing is produced analogously to professional human soprano singing, in which a precise tuning of the first formant (F(1)) of the SVT to f(0) amplifies exclusively the f(0) component of the source. Thus, in gibbons, as in humans, dynamic control over the vocal tract configuration, rather than anatomical modifications, has been a dominant factor in determining call structure. The varied dynamic movements were adopted in response to unique social and ecological pressures in gibbons, allowing monogamous gibbons to produce pure-tonal melodious songs in the dense tropical forests with poor visibility.}, } @article {pmid22921292, year = {2012}, author = {Erickson, ML and Gaskill, CS}, title = {Can listeners hear how many singers are singing? The effect of listener's experience, vibrato, onset, and formant frequency on the perception of number of simultaneous singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {6}, pages = {817.e1-13}, doi = {10.1016/j.jvoice.2012.04.011}, pmid = {22921292}, issn = {1873-4588}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Analysis of Variance ; Audiometry ; *Auditory Perception ; Auditory Threshold ; Cues ; Discrimination, Psychological ; Female ; Humans ; Linear Models ; Male ; Middle Aged ; *Music ; Pattern Recognition, Physiological ; Pitch Discrimination ; *Singing ; Vibration ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study investigated whether listener's experience, presence/absence of vibrato, formant frequency difference, or onset delay affect the ability of experienced and inexperienced listeners to segregate complex vocal stimuli.

STUDY DESIGN: Repeated measures factorial design.

METHODS: Two sets of stimuli were constructed: one with no vibrato and another with vibrato. For each set, each stimulus was synthesized at four pitches: A3, E4, B4, and F5. Stimuli were synthesized using formant patterns appropriate for the vowel |ɑ|. Frequencies for formants one through four were systematically varied from lower to higher in an attempt to simulate the acoustic results of corresponding changes in vocal tract length. Four formant patterns were synthesized (patterns A-D). Three pairs were created at each pitch, pairing the formants AB (mezzo-soprano/mezzo-soprano), CD (soprano/soprano), and AD (mezzo-soprano/soprano). Each of these three pairs was constructed in three separate conditions: simultaneous onset; the first voice in the pair with an onset delay of 100 milliseconds; and the second voice in the pair with an onset delay of 100 milliseconds. Using a scroll bar, listeners rated how difficult it was for them to hear each stimulus pair as two separate voices.

RESULTS: The most difficult combinations to segregate were produced with no vibrato and used simultaneous onset. The easiest conditions to segregate were combinations including a "soprano-like" formant pattern (D) in the vibrato condition. Overall, listener's experience did not affect the perceived difficulty of segregation; however, in the presence of vibrato cues, inexperienced listeners did not use delay cues as an aid in segregation in the same manner as did experienced listeners. Once vibrato was removed from the experimental context, inexperienced listeners were able to use delay to aid in segregation in a similar manner to experienced listeners.

CONCLUSION: Presence/absence of vibrato, formant pattern difference, and onset delay interact in a complex manner to affect the perceived difficulty of voice segregation.}, } @article {pmid22911857, year = {2012}, author = {Cai, S and Beal, DS and Ghosh, SS and Tiede, MK and Guenther, FH and Perkell, JS}, title = {Weak responses to auditory feedback perturbation during articulation in persons who stutter: evidence for abnormal auditory-motor transformation.}, journal = {PloS one}, volume = {7}, number = {7}, pages = {e41830}, pmid = {22911857}, issn = {1932-6203}, support = {/CAPMC/CIHR/Canada ; R01-DC007683/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R56-DC0010849/DC/NIDCD NIH HHS/United States ; R56 DC010849/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; *Feedback, Physiological ; Female ; Humans ; Language ; Latency Period, Psychological ; Male ; Middle Aged ; Motor Activity/*physiology ; Speech/*physiology ; Stuttering/*physiopathology ; Young Adult ; }, abstract = {Previous empirical observations have led researchers to propose that auditory feedback (the auditory perception of self-produced sounds when speaking) functions abnormally in the speech motor systems of persons who stutter (PWS). Researchers have theorized that an important neural basis of stuttering is the aberrant integration of auditory information into incipient speech motor commands. Because of the circumstantial support for these hypotheses and the differences and contradictions between them, there is a need for carefully designed experiments that directly examine auditory-motor integration during speech production in PWS. In the current study, we used real-time manipulation of auditory feedback to directly investigate whether the speech motor system of PWS utilizes auditory feedback abnormally during articulation and to characterize potential deficits of this auditory-motor integration. Twenty-one PWS and 18 fluent control participants were recruited. Using a short-latency formant-perturbation system, we examined participants' compensatory responses to unanticipated perturbation of auditory feedback of the first formant frequency during the production of the monophthong [ε]. The PWS showed compensatory responses that were qualitatively similar to the controls' and had close-to-normal latencies (∼150 ms), but the magnitudes of their responses were substantially and significantly smaller than those of the control participants (by 47% on average, p<0.05). Measurements of auditory acuity indicate that the weaker-than-normal compensatory responses in PWS were not attributable to a deficit in low-level auditory processing. These findings are consistent with the hypothesis that stuttering is associated with functional defects in the inverse models responsible for the transformation from the domain of auditory targets and auditory error information into the domain of speech motor commands.}, } @article {pmid22910275, year = {2012}, author = {Chhabra, S and Badcock, JC and Maybery, MT and Leung, D}, title = {Voice identity discrimination in schizophrenia.}, journal = {Neuropsychologia}, volume = {50}, number = {12}, pages = {2730-2735}, doi = {10.1016/j.neuropsychologia.2012.08.006}, pmid = {22910275}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Adult ; Case-Control Studies ; Cognition Disorders/etiology/*physiopathology ; Cues ; Discrimination, Psychological/*physiology ; Female ; Hallucinations/etiology/*physiopathology ; Humans ; Male ; Recognition, Psychology/physiology ; Schizophrenia/complications/*physiopathology ; Speech Perception/physiology ; *Voice ; }, abstract = {Voices provide a wealth of socially-relevant information, including cues to a speaker's identity and emotion. Deficits recognising emotion from voice have been extensively described in schizophrenia, and linked specifically to auditory hallucinations (AH), but relatively little attention has been given to examining the ability to analyse speaker identity. Hence, the current study assessed the ability to discriminate between different speakers in people with schizophrenia (including 33 with and 32 without AH) compared to 32 healthy controls. Participants rated the degree of perceived identity similarity of pairs of unfamiliar voices pronouncing three-syllable words. Multidimensional scaling of the dissimilarity matrices was performed and the resulting dimensions were interpreted, a posteriori, via correlations with acoustic measures relevant to voice identity. A two-dimensional perceptual space was found to be appropriate for both schizophrenia patients and controls, with axes corresponding to the average fundamental frequency (F0) and formant dispersion (D(f)). Patients with schizophrenia did not differ from healthy controls in their reliance on F0 in differentiating voices, suggesting that the ability to use pitch-based cues for discriminating voice identity may be relatively preserved in schizophrenia. On the other hand, patients (both with and without AH) made less use of D(f) in discriminating voices compared to healthy controls. This distorted pattern of responses suggests some potentially important differences in voice identity processing in schizophrenia. Formant dispersion has been linked to perceptions of dominance, masculinity, size and age in healthy individuals. These findings open some interesting new directions for future research.}, } @article {pmid22894224, year = {2012}, author = {Kondaurova, MV and Bergeson, TR and Dilley, LC}, title = {Effects of deafness on acoustic characteristics of American English tense/lax vowels in maternal speech to infants.}, journal = {The Journal of the Acoustical Society of America}, volume = {132}, number = {2}, pages = {1039-1049}, pmid = {22894224}, issn = {1520-8524}, support = {R01 DC008581/DC/NIDCD NIH HHS/United States ; R01 DC 008581/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Auditory Threshold ; Case-Control Studies ; Cues ; Deafness/diagnosis/physiopathology/*psychology ; Female ; Humans ; Infant ; Male ; *Mother-Child Relations ; Mothers/*psychology ; Phonetics ; Severity of Illness Index ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {Recent studies have demonstrated that mothers exaggerate phonetic properties of infant-directed (ID) speech. However, these studies focused on a single acoustic dimension (frequency), whereas speech sounds are composed of multiple acoustic cues. Moreover, little is known about how mothers adjust phonetic properties of speech to children with hearing loss. This study examined mothers' production of frequency and duration cues to the American English tense/lax vowel contrast in speech to profoundly deaf (N = 14) and normal-hearing (N = 14) infants, and to an adult experimenter. First and second formant frequencies and vowel duration of tense (/i/, /u/) and lax (/I/, /ʊ/) vowels were measured. Results demonstrated that for both infant groups mothers hyperarticulated the acoustic vowel space and increased vowel duration in ID speech relative to adult-directed speech. Mean F2 values were decreased for the /u/ vowel and increased for the /I/ vowel, and vowel duration was longer for the /i/, /u/, and /I/ vowels in ID speech. However, neither acoustic cue differed in speech to hearing-impaired or normal-hearing infants. These results suggest that both formant frequencies and vowel duration that differentiate American English tense/lx vowel contrasts are modified in ID speech regardless of the hearing status of the addressee.}, } @article {pmid22864645, year = {2012}, author = {Schuster, M and Stelzle, F}, title = {Outcome measurements after oral cancer treatment: speech and speech-related aspects--an overview.}, journal = {Oral and maxillofacial surgery}, volume = {16}, number = {3}, pages = {291-298}, pmid = {22864645}, issn = {1865-1569}, mesh = {Articulation Disorders/etiology ; Chemoradiotherapy, Adjuvant ; Combined Modality Therapy ; Humans ; Mouth Neoplasms/pathology/*surgery ; *Phonetics ; Postoperative Complications/*etiology ; Risk Factors ; Sound Spectrography ; Speech Acoustics ; Speech Disorders/*etiology ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {INTRODUCTION: Oral cancer and its surgical treatment impair speech quality by an alteration of the vocal tract. Local size and stage of the tumour as well as surgical and adjuvant treatment modalities have an impact on the functional outcome of patients' speech rehabilitation. It was the aim of this overview to specifiy speech and speech-related aspects as well as to delineate measurement methods of speech outcome in patients with oral cancer by a review of the literature.

METHODS: The review is based on a Medline Search on "speech", "cancer", "oral cancer", "malignoma mouth", "intelligibility", "formant", "ultrasound".

DISCUSSION: In particular, speech intelligibility is inevitable for the social interaction of patients which is highly correlated with the patient's quality of life. However, speech outcome measurement shows a variety of methods without an international standardisation. Additionally, several co-aspects of speech production have to be considered: tongue mobility, voice production, velopharyngeal closure and neural coordination are important influencing factors. Speech assessment is traditionally performed by perceptual methods on a subjective or semi-subjective base. More objective, technical-based methods of speech evaluation are in development and under research.

PURPOSE: It was the aim of this overview to specify speech and speech-related aspects as well as to delineate measurement methods of speech outcome in patients with oral cancer by a review of the current literature.}, } @article {pmid22846348, year = {2013}, author = {Kirchhübel, C and Howard, DM}, title = {Detecting suspicious behaviour using speech: acoustic correlates of deceptive speech -- an exploratory investigation.}, journal = {Applied ergonomics}, volume = {44}, number = {5}, pages = {694-702}, doi = {10.1016/j.apergo.2012.04.016}, pmid = {22846348}, issn = {1872-9126}, mesh = {Adult ; Anxiety/psychology ; Cues ; Dangerous Behavior ; *Deception ; Humans ; Interviews as Topic ; Motivation ; Phonetics ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Tape Recording ; Truth Disclosure ; Young Adult ; }, abstract = {The current work intended to enhance our knowledge of changes or lack of changes in the speech signal when people were being deceptive. In particular, the study attempted to investigate the appropriateness of using speech cues in detecting deception. Truthful, deceptive and control speech were elicited from ten speakers in an interview setting. The data were subjected to acoustic analysis and results are presented on a range of speech parameters including fundamental frequency (f0), overall amplitude and mean vowel formants F1, F2 and F3. A significant correlation could not be established between deceptiveness/truthfulness and any of the acoustic features examined. Directions for future work are highlighted.}, } @article {pmid22844469, year = {2012}, author = {Latinus, M and Belin, P}, title = {Perceptual auditory aftereffects on voice identity using brief vowel stimuli.}, journal = {PloS one}, volume = {7}, number = {7}, pages = {e41384}, pmid = {22844469}, issn = {1932-6203}, support = {BBE0039581//Biotechnology and Biological Sciences Research Council/United Kingdom ; RES-060-25-0010//Medical Research Council/United Kingdom ; }, mesh = {*Acoustic Stimulation ; Acoustics ; Adult ; Auditory Perception/*physiology ; Female ; Humans ; Learning ; *Linguistics ; Male ; Voice/*physiology ; }, abstract = {Humans can identify individuals from their voice, suggesting the existence of a perceptual representation of voice identity. We used perceptual aftereffects--shifts in perceived stimulus quality after brief exposure to a repeated adaptor stimulus--to further investigate the representation of voice identity in two experiments. Healthy adult listeners were familiarized with several voices until they reached a recognition criterion. They were then tested on identification tasks that used vowel stimuli generated by morphing between the different identities, presented either in isolation (baseline) or following short exposure to different types of voice adaptors (adaptation). Experiment 1 showed that adaptation to a given voice induced categorization shifts away from that adaptor's identity even when the adaptors consisted of vowels different from the probe stimuli. Moreover, original voices and caricatures resulted in comparable aftereffects, ruling out an explanation of identity aftereffects in terms of adaptation to low-level features. In Experiment 2, we show that adaptors with a disrupted configuration, i.e., altered fundamental frequency or formant frequencies, failed to produce perceptual aftereffects showing the importance of the preserved configuration of these acoustical cues in the representation of voices. These two experiments indicate a high-level, dynamic representation of voice identity based on the combination of several lower-level acoustical features into a specific voice configuration.}, } @article {pmid22814487, year = {2013}, author = {Laroche, M and Dajani, HR and Prévost, F and Marcoux, AM}, title = {Brainstem auditory responses to resolved and unresolved harmonics of a synthetic vowel in quiet and noise.}, journal = {Ear and hearing}, volume = {34}, number = {1}, pages = {63-74}, doi = {10.1097/AUD.0b013e31826119a1}, pmid = {22814487}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Noise ; Phonetics ; Signal Detection, Psychological ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVES: This study investigated speech auditory brainstem responses (speech ABR) with variants of a synthetic vowel in quiet and in background noise. Its objectives were to study the noise robustness of the brainstem response at the fundamental frequency F0 and at the first formant F1, evaluate how the resolved/unresolved harmonics regions in speech contribute to the response at F0, and investigate the origin of the response at F0 to resolved and unresolved harmonics in speech.

DESIGN: In total, 18 normal-hearing subjects (11 women, aged 18-33 years) participated in this study. Speech ABRs were recorded using variants of a 300 msec formant-synthesized /a/ vowel in quiet and in white noise. The first experiment employed three variants containing the first three formants F1 to F3, F1 only, and F2 and F3 only with relative formant levels following those reported in the literature. The second experiment employed three variants containing F1 only, F2 only, and F3 only, with the formants equalized to the same level and the signal-to-noise ratio (SNR) maintained at -5 dB. Overall response latency was estimated, and the amplitude and local SNR of the envelope following response at F0 and of the frequency following response at F1 were compared for the different stimulus variants in quiet and in noise.

RESULTS: The response at F0 was more robust to noise than that at F1. There were no statistically significant differences in the response at F0 caused by the three stimulus variants in both experiments in quiet. However, the response at F0 with the variant dominated by resolved harmonics was more robust to noise than the response at F0 with the stimulus variants dominated by unresolved harmonics. The latencies of the responses in all cases were very similar in quiet, but the responses at F0 due to resolved and unresolved harmonics combined nonlinearly when both were present in the stimulus.

CONCLUSIONS: Speech ABR has been suggested as a marker of central auditory processing. The results of this study support earlier work on the differential susceptibility to noise of the F0 and F1 components of the evoked response. In the case of F0, the results support the view that in speech, the pitch of resolved harmonics and that of unresolved harmonics are processed in different but interacting pathways that converge in the upper brainstem. Pitch plays an important role in speech perception, and speech ABR can offer a window into the neural extraction of the pitch of speech and how it may change with hearing impairment.}, } @article {pmid22807665, year = {2012}, author = {Carlson, NL and Ming, VL and Deweese, MR}, title = {Sparse codes for speech predict spectrotemporal receptive fields in the inferior colliculus.}, journal = {PLoS computational biology}, volume = {8}, number = {7}, pages = {e1002594}, pmid = {22807665}, issn = {1553-7358}, mesh = {Algorithms ; Auditory Pathways ; Computational Biology ; Humans ; Inferior Colliculi/*physiology ; Mesencephalon ; *Models, Neurological ; Neurons/physiology ; Speech/*physiology ; Thalamus/physiology ; }, abstract = {We have developed a sparse mathematical representation of speech that minimizes the number of active model neurons needed to represent typical speech sounds. The model learns several well-known acoustic features of speech such as harmonic stacks, formants, onsets and terminations, but we also find more exotic structures in the spectrogram representation of sound such as localized checkerboard patterns and frequency-modulated excitatory subregions flanked by suppressive sidebands. Moreover, several of these novel features resemble neuronal receptive fields reported in the Inferior Colliculus (IC), as well as auditory thalamus and cortex, and our model neurons exhibit the same tradeoff in spectrotemporal resolution as has been observed in IC. To our knowledge, this is the first demonstration that receptive fields of neurons in the ascending mammalian auditory pathway beyond the auditory nerve can be predicted based on coding principles and the statistical properties of recorded sounds.}, } @article {pmid22803405, year = {2012}, author = {Li, J and Yan, Y and Wang, L and Duan, J and Zhang, K}, title = {[Study of voice changes pre- and post-tonsillectomy].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {26}, number = {9}, pages = {399-401}, pmid = {22803405}, issn = {2096-7993}, mesh = {Humans ; Organ Size ; Palatine Tonsil/*pathology ; Postoperative Period ; Preoperative Period ; Software ; Speech Acoustics ; *Tonsillectomy ; *Voice Quality ; }, abstract = {OBJECTIVE: To evaluate changes in acoustic features of voice after tonsillectomy.

METHOD: Thirty-three patients presenting for tonsillectomy were recruited. Voice recordings obtained from patients before and 1 month after surgery, were analyzed using the Kay Elemetrics, multi-dimensional voice processing software. The following parameters were estimated: the first formant (F1, B1), average of fundamental frequency (F0), Jitter percent (Jitter), Shimmer, noise-to-harmonics ratio (NHR).

RESULT: There were significant difference in the first formant frequency and bandwidth before and 1 month after surgery. The larger the tonsil was, the more changes were seen in the formant frequency. No significant differences were found for F0, Jitter, Shimmer, NHR decreased after the surgery.

CONCLUSION: The modifications in the anatomical structure and volume of the vocal tract, induced by the tonsillectomy can change the formant frequency and bandwidth. The larger the tonsil was, the more changes were detected in the formant frequency.}, } @article {pmid22764242, year = {2012}, author = {Lametti, DR and Nasir, SM and Ostry, DJ}, title = {Sensory preference in speech production revealed by simultaneous alteration of auditory and somatosensory feedback.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {32}, number = {27}, pages = {9351-9358}, pmid = {22764242}, issn = {1529-2401}, support = {R01 DC004669/DC/NIDCD NIH HHS/United States ; DC-04669/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Auditory Perception/*physiology ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Physical Stimulation/methods ; Somatosensory Cortex/*physiology ; Speech/*physiology ; Speech Perception/*physiology ; Time Factors ; Touch Perception/*physiology ; Verbal Behavior/physiology ; Young Adult ; }, abstract = {The idea that humans learn and maintain accurate speech by carefully monitoring auditory feedback is widely held. But this view neglects the fact that auditory feedback is highly correlated with somatosensory feedback during speech production. Somatosensory feedback from speech movements could be a primary means by which cortical speech areas monitor the accuracy of produced speech. We tested this idea by placing the somatosensory and auditory systems in competition during speech motor learning. To do this, we combined two speech-learning paradigms to simultaneously alter somatosensory and auditory feedback in real time as subjects spoke. Somatosensory feedback was manipulated by using a robotic device that altered the motion path of the jaw. Auditory feedback was manipulated by changing the frequency of the first formant of the vowel sound and playing back the modified utterance to the subject through headphones. The amount of compensation for each perturbation was used as a measure of sensory reliance. All subjects were observed to correct for at least one of the perturbations, but auditory feedback was not dominant. Indeed, some subjects showed a stable preference for either somatosensory or auditory feedback during speech.}, } @article {pmid22741532, year = {2013}, author = {Pinho, CM and Jesus, LM and Barney, A}, title = {Aerodynamic measures of speech in unilateral vocal fold paralysis (UVFP) patients.}, journal = {Logopedics, phoniatrics, vocology}, volume = {38}, number = {1}, pages = {19-34}, doi = {10.3109/14015439.2012.696138}, pmid = {22741532}, issn = {1651-2022}, mesh = {Aged ; Analysis of Variance ; Biomechanical Phenomena ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; *Phonetics ; Reproducibility of Results ; Sex Factors ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Vocal Cord Paralysis/*physiopathology ; Vocal Cords/*physiopathology ; *Voice Quality ; }, abstract = {This paper reports the recording and analysis of an aerodynamic database of 51 words produced by four patients with unilateral vocal fold paralysis. The vowel-fricative-vowel boundaries were manually annotated, and the mean absolute oral airflow amplitude (OA), fundamental frequency (f0), and first formant intensity (IF1) were extracted from a 20 ms window in the steady state of each phone. A case study approach to analysis of phonatory behaviour for the subjects is presented. Significant differences were found between the absolute OA and IF1 for different phones. Large between-subject variations in absolute measures for OA and f0 were found. Relative values calculated from the difference in these parameters between phones show consistency for subjects of the same gender.}, } @article {pmid22740017, year = {2012}, author = {Charlton, BD and Ellis, WA and Larkin, R and Fitch, WT}, title = {Perception of size-related formant information in male koalas (Phascolarctos cinereus).}, journal = {Animal cognition}, volume = {15}, number = {5}, pages = {999-1006}, doi = {10.1007/s10071-012-0527-5}, pmid = {22740017}, issn = {1435-9456}, support = {230604/ERC_/European Research Council/International ; }, mesh = {Acoustic Stimulation ; Animal Communication ; Animals ; *Auditory Perception ; Behavior, Animal ; Habituation, Psychophysiologic ; Male ; Phascolarctidae/*psychology ; *Size Perception ; *Vocalization, Animal ; }, abstract = {Advances in bioacoustics allow us to study the perceptual and functional relevance of individual acoustic parameters. Here, we use re-synthesised male koala bellows and a habituation-dishabituation paradigm to test the hypothesis that male koalas are sensitive to shifts in formant frequencies corresponding to the natural variation in body size between a large and small adult male. We found that males habituated to bellows, in which the formants had been shifted to simulate a large or small male displayed a significant increase in behavioural response (dishabituation) when they were presented with bellows simulating the alternate size variant. The rehabituation control, in which the behavioural response levels returned to that of the last playbacks of the habituation phase, indicates that this was not a chance increase in response levels. Our results provide clear evidence that male koalas perceive and attend to size-related formant information in their own species-specific vocalisations and suggest that formant perception is a widespread ability shared by marsupials and placental mammals, and perhaps by vertebrates more widely.}, } @article {pmid22729986, year = {2012}, author = {Walsh, B and Smith, A}, title = {Basic parameters of articulatory movements and acoustics in individuals with Parkinson's disease.}, journal = {Movement disorders : official journal of the Movement Disorder Society}, volume = {27}, number = {7}, pages = {843-850}, pmid = {22729986}, issn = {1531-8257}, support = {F31 DC007267/DC/NIDCD NIH HHS/United States ; R01 DC000559/DC/NIDCD NIH HHS/United States ; R01DC00559/DC/NIDCD NIH HHS/United States ; F31DC007267-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Aged ; Aged, 80 and over ; Analysis of Variance ; Female ; Humans ; Male ; Middle Aged ; Movement/*physiology ; Parkinson Disease/*complications ; *Speech Acoustics ; Speech Disorders/*etiology ; Speech Production Measurement ; }, abstract = {It has long been recognized that lesions of the basal ganglia frequently result in dysarthria, in part because many individuals with Parkinson's disease (PD) have impaired speech. Earlier studies of speech production in PD using perceptual, acoustic, and/or kinematic analyses have yielded mixed findings about the characteristics of articulatory movements underlying hypokinetic dysarthria associated with PD: in some cases reporting reduced articulatory output, and in other instances revealing orofacial movement parameters within the normal range. The central aim of this experiment was to address these inconsistencies by providing an integrative description of basic kinematic and acoustic parameters of speech production in individuals with PD. Recordings of lip and jaw movements and acoustic data were collected in 16 individuals with PD and 16 age- and sex-matched neurologically healthy older adults. Our results revealed a downscaling of articulatory dynamics in the individuals with PD, evidenced by decreased amplitude and velocity of lower lip and jaw movements, decreased vocal intensity (dB sound pressure level [SPL]), and reduced second formant (F2) slopes. However, speech rate did not differ between groups. Our finding of an overall downscaling of speech movement and acoustic parameters in some participants with PD provides support for speech therapies directed at increasing speech effort in individuals with PD.}, } @article {pmid22729493, year = {2012}, author = {Zielińska-Bliźniewska, H and Sułkowski, WJ and Pietkiewicz, P and Miłoński, J and Mazurek, A and Olszewski, J}, title = {Evaluation of vocal acoustic and efficiency analysis parameters in medical students and academic teachers with use of iris and diagnoscope specialist software.}, journal = {International journal of occupational medicine and environmental health}, volume = {25}, number = {3}, pages = {236-241}, doi = {10.2478/S13382-012-0030-x}, pmid = {22729493}, issn = {1896-494X}, mesh = {Adult ; Certification ; Diagnostic Techniques and Procedures ; *Faculty, Medical ; Female ; Humans ; Occupational Diseases ; Professional Competence/*standards ; *Software ; *Students, Medical ; Voice Disorders/*diagnosis ; Young Adult ; }, abstract = {OBJECTIVES: The aim of this study was to compare the parameters of vocal acoustic and vocal efficiency analyses in medical students and academic teachers with use of the IRIS and DiagnoScope Specialist software and to evaluate their usefulness in prevention and certification of occupational disease.

MATERIAL AND METHODS: The study group comprised 40 women, including students and employees of the Military Medical Faculty, Medical University of Łodź. After informed consent had been obtained from the participant women, the primary medical history was taken, videolaryngoscopic and stroboscopic examinations were performed and diagnostic vocal acoustic analysis was carried out with the use of the IRIS and Diagno-Scope Specialist software.

RESULTS: Based on the results of the performed measurements, the statistical analysis evidenced the compatibility between two software programs, IRIS and DiagnoScope Specialist, with the only exception of the F4 formant. The mean values of vocal acoustic parameters in medical students and academic teachers, obtained by means of the IRIS software, can be used as standards for the female population not yet developed by the producer. When using the DiagnoScope Specialist software, some mean values were higher and some lower than the standards specified by the producer.

CONCLUSIONS: The study evidenced the compatibility between two measurement software programs, IRIS and DiagnoScope Specialist, except for the F4 formant. It should be noted that the later has advantage over the former since the standard values of vocal acoustic parameters have been worked out by the producer. Moreover, they only slightly departed from the values obtained in our study and may be useful in diagnostics of occupational voice disorders.}, } @article {pmid22713016, year = {2012}, author = {Wang, N and Kreft, H and Oxenham, AJ}, title = {Vowel enhancement effects in cochlear-implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {6}, pages = {EL421-6}, doi = {10.1121/1.4710838}, pmid = {22713016}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Aged ; Aged, 80 and over ; *Cochlear Implants ; Cochlear Nerve/*physiology ; Deafness/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Noise ; *Phonetics ; Pitch Discrimination/physiology ; Young Adult ; }, abstract = {Auditory enhancement of certain frequencies can occur through prior stimulation of surrounding frequency regions. The underlying neural mechanisms are unknown, but may involve stimulus-driven changes in cochlear gain via the medial olivocochlear complex (MOC) efferents. Cochlear implants (CIs) bypass the cochlea and stimulate the auditory nerve directly. If the MOC plays a critical role in enhancement then CI users should not exhibit this effect. Results using vowel stimuli, with and without preceding sounds designed to enhance formants, provided evidence of auditory enhancement in both normal-hearing listeners and CI users, suggesting that vowel enhancement is not mediated solely by cochlear effects.}, } @article {pmid22690718, year = {2012}, author = {Klein, HB and Grigos, MI and McAllister Byun, T and Davidson, L}, title = {The relationship between inexperienced listeners' perceptions and acoustic correlates of children's /r/ productions.}, journal = {Clinical linguistics & phonetics}, volume = {26}, number = {7}, pages = {628-645}, doi = {10.3109/02699206.2012.682695}, pmid = {22690718}, issn = {1464-5076}, mesh = {Adult ; Auditory Threshold/physiology ; Child ; Child Language ; Clinical Competence ; *Cues ; Female ; Humans ; Judgment/physiology ; Male ; *Phonetics ; Psychoacoustics ; Speech/*physiology ; Speech Acoustics ; Speech Disorders/diagnosis/*physiopathology ; Speech Perception/*physiology ; Speech Therapy/*methods/standards ; }, abstract = {This study examined inexperienced listeners' perceptions of children's naturally produced /r/ sounds with reference to levels of accuracy determined by consensus between two expert clinicians. Participants rated /r/ sounds as fully correct, distorted or incorrect/non-rhotic. Second and third formant heights were measured to explore the relationship between acoustic cues and perceptual judgments. Inexperienced listeners' agreement was greater for correct productions than for distorted or incorrect/non-rhotic productions. In addition, inexperienced listeners' differentiation of intermediate versus fully incorrect /r/ had lower sensitivity and specificity relative to an acoustically defined threshold than experienced listeners' classification. These findings are consistent with results of previous studies highlighting the difficulty in identifying gradations of correctness in misarticulated /r/, and they suggest that this ability may be influenced by clinical experience. Additionally, all listeners were noted to be more consistent in rating vocalic /r/ than consonantal /r/. Implications for clinician training and treatment planning are discussed.}, } @article {pmid22688081, year = {2013}, author = {Prévost, F and Laroche, M and Marcoux, AM and Dajani, HR}, title = {Objective measurement of physiological signal-to-noise gain in the brainstem response to a synthetic vowel.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {124}, number = {1}, pages = {52-60}, doi = {10.1016/j.clinph.2012.05.009}, pmid = {22688081}, issn = {1872-8952}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Brain Stem/*physiology ; Evoked Potentials, Auditory/physiology ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; *Signal-To-Noise Ratio ; Speech Perception/*physiology ; Stochastic Processes ; Young Adult ; }, abstract = {OBJECTIVE: This work investigates auditory speech processing in normal listeners through measurement of brainstem responses to the synthetic vowel /a/.

METHODS: The vowel is presented in quiet and in continuous white noise with different signal-to-noise ratios (SNR) of +5, 0, -5, and -10 dB.

RESULTS: In the presence of noise, transient response waves V and A are delayed when compared to those evoked in quiet, whereas the amplitude of wave V and the steepness of the slope between waves V and A are strongly reduced. The spectral component of the steady-state evoked response corresponding to the fundamental frequency (F0) of the vowel shows significantly greater amplitude and local SNR in the less severe noise conditions compared to the quiet condition. Such increases of the amplitude and SNR were not observed for the spectral component corresponding to the first formant of the vowel (F1).

CONCLUSIONS: Results suggest that, at F0, both local noise suppression and signal enhancement contribute to the SNR gain. There is suppression of local noise near F1, but no signal enhancement.

SIGNIFICANCE: The physiological SNR gain was estimated to be approximately +12 dB at both F0 and F1, as stimulus SNR was reduced from +5 to -10 dB.}, } @article {pmid22634507, year = {2012}, author = {Parbery-Clark, A and Tierney, A and Strait, DL and Kraus, N}, title = {Musicians have fine-tuned neural distinction of speech syllables.}, journal = {Neuroscience}, volume = {219}, number = {}, pages = {111-119}, pmid = {22634507}, issn = {1873-7544}, support = {F31 DC011457/DC/NIDCD NIH HHS/United States ; T32 DC009399/DC/NIDCD NIH HHS/United States ; T32 DC009399-02/DC/NIDCD NIH HHS/United States ; F31DC011457-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Brain Stem/*physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Music ; Speech Perception/*physiology ; Young Adult ; }, abstract = {One of the benefits musicians derive from their training is an increased ability to detect small differences between sounds. Here, we asked whether musicians' experience discriminating sounds on the basis of small acoustic differences confers advantages in the subcortical differentiation of closely related speech sounds (e.g., /ba/ and /ga/), distinguishable only by their harmonic spectra (i.e., their second formant trajectories). Although the second formant is particularly important for distinguishing stop consonants, auditory brainstem neurons do not phase-lock to its frequency range (above 1000 Hz). Instead, brainstem neurons convert this high-frequency content into neural response timing differences. As such, speech tokens with higher formant frequencies elicit earlier brainstem responses than those with lower formant frequencies. By measuring the degree to which subcortical response timing differs to the speech syllables /ba/, /da/, and /ga/ in adult musicians and nonmusicians, we reveal that musicians demonstrate enhanced subcortical discrimination of closely related speech sounds. Furthermore, the extent of subcortical consonant discrimination correlates with speech-in-noise perception. Taken together, these findings show a musician enhancement for the neural processing of speech and reveal a biological mechanism contributing to musicians' enhanced speech perception in noise.}, } @article {pmid22582775, year = {2012}, author = {Won, TB and Kim, SY and Lee, WH and Han, DH and Kim, DY and Kim, JW and Rhee, CS and Lee, CH}, title = {Acoustic characteristics of snoring according to obstruction site determined by sleep videofluoroscopy.}, journal = {Acta oto-laryngologica}, volume = {132 Suppl 1}, number = {}, pages = {S13-20}, doi = {10.3109/00016489.2012.660733}, pmid = {22582775}, issn = {1651-2251}, mesh = {Acoustics/*instrumentation ; Adult ; Aged ; Aged, 80 and over ; Airway Obstruction/complications/*diagnosis ; Female ; Fluoroscopy/*methods ; Humans ; Male ; Middle Aged ; Retrospective Studies ; Sleep Apnea, Obstructive/*complications/diagnosis ; Snoring/*diagnosis/etiology/physiopathology ; *Video Recording ; Young Adult ; }, abstract = {CONCLUSION: Acoustic characteristics of snoring sound, such as pitch and formant, differed according to the site of upper airway obstruction determined by sleep videofluoroscopy (SVF). Snoring sound analysis can complement determination of the site of obstruction in snoring and sleep apnea patients.

OBJECTIVES: The aim of this study was to evaluate the acoustic characteristics of snoring according to obstruction site determined by SVF.

METHODS: Ninety patients who underwent simultaneous snoring sound recording during SVF were included in this study. Acoustic parameters of snoring such as pitch (min, mean, max) and formant (1,2) were analyzed. Site of obstruction was determined by SVF and classified according to anatomic structure and level of obstruction.

RESULTS: Mean value of peak frequency showed significant difference between soft palate and isolated tongue base or epiglottis obstruction and combined obstruction involving soft palate and tongue base or epiglottis. Peak frequency of velopharyngeal obstruction showed difference only with hypopharyngeal obstruction. First formant showed similar results in the structure classification whereas velopharyngeal obstruction showed significant difference compared with other levels of obstruction. Other parameters (intensity, jitter, shimmer) did not show significance according to site of obstruction.}, } @article {pmid22587654, year = {2013}, author = {Waaramaa, T and Kankare, E}, title = {Acoustic and EGG analyses of emotional utterances.}, journal = {Logopedics, phoniatrics, vocology}, volume = {38}, number = {1}, pages = {11-18}, doi = {10.3109/14015439.2012.679966}, pmid = {22587654}, issn = {1651-2022}, mesh = {*Acoustics ; Analysis of Variance ; Biomechanical Phenomena ; *Electrodiagnosis ; *Emotions ; Female ; Glottis/*physiology ; Humans ; Male ; *Phonation ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Vocal Cords/physiology ; *Voice Quality ; }, abstract = {The aim of the present study was to investigate whether the glottal and filter variables of emotional expressions vary by emotion and valence expressed. Prolonged emotional vowels (n = 96) were produced by professional actors and actresses (n = 4) expressing joy, surprise, interest, sadness, fear, anger, disgust, and a neutral emotional state. Acoustic parameters and the contact quotient from the electroglottographic signal (CQEGG) were calculated. Statistics were calculated for the parameters. Vocal fold contact time differed significantly between the emotional expressions reflecting differences in phonation types. It was concluded that CQEGG may vary simultaneously and inversely with F3 and F4 in emotional expressions of positive emotions. Changes in the lower pharynx and larynx may affect the higher formant frequencies.}, } @article {pmid22578439, year = {2012}, author = {Andrade, PA}, title = {Analysis of male singers laryngeal vertical displacement during the first passaggio and its implications on the vocal folds vibratory pattern.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {5}, pages = {665.e19-24}, doi = {10.1016/j.jvoice.2011.10.006}, pmid = {22578439}, issn = {1873-4588}, mesh = {Acoustics ; Adult ; Analysis of Variance ; Biomechanical Phenomena ; Electrodiagnosis ; Humans ; Male ; Pharynx/anatomy & histology/*physiology ; *Phonation ; Sex Factors ; *Singing ; Sound Spectrography ; Vibration ; Vocal Cords/anatomy & histology/*physiology ; *Voice Quality ; }, abstract = {OBJECTIVES: The first voice passaggio in classical western singing is characterized by changes in the vocal tract resonance and configuration around the middle C region (261.62 Hz). Many singers agree that something occurs at the level of the vocal folds to aid the demand for higher frequencies. However, although many singers explore this transitional mechanism, little is known about its precise physiology and association with the vertical displacement of the larynx. The objective of this study is to describe the implications of the vocal tract readjustment and the vocal folds vibration during the passaggio.

METHODS: The study consisted of 11 well-experienced singers performing an ascending scale up to a few notes above middle C while maintaining a consistent vocal tract configuration and finally sustaining the very last note, hence violating the normal passaggio. Subsequently, they reproduced the last sustained note with their traditional configuration of the larynx--western opera singing style. The data were collected with an electroglottography device. A repeated-measures analysis of variance (ANOVA) was applied to analyze the vocal tract length changes, vocal fold contact area (VFCA), and closure quotient before and after the first passaggio. A paired sample t test was implemented to analyze the formants transition.

RESULTS: The results show a significant difference of scores for the formants transition, VFCA, and vocal tract length suggesting that the larynx lowers after the passaggio. This displacement of the larynx is associated with changes on the vocal folds vibratory pattern conjecturably altering from essentially isotonic contraction of the thyroarytenoid muscle to isometric contraction.}, } @article {pmid22521788, year = {2012}, author = {Boeckle, M and Bugnyar, T}, title = {Long-term memory for affiliates in ravens.}, journal = {Current biology : CB}, volume = {22}, number = {9}, pages = {801-806}, pmid = {22521788}, issn = {1879-0445}, support = {Y 366/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Animals ; Crows/*physiology ; *Memory, Long-Term ; }, abstract = {Complex social life requires individuals to recognize and remember group members and, within those, to distinguish affiliates from nonaffiliates. Whereas long-term individual recognition has been demonstrated in some nonhuman animals, memory for the relationship valence to former group members has received little attention. Here we show that adult, pair-housed ravens not only respond differently to the playback of calls from previous group members and unfamiliar conspecifics but also discriminate between familiar birds according to the relationship valence they had to those subjects up to three years ago as subadult nonbreeders. The birds' distinction between familiar and unfamiliar individuals is reflected mainly in the number of calls, whereas their differentiation according to relationship valence is reflected in call modulation only. As compared to their response to affiliates, ravens responded to nonaffiliates by increasing chaotic parts of the vocalization and lowering formant spacing, potentially exaggerating the perceived impression of body size. Our findings indicate that ravens remember relationship qualities to former group members even after long periods of separation, confirming that their sophisticated social knowledge as nonbreeders is maintained into the territorial breeding stage.}, } @article {pmid22501085, year = {2012}, author = {Steinberg, J and Truckenbrodt, H and Jacobsen, T}, title = {The role of stimulus cross-splicing in an event-related potentials study. Misleading formant transitions hinder automatic phonological processing.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {4}, pages = {3120-3140}, doi = {10.1121/1.3688515}, pmid = {22501085}, issn = {1520-8524}, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Phonetics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The mental organization of linguistic knowledge and its involvement in speech processing can be investigated using the mismatch negativity (MMN) component of the auditory event-related potential. A contradiction arises, however, between the technical need for strict control of acoustic stimulus properties and the quest for naturalness and acoustic variability of the stimuli. Here, two methods of preparing speech stimulus material were compared. Focussing on the automatic processing of a phonotactic restriction in German, two corresponding sets of various vowel-fricative syllables were used as stimuli. The former syllables were naturally spoken while the latter ones were created by means of cross-splicing. Phonetically, natural and spliced syllables differed with respect to the appropriateness of coarticulatory information about the forthcoming fricative within the vowels. Spliced syllables containing clearly misleading phonetic information were found to elicit larger N2 responses compared to their natural counterparts. Furthermore, MMN results found for the natural syllables could not be replicated with these spliced stimuli. These findings indicate that the automatic processing of the stimuli was considerably affected by the stimulus preparation method. Thus, in spite of its unquestioned benefits for MMN experiments, the splicing technique may lead to interference effects on the linguistic factors under investigation.}, } @article {pmid22501082, year = {2012}, author = {Brunelle, M}, title = {Dialect experience and perceptual integrality in phonological registers: fundamental frequency, voice quality and the first formant in Cham.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {4}, pages = {3088-3102}, doi = {10.1121/1.3693651}, pmid = {22501082}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Asia, Southeastern ; Humans ; *Language ; Male ; *Phonetics ; Reaction Time/physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Voice Quality/*physiology ; Young Adult ; }, abstract = {The perceptual integrality of f0, F1 and voice quality is investigated by looking at register, a phonological contrast that relies on these three properties in three dialects of Cham, an Austronesian language of Mainland Southeast Asia. The results of a Garner classification experiment confirm that the three acoustic properties integrate perceptually and that their patterns of integrality are similar in the three dialects. Moreover, they show that dialect-specific sensitivity to acoustic properties can cause salient dimensions to override weaker ones. Finally, the patterns of integrality found in Cham suggest that auditory integrality is not limited to acoustically similar properties.}, } @article {pmid22501081, year = {2012}, author = {Benders, T and Escudero, P and Sjerps, MJ}, title = {The interrelation between acoustic context effects and available response categories in speech sound categorization.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {4}, pages = {3079-3087}, doi = {10.1121/1.3688512}, pmid = {22501081}, issn = {1520-8524}, mesh = {Acoustic Stimulation/methods ; Adolescent ; Female ; Humans ; Language ; Male ; Peru ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {In an investigation of contextual influences on sound categorization, 64 Peruvian Spanish listeners categorized vowels on an /i/ to /e/ continuum. First, to measure the influence of the stimulus range (broad acoustic context) and the preceding stimuli (local acoustic context), listeners were presented with different subsets of the Spanish /i/-/e/ continuum in separate blocks. Second, the influence of the number of response categories was measured by presenting half of the participants with /i/ and /e/ as responses, and the other half with /i/, /e/, /a/, /o/, and /u/. The results showed that the perceptual category boundary between /i/ and /e/ shifted depending on the stimulus range and that the formant values of locally preceding items had a contrastive influence. Categorization was less susceptible to broad and local acoustic context effects, however, when listeners were presented with five rather than two response options. Vowel categorization depends not only on the acoustic properties of the target stimulus, but also on its broad and local acoustic context. The influence of such context is in turn affected by the number of internal referents that are available to the listener in a task.}, } @article {pmid22501076, year = {2012}, author = {McGowan, RS and Howe, MS}, title = {Source-tract interaction with prescribed vocal fold motion.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {4}, pages = {2999-3016}, pmid = {22501076}, issn = {1520-8524}, support = {R01 DC009229/DC/NIDCD NIH HHS/United States ; 1R01 DC009229/DC/NIDCD NIH HHS/United States ; }, mesh = {Glottis/physiology ; Humans ; Models, Biological ; Movement/*physiology ; Phonation/*physiology ; Sound Spectrography ; Vocal Cords/*physiology ; Voice/*physiology ; }, abstract = {An equation describing the time-evolution of glottal volume velocity with specified vocal fold motion is derived when the sub- and supra-glottal vocal tracts are present. The derivation of this Fant equation employs a property explicated in Howe and McGowan [(2011) J. Fluid Mech. 672, 428-450] that the Fant equation is the adjoint to the equation characterizing the matching conditions of sub- and supra-glottal Green's functions segments with the glottal segment. The present aeroacoustic development shows that measurable quantities such as input impedances at the glottis, provide the coefficients for the Fant equation when source-tract interaction is included in the development. Explicit expressions for the Green's function are not required. With the poles and zeros of the input impedance functions specified, the Fant equation can be solved. After the general derivation of the Fant equation, the specific cases where plane wave acoustic propagation is described either by a Sturm-Liouville problem or concatenated cylindrical tubes is considered. Simulations show the expected skewing of the glottal volume velocity pulses depending on whether the fundamental frequency is below or above a sub- or supra-glottal formant. More complex glottal wave forms result when both the first supra-glottal fundamental frequencies are high and close to the first sub-glottal formant.}, } @article {pmid22465090, year = {2012}, author = {Lapshina, EN and Volodin, IA and Volodina, EV and Frey, R and Efremova, KO and Soldatova, NV}, title = {The ontogeny of acoustic individuality in the nasal calls of captive goitred gazelles, Gazella subgutturosa.}, journal = {Behavioural processes}, volume = {90}, number = {3}, pages = {323-330}, doi = {10.1016/j.beproc.2012.03.011}, pmid = {22465090}, issn = {1872-8308}, mesh = {Aging/*psychology ; Algorithms ; Animals ; Antelopes/*physiology ; Body Weight/physiology ; Data Interpretation, Statistical ; Discriminant Analysis ; Discrimination, Psychological/*physiology ; Female ; Individuality ; Male ; Neck/anatomy & histology ; Sex Characteristics ; Vocalization, Animal/*physiology ; }, abstract = {Individualistic voices are important for establishing personalized relationships among individuals. In young animals, individual vocal identity is affected by permanent changes of the acoustics due to the growth of their vocal apparatus. Different acoustic variables change uncoordinatedly, so vocal individuality should be repeatedly upgraded along development. We compared classifying accuracy of individuals and sexes by nasal calls in fast-growing goitred gazelles Gazella subgutturosa at two ontogenetic stages, juvenile (3-6 weeks of age) and adolescent (23-26 weeks of age). Juvenile "spring" nasal calls and adolescent "fall" nasal calls were examined in the same 35 calves (18 males, 17 females), wild-born in May and then hand-raised. Discriminate function analysis based on four formants, fundamental frequency, duration and three power quartiles, revealed an equally high potential of spring and fall calls to encode sex. The individuality was very high in both ages but significantly higher in fall calls. Classifying calls to individuals was based on the same three acoustic variables (fundamental frequency and third and fourth formants) in both ages, although their actual values changed uncoordinatedly from spring to fall in most subjects. Our results suggest updating acoustic individuality in nasal calls of adolescent goitred gazelles accordingly to the newly emerged acoustic variation.}, } @article {pmid22439014, year = {2012}, author = {Hu, C and Wang, Q and Short, LA and Fu, G}, title = {Speech spectrum's correlation with speakers' Eysenck personality traits.}, journal = {PloS one}, volume = {7}, number = {3}, pages = {e33906}, pmid = {22439014}, issn = {1932-6203}, mesh = {Adolescent ; Emotions ; Extraversion, Psychological ; Female ; Humans ; Male ; *Personality ; Personality Tests ; Speech ; *Speech Acoustics ; Surveys and Questionnaires ; Young Adult ; }, abstract = {The current study explored the correlation between speakers' Eysenck personality traits and speech spectrum parameters. Forty-six subjects completed the Eysenck Personality Questionnaire. They were instructed to verbally answer the questions shown on a computer screen and their responses were recorded by the computer. Spectrum parameters of /sh/ and /i/ were analyzed by Praat voice software. Formant frequencies of the consonant /sh/ in lying responses were significantly lower than that in truthful responses, whereas no difference existed on the vowel /i/ speech spectrum. The second formant bandwidth of the consonant /sh/ speech spectrum was significantly correlated with the personality traits of Psychoticism, Extraversion, and Neuroticism, and the correlation differed between truthful and lying responses, whereas the first formant frequency of the vowel /i/ speech spectrum was negatively correlated with Neuroticism in both response types. The results suggest that personality characteristics may be conveyed through the human voice, although the extent to which these effects are due to physiological differences in the organs associated with speech or to a general Pygmalion effect is yet unknown.}, } @article {pmid22423719, year = {2012}, author = {Kohn, ME and Farrington, C}, title = {Evaluating acoustic speaker normalization algorithms: evidence from longitudinal child data.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {3}, pages = {2237-2248}, doi = {10.1121/1.3682061}, pmid = {22423719}, issn = {1520-8524}, mesh = {Adolescent ; *Algorithms ; Child ; Female ; Humans ; Language Development ; Longitudinal Studies ; Male ; *Phonetics ; Psycholinguistics ; Speech/*physiology ; Speech Acoustics ; }, abstract = {Speaker vowel formant normalization, a technique that controls for variation introduced by physical differences between speakers, is necessary in variationist studies to compare speakers of different ages, genders, and physiological makeup in order to understand non-physiological variation patterns within populations. Many algorithms have been established to reduce variation introduced into vocalic data from physiological sources. The lack of real-time studies tracking the effectiveness of these normalization algorithms from childhood through adolescence inhibits exploration of child participation in vowel shifts. This analysis compares normalization techniques applied to data collected from ten African American children across five time points. Linear regressions compare the reduction in variation attributable to age and gender for each speaker for the vowels BEET, BAT, BOT, BUT, and BOAR. A normalization technique is successful if it maintains variation attributable to a reference sociolinguistic variable, while reducing variation attributable to age. Results indicate that normalization techniques which rely on both a measure of central tendency and range of the vowel space perform best at reducing variation attributable to age, although some variation attributable to age persists after normalization for some sections of the vowel space.}, } @article {pmid22421465, year = {2013}, author = {Mishima, K and Moritani, N and Nakano, H and Matsushita, A and Iida, S and Ueyama, Y}, title = {Voice characteristics before versus after mandibular setback surgery in patients with mandibular prognathism using nonlinear dynamics and conventional acoustic analyses.}, journal = {Journal of cranio-maxillo-facial surgery : official publication of the European Association for Cranio-Maxillo-Facial Surgery}, volume = {41}, number = {8}, pages = {706-709}, doi = {10.1016/j.jcms.2012.01.021}, pmid = {22421465}, issn = {1878-4119}, mesh = {Adolescent ; Adult ; Female ; Follow-Up Studies ; Fractals ; Humans ; Male ; Malocclusion, Angle Class III/surgery ; Mandible/surgery ; Nonlinear Dynamics ; Osteotomy, Sagittal Split Ramus/*methods ; Phonetics ; Prognathism/*surgery ; Sex Factors ; Speech Acoustics ; Voice/*physiology ; Young Adult ; }, abstract = {OBJECTIVES: The purpose of this study was to explore the voice characteristics of patients with mandibular prognathism, and to investigate the effects of mandibular setback surgery on these characteristics using nonlinear dynamics and conventional acoustic analyses.

MATERIALS AND METHODS: Sixteen patients (8 males and 8 females) who had skeletal 3, class III malocclusion without cleft palate, and who underwent a bilateral sagittal split ramus osteotomy (BSSRO), were enrolled. As controls, 50 healthy adults (25 males and 25 females) were enrolled. The mean first LEs (mLE1) computed for each one-second interval, and the fundamental frequency (F0) and frequencies of the first and second formant (F1, F2) were calculated for each Japanese vowel.

RESULTS AND CONCLUSIONS: The mLE1s for /u/ in males, and /o/ in females and the F2s for /i/ and /u/ in males, changed significantly after BSSRO. Class III voice characteristics were observed in the mLE1s for /i/ in both males and females, in the F0 for /a/, /i/, /u/ and /o/ in females, and in the F1 and F2 for /a/ in males, and the F1 for /u/ and the F2 for /i/ in females. Most of these characteristics were preserved after BSSRO.}, } @article {pmid22411285, year = {2012}, author = {Rong, P and Kuehn, D}, title = {The effect of articulatory adjustment on reducing hypernasality.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {55}, number = {5}, pages = {1438-1448}, doi = {10.1044/1092-4388(2012/11-0142)}, pmid = {22411285}, issn = {1558-9102}, mesh = {Communication Aids for Disabled ; Female ; Humans ; Male ; Nasal Cavity/*physiology ; Phonation/*physiology ; Phonetics ; *Speech Acoustics ; Speech Production Measurement ; Speech Therapy/methods ; Velopharyngeal Sphincter/physiology ; Voice Disorders/physiopathology/*therapy ; Voice Quality/*physiology ; Young Adult ; }, abstract = {PURPOSE: With the goal of using articulatory adjustments to reduce hypernasality, this study utilized an articulatory synthesis model (Childers, 2000) to simulate the adjustment of articulatory configurations with an open velopharynx to achieve the same acoustic goal as normal speech simulated with a closed velopharynx.

METHOD: To examine the effect of articulatory adjustment on perceived nasality, this study used an articulatory synthesis model (Childers, 2000) to synthesize 18 oral /i/ vowels, 18 nasal /i/ vowels, and 18 nasal /i/ vowels with computer-generated articulatory adjustments; these vowels were then presented to 7 listeners for perceptual ratings of nasality following the direct magnitude estimation method.

RESULTS: Comparisons of nasality ratings of nasal vowels showed a significant reduction of perceived nasality after articulatory adjustment. Moreover, the acoustic features associated with nasal resonances were attenuated and the oral formant structures changed by nasalization were restored after articulatory adjustment, which confirmed findings in Rong and Kuehn (2010).

CONCLUSION: Appropriate articulatory adjustments are able to reduce the nasality of synthetic nasal /i/ vowels by compensating for the acoustic deviations caused by excessive velopharyngeal opening. Such compensatory interarticulator coordination may have an application in using articulatory adjustments to reduce hypernasality in clinical speech therapies.}, } @article {pmid22411276, year = {2012}, author = {Lee, SA and Iverson, GK}, title = {Vowel category formation in Korean-English bilingual children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {55}, number = {5}, pages = {1449-1462}, doi = {10.1044/1092-4388(2012/11-0150)}, pmid = {22411276}, issn = {1558-9102}, support = {RHD061527A//PHS HHS/United States ; }, mesh = {Child ; Child Behavior ; Child Language ; Child, Preschool ; Female ; Humans ; *Language ; *Language Development ; Male ; *Multilingualism ; Phonation/*physiology ; *Phonetics ; Social Behavior ; Speech Acoustics ; }, abstract = {PURPOSE: A previous investigation (Lee & Iverson, 2012) found that English and Korean stop categories were fully distinguished by Korean-English bilingual children at 10 years of age but not at 5 years of age. The present study examined vowels produced by Korean-English bilingual children of these same ages to determine whether and when bilinguals establish distinct vowel categories across their 2 languages.

METHOD: Both English and Korean vowels produced by 40 Korean-English bilingual children (5 and 10 years of age) were examined in terms of 1st formant frequency (F1) and 2nd formant frequency (F2), vowel duration, and F1 and F2 formant trajectories.

RESULTS: Formant frequencies of vowels produced by the bilingual children were similar to those of monolingual English and Korean children. The bilinguals distinguished vowel categories across languages using both the assimilation and dissimilation mechanisms as identified by Flege, Schirru, and MacKay (2003).

CONCLUSIONS: Vowel categories developed earlier than stops in bilingual children because vowels were typically acquired earlier than consonants. The results of this study suggest that detailed phonetic categories do not form across the board and that bilingual children may invoke multidimensional representations of phonetic categories.}, } @article {pmid22394011, year = {2012}, author = {Laukkanen, AM and Horáček, J and Havlík, R}, title = {Case-study magnetic resonance imaging and acoustic investigation of the effects of vocal warm-up on two voice professionals.}, journal = {Logopedics, phoniatrics, vocology}, volume = {37}, number = {2}, pages = {75-82}, doi = {10.3109/14015439.2012.660502}, pmid = {22394011}, issn = {1651-2022}, mesh = {*Acoustics ; Female ; Humans ; Larynx/*anatomy & histology/*physiology ; *Magnetic Resonance Imaging ; Male ; Middle Aged ; *Music ; *Phonation ; Sound Spectrography ; *Speech Acoustics ; *Voice Quality ; *Voice Training ; }, abstract = {Vocal warm-up (WU)-related changes were studied in one male musical singer and one female speech trainer. They sustained vowels before and after WU in a magnetic resonance imaging (MRI) device. Acoustic recordings were made in a studio. The vocal tract area increased after WU, a formant cluster appeared between 2 and 4.5 kHz, and SPL increased. Evidence of larynx lowering was only found for the male. The pharyngeal inlet over the epilaryngeal outlet ratio (A(ph)/A(e)) increased by 10%-28%, being 3-4 for the male and 5-7 for the female. The results seem to represent different voice training traditions. A singer's formant cluster may be achievable without a high A(ph)/A(e) (≥ 6), but limitations of the 2D method should be taken into account.}, } @article {pmid22389682, year = {2012}, author = {Skodda, S and Grönheit, W and Schlegel, U}, title = {Impairment of vowel articulation as a possible marker of disease progression in Parkinson's disease.}, journal = {PloS one}, volume = {7}, number = {2}, pages = {e32132}, pmid = {22389682}, issn = {1932-6203}, mesh = {Adult ; Aged ; Aged, 80 and over ; Articulation Disorders/etiology/pathology/*physiopathology ; Disease Progression ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/complications/*pathology/*physiopathology ; Speech Articulation Tests ; }, abstract = {PURPOSE: The aim of the current study was to survey if vowel articulation in speakers with Parkinson's disease (PD) shows specific changes in the course of the disease.

METHOD: 67 patients with PD (42 male) and 40 healthy speakers (20 male) were tested and retested after an average time interval of 34 months. Participants had to read a given text as source for subsequent calculation of the triangular vowel space area (tVSA) and vowel articulation index (VAI). Measurement of tVSA and VAI were based upon analysis of the first and second formant of the vowels /α/, /i/and /u/ extracted from defined words within the text.

RESULTS: At first visit, VAI values were reduced in male and female PD patients as compared to the control group, and showed a further decrease at the second visit. Only in female Parkinsonian speakers, VAI was correlated to overall speech impairment based upon perceptual impression. VAI and tVSA were correlated to gait impairment, but no correlations were seen between VAI and global motor impairment or overall disease duration. tVSA showed a similar reduction in the PD as compared to the control group and was also found to further decline between first and second examination in female, but not in male speakers with PD.

CONCLUSIONS: Measurement of VAI seems to be superior to tVSA in the description of impaired vowel articulation and its further decline in the course of the disease in PD. Since impairment of vowel articulation was found to be independent from global motor function but correlated to gait dysfunction, measurement of vowel articulation might have a potential to serve as a marker of axial disease progression.}, } @article {pmid24932066, year = {2012}, author = {Weismer, G and Yunusova, Y and Bunton, K}, title = {Measures to Evaluate the Effects of DBS on Speech Production.}, journal = {Journal of neurolinguistics}, volume = {25}, number = {4}, pages = {74-94}, pmid = {24932066}, issn = {0911-6044}, support = {R01 DC003723/DC/NIDCD NIH HHS/United States ; R01 DC003723-01A1/DC/NIDCD NIH HHS/United States ; }, abstract = {The purpose of this paper is to review and evaluate measures of speech production that could be used to document effects of Deep Brain Stimulation (DBS) on speech performance, especially in persons with Parkinson disease (PD). A small set of evaluative criteria for these measures is presented first, followed by consideration of several speech physiology and speech acoustic measures that have been studied frequently and reported on in the literature on normal speech production, and speech production affected by neuromotor disorders (dysarthria). Each measure is reviewed and evaluated against the evaluative criteria. Embedded within this review and evaluation is a presentation of new data relating speech motions to speech intelligibility measures in speakers with PD, amyotrophic lateral sclerosis (ALS), and control speakers (CS). These data are used to support the conclusion that at the present time the slope of second formant transitions (F2 slope), an acoustic measure, is well suited to make inferences to speech motion and to predict speech intelligibility. The use of other measures should not be ruled out, however, and we encourage further development of evaluative criteria for speech measures designed to probe the effects of DBS or any treatment with potential effects on speech production and communication skills.}, } @article {pmid22372626, year = {2012}, author = {Steeve, RW}, title = {Effects of changing jaw height on F1 during babble: a case study at 9 months.}, journal = {Clinical linguistics & phonetics}, volume = {26}, number = {4}, pages = {311-329}, doi = {10.3109/02699206.2011.626887}, pmid = {22372626}, issn = {1464-5076}, support = {F31 DC00295/DC/NIDCD NIH HHS/United States ; P20 RR16474/RR/NCRR NIH HHS/United States ; R01 DC00822/DC/NIDCD NIH HHS/United States ; T32 DC00033/DC/NIDCD NIH HHS/United States ; }, mesh = {Biomechanical Phenomena/*physiology ; *Child Language ; Humans ; Infant ; Jaw/*physiology ; *Language Development ; Lip/*physiology ; Male ; Phonation/*physiology ; Phonetics ; Reference Values ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Tongue/*physiology ; Video Recording ; }, abstract = {An empirical gap exists in our understanding of the extent that mandibular kinematics modulate acoustic changes in natural babble productions of infants. Data were recorded from a normal developing 9-month-old infant. Mandibular position was tracked from the infant during vowel and canonical babble. Linear predictive coding analysis was used to track estimates of formant center-frequency for F1. For each sample, a correlation coefficient was computed between changes in jaw height and formant history for F1. A Mann-Whitney rank sum test reached significance for differences among coefficients for vowel and canonical babble. Coefficients for vowel babble productions were nearest to -1 with a median of r=-0.76, showing the strongest relationship between jaw position and formant history, while the median for canonical babble coefficients was r=-0.54, indicating greater contribution of the tongue and lips. This 9-month-old infant exhibited plasticity for coordinating the jaw, tongue and lips among babble types.}, } @article {pmid22363628, year = {2012}, author = {Cartei, V and Cowles, HW and Reby, D}, title = {Spontaneous voice gender imitation abilities in adult speakers.}, journal = {PloS one}, volume = {7}, number = {2}, pages = {e31353}, pmid = {22363628}, issn = {1932-6203}, mesh = {Acoustics ; Adolescent ; Adult ; Analysis of Variance ; Body Height/physiology ; Body Weight/physiology ; Female ; Gestures ; Humans ; Imitative Behavior/*physiology ; Language ; Male ; Middle Aged ; *Sex Characteristics ; Speech/*physiology ; Voice/*physiology ; Young Adult ; }, abstract = {BACKGROUND: The frequency components of the human voice play a major role in signalling the gender of the speaker. A voice imitation study was conducted to investigate individuals' ability to make behavioural adjustments to fundamental frequency (F0), and formants (Fi) in order to manipulate their expression of voice gender.

Thirty-two native British-English adult speakers were asked to read out loud different types of text (words, sentence, passage) using their normal voice and then while sounding as 'masculine' and 'feminine' as possible. Overall, the results show that both men and women raised their F0 and Fi when feminising their voice, and lowered their F0 and Fi when masculinising their voice.

CONCLUSIONS/SIGNIFICANCE: These observations suggest that adult speakers are capable of spontaneous glottal and vocal tract length adjustments to express masculinity and femininity in their voice. These results point to a "gender code", where speakers make a conventionalized use of the existing sex dimorphism to vary the expression of their gender and gender-related attributes.}, } @article {pmid22352610, year = {2012}, author = {Chládková, K and Escudero, P}, title = {Comparing vowel perception and production in Spanish and Portuguese: European versus Latin American dialects.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {2}, pages = {EL119-25}, doi = {10.1121/1.3674991}, pmid = {22352610}, issn = {1520-8524}, mesh = {Adolescent ; Adult ; Brazil/ethnology ; Female ; Humans ; *Language ; Male ; Peru/ethnology ; Portugal/ethnology ; Spain/ethnology ; *Speech Acoustics ; Speech Intelligibility/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Recent acoustic descriptions have shown that Spanish and Portuguese vowels are produced differently in Europe and Latin America. The present study investigates whether comparable between-variety differences exist in vowel perception. Spanish, Peruvian, Portuguese, and Brazilian listeners were tested in a vowel identification task with stimuli sampled from the whole vowel space. The mean perceived first (F1) and second formant (F2) of every vowel category were compared across varieties. For both languages, perception exhibited the same between-variety differences as production for F1 but not F2, which suggests correspondence between produced F1 and perceived vowel height but not between F2 and frontness.}, } @article {pmid22352517, year = {2012}, author = {Winn, MB and Chatterjee, M and Idsardi, WJ}, title = {The use of acoustic cues for phonetic identification: effects of spectral degradation and electric hearing.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {2}, pages = {1465-1479}, pmid = {22352517}, issn = {1520-8524}, support = {R01 DC004786/DC/NIDCD NIH HHS/United States ; T32 DC000046/DC/NIDCD NIH HHS/United States ; T32 DC000046-17/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Case-Control Studies ; *Cochlear Implants ; *Cues ; Humans ; Middle Aged ; Noise ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Although some cochlear implant (CI) listeners can show good word recognition accuracy, it is not clear how they perceive and use the various acoustic cues that contribute to phonetic perceptions. In this study, the use of acoustic cues was assessed for normal-hearing (NH) listeners in optimal and spectrally degraded conditions, and also for CI listeners. Two experiments tested the tense/lax vowel contrast (varying in formant structure, vowel-inherent spectral change, and vowel duration) and the word-final fricative voicing contrast (varying in F1 transition, vowel duration, consonant duration, and consonant voicing). Identification results were modeled using mixed-effects logistic regression. These experiments suggested that under spectrally-degraded conditions, NH listeners decrease their use of formant cues and increase their use of durational cues. Compared to NH listeners, CI listeners showed decreased use of spectral cues like formant structure and formant change and consonant voicing, and showed greater use of durational cues (especially for the fricative contrast). The results suggest that although NH and CI listeners may show similar accuracy on basic tests of word, phoneme or feature recognition, they may be using different perceptual strategies in the process.}, } @article {pmid22352514, year = {2012}, author = {Jacewicz, E and Fox, RA}, title = {The effects of cross-generational and cross-dialectal variation on vowel identification and classification.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {2}, pages = {1413-1433}, pmid = {22352514}, issn = {1520-8524}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; R01DC006871/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Aged ; Child ; Discrimination, Psychological ; Female ; Humans ; Linguistics ; Male ; Middle Aged ; North Carolina ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; Wisconsin ; }, abstract = {Cross-generational and cross-dialectal variation in vowels among speakers of American English was examined in terms of vowel identification by listeners and vowel classification using pattern recognition. Listeners from Western North Carolina and Southeastern Wisconsin identified 12 vowel categories produced by 120 speakers stratified by age (old adults, young adults, and children), gender, and dialect. The vowels /ɝ, o, ʊ, u/ were well identified by both groups of listeners. The majority of confusions were for the front /i, ɪ, e, ɛ, æ/, the low back /ɑ, ɔ/ and the monophthongal North Carolina /aɪ/. For selected vowels, generational differences in acoustic vowel characteristics were perceptually salient, suggesting listeners' responsiveness to sound change. Female exemplars and native-dialect variants produced higher identification rates. Linear discriminant analyses which examined dialect and generational classification accuracy showed that sampling the formant pattern at vowel midpoint only is insufficient to separate the vowels. Two sample points near onset and offset provided enough information for successful classification. The models trained on one dialect classified the vowels from the other dialect with much lower accuracy. The results strongly support the importance of dynamic information in accurate classification of cross-generational and cross-dialectal variations.}, } @article {pmid22334029, year = {2012}, author = {Clark, G}, title = {The multi-channel cochlear implant and the relief of severe-to-profound deafness.}, journal = {Cochlear implants international}, volume = {13}, number = {2}, pages = {69-85}, doi = {10.1179/1754762811Y.0000000019}, pmid = {22334029}, issn = {1754-7628}, mesh = {Adult ; Age Factors ; Audiometry ; Australia ; Bioengineering/methods ; Child ; Cochlear Implantation/*methods ; *Cochlear Implants ; Deafness/diagnosis/*surgery ; Electric Stimulation ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Prosthesis Design/*methods ; *Quality Improvement ; Quality of Life ; Risk Assessment ; Severity of Illness Index ; Speech Perception/physiology ; Treatment Outcome ; }, abstract = {This personal reflection outlines the discoveries at the University of Melbourne leading to the multi-channel cochlear implant, and its development industrially by Cochlear Limited. My earlier experimental electrophysiological research demonstrated temporal coding occurred for only low frequencies, i.e. below 200-500 pulses/second. I was able to confirm these findings perceptually in behaviourally conditioned animals. In addition, these studies showed that temporal discrimination occurred across spatial coding channels. These experimental results correlated with the later conscious experience for electrical stimulation in my implant patients. In addition, the mid-to-high frequencies were coded in part by place of stimulation using bipolar and monopolar stimulation to restrict current spread. Furthermore, place of stimulation had the qualities of sharpness and dullness, and was also experienced as vowels. Owing to the limitation in coding speech with a physiological model due to the overlap of electrical current leading to unpredictable variations in loudness, a speech coding strategy that extracted the most important speech features for transmission through an electro-neural 'bottle-neck' to the brain was explored. Our inaugural strategy, discovered in 1978, extracted the second formant for place of stimulation, voicing for rate of stimulation, and sound pressure for current level. This was the first coding strategy to provide open-set speech understanding, as shown by standard audiological tests, and it became the first clinically successful interface between the world and human consciousness. This strategy was improved with place coding for the third formant or high-frequency spectrum, and then the spectral maxima. In 1989, I operated on our first patient to receive a bilateral implant, and in 1990, the first with a bimodal processor. The psychophysics and speech perception for these showed that the stimuli from each side could be fused into a single image, and localized according to differences in intensity and time of arrival of the stimuli. There were significant improvements for speech perception in noise. In 1985, I implanted our first children with the multi-channel prosthesis and found that speech understanding and spoken language were greatly improved the younger the child at surgery, and especially when younger than 12 months. Speech understanding was strongly related to the development of place coding. In 1990, the US Food and Drug Administration approved the implant for deaf children, the first by any world health regulatory body making it the first major advance in helping deaf children to communicate.}, } @article {pmid22319951, year = {2011}, author = {Uchida, T}, title = {[Contrast between vowel formants affects impressions of the speaker's personality and speech style].}, journal = {Shinrigaku kenkyu : The Japanese journal of psychology}, volume = {82}, number = {5}, pages = {433-441}, doi = {10.4992/jjpsy.82.433}, pmid = {22319951}, issn = {0021-5236}, mesh = {Adolescent ; Humans ; *Personality ; *Phonetics ; *Speech ; Young Adult ; }, abstract = {This study investigated the relationship between the distinctness of vowels in speech and impressions of the speaker's personality and speech style. Vowel sounds are considered to carry mainly phonetic information. For the experiment, formant frequencies of vowel sounds in original speech were altered to synthesize speech stimuli into four levels of formant contrast among different vowels. In Experiment 1, 36 university students listened to the speech stimuli and evaluated the speaker's personality using the Big Five scale. In Experiment 2, 35 participants evaluated the speech style. As the phonetic contrast between vowels became bigger, the trait evaluations of "conscientiousness" showed an asymptotic increase. "Agreeableness" was evaluated as high when the vowel contrast was somewhat bigger than the original before beginning to decrease. Regarding speech styles, "naturalness" and "fluency" were evaluated highest when vowel contrasts were somewhat bigger. "Pleasantness" was evaluated equally high for original and somewhat big contrasts, but lowest for the smallest contrast. In conclusion, vowel distinctness conveys not only phonetic information but also contributes to impressions of speech style and the speaker's personality systematically.}, } @article {pmid22292985, year = {2012}, author = {Chuang, HF and Yang, CC and Chi, LY and Weismer, G and Wang, YT}, title = {Speech intelligibility, speaking rate, and vowel formant characteristics in Mandarin-speaking children with cochlear implant.}, journal = {International journal of speech-language pathology}, volume = {14}, number = {2}, pages = {119-129}, doi = {10.3109/17549507.2011.639391}, pmid = {22292985}, issn = {1754-9515}, mesh = {*Asian People ; Case-Control Studies ; Child ; *Child Language ; *Cochlear Implants ; Deafness/therapy ; Female ; Humans ; Language Development ; Male ; Phonetics ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Taiwan ; }, abstract = {The effects of the use of cochlear implant (CI) on speech intelligibility, speaking rate, and vowel formant characteristics and the relationships between speech intelligibility, speaking rate, and vowel formant characteristics for children are clinically important. The purposes of this study were to report on the comparisons for speaking rate and vowel space area, and their relationship with speech intelligibility, between 24 Mandarin-speaking children with CI and 24 age-sex-education level matched normal hearing (NH) controls. Participants were audio recorded as they read a designed Mandarin intelligibility test, repeated prolongation of each of the three point vowels /i/, /a/, and /u/ five times, and repeated each of three sentences carrying one point vowel five times. Compared to the NH group, the CI group exhibited: (1) mild-to-moderate speech intelligibility impairment; (2) significantly reduced speaking rate mainly due to significantly longer inter-word pauses and larger pause proportion; and (3) significantly less vowel reduction in the horizontal dimension in sustained vowel phonation. The limitations of speech intelligibility development in children after cochlear implantation were related to atypical patterns and to a smaller degree in vowel reduction and slower speaking rate resulting from less efficient articulatory movement transition.}, } @article {pmid22285457, year = {2012}, author = {Ting, HN and Zourmand, A and Chia, SY and Yong, BF and Abdul Hamid, B}, title = {Formant frequencies of Malay vowels produced by Malay children aged between 7 and 12 years.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {5}, pages = {664.e1-6}, doi = {10.1016/j.jvoice.2011.08.008}, pmid = {22285457}, issn = {1873-4588}, mesh = {Age Factors ; Analysis of Variance ; Child ; *Child Language ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; }, abstract = {The formant frequencies of Malaysian Malay children have not been well studied. This article investigates the first four formant frequencies of sustained vowels in 360 Malay children aged between 7 and 12 years using acoustical analysis. Generally, Malay female children had higher formant frequencies than those of their male counterparts. However, no significant differences in all four formant frequencies were observed between the Malay male and female children in most of the vowels and age groups. Significant differences in all formant frequencies were found across the Malay vowels in both Malay male and female children for all age groups except for F4 in female children aged 12 years. Generally, the Malaysian Malay children showed a nonsystematic decrement in formant frequencies with age. Low levels of significant differences in formant frequencies were observed across the age groups in most of the vowels for F1, F3, and F4 in Malay male children and F1 and F4 in Malay female children.}, } @article {pmid22280613, year = {2012}, author = {Sheffield, BM and Zeng, FG}, title = {The relative phonetic contributions of a cochlear implant and residual acoustic hearing to bimodal speech perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {1}, pages = {518-530}, pmid = {22280613}, issn = {1520-8524}, support = {P30 DC008369/DC/NIDCD NIH HHS/United States ; R01 DC008858/DC/NIDCD NIH HHS/United States ; 1R01 DC008858/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Aged, 80 and over ; Auditory Threshold/physiology ; Case-Control Studies ; *Cochlear Implants ; Computer Simulation ; Female ; Hearing/*physiology ; Hearing Loss, Sensorineural/physiopathology ; Humans ; Male ; Middle Aged ; Noise ; Perceptual Masking/physiology ; *Phonetics ; Recognition, Psychology/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The addition of low-passed (LP) speech or even a tone following the fundamental frequency (F0) of speech has been shown to benefit speech recognition for cochlear implant (CI) users with residual acoustic hearing. The mechanisms underlying this benefit are still unclear. In this study, eight bimodal subjects (CI users with acoustic hearing in the non-implanted ear) and eight simulated bimodal subjects (using vocoded and LP speech) were tested on vowel and consonant recognition to determine the relative contributions of acoustic and phonetic cues, including F0, to the bimodal benefit. Several listening conditions were tested (CI/Vocoder, LP, T(F0-env), CI/Vocoder + LP, CI/Vocoder + T(F0-env)). Compared with CI/Vocoder performance, LP significantly enhanced both consonant and vowel perception, whereas a tone following the F0 contour of target speech and modulated with an amplitude envelope of the maximum frequency of the F0 contour (T(F0-env)) enhanced only consonant perception. Information transfer analysis revealed a dual mechanism in the bimodal benefit: The tone representing F0 provided voicing and manner information, whereas LP provided additional manner, place, and vowel formant information. The data in actual bimodal subjects also showed that the degree of the bimodal benefit depended on the cutoff and slope of residual acoustic hearing.}, } @article {pmid22280608, year = {2012}, author = {Barreda, S and Nearey, TM}, title = {The direct and indirect roles of fundamental frequency in vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {1}, pages = {466-477}, doi = {10.1121/1.3662068}, pmid = {22280608}, issn = {1520-8524}, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Female ; Humans ; Judgment/physiology ; Male ; Middle Aged ; *Phonetics ; Recognition, Psychology/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Several experiments have found that changing the intrinsic f0 of a vowel can have an effect on perceived vowel quality. It has been suggested that these shifts may occur because f0 is involved in the specification of vowel quality in the same way as the formant frequencies. Another possibility is that f0 affects vowel quality indirectly, by changing a listener's assumptions about characteristics of a speaker who is likely to have uttered the vowel. In the experiment outlined here, participants were asked to listen to vowels differing in terms of f0 and their formant frequencies and report vowel quality and the apparent speaker's gender and size on a trial-by-trial basis. The results presented here suggest that f0 affects vowel quality mainly indirectly via its effects on the apparent-speaker characteristics; however, f0 may also have some residual direct effects on vowel quality. Furthermore, the formant frequencies were also found to have significant indirect effects on vowel quality by way of their strong influence on the apparent speaker.}, } @article {pmid22280606, year = {2012}, author = {Chung, H and Kong, EJ and Edwards, J and Weismer, G and Fourakis, M and Hwang, Y}, title = {Cross-linguistic studies of children's and adults' vowel spaces.}, journal = {The Journal of the Acoustical Society of America}, volume = {131}, number = {1}, pages = {442-454}, pmid = {22280606}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC002932/DC/NIDCD NIH HHS/United States ; 02932//PHS HHS/United States ; }, mesh = {Adult ; Aging/*physiology ; Child, Preschool ; England ; Female ; Greece ; Humans ; Japan ; Korea ; *Linguistics ; Male ; Phonetics ; Speech/*physiology ; }, abstract = {This study examines cross-linguistic variation in the location of shared vowels in the vowel space across five languages (Cantonese, American English, Greek, Japanese, and Korean) and three age groups (2-year-olds, 5-year-olds, and adults). The vowels /a/, /i/, and /u/ were elicited in familiar words using a word repetition task. The productions of target words were recorded and transcribed by native speakers of each language. For correctly produced vowels, first and second formant frequencies were measured. In order to remove the effect of vocal tract size on these measurements, a normalization approach that calculates distance and angular displacement from the speaker centroid was adopted. Language-specific differences in the location of shared vowels in the formant values as well as the shape of the vowel spaces were observed for both adults and children.}, } @article {pmid22255549, year = {2011}, author = {Guenther, FH and Brumberg, JS}, title = {Brain-machine interfaces for real-time speech synthesis.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2011}, number = {}, pages = {5360-5363}, pmid = {22255549}, issn = {2694-0604}, support = {R29 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; DC002852/DC/NIDCD NIH HHS/United States ; DC007683/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Biofeedback, Psychology/*instrumentation ; Brain/*physiopathology ; *Communication Aids for Disabled ; Computer Systems ; Electroencephalography/*instrumentation ; Equipment Design ; Equipment Failure Analysis ; Humans ; Imagination ; Pilot Projects ; Quadriplegia/*rehabilitation ; Therapy, Computer-Assisted/*instrumentation ; *User-Computer Interface ; }, abstract = {This paper reports on studies involving brain-machine interfaces (BMIs) that provide near-instantaneous audio feedback from a speech synthesizer to the BMI user. In one study, neural signals recorded by an intracranial electrode implanted in a speech-related region of the left precentral gyrus of a human volunteer suffering from locked-in syndrome were transmitted wirelessly across the scalp and used to drive a formant synthesizer, allowing the user to produce vowels. In a second, pilot study, a neurologically normal user was able to drive the formant synthesizer with imagined movements detected using electroencephalography. Our results support the feasibility of neural prostheses that have the potential to provide near-conversational synthetic speech for individuals with severely impaired speech output.}, } @article {pmid22255461, year = {2011}, author = {Sadeghian, A and Dajani, HR and Chan, AD}, title = {Classification of English vowels using speech evoked potentials.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2011}, number = {}, pages = {5000-5003}, doi = {10.1109/IEMBS.2011.6091239}, pmid = {22255461}, issn = {2694-0604}, mesh = {Acoustic Stimulation/*methods ; Adult ; Brain Stem/*physiology ; Electroencephalography/*methods ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Humans ; Male ; Middle Aged ; *Phonetics ; Reproducibility of Results ; Sensitivity and Specificity ; Speech Perception/*physiology ; }, abstract = {The objective of this study is to investigate whether Speech Evoked Potentials (SpEPs), which are auditory brainstem responses to speech stimuli, contain information that can be used to distinguish different speech stimuli. Previous studies on brainstem SpEPs show that they contain valuable information about auditory neural processing. As such, SpEPs may be useful for the diagnosis of central auditory processing disorders and language disability, particularly in children. In this work, we examine the spectral amplitude information of both the Envelope Following Response, which is dominated by spectral components at the fundamental (F0) and its harmonics, and Frequency Following Response, which is dominated by spectral components in the region of the first formant (F1), of SpEPs in response to the five English language vowels (\a\,\e\,\ae\,\i\,\u\). Using spectral amplitude features, a classification accuracy of 78.3% is obtained with a linear discriminant analysis classifier. Classification of SpEPs demonstrates that brainstem neural responses in the region of F0 and F1 contain valuable information for discriminating vowels. This result provides an insight into human auditory processing of speech, and may help develop improved methods for objectively assessing central hearing impairment.}, } @article {pmid22238691, year = {2012}, author = {Wang, XD and Gu, F and He, K and Chen, LH and Chen, L}, title = {Preattentive extraction of abstract auditory rules in speech sound stream: a mismatch negativity study using lexical tones.}, journal = {PloS one}, volume = {7}, number = {1}, pages = {e30027}, pmid = {22238691}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; Adult ; Asian People ; Attention/*physiology ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Inhibition, Psychological ; Language ; Male ; Neural Pathways/physiology ; *Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {BACKGROUND: Extraction of linguistically relevant auditory features is critical for speech comprehension in complex auditory environments, in which the relationships between acoustic stimuli are often abstract and constant while the stimuli per se are varying. These relationships are referred to as the abstract auditory rule in speech and have been investigated for their underlying neural mechanisms at an attentive stage. However, the issue of whether or not there is a sensory intelligence that enables one to automatically encode abstract auditory rules in speech at a preattentive stage has not yet been thoroughly addressed.

We chose Chinese lexical tones for the current study because they help to define word meaning and hence facilitate the fabrication of an abstract auditory rule in a speech sound stream. We continuously presented native Chinese speakers with Chinese vowels differing in formant, intensity, and level of pitch to construct a complex and varying auditory stream. In this stream, most of the sounds shared flat lexical tones to form an embedded abstract auditory rule. Occasionally the rule was randomly violated by those with a rising or falling lexical tone. The results showed that the violation of the abstract auditory rule of lexical tones evoked a robust preattentive auditory response, as revealed by whole-head electrical recordings of the mismatch negativity (MMN), though none of the subjects acquired explicit knowledge of the rule or became aware of the violation.

CONCLUSIONS/SIGNIFICANCE: Our results demonstrate that there is an auditory sensory intelligence in the perception of Chinese lexical tones. The existence of this intelligence suggests that the humans can automatically extract abstract auditory rules in speech at a preattentive stage to ensure speech communication in complex and noisy auditory environments without drawing on conscious resources.}, } @article {pmid22226588, year = {2012}, author = {Solà-Soler, J and Fiz, JA and Morera, J and Jané, R}, title = {Multiclass classification of subjects with sleep apnoea-hypopnoea syndrome through snoring analysis.}, journal = {Medical engineering & physics}, volume = {34}, number = {9}, pages = {1213-1220}, doi = {10.1016/j.medengphy.2011.12.008}, pmid = {22226588}, issn = {1873-4030}, mesh = {Adult ; Aged ; Algorithms ; Bayes Theorem ; Female ; Humans ; Logistic Models ; Male ; Middle Aged ; Normal Distribution ; ROC Curve ; Sleep Apnea, Obstructive/*complications/*diagnosis ; Snoring/*classification/*complications/diagnosis ; *Sound ; Young Adult ; }, abstract = {The gold standard for diagnosing sleep apnoea-hypopnoea syndrome (SAHS) is polysomnography (PSG), an expensive, labour-intensive and time-consuming procedure. Accordingly, it would be very useful to have a screening method to allow early assessment of the severity of a subject, prior to his/her referral for PSG. Several differences have been reported between simple snorers and SAHS patients in the acoustic characteristics of snoring and its variability. In this paper, snores are fully characterised in the time domain, by their sound intensity and pitch, and in the frequency domain, by their formant frequencies and several shape and energy ratio measurements. We show that accurate multiclass classification of snoring subjects, with three levels of SAHS, can be achieved on the basis of acoustic analysis of snoring alone, without any requiring information on the duration or the number of apnoeas. Several classification methods are examined. The best of the approaches assessed is a Bayes model using a kernel density estimation method, although good results can also be obtained by a suitable combination of two binary logistic regression models. Multiclass snore-based classification allows early stratification of subjects according to their severity. This could be the basis of a single channel, snore-based screening procedure for SAHS.}, } @article {pmid22215035, year = {2012}, author = {Fogerty, D and Kewley-Port, D and Humes, LE}, title = {Asynchronous vowel-pair identification across the adult life span for monaural and dichotic presentations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {55}, number = {2}, pages = {487-499}, doi = {10.1044/1092-4388(2011/11-0102)}, pmid = {22215035}, issn = {1558-9102}, support = {R01 AG022334/AG/NIA NIH HHS/United States ; T32-DC00012/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Aged ; Aged, 80 and over ; Aging/*physiology ; Auditory Cortex/physiology ; Auditory Threshold/*physiology ; Dichotic Listening Tests ; Hearing/*physiology ; Humans ; *Phonetics ; Presbycusis/*physiopathology ; Sound Localization/physiology ; Speech Perception/*physiology ; Time Perception/physiology ; Young Adult ; }, abstract = {PURPOSE: Temporal order abilities decrease with age. Declining temporal processing abilities may influence the identification of rapid vowel sequences. Identification patterns for asynchronous vowel pairs were explored across the life span.

METHOD: Young, middle-aged, and older listeners completed temporal order tasks for pairs of 70-ms and 40-ms vowel stimuli. For a given vowel duration, naturally spoken vowels were equated for duration, intensity, and fundamental frequency. Listeners completed monaural and dichotic temporal order tasks that involved identifying the vowel pair in the correct order. The stimulus onset asynchrony that yielded 50% accuracy for identifying the vowel pair in the correct order was used to equate performance among listeners. Vowel identification response patterns were determined at this stimulus onset asynchrony threshold.

RESULTS: Vowel identification patterns were largely consistent across age groups. Older listeners were influenced by the order of certain vowel pairs. Not all vowel pairs were identified equally well. Vowel dominance patterns were also observed, with /a/ being identified most accurately for the vowel pairs tested. Formant dynamics explained, in part, identification and confusion patterns.

CONCLUSION: Vowel identification accuracy patterns were reasonably similar across the life span, regardless of presentation mode, vowel duration, or effect of considerable stimulus exposure. Large effects of vowel order were observed, particularly for older listeners.}, } @article {pmid22209062, year = {2012}, author = {D'Alatri, L and Bussu, F and Scarano, E and Paludetti, G and Marchese, MR}, title = {Objective and subjective assessment of tracheoesophageal prosthesis voice outcome.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {5}, pages = {607-613}, doi = {10.1016/j.jvoice.2011.08.013}, pmid = {22209062}, issn = {1873-4588}, mesh = {Aged ; Chi-Square Distribution ; Female ; Humans ; Laryngectomy/*adverse effects ; Larynx/*surgery ; *Larynx, Artificial ; Linear Models ; Male ; Middle Aged ; Phonation ; Pleasure ; Prosthesis Implantation/*instrumentation ; Punctures ; Respiration ; Retrospective Studies ; Self-Assessment ; Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; Time Factors ; Trachea/*surgery ; Treatment Outcome ; Voice Disorders/etiology/physiopathology/psychology/*surgery ; *Voice Quality ; }, abstract = {OBJECTIVE: To investigate the relationships between objective measures and the results of subjective assessment of voice quality and speech intelligibility in patients submitted to total laryngectomy and tracheoesophageal (TE) puncture.

STUDY DESIGN: Retrospective.

MATERIALS: Twenty patients implanted with voice prosthesis were studied. After surgery, the entire sample performed speech rehabilitation. The assessment protocol included maximum phonation time (MPT), number of syllables per deep breath, acoustic analysis of the sustained vowel /a/ and of a bisyllabic word, perceptual evaluation (pleasantness and intelligibility%), and self-assessment.

RESULTS: The correlation between pleasantness and intelligibility% was statistically significant. Both the latter were significantly correlated with the acoustic signal type, the number of formant peaks, and the F2-F1 difference. The intelligibility% and number of formant peaks were significantly correlated with the MPT and number of syllables per deep breath. Moreover, significant correlations were found between the number of formant peaks and both intelligibility% and pleasantness. The higher the number of syllables per deep breath and the longer the MPT, significantly higher was the number of formant peaks and the intelligibility%. The study failed to show significant correlation between patient's self-assessment of voice quality and both pleasantness and communication effectiveness.

CONCLUSION: The multidimensional assessment seems to be a reliable tool to evaluate the TE functional outcome. Particularly, the results showed that both pleasantness and intelligibility of TE speech are correlated to the availability of expired air and the function of the vocal tract.}, } @article {pmid22199181, year = {2012}, author = {Wright, R and Souza, P}, title = {Comparing identification of standardized and regionally valid vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {55}, number = {1}, pages = {182-193}, pmid = {22199181}, issn = {1558-9102}, support = {R01 DC006014-08/DC/NIDCD NIH HHS/United States ; R01 DC006014-07/DC/NIDCD NIH HHS/United States ; R01 DC006014-06/DC/NIDCD NIH HHS/United States ; R01 DC60014/DC/NIDCD NIH HHS/United States ; R01 DC006014/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; Reference Standards ; Reference Values ; Sound Spectrography ; *Speech Acoustics ; Speech Discrimination Tests/*standards ; *Speech Perception ; }, abstract = {PURPOSE: In perception studies, it is common to use vowel stimuli from standardized recordings or synthetic stimuli created using values from well-known published research. Although the use of standardized stimuli is convenient, unconsidered dialect and regional accent differences may introduce confounding effects. The goal of this study was to examine the effect of regional accent variation on vowel identification.

METHOD: The authors analyzed formant values of 8 monophthong vowels produced by 12 talkers from the region where the research took place and compared them with standardized vowels. Fifteen listeners with normal hearing identified synthesized vowels presented in varying levels of noise and at varying spectral distances from the local-dialect values.

RESULTS: Acoustically, local vowels differed from standardized vowels, and distance varied across vowels. Perceptually, there was a robust effect of accent similarity such that identification was reduced for vowels at greater distances from local values.

CONCLUSIONS: Researchers and clinicians should take care in choosing stimuli for perception experiments. It is recommended that regionally validated vowels be used instead of relying on standardized vowels in vowel perception tasks.}, } @article {pmid22197241, year = {2012}, author = {MacDonald, EN and Johnson, EK and Forsythe, J and Plante, P and Munhall, KG}, title = {Children's development of self-regulation in speech production.}, journal = {Current biology : CB}, volume = {22}, number = {2}, pages = {113-117}, pmid = {22197241}, issn = {1879-0445}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; R01 DC008092-01A1/DC/NIDCD NIH HHS/United States ; R01 DC008092-03/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; R01 DC008092-04/DC/NIDCD NIH HHS/United States ; R01 DC008092-02/DC/NIDCD NIH HHS/United States ; R01 DC008092-05/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Auditory Perception ; Child ; *Child Development ; Child, Preschool ; Feedback, Sensory ; Female ; Humans ; *Speech ; Young Adult ; }, abstract = {Species-specific vocalizations fall into two broad categories: those that emerge during maturation, independent of experience, and those that depend on early life interactions with conspecifics. Human language and the communication systems of a small number of other species, including songbirds, fall into this latter class of vocal learning. Self-monitoring has been assumed to play an important role in the vocal learning of speech and studies demonstrate that perception of your own voice is crucial for both the development and lifelong maintenance of vocalizations in humans and songbirds. Experimental modifications of auditory feedback can also change vocalizations in both humans and songbirds. However, with the exception of large manipulations of timing, no study to date has ever directly examined the use of auditory feedback in speech production under the age of 4. Here we use a real-time formant perturbation task to compare the response of toddlers, children, and adults to altered feedback. Children and adults reacted to this manipulation by changing their vowels in a direction opposite to the perturbation. Surprisingly, toddlers' speech didn't change in response to altered feedback, suggesting that long-held assumptions regarding the role of self-perception in articulatory development need to be reconsidered.}, } @article {pmid22175821, year = {2012}, author = {Hertrich, I and Dietrich, S and Trouvain, J and Moos, A and Ackermann, H}, title = {Magnetic brain activity phase-locked to the envelope, the syllable onsets, and the fundamental frequency of a perceived speech signal.}, journal = {Psychophysiology}, volume = {49}, number = {3}, pages = {322-334}, doi = {10.1111/j.1469-8986.2011.01314.x}, pmid = {22175821}, issn = {1469-8986}, mesh = {Acoustic Stimulation ; Auditory Perception/*physiology ; Brain/*physiology ; Humans ; Magnetoencephalography ; Speech Perception/*physiology ; }, abstract = {During speech perception, acoustic correlates of syllable structure and pitch periodicity are directly reflected in electrophysiological brain activity. Magnetoencephalography (MEG) recordings were made while 10 participants listened to natural or formant-synthesized speech at moderately fast or ultrafast rate. Cross-correlation analysis was applied to show brain activity time-locked to the speech envelope, to an acoustic marker of syllable onsets, and to pitch periodicity. The envelope yielded a right-lateralized M100-like response, syllable onsets gave rise to M50/M100-like fields with an additional anterior M50 component, and pitch (ca. 100 Hz) elicited a neural resonance bound to a central auditory source at a latency of 30 ms. The strength of these MEG components showed differential effects of syllable rate and natural versus synthetic speech. Presumingly, such phase-locking mechanisms serve as neuronal triggers for the extraction of information-bearing elements.}, } @article {pmid22160754, year = {2012}, author = {Summers, RJ and Bailey, PJ and Roberts, B}, title = {Effects of the rate of formant-frequency variation on the grouping of formants in speech perception.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {13}, number = {2}, pages = {269-280}, pmid = {22160754}, issn = {1438-7573}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Middle Aged ; Speech ; Speech Intelligibility ; *Speech Perception ; }, abstract = {How speech is separated perceptually from other speech remains poorly understood. Recent research suggests that the ability of an extraneous formant to impair intelligibility depends on the modulation of its frequency, but not its amplitude, contour. This study further examined the effect of formant-frequency variation on intelligibility by manipulating the rate of formant-frequency change. Target sentences were synthetic three-formant (F1 + F2 + F3) analogues of natural utterances. Perceptual organization was probed by presenting stimuli dichotically (F1 + F2C + F3C; F2 + F3), where F2C + F3C constitute a competitor for F2 and F3 that listeners must reject to optimize recognition. Competitors were derived using formant-frequency contours extracted from extended passages spoken by the same talker and processed to alter the rate of formant-frequency variation, such that rate scale factors relative to the target sentences were 0, 0.25, 0.5, 1, 2, and 4 (0 = constant frequencies). Competitor amplitude contours were either constant, or time-reversed and rate-adjusted in parallel with the frequency contour. Adding a competitor typically reduced intelligibility; this reduction increased with competitor rate until the rate was at least twice that of the target sentences. Similarity in the results for the two amplitude conditions confirmed that formant amplitude contours do not influence across-formant grouping. The findings indicate that competitor efficacy is not tuned to the rate of the target sentences; most probably, it depends primarily on the overall rate of frequency variation in the competitor formants. This suggests that, when segregating the speech of concurrent talkers, differences in speech rate may not be a significant cue for across-frequency grouping of formants.}, } @article {pmid22137845, year = {2012}, author = {Prendergast, G and Green, GG}, title = {Cross-channel amplitude sweeps are crucial to speech intelligibility.}, journal = {Brain and language}, volume = {120}, number = {3}, pages = {406-411}, doi = {10.1016/j.bandl.2011.11.001}, pmid = {22137845}, issn = {1090-2155}, mesh = {Adolescent ; Female ; Humans ; Male ; *Phonetics ; *Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Discrimination Tests ; Speech Intelligibility/physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Classical views of speech perception argue that the static and dynamic characteristics of spectral energy peaks (formants) are the acoustic features that underpin phoneme recognition. Here we use representations where the amplitude modulations of sub-band filtered speech are described, precisely, in terms of co-sinusoidal pulses. These pulses are parameterised in terms of their amplitude, duration and position in time across a large number of spectral channels. Coherent sweeps of energy across this parameter space are identified and the local transitions of pulse features across spectral channels are extracted. Synthesised speech based on manipulations of these local amplitude modulation features was used to explore the basis of intelligibility. The results show that removing changes in amplitude across channels has a much greater impact on intelligibility than differences in sweep transition or duration across channels. This finding has severe implications for future experimental design in the fields of psychophysics, electrophysiology and neuroimaging.}, } @article {pmid22125350, year = {2011}, author = {Jacewicz, E and Fox, RA and Salmons, J}, title = {Vowel change across three age groups of speakers in three regional varieties of American English.}, journal = {Journal of phonetics}, volume = {39}, number = {4}, pages = {683-693}, pmid = {22125350}, issn = {0095-4470}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; R01 DC006871-05/DC/NIDCD NIH HHS/United States ; }, abstract = {This acoustic study examines sound (vowel) change in apparent time across three successive generations of 123 adult female speakers ranging in age from 20 to 65 years old, representing three regional varieties of American English, typical of western North Carolina, central Ohio and southeastern Wisconsin. A set of acoustic measures characterized the dynamic nature of formant trajectories, the amount of spectral change over the course of vowel duration and the position of the spectral centroid. The study found a set of systematic changes to /I, ε, æ/ including positional changes in the acoustic space (mostly lowering of the vowels) and significant variation in formant dynamics (increased monophthongization). This common sound change is evident in both emphatic (articulated clearly) and nonemphatic (casual) productions and occurs regardless of dialect-specific vowel dispersions in the vowel space. The cross-generational and cross-dialectal patterns of variation found here support an earlier report by Jacewicz, Fox, and Salmons (2011) which found this recent development in these three dialect regions in isolated citation-form words. While confirming the new North American Shift in different styles of production, the study underscores the importance of addressing the stress-related variation in vowel production in a careful and valid assessment of sound change.}, } @article {pmid22113211, year = {2012}, author = {Hornickel, J and Anderson, S and Skoe, E and Yi, HG and Kraus, N}, title = {Subcortical representation of speech fine structure relates to reading ability.}, journal = {Neuroreport}, volume = {23}, number = {1}, pages = {6-9}, pmid = {22113211}, issn = {1473-558X}, support = {R01 DC001510/DC/NIDCD NIH HHS/United States ; R01 DC001510-13/DC/NIDCD NIH HHS/United States ; R01DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Brain Stem/*physiology ; Cerebral Cortex/physiology ; Child ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; *Reading ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {Impaired perception of consonants by poor readers is reflected in poor subcortical encoding of speech timing and harmonics. We assessed auditory brainstem representation of higher harmonics within a consonant-vowel formant transition to identify relationships between speech fine structure and reading. Responses were analyzed in three ways: a single stimulus polarity, adding responses to inverted polarities (emphasizing low harmonics), and subtracting responses to inverted polarities (emphasizing high harmonics). Poor readers had a reduced representation of higher speech harmonics for subtracted polarities and a single polarity. No group differences were found for the fundamental frequency. These findings strengthen the evidence of subcortical encoding deficits in poor readers for speech fine structure and delineate effective strategies for capturing these neural impairments in humans.}, } @article {pmid22088030, year = {2011}, author = {Tarr, E and Nittrouer, S}, title = {Coherence masking protection for mid-frequency formants by adults and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {5}, pages = {EL290-6}, pmid = {22088030}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-23A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Age Factors ; Audiometry, Speech ; Auditory Threshold ; Child ; Child, Preschool ; Humans ; Noise/*adverse effects ; *Perceptual Masking ; Signal Detection, Psychological ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {Coherence masking protection (CMP) refers to the phenomenon in which a target formant is labeled at lower signal-to-noise levels when presented with a stable cosignal consisting of two other formants than when presented alone. This effect has been reported primarily for adults with first-formant (F1) targets and F2/F3 cosignals, but has also been found for children, in fact in greater magnitude. In this experiment, F2 was the target and F1/F3 was the cosignal. Results showed similar effects for each age group as had been found for F1 targets. Implications for auditory prostheses for listeners with hearing loss are discussed.}, } @article {pmid22087926, year = {2011}, author = {Mitsuya, T and Macdonald, EN and Purcell, DW and Munhall, KG}, title = {A cross-language study of compensation in response to real-time formant perturbation.}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {5}, pages = {2978-2986}, pmid = {22087926}, issn = {1520-8524}, support = {DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Analysis of Variance ; *Feedback, Psychological ; Female ; Humans ; *Multilingualism ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {Past studies have shown that when formants are perturbed in real time, speakers spontaneously compensate for the perturbation by changing their formant frequencies in the opposite direction to the perturbation. Further, the pattern of these results suggests that the processing of auditory feedback error operates at a purely acoustic level. This hypothesis was tested by comparing the response of three language groups to real-time formant perturbations, (1) native English speakers producing an English vowel /ε/, (2) native Japanese speakers producing a Japanese vowel (/e([inverted perpendicular])/), and (3) native Japanese speakers learning English, producing /ε/. All three groups showed similar production patterns when F1 was decreased; however, when F1 was increased, the Japanese groups did not compensate as much as the native English speakers. Due to this asymmetry, the hypothesis that the compensatory production for formant perturbation operates at a purely acoustic level was rejected. Rather, some level of phonological processing influences the feedback processing behavior.}, } @article {pmid22072698, year = {2011}, author = {Cai, S and Ghosh, SS and Guenther, FH and Perkell, JS}, title = {Focal manipulations of formant trajectories reveal a role of auditory feedback in the online control of both within-syllable and between-syllable speech timing.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {31}, number = {45}, pages = {16483-16490}, pmid = {22072698}, issn = {1529-2401}, support = {R56 DC010849-01/DC/NIDCD NIH HHS/United States ; R01-DC0001925/DC/NIDCD NIH HHS/United States ; R56-DC010849/DC/NIDCD NIH HHS/United States ; R01-DC007683/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R01 DC001925-14/DC/NIDCD NIH HHS/United States ; R01 DC001925/DC/NIDCD NIH HHS/United States ; R56 DC010849/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Middle Aged ; Musculoskeletal Manipulations/*methods ; Online Systems ; *Phonetics ; Space Perception/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Time Factors ; Time Perception/*physiology ; Young Adult ; }, abstract = {Within the human motor repertoire, speech production has a uniquely high level of spatiotemporal complexity. The production of running speech comprises the traversing of spatial positions with precisely coordinated articulator movements to produce 10-15 sounds/s. How does the brain use auditory feedback, namely the self-perception of produced speech sounds, in the online control of spatial and temporal parameters of multisyllabic articulation? This question has important bearings on the organizational principles of sequential actions, yet its answer remains controversial due to the long latency of the auditory feedback pathway and technical challenges involved in manipulating auditory feedback in precisely controlled ways during running speech. In this study, we developed a novel technique for introducing time-varying, focal perturbations in the auditory feedback during multisyllabic, connected speech. Manipulations of spatial and temporal parameters of the formant trajectory were tested separately on two groups of subjects as they uttered "I owe you a yo-yo." Under these perturbations, significant and specific changes were observed in both the spatial and temporal parameters of the produced formant trajectories. Compensations to spatial perturbations were bidirectional and opposed the perturbations. Furthermore, under perturbations that manipulated the timing of auditory feedback trajectory (slow-down or speed-up), significant adjustments in syllable timing were observed in the subjects' productions. These results highlight the systematic roles of auditory feedback in the online control of a highly over-learned action as connected speech articulation and provide a first look at the properties of this type of sensorimotor interaction in sequential movements.}, } @article {pmid22070046, year = {2011}, author = {Esposito, CM}, title = {The perception of pathologically-disordered phonation by Gujarati, English, and Spanish listeners.}, journal = {Language and speech}, volume = {54}, number = {Pt 3}, pages = {415-430}, doi = {10.1177/0023830911402605}, pmid = {22070046}, issn = {0023-8309}, mesh = {Female ; Humans ; Language ; Male ; *Perceptual Distortion ; *Phonation ; *Phonetics ; Psycholinguistics ; *Speech Disorders ; *Speech Perception ; }, abstract = {This study investigates the influence of linguistic experience on the perception of pathologically-disordered voices using 18 listeners of American English, which has allophonic breathiness, 12 listeners of Gujarati, which contrasts breathy and modal vowels, and 18 listeners of Spanish, which has neither allophonic nor phonemic breathiness. Listeners rated the similarity of pairs of pathologically-disordered voices. Multidimensional scaling was used to determine the properties that were most correlated with perception for each listener group. Results showed that Gujaratis' perception was correlated with the difference between the amplitude of the first (HI*) and second (H2*) harmonic (HI*-H2*), which is associated with the production of phonation in Gujarati. English listeners' judgments were correlated with the measure HI*-H2* and cepstral peak prominence, and Spanish listeners' judgments were correlated with HI*-H2 and HI*-AI* (the amplitude of the principal harmonic near the first formant). When compared to Esposito (2006), which asked the same listeners to rate the similarity of breathy and modal vowels from Mazatec, results showed that Gujarati listeners classified the pathologically-disordered stimuli in the same way that they classified the Mazatec stimuli, while English and Spanish listeners perceived the pathologically-disordered stimuli and the Mazatec stimuli in slightly different ways.}, } @article {pmid22046577, year = {2011}, author = {Dromey, C and Bjarnason, S}, title = {A preliminary report on disordered speech with deep brain stimulation in individuals with Parkinson's disease.}, journal = {Parkinson's disease}, volume = {2011}, number = {}, pages = {796205}, pmid = {22046577}, issn = {2042-0080}, abstract = {Deep brain stimulation (DBS) of the subthalamic nucleus (STN) has proven effective in treating the major motor symptoms of advanced Parkinson's disease (PD). The aim of this study was to learn which laryngeal and articulatory acoustic features changed in patients who were reported to have worse speech with stimulation. Six volunteers with PD who had bilateral STN electrodes were recorded with DBS turned on or off. Perceptual ratings reflected poorer speech performance with DBS on. Acoustic measures of articulation (corner vowel formants, diphthong slopes, and a spirantization index) and phonation (perturbation, long-term average spectrum) as well as verbal fluency scores showed mixed results with DBS. Some speakers improved while others became worse on individual measures. The magnitude of DBS effects was not predictable based on the patients' demographic characteristics. Future research involving adjustments to stimulator settings or electrode placement may be beneficial in limiting the negative effects of DBS on speech.}, } @article {pmid22018927, year = {2011}, author = {Wong, AW and Allegro, J and Tirado, Y and Chadha, N and Campisi, P}, title = {Objective measurement of motor speech characteristics in the healthy pediatric population.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {75}, number = {12}, pages = {1604-1611}, doi = {10.1016/j.ijporl.2011.09.023}, pmid = {22018927}, issn = {1872-8464}, mesh = {Adolescent ; Age Factors ; Child ; Child, Preschool ; Cross-Sectional Studies ; Female ; Humans ; Language Development ; Male ; Motor Cortex/*physiology ; Sex Factors ; Software ; Speech/*physiology ; }, abstract = {OBJECTIVE: To obtain objective measurements of motor speech characteristics in normal children, using a computer-based motor speech software program.

METHODS: Cross-sectional, observational design in a university-based ambulatory pediatric otolaryngology clinic. Participants included 112 subjects (54 females and 58 males) aged 4-18 years. Participants with previously diagnosed hearing loss, voice and motor disorders, and children unable to repeat a passage in English were excluded. Voice samples were recorded and analysed using the Motor Speech Profile (MSP) software (KayPENTAX, Lincoln Park, NJ). The MSP produced measures of diadochokinetics, second formant transition, intonation, and syllabic rates.

RESULTS: Demographic data, including sex, age, and cigarette smoke exposure were obtained. Normative data for several motor speech characteristics were derived for children ranging from age 4 to 18 years. A number of age-dependent changes were indentified, including an increase in average diadochokinetic rate (p<0.001) and standard syllabic duration (p<0.001) with age. There were no identified differences in motor speech characteristics between males and females across the measured age range. Variations in fundamental frequency (Fo) during speech did not change significantly with age for both males and females.

CONCLUSIONS: To our knowledge, this is the first pediatric normative database for the MSP progam. The MSP is suitable for testing children and can be used to study developmental changes in motor speech. The analysis demonstrated that males and females behave similarly and show the same relationship with age for the motor speech characteristics studied. This normative database will provide essential comparative data for future studies exploring alterations in motor speech that may occur with hearing, voice, and motor disorders and to assess the results of targeted therapies.}, } @article {pmid22001313, year = {2011}, author = {Sjerps, MJ and Mitterer, H and McQueen, JM}, title = {Listening to different speakers: on the time-course of perceptual compensation for vocal-tract characteristics.}, journal = {Neuropsychologia}, volume = {49}, number = {14}, pages = {3831-3846}, doi = {10.1016/j.neuropsychologia.2011.09.044}, pmid = {22001313}, issn = {1873-3514}, mesh = {Acoustic Stimulation ; Brain Mapping ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Hearing/*physiology ; Humans ; Male ; *Phonetics ; Reaction Time ; *Speech Acoustics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {This study used an active multiple-deviant oddball design to investigate the time-course of normalization processes that help listeners deal with between-speaker variability. Electroencephalograms were recorded while Dutch listeners heard sequences of non-words (standards and occasional deviants). Deviants were [ipapu] or [ɛpapu], and the standard was [(I)(ɛ)papu], where [(I)(ɛ)] was a vowel that was ambiguous between [ɛ] and [i]. These sequences were presented in two conditions, which differed with respect to the vocal-tract characteristics (i.e., the average 1st formant frequency) of the [papu] part, but not of the initial vowels [i], [ɛ] or [(I)(ɛ)] (these vowels were thus identical across conditions). Listeners more often detected a shift from [(I)(ɛ)papu] to [ɛpapu] than from [(I)(ɛ)papu] to [ipapu] in the high F(1) context condition; the reverse was true in the low F(1) context condition. This shows that listeners' perception of vowels differs depending on the speaker's vocal-tract characteristics, as revealed in the speech surrounding those vowels. Cortical electrophysiological responses reflected this normalization process as early as about 120 ms after vowel onset, which suggests that shifts in perception precede influences due to conscious biases or decision strategies. Listeners' abilities to normalize for speaker-vocal-tract properties are for an important part the result of a process that influences representations of speech sounds early in the speech processing stream.}, } @article {pmid21994373, year = {2011}, author = {Walker, KM and Bizley, JK and King, AJ and Schnupp, JW}, title = {Multiplexed and robust representations of sound features in auditory cortex.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {31}, number = {41}, pages = {14565-14576}, pmid = {21994373}, issn = {1529-2401}, support = {/WT_/Wellcome Trust/United Kingdom ; 076508/WT_/Wellcome Trust/United Kingdom ; /BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation/methods ; Action Potentials/physiology ; Animals ; Auditory Cortex/cytology/*physiology ; Auditory Pathways/physiology ; Bias ; Evoked Potentials, Auditory/*physiology ; Female ; Ferrets ; Neurons/physiology ; Reaction Time/physiology ; *Sound ; Sound Localization/*physiology ; Spectrum Analysis ; Statistics, Nonparametric ; }, abstract = {We can recognize the melody of a familiar song when it is played on different musical instruments. Similarly, an animal must be able to recognize a warning call whether the caller has a high-pitched female or a lower-pitched male voice, and whether they are sitting in a tree to the left or right. This type of perceptual invariance to "nuisance" parameters comes easily to listeners, but it is unknown whether or how such robust representations of sounds are formed at the level of sensory cortex. In this study, we investigate whether neurons in both core and belt areas of ferret auditory cortex can robustly represent the pitch, formant frequencies, or azimuthal location of artificial vowel sounds while the other two attributes vary. We found that the spike rates of the majority of cortical neurons that are driven by artificial vowels carry robust representations of these features, but the most informative temporal response windows differ from neuron to neuron and across five auditory cortical fields. Furthermore, individual neurons can represent multiple features of sounds unambiguously by independently modulating their spike rates within distinct time windows. Such multiplexing may be critical to identifying sounds that vary along more than one perceptual dimension. Finally, we observed that formant information is encoded in cortex earlier than pitch information, and we show that this time course matches ferrets' behavioral reaction time differences on a change detection task.}, } @article {pmid21980349, year = {2011}, author = {Rosen, S and Wise, RJ and Chadha, S and Conway, EJ and Scott, SK}, title = {Hemispheric asymmetries in speech perception: sense, nonsense and modulations.}, journal = {PloS one}, volume = {6}, number = {9}, pages = {e24672}, pmid = {21980349}, issn = {1932-6203}, support = {/WT_/Wellcome Trust/United Kingdom ; 090961/WT_/Wellcome Trust/United Kingdom ; MC_U120064975/MRC_/Medical Research Council/United Kingdom ; WT074414MA/WT_/Wellcome Trust/United Kingdom ; }, mesh = {*Acoustic Stimulation ; Acoustics ; Adult ; Auditory Cortex/physiology ; Auditory Perception/*physiology ; Brain Mapping ; Cerebrum ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Linguistics ; Male ; Middle Aged ; Positron-Emission Tomography/methods ; Speech/physiology ; Speech Intelligibility/physiology ; Speech Perception/*physiology ; Temporal Lobe/physiology ; Time Factors ; }, abstract = {BACKGROUND: The well-established left hemisphere specialisation for language processing has long been claimed to be based on a low-level auditory specialization for specific acoustic features in speech, particularly regarding 'rapid temporal processing'.

METHODOLOGY: A novel analysis/synthesis technique was used to construct a variety of sounds based on simple sentences which could be manipulated in spectro-temporal complexity, and whether they were intelligible or not. All sounds consisted of two noise-excited spectral prominences (based on the lower two formants in the original speech) which could be static or varying in frequency and/or amplitude independently. Dynamically varying both acoustic features based on the same sentence led to intelligible speech but when either or both acoustic features were static, the stimuli were not intelligible. Using the frequency dynamics from one sentence with the amplitude dynamics of another led to unintelligible sounds of comparable spectro-temporal complexity to the intelligible ones. Positron emission tomography (PET) was used to compare which brain regions were active when participants listened to the different sounds.

CONCLUSIONS: Neural activity to spectral and amplitude modulations sufficient to support speech intelligibility (without actually being intelligible) was seen bilaterally, with a right temporal lobe dominance. A left dominant response was seen only to intelligible sounds. It thus appears that the left hemisphere specialisation for speech is based on the linguistic properties of utterances, not on particular acoustic features.}, } @article {pmid21976026, year = {2011}, author = {Efremova, KO and Volodin, IA and Volodina, EV and Frey, R and Lapshina, EN and Soldatova, NV}, title = {Developmental changes of nasal and oral calls in the goitred gazelle Gazella subgutturosa, a nonhuman mammal with a sexually dimorphic and descended larynx.}, journal = {Die Naturwissenschaften}, volume = {98}, number = {11}, pages = {919-931}, pmid = {21976026}, issn = {1432-1904}, mesh = {Age Factors ; *Animal Communication ; Animals ; Antelopes/*anatomy & histology/growth & development/*physiology ; Body Size ; Female ; Larynx/*anatomy & histology/cytology ; Male ; Sex Characteristics ; Sex Factors ; }, abstract = {In goitred gazelles (Gazella subgutturosa), sexual dimorphism of larynx size and position is reminiscent of the case in humans, suggesting shared features of vocal ontogenesis in both species. This study investigates the ontogeny of nasal and oral calls in 23 (10 male and 13 female) individually identified goitred gazelles from shortly after birth up to adolescence. The fundamental frequency (f0) and formants were measured as the acoustic correlates of the developing sexual dimorphism. Settings for LPC analysis of formants were based on anatomical dissections of 5 specimens. Along ontogenesis, compared to females, male f0 was consistently lower both in oral and nasal calls and male formants were lower in oral calls, whereas the first two formants of nasal calls did not differ between sexes. In goitred gazelles, significant sex differences in f0 and formants appeared as early as the second week of life, while in humans they emerge only before puberty. This result suggests different pathways of vocal ontogenesis in the goitred gazelles and in humans.}, } @article {pmid21974502, year = {2011}, author = {Liu, C}, title = {Effects of roving level and spectral range on vowel formant discrimination.}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {4}, pages = {EL264-70}, doi = {10.1121/1.3638924}, pmid = {21974502}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Audiometry, Pure-Tone ; Audiometry, Speech ; Auditory Threshold ; *Cues ; *Discrimination, Psychological ; Humans ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Young Adult ; }, abstract = {Thresholds of vowel formant discrimination for F1 and F2 of isolated vowels with full and partial vowel spectra were measured for normal-hearing listeners at fixed and roving speech levels. Performance of formant discrimination was significantly better for fixed levels than for roving levels with both full and partial spectra. The effect of vowel spectral range was present only for roving levels, but not for fixed levels. These results, consistent with studies of profile analysis, indicated different perceptual mechanisms for listeners to discriminate vowel formant frequency at fixed and roving levels.}, } @article {pmid21957105, year = {2011}, author = {Charlton, BD and Ellis, WA and McKinnon, AJ and Cowin, GJ and Brumm, J and Nilsson, K and Fitch, WT}, title = {Cues to body size in the formant spacing of male koala (Phascolarctos cinereus) bellows: honesty in an exaggerated trait.}, journal = {The Journal of experimental biology}, volume = {214}, number = {Pt 20}, pages = {3414-3422}, doi = {10.1242/jeb.061358}, pmid = {21957105}, issn = {1477-9145}, support = {230604/ERC_/European Research Council/International ; }, mesh = {Acoustics ; Animals ; Body Size/*physiology ; Cephalometry ; *Cues ; Exhalation/physiology ; Head/anatomy & histology ; Inhalation/physiology ; Magnetic Resonance Imaging ; Male ; Phascolarctidae/*anatomy & histology/*physiology ; Postmortem Changes ; Sound Spectrography ; Vocal Cords/anatomy & histology ; Vocalization, Animal/*physiology ; }, abstract = {Determining the information content of vocal signals and understanding morphological modifications of vocal anatomy are key steps towards revealing the selection pressures acting on a given species' vocal communication system. Here, we used a combination of acoustic and anatomical data to investigate whether male koala bellows provide reliable information on the caller's body size, and to confirm whether male koalas have a permanently descended larynx. Our results indicate that the spectral prominences of male koala bellows are formants (vocal tract resonances), and show that larger males have lower formant spacing. In contrast, no relationship between body size and the fundamental frequency was found. Anatomical investigations revealed that male koalas have a permanently descended larynx: the first example of this in a marsupial. Furthermore, we found a deeply anchored sternothyroid muscle that could allow male koalas to retract their larynx into the thorax. While this would explain the low formant spacing of the exhalation and initial inhalation phases of male bellows, further research will be required to reveal the anatomical basis for the formant spacing of the later inhalation phases, which is predictive of vocal tract lengths of around 50 cm (nearly the length of an adult koala's body). Taken together, these findings show that the formant spacing of male koala bellows has the potential to provide receivers with reliable information on the caller's body size, and reveal that vocal adaptations allowing callers to exaggerate (or maximise) the acoustic impression of their size have evolved independently in marsupials and placental mammals.}, } @article {pmid21956726, year = {2011}, author = {Peters, AS and Rémi, J and Vollmar, C and Gonzalez-Victores, JA and Cunha, JP and Noachtar, S}, title = {Dysprosody during epileptic seizures lateralizes to the nondominant hemisphere.}, journal = {Neurology}, volume = {77}, number = {15}, pages = {1482-1486}, doi = {10.1212/WNL.0b013e318232abae}, pmid = {21956726}, issn = {1526-632X}, mesh = {Adolescent ; Adult ; Epilepsy/classification/*complications/pathology ; *Functional Laterality ; Humans ; Linguistics ; Middle Aged ; Sound Spectrography ; Speech Disorders/*etiology ; Speech Perception ; Statistics, Nonparametric ; Young Adult ; }, abstract = {OBJECTIVE: In human speech, the changes in intonation, rhythm, or stress reflect emotions or intentions and are called prosody. Dysprosody is the impairment of prosody and has been described in stroke and neurodegenerative disorders. Reports in epilepsy patients are limited to case reports.

METHODS: We assessed prosody qualitatively and quantitatively in 967 focal epilepsy patients. The qualitative assessment was performed by 2 native German speakers, and the quantitative frequency analysis used linguistic software tools. For the quantitative analysis, the formant F0 (a frequency peak, which is an approximation of pitch) and the further spectral frequency peaks of our patients' voices were analyzed.

RESULTS: We found 26 patients with ictal dysprosody through qualitative analysis (2.7% of all focal epilepsies). The qualitative changes affected mostly the pitch and the loss of melody. The seizure patterns at the time of ictal dysprosody were always in the nondominant hemisphere (100%) and were mostly right temporal (n = 22; 84.6%). Quantitative analysis of 15 audio samples (11 patients) showed a change in the frequency of formant F0 of several patients and a reduction of frequency variation during ictal speech, expressed as the SD of formant F0 (ictal 14.1 vs interictal 27.2).

CONCLUSIONS: Ictal dysprosody localizes seizure onset or propagation to the nondominant temporal lobe. This information can be used in the evaluation of patients considered for resective epilepsy surgery.}, } @article {pmid21948285, year = {2011}, author = {Nittrouer, S and Tarr, E}, title = {Coherence masking protection for speech in children and adults.}, journal = {Attention, perception & psychophysics}, volume = {73}, number = {8}, pages = {2606-2623}, pmid = {21948285}, issn = {1943-393X}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-23A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child, Preschool ; Cues ; Female ; Gestalt Theory ; Humans ; Male ; Pattern Recognition, Visual ; *Perceptual Masking ; *Phonetics ; Psycholinguistics ; Recognition, Psychology ; *Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {In three experiments, we tested the hypothesis that children are more obliged than adults to fuse components of speech signals and asked whether the principle of harmonicity could explain the effect or whether it is, instead, due to children's implementing speech-based mechanisms. Coherence masking protection (CMP) was used, which involves labeling a phonetically relevant formant (the target) presented in noise, either alone or in combination with a stable spectral band (the cosignal) that provides no additional information about phonetic identity and is well outside the critical band of the target. Adults and children (8 and 5 years old) heard stimuli that were either synthetic speech or hybrids consisting of sine wave targets and synthetic cosignals. The target and cosignal either shared a common harmonic structure or did not. An adaptive procedure located listeners' thresholds for accurate labeling. Lower thresholds when the cosignal is present indicate CMP. Younger children demonstrated CMP effects that were both larger in magnitude and less susceptible to disruptions in harmonicity than those observed for adults. The conclusion was that children are obliged to integrate spectral components of speech signals, a perceptual strategy based on their recognition of when all components come from the same generator.}, } @article {pmid21945200, year = {2011}, author = {Warrier, CM and Abrams, DA and Nicol, TG and Kraus, N}, title = {Inferior colliculus contributions to phase encoding of stop consonants in an animal model.}, journal = {Hearing research}, volume = {282}, number = {1-2}, pages = {108-118}, pmid = {21945200}, issn = {1878-5891}, support = {R01 DC001510/DC/NIDCD NIH HHS/United States ; R01 DC001510-08/DC/NIDCD NIH HHS/United States ; R01 DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Audiometry ; Auditory Threshold ; Electroencephalography ; *Evoked Potentials, Auditory, Brain Stem ; Female ; Guinea Pigs ; Humans ; Inferior Colliculi/*physiology ; Male ; Models, Animal ; Pattern Recognition, Physiological ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {The human auditory brainstem is known to be exquisitely sensitive to fine-grained spectro-temporal differences between speech sound contrasts, and the ability of the brainstem to discriminate between these contrasts is important for speech perception. Recent work has described a novel method for translating brainstem timing differences in response to speech contrasts into frequency-specific phase differentials. Results from this method have shown that the human brainstem response is surprisingly sensitive to phase differences inherent to the stimuli across a wide extent of the spectrum. Here we use an animal model of the auditory brainstem to examine whether the stimulus-specific phase signatures measured in human brainstem responses represent an epiphenomenon associated with far-field (i.e., scalp-recorded) measurement of neural activity, or alternatively whether these specific activity patterns are also evident in auditory nuclei that contribute to the scalp-recorded response, thereby representing a more fundamental temporal processing phenomenon. Responses in anaesthetized guinea pigs to three minimally-contrasting consonant-vowel stimuli were collected simultaneously from the cortical surface vertex and directly from central nucleus of the inferior colliculus (ICc), measuring volume conducted neural activity and multiunit, near-field activity, respectively. Guinea pig surface responses were similar to human scalp-recorded responses to identical stimuli in gross morphology as well as phase characteristics. Moreover, surface-recorded potentials shared many phase characteristics with near-field ICc activity. Response phase differences were prominent during formant transition periods, reflecting spectro-temporal differences between syllables, and showed more subtle differences during the identical steady state periods. ICc encoded stimulus distinctions over a broader frequency range, with differences apparent in the highest frequency ranges analyzed, up to 3000 Hz. Based on the similarity of phase encoding across sites, and the consistency and sensitivity of response phase measured within ICc, results suggest that a general property of the auditory system is a high degree of sensitivity to fine-grained phase information inherent to complex acoustical stimuli. Furthermore, results suggest that temporal encoding in ICc contributes to temporal features measured in speech-evoked scalp-recorded responses.}, } @article {pmid21907782, year = {2011}, author = {Tierney, A and Parbery-Clark, A and Skoe, E and Kraus, N}, title = {Frequency-dependent effects of background noise on subcortical response timing.}, journal = {Hearing research}, volume = {282}, number = {1-2}, pages = {145-150}, pmid = {21907782}, issn = {1878-5891}, support = {T32 DC009399/DC/NIDCD NIH HHS/United States ; T32 DC009399-04/DC/NIDCD NIH HHS/United States ; DC009399/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Audiometry, Speech ; Auditory Pathways/*physiology ; Auditory Threshold ; Brain Stem/*physiology ; Electroencephalography ; *Evoked Potentials, Auditory, Brain Stem ; Female ; Humans ; Male ; Noise/*adverse effects ; *Perceptual Masking ; Reaction Time ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {The addition of background noise to an auditory signal delays brainstem response timing. This effect has been extensively documented using manual peak selection. Peak picking, however, is impractical for large-scale studies of spectrotemporally complex stimuli, and leaves open the question of whether noise-induced delays are frequency-dependent or occur across the frequency spectrum. Here we use an automated, objective method to examine phase shifts between auditory brainstem responses to a speech sound (/da/) presented with and without background noise. We predicted that shifts in neural response timing would also be reflected in frequency-specific phase shifts. Our results indicate that the addition of background noise causes phase shifts across the subcortical response spectrum (70-1000 Hz). However, this noise-induced delay is not uniform such that some frequency bands show greater shifts than others: low-frequency phase shifts (300-500 Hz) are largest during the response to the consonant-vowel formant transition (/d/), while high-frequency shifts (720-1000 Hz) predominate during the response to the steady-state vowel (/a/). Most importantly, phase shifts occurring in specific frequency bands correlate strongly with shifts in the latencies of the predominant peaks in the auditory brainstem response, while phase shifts in other frequency bands do not. This finding confirms the validity of phase shift detection as an objective measure of timing differences and reveals that this method detects noise-induced shifts in timing that may not be captured by traditional peak latency measurements.}, } @article {pmid21901453, year = {2011}, author = {Parise, CV and Pavani, F}, title = {Evidence of sound symbolism in simple vocalizations.}, journal = {Experimental brain research}, volume = {214}, number = {3}, pages = {373-380}, pmid = {21901453}, issn = {1432-1106}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Perception/*physiology ; Female ; Humans ; Imagination/physiology ; Language Tests/standards ; Male ; Middle Aged ; Phonation/*physiology ; Photic Stimulation/methods ; *Semantics ; Speech Perception/*physiology ; *Symbolism ; Visual Perception/*physiology ; Young Adult ; }, abstract = {The question of the arbitrariness of language is among the oldest in cognitive sciences, and it relates to the nature of the associations between vocal sounds and their meaning. Growing evidence seems to support sound symbolism, claiming for a naturally constrained mapping of meaning into sounds. Most of such evidence, however, comes from studies based on the interpretation of pseudowords, and to date, there is little empirical evidence that sound symbolism can affect phonatory behavior. In the present study, we asked participants to utter the letter /a/ in response to visual stimuli varying in shape, luminance, and size, and we observed consistent sound symbolic effects on vocalizations. Utterances' loudness was modulated by stimulus shape and luminance. Moreover, stimulus shape consistently modulated the frequency of the third formant (F3). This finding reveals an automatic mapping of specific visual attributes into phonological features of vocalizations. Furthermore, it suggests that sound-meaning associations are reciprocal, affecting active (production) as well as passive (comprehension) linguistic behavior.}, } @article {pmid21895100, year = {2011}, author = {Ohde, RN and German, SR}, title = {Formant onsets and formant transitions as developmental cues to vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {3}, pages = {1628-1642}, pmid = {21895100}, issn = {1520-8524}, support = {R01 DC000523/DC/NIDCD NIH HHS/United States ; R56 DC000523/DC/NIDCD NIH HHS/United States ; DC00523-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Age Factors ; *Aging ; Analysis of Variance ; Audiometry, Speech ; *Child Development ; Child, Preschool ; *Cues ; Humans ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {The purpose of this study was to determine whether children give more perceptual weight than do adults to dynamic spectral cues versus static cues. Listeners were 10 children between the ages of 3;8 and 4;1 (mean 3;11) and ten adults between the ages of 23;10 and 32;0 (mean 25;11). Three experimental stimulus conditions were presented, with each containing stimuli of 30 ms duration. The first experimental condition consisted of unchanging formant onset frequencies ranging in value from frequencies for [i] to those for [a], appropriate for a bilabial stop consonant context. The second two experimental conditions consisted of either an [i] or [a] onset frequency with a 25 ms portion of a formant transition whose trajectory was toward one of a series of target frequencies ranging from those for [i] to those for [a]. Results indicated that the children attended differently than the adults on both the [a] and [i] formant onset frequency cue to identify the vowels. The adults gave more equal weight to the [i]-onset and [a]-onset dynamic cues as reflected in category boundaries than the children did. For the [i]-onset condition, children were not as confident compared to adults in vowel perception, as reflected in slope analyses.}, } @article {pmid21895052, year = {2011}, author = {Kokkinakis, K and Loizou, PC}, title = {The impact of reverberant self-masking and overlap-masking effects on speech intelligibility by cochlear implant listeners (L).}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {3}, pages = {1099-1102}, pmid = {21895052}, issn = {1520-8524}, support = {R01 DC010494/DC/NIDCD NIH HHS/United States ; R03 DC008882/DC/NIDCD NIH HHS/United States ; R03 DC 008882/DC/NIDCD NIH HHS/United States ; R01 DC 010494/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Aged ; Analysis of Variance ; Audiometry, Speech ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Correction of Hearing Impairment/*psychology ; Humans ; Middle Aged ; Noise/*adverse effects ; *Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Recognition, Psychology ; Sound Spectrography ; Speech Acoustics ; *Speech Intelligibility ; Time Factors ; Vibration ; }, abstract = {The purpose of this study is to determine the relative impact of reverberant self-masking and overlap-masking effects on speech intelligibility by cochlear implant listeners. Sentences were presented in two conditions wherein reverberant consonant segments were replaced with clean consonants, and in another condition wherein reverberant vowel segments were replaced with clean vowels. The underlying assumption is that self-masking effects would dominate in the first condition, whereas overlap-masking effects would dominate in the second condition. Results indicated that the degradation of speech intelligibility in reverberant conditions is caused primarily by self-masking effects that give rise to flattened formant transitions.}, } @article {pmid21867588, year = {2012}, author = {Han, D and Xu, W and Hu, R and Zhang, L}, title = {Voice function following Han's uvulopalatopharyngoplasty.}, journal = {The Journal of laryngology and otology}, volume = {126}, number = {1}, pages = {47-51}, doi = {10.1017/S0022215111002325}, pmid = {21867588}, issn = {1748-5460}, mesh = {Adult ; Aged ; Female ; Humans ; Male ; Middle Aged ; Otorhinolaryngologic Surgical Procedures/adverse effects/*methods ; Palate/surgery ; Pharynx/*surgery ; Postoperative Complications ; Sleep Apnea, Obstructive/pathology/physiopathology/*surgery ; *Speech Acoustics ; Treatment Outcome ; Uvula/surgery ; Voice Quality/*physiology ; }, abstract = {OBJECTIVE: To investigate voice function following Han's uvulopalatopharyngoplasty.

PATIENTS AND METHODS: Acoustic and articulatory function was examined by acoustic analysis and by formant frequency and bandwidth analysis, before and after Han's uvulopalatopharyngoplasty, in 56 patients with obstructive sleep apnoea hypopnoea syndrome.

RESULTS: These patients' normalised noise energy was higher than normal, and improved post-operatively. Their pre-operative F1, F2, and F3 formant frequencies and B1 and B2 formant bandwidths were significantly lower than those of normal controls; however, one month after surgery their F1 and F2 frequencies were markedly higher.

CONCLUSION: The acoustic and articulatory characteristics of obstructive sleep apnoea hypopnoea syndrome patients differed from those of normal subjects. After Han's uvulopalatopharyngoplasty, obstructive factors in the oropharynx were relieved, allowing oropharyngeal cavity expansion and a gradual increase in formant frequency to within the normal range. Patients' vocal quality improved and their resonator and articulator functions were protected and enhanced.}, } @article {pmid21862680, year = {2011}, author = {Fox, RA and Jacewicz, E and Chang, CY}, title = {Auditory spectral integration in the perception of static vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {6}, pages = {1667-1681}, pmid = {21862680}, issn = {1558-9102}, support = {R01 DC006879/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cues ; Humans ; *Models, Neurological ; *Phonetics ; Pitch Perception/physiology ; *Psychoacoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {PURPOSE: To evaluate potential contributions of broadband spectral integration in the perception of static vowels. Specifically, can the auditory system infer formant frequency information from changes in the intensity weighting across harmonics when the formant itself is missing? Does this type of integration produce the same results in the lower (first formant [F1]) and higher (second formant [F2]) regions? Does the spacing between the spectral components affect a listener's ability to integrate the acoustic cues?

METHOD: Twenty young listeners with normal hearing identified synthesized vowel-like stimuli created for adjustments in the F1 region (/Λ/-/α/, /i/-/ε/) and in the F2 region (/Λ/-/æ/). There were 2 types of stimuli: (a) 2-formant tokens and (b) tokens in which 1 formant was removed and 2 pairs of sine waves were inserted below and above the missing formant; the intensities of these harmonics were modified to cause variations in their spectral center of gravity (COG). The COG effects were tested over a wide range of frequencies.

RESULTS: Obtained patterns were consistent with calculated changes to the spectral COG, in both the F1 and F2 regions. The spacing of the sine waves did not affect listeners' responses.

CONCLUSION: The auditory system may perform broadband integration as a type of auditory wideband spectral analysis.}, } @article {pmid21805834, year = {2011}, author = {Lu, H and Huang, Z and Bai, Y and Zhang, L}, title = {[The acoustic study on vowel movement of normal adult].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {25}, number = {9}, pages = {406-408}, pmid = {21805834}, issn = {2096-7993}, mesh = {Adolescent ; Adult ; Humans ; Male ; *Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Vocal Cords/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: To study the relationship between the first formant (F1) and jaw, the second formant (F2) and tongue and the third formant (F3) and lip. Fine articulation of jaw, lips, tongue by measured formant of different single-vowel, in order to explore clinical implications of F1, F2 and F3.

METHOD: Measure 30 hearing normal men's F1, F2, F3 of /a/, /i/, /e/, /u/ and /ü/. The study compared F1 of /a/, /i/, /e/ to find the relation ship between F1 and jaw movement by one-way anova, compared F2 of /a/, /i/, /e/, /u/ to find the relationship between F2 and four tongue movements, and compared F2 and F3 of /i/, /ü/ to find the relationship between F2, F3 and lip movement by paired-samples t test.

RESULT: There was significant difference among F1 of /a/, /i/, /e/. F2 and F3 of /i/, /ü/ were also significantly different (P<0.01); F2 of /a/, /i/, /u/ that expresses tongue articulation movement exists significant difference (P<0.01), but both F2(a) and F2(e) did not differ significantly by multiple compare means. There were extremely significant differences (P<0.01) among other three positions of tongue.

CONCLUSION: F1 can reflect different positions of jaw. F2 and F3 can reflect the position of lip and tongue. F2 can reflect different locations of tongue.}, } @article {pmid21803236, year = {2011}, author = {Stevens, K and Bressmann, T and Gong, SG and Tompson, BD}, title = {Impact of a rapid palatal expander on speech articulation.}, journal = {American journal of orthodontics and dentofacial orthopedics : official publication of the American Association of Orthodontists, its constituent societies, and the American Board of Orthodontics}, volume = {140}, number = {2}, pages = {e67-75}, doi = {10.1016/j.ajodo.2011.02.017}, pmid = {21803236}, issn = {1097-6752}, mesh = {Adaptation, Physiological ; Adolescent ; Analysis of Variance ; Articulation Disorders/*etiology ; Child ; Female ; Humans ; Male ; Orthodontic Anchorage Procedures/adverse effects ; Orthodontic Appliances/*adverse effects ; Palatal Expansion Technique/adverse effects/*instrumentation ; Sound Spectrography ; Speech Articulation Tests ; Statistics, Nonparametric ; Young Adult ; }, abstract = {INTRODUCTION: Rapid palatal expanders (RPEs) have attachments cemented to the teeth and a screw that covers the palate. Because of their position and relative size, RPEs can affect speech. Our objective was to assess speech perturbation and adaptation related to RPE appliances over time.

METHODS: RPEs were planned for the treatment of 22 patients in the orthodontic clinic at the University of Toronto in Canada. Speech recordings were made at 6 time points: before RPE placement, after placement, during expansion, during retention, after removal, and 4 weeks after removal. The speech recordings consisted of 35 sentences, from which 3 sentences were chosen for analysis. Speech acceptability was assessed perceptually by 10 listeners who rated each sentence on an equal-appearing interval scale. The vowel formants for /i/ and the fricative spectra for /s/ and /∫/ were measured with speech analysis software. Repeated-measures analysis of variance with post-hoc paired t tests was used for statistical analysis.

RESULTS: When the appliance was placed, speech acceptability deteriorated. Over time, the ratings improved and returned to baseline when the appliance was removed. For the vowel /i/, the first formant increased, and the second formant decreased in frequency, indicating centralization of the vowel. The formants returned to the pretreatment levels during treatment. For the fricatives (/s/ and /∫/), low-to-high frequency ratios indicated that the fricatives were distorted when the appliance was placed. The ratios returned to baseline levels once the appliance was removed. The results for the spectral moments indicated that spectral mean decreased and skewness became more positive. Repeated-measures analysis of variance showed significant effects for time for all acoustic measures.

CONCLUSIONS: Speech was altered and distorted when the appliance was first placed. The patients' speech gradually improved over time and returned to baseline once the appliance was removed. The results from the study will be useful for pretreatment counseling of patients and their families.}, } @article {pmid21799207, year = {2012}, author = {Song, JH and Skoe, E and Banai, K and Kraus, N}, title = {Training to improve hearing speech in noise: biological mechanisms.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {22}, number = {5}, pages = {1180-1190}, pmid = {21799207}, issn = {1460-2199}, support = {F32 DC008052/DC/NIDCD NIH HHS/United States ; R01 DC01510/DC/NIDCD NIH HHS/United States ; T32 NS047987/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Noise ; Speech Perception/*physiology ; Young Adult ; }, abstract = {We investigated training-related improvements in listening in noise and the biological mechanisms mediating these improvements. Training-related malleability was examined using a program that incorporates cognitively based listening exercises to improve speech-in-noise perception. Before and after training, auditory brainstem responses to a speech syllable were recorded in quiet and multitalker noise from adults who ranged in their speech-in-noise perceptual ability. Controls did not undergo training but were tested at intervals equivalent to the trained subjects. Trained subjects exhibited significant improvements in speech-in-noise perception that were retained 6 months later. Subcortical responses in noise demonstrated training-related enhancements in the encoding of pitch-related cues (the fundamental frequency and the second harmonic), particularly for the time-varying portion of the syllable that is most vulnerable to perceptual disruption (the formant transition region). Subjects with the largest strength of pitch encoding at pretest showed the greatest perceptual improvement. Controls exhibited neither neurophysiological nor perceptual changes. We provide the first demonstration that short-term training can improve the neural representation of cues important for speech-in-noise perception. These results implicate and delineate biological mechanisms contributing to learning success, and they provide a conceptual advance to our understanding of the kind of training experiences that can influence sensory processing in adulthood.}, } @article {pmid21786909, year = {2011}, author = {Chládková, K and Escudero, P and Boersma, P}, title = {Context-specific acoustic differences between Peruvian and Iberian Spanish vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {130}, number = {1}, pages = {416-428}, doi = {10.1121/1.3592242}, pmid = {21786909}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; *Phonetics ; Sex Factors ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {This paper examines four acoustic properties (duration F0, F1, and F2) of the monophthongal vowels of Iberian Spanish (IS) from Madrid and Peruvian Spanish (PS) from Lima in various consonantal contexts (/s/, /f/, /t/, /p/, and /k/) and in various phrasal contexts (in isolated words and sentence-internally). Acoustic measurements on 39 speakers, balanced by dialect and gender, can be generalized to the following differences between the two dialects. The vowel /a/ has a lower first formant in PS than in IS by 6.3%. The vowels /e/ and /o/ have more peripheral second-formant (F2) values in PS than in IS by about 4%. The consonant /s/ causes more centralization of the F2 of neighboring vowels in IS than in PS. No dialectal differences are found for the effect of phrasal context. Next to the between-dialect differences in the vowels, the present study finds that /s/ has a higher spectral center of gravity in PS than in IS by about 10%, that PS speakers speak slower than IS speakers by about 9%, and that Spanish-speaking women speak slower than Spanish-speaking men by about 5% (irrespective of dialect).}, } @article {pmid21767004, year = {2011}, author = {Charrier, I and Ahonen, H and Harcourt, RG}, title = {What makes an Australian sea lion (Neophoca cinerea) male's bark threatening?.}, journal = {Journal of comparative psychology (Washington, D.C. : 1983)}, volume = {125}, number = {4}, pages = {385-392}, doi = {10.1037/a0024513}, pmid = {21767004}, issn = {1939-2087}, mesh = {Animals ; Competitive Behavior ; Female ; Male ; Sea Lions/*psychology ; Sexual Behavior, Animal ; Territoriality ; *Vocalization, Animal ; Western Australia ; }, abstract = {In mammals, vocal signals are produced in many social contexts and convey diverse information about the emitter (social rank, individual identity, body size-condition). To understand their biological function, the authors find it is not only important to estimate the information about the signaler encoded in the signal but also to determine if and how this information is perceived by the receiver. In male pinnipeds (phocids, otariids, and odobenids) vocal signaling plays an important role in the breeding season during the defense of territories, females, or both. In this article, the authors investigated 2 key acoustic features that Australian sea lion (Neophoca cinerea) males most likely rely on to assess the threat level posed by potential rivals, by manipulating bark rhythmicity and spectral characteristics. Bark series that show accelerated rhythmicity and higher formants elicited stronger responses.}, } @article {pmid21761988, year = {2012}, author = {Kulak Kayikci, ME and Akan, S and Ciger, S and Ozkan, S}, title = {Effects of Hawley retainers on consonants and formant frequencies of vowels.}, journal = {The Angle orthodontist}, volume = {82}, number = {1}, pages = {14-21}, pmid = {21761988}, issn = {1945-7103}, mesh = {Adaptation, Physiological ; Adolescent ; Articulation Disorders/*diagnosis/etiology ; Female ; Humans ; Male ; Orthodontic Retainers/*adverse effects ; *Speech Articulation Tests ; }, abstract = {OBJECTIVE: To assess (1) whether Hawley retainers cause speech disturbance and (2) the duration of speech adaptation to Hawley retainers with objective and subjective tests.

MATERIALS AND METHODS: Twelve adolescents, aged 11.11 to 18.03 years, were included in this study. The assessment of speech sounds were done subjectively using an articulation test and objectively using acoustic analysis before and after Hawley retainer application.

RESULTS: After wearing Hawley retainers, patients showed statistically significant speech disturbances on consonants [ş] and [z]. Regarding the vowels, statistically significant changes were recorded with [i], while F1 increased and F2 and F3 decreased.

CONCLUSIONS: The tongue changes its target position with the application of a foreign body within the mouth; however, in time it adapts to that new situation.}, } @article {pmid21761144, year = {2012}, author = {Ohms, VR and Escudero, P and Lammers, K and ten Cate, C}, title = {Zebra finches and Dutch adults exhibit the same cue weighting bias in vowel perception.}, journal = {Animal cognition}, volume = {15}, number = {2}, pages = {155-161}, pmid = {21761144}, issn = {1435-9456}, mesh = {Acoustic Stimulation ; Adult ; Animals ; Auditory Perception ; Cues ; Discrimination, Psychological ; *Finches/physiology ; Humans ; *Phonetics ; *Speech Perception/physiology ; }, abstract = {Vocal tract resonances, called formants, are the most important parameters in human speech production and perception. They encode linguistic meaning and have been shown to be perceived by a wide range of species. Songbirds are also sensitive to different formant patterns in human speech. They can categorize words differing only in their vowels based on the formant patterns independent of speaker identity in a way comparable to humans. These results indicate that speech perception mechanisms are more similar between songbirds and humans than realized before. One of the major questions regarding formant perception concerns the weighting of different formants in the speech signal ("acoustic cue weighting") and whether this process is unique to humans. Using an operant Go/NoGo design, we trained zebra finches to discriminate syllables, whose vowels differed in their first three formants. When subsequently tested with novel vowels, similar in either their first formant or their second and third formants to the familiar vowels, similarity in the higher formants was weighted much more strongly than similarity in the lower formant. Thus, zebra finches indeed exhibit a cue weighting bias. Interestingly, we also found that Dutch speakers when tested with the same paradigm exhibit the same cue weighting bias. This, together with earlier findings, supports the hypothesis that human speech evolution might have exploited general properties of the vertebrate auditory system.}, } @article {pmid21756222, year = {2011}, author = {Sundberg, J and Lã, FM and Gill, BP}, title = {Professional male singers' formant tuning strategies for the vowel /a/.}, journal = {Logopedics, phoniatrics, vocology}, volume = {36}, number = {4}, pages = {156-167}, doi = {10.3109/14015439.2011.587448}, pmid = {21756222}, issn = {1651-2022}, mesh = {Adult ; Humans ; Male ; *Music ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Certain spectrum characteristics have been identified as important for register equalization around the male passaggio, an effect ascribed to formant tuning although descriptions of formant tuning diverge. Eight professional singers sang scales including their passaggio range on different vowels, applying two formant tuning strategies as found in (1) classical and (2) non-classical singing. Formant frequencies were measured using inverse filtering. Results revealed differences between the two strategies. For the classical formant tuning, systematic changes of formant frequencies with pitch were observed. For the highest note sung on /a/, F1 was below the second partial and F2 in the vicinity of the third. Similar spectrum characteristics were achieved by different F1 and F2 values between singers.}, } @article {pmid21752821, year = {2012}, author = {Puts, DA and Apicella, CL and Cárdenas, RA}, title = {Masculine voices signal men's threat potential in forager and industrial societies.}, journal = {Proceedings. Biological sciences}, volume = {279}, number = {1728}, pages = {601-609}, pmid = {21752821}, issn = {1471-2954}, mesh = {*Aggression ; Anthropometry ; Biometry ; Female ; Humans ; Male ; Phonation ; Sex Characteristics ; Speech Acoustics ; Tanzania ; Testosterone/analysis ; United States ; *Voice ; Young Adult ; }, abstract = {Humans and many non-human primates exhibit large sexual dimorphisms in vocalizations and vocal anatomy. In humans, same-sex competitors and potential mates attend to acoustic features of male vocalizations, but vocal masculinity especially increases perceptions of physical prowess. Yet, the information content of male vocalizations remains obscure. We therefore examined relationships between sexually dimorphic acoustic properties and men's threat potential. We first introduce a new measure of the structure of vocal formant frequencies, 'formant position' (Pf), which we show is more sexually dimorphic and more strongly related to height than is the most widely used measure of formant structure, 'formant dispersion', in both a US sample and a sample of Hadza foragers from Tanzania. We also show large sexual dimorphisms in the mean fundamental frequency (F0) and the within-utterance standard deviation in F0 (F0-s.d.) in both samples. We then explore relationships between these acoustic parameters and men's body size, strength, testosterone and physical aggressiveness. Each acoustic parameter was related to at least one measure of male threat potential. The most dimorphic parameters, F0 and Pf, were most strongly related to body size in both samples. In the US sample, F0 predicted testosterone levels, Pf predicted upper body strength and F0-s.d. predicted physical aggressiveness.}, } @article {pmid21695159, year = {2011}, author = {Charlton, BD and Reby, D}, title = {Context-related acoustic variation in male fallow deer (Dama dama) groans.}, journal = {PloS one}, volume = {6}, number = {6}, pages = {e21066}, pmid = {21695159}, issn = {1932-6203}, support = {230604/ERC_/European Research Council/International ; }, mesh = {*Acoustics ; Animals ; Body Size ; Breeding ; *Deer/anatomy & histology/genetics ; Female ; Male ; Sexual Behavior, Animal ; Social Behavior ; *Vocalization, Animal ; }, abstract = {While social and behavioural contexts are known to affect the acoustic structure of vocal signals in several mammal species, few studies have investigated context-related acoustic variation during inter-sexual advertisement and/or intra-sexual competition. Here we recorded male fallow deer groans during the breeding season and investigated how key acoustic parameters (fundamental frequency and formant frequencies) vary as a function of the social context in which they are produced. We found that in the presence of females, male fallow deer produced groans with higher mean fundamental frequency when vocal males were also present than they did when no vocal males were in close vicinity. We attribute this to the increased arousal state typically associated with this context. In addition, groan minimum formant frequency spacing was slightly, but significantly lower (indicating marginally more extended vocal tracts) when males were alone than when potential mates and/or competitors were nearby. This indicates that, contrary to our predictions, male fallow deer do not exaggerate the acoustic impression of their body size by further lowering their formant frequencies in the presence of potential mating partners and competitors. Furthermore, since the magnitude of the variation in groan minimum formant frequency spacing remains small compared to documented inter-individual differences, our findings are consistent with the hypothesis that formants are reliable static cues to body size during intra- and inter-sexual advertisement that do not concurrently encode dynamic motivation-related information.}, } @article {pmid21682417, year = {2011}, author = {Echternach, M and Sundberg, J and Baumann, T and Markl, M and Richter, B}, title = {Vocal tract area functions and formant frequencies in opera tenors' modal and falsetto registers.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {6}, pages = {3955-3963}, doi = {10.1121/1.3589249}, pmid = {21682417}, issn = {1520-8524}, mesh = {Adult ; Biomechanical Phenomena ; Humans ; Larynx/anatomy & histology/*physiology ; Lip/physiology ; Magnetic Resonance Imaging ; Male ; Middle Aged ; Mouth/physiology ; *Music ; *Phonation ; Pressure ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Vibration ; *Voice Quality ; }, abstract = {According to recent model investigations, vocal tract resonance is relevant to vocal registers. However, no experimental corroboration of this claim has been published so far. In the present investigation, ten professional tenors' vocal tract configurations were analyzed using MRI volumetry. All subjects produced a sustained tone on the pitch F4 (349 Hz) on the vowel /a/ (1) in modal and (2) in falsetto register. The area functions were estimated from the MRI data and their associated formant frequencies were calculated. In a second condition the same subjects repeated the same tasks in a sound treated room and their formant frequencies were estimated by means of inverse filtering. In both recordings similar formant frequencies were observed. Vocal tract shapes differed between modal and falsetto register. In modal as compared to falsetto the lip opening and the oral cavity were wider and the first formant frequency was higher. In this sense the presented results are in agreement with the claim that the formant frequencies differ between registers.}, } @article {pmid21682414, year = {2011}, author = {Nelson, DA and Kreft, HA and Anderson, ES and Donaldson, GS}, title = {Spatial tuning curves from apical, middle, and basal electrodes in cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {6}, pages = {3916-3933}, pmid = {21682414}, issn = {1520-8524}, support = {R01 DC006699/DC/NIDCD NIH HHS/United States ; R01-DC006699/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Aged ; Audiometry, Speech ; Auditory Threshold ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; *Correction of Hearing Impairment/psychology ; Electric Stimulation ; Female ; Humans ; Loudness Perception ; Male ; Middle Aged ; Noise/adverse effects ; Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; Prosthesis Design ; Psychoacoustics ; Recognition, Psychology ; Signal Processing, Computer-Assisted ; Speech Acoustics ; *Speech Perception ; }, abstract = {Forward-masked psychophysical spatial tuning curves (fmSTCs) were measured in 15 cochlear-implant subjects, 10 using monopolar stimulation and 5 using bipolar stimulation. In each subject, fmSTCs were measured at several probe levels on an apical, middle, and basal electrode using a fixed-level probe stimulus and variable-level maskers. Tuning curve slopes and bandwidths did not change significantly with probe level for electrodes located in the apical, middle, or basal region although a few subjects exhibited dramatic changes in tuning at the extremes of the probe level range. Average tuning curve slopes and bandwidths did not vary significantly across electrode regions. Spatial tuning curves were symmetrical and similar in width across the three electrode regions. However, several subjects demonstrated large changes in slope and/or bandwidth across the three electrode regions, indicating poorer tuning in localized regions of the array. Cochlear-implant users exhibited bandwidths that were approximately five times wider than normal-hearing acoustic listeners but were in the same range as acoustic listeners with moderate cochlear hearing loss. No significant correlations were found between spatial tuning parameters and speech recognition; although a weak relation was seen between middle electrode tuning and transmitted information for vowel second formant frequency.}, } @article {pmid21624137, year = {2011}, author = {Muhammad, G and Mesallam, TA and Malki, KH and Farahat, M and Alsulaiman, M and Bukhari, M}, title = {Formant analysis in dysphonic patients and automatic Arabic digit speech recognition.}, journal = {Biomedical engineering online}, volume = {10}, number = {}, pages = {41}, pmid = {21624137}, issn = {1475-925X}, mesh = {Adolescent ; Adult ; Automation ; Dysphonia/*diagnosis/*physiopathology ; Female ; Humans ; Language ; Male ; Middle Aged ; *Phonetics ; *Speech ; *Speech Recognition Software ; Young Adult ; }, abstract = {BACKGROUND AND OBJECTIVE: There has been a growing interest in objective assessment of speech in dysphonic patients for the classification of the type and severity of voice pathologies using automatic speech recognition (ASR). The aim of this work was to study the accuracy of the conventional ASR system (with Mel frequency cepstral coefficients (MFCCs) based front end and hidden Markov model (HMM) based back end) in recognizing the speech characteristics of people with pathological voice.

MATERIALS AND METHODS: The speech samples of 62 dysphonic patients with six different types of voice disorders and 50 normal subjects were analyzed. The Arabic spoken digits were taken as an input. The distribution of the first four formants of the vowel /a/ was extracted to examine deviation of the formants from normal.

RESULTS: There was 100% recognition accuracy obtained for Arabic digits spoken by normal speakers. However, there was a significant loss of accuracy in the classifications while spoken by voice disordered subjects. Moreover, no significant improvement in ASR performance was achieved after assessing a subset of the individuals with disordered voices who underwent treatment.

CONCLUSION: The results of this study revealed that the current ASR technique is not a reliable tool in recognizing the speech of dysphonic patients.}, } @article {pmid21621382, year = {2012}, author = {Lundeborg, I and Hultcrantz, E and Ericsson, E and McAllister, A}, title = {Acoustic and perceptual aspects of vocal function in children with adenotonsillar hypertrophy--effects of surgery.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {4}, pages = {480-487}, doi = {10.1016/j.jvoice.2010.11.003}, pmid = {21621382}, issn = {1873-4588}, mesh = {Adenoidectomy ; Adenoids/*pathology ; Case-Control Studies ; Child ; Child, Preschool ; Female ; Humans ; Hypertrophy/physiopathology/surgery ; Male ; Palatine Tonsil/*pathology/surgery ; Pharyngeal Diseases/pathology/*physiopathology/surgery ; Postoperative Period ; Preoperative Period ; Sex Factors ; Speech Acoustics ; Tonsillectomy ; Voice/*physiology ; Voice Quality ; }, abstract = {OBJECTIVE: To evaluate outcome of two types of tonsil surgery (tonsillectomy [TE]+adenoidectomy or tonsillotomy [TT]+adenoidectomy) on vocal function perceptually and acoustically.

STUDY DESIGN: Sixty-seven children, aged 50-65 months, on waiting list for tonsil surgery were randomized to TE (n=33) or TT (n=34). Fifty-seven age- and gender-matched healthy preschool children were controls. Twenty-eight of them, aged 48-59 months, served as control group before surgery, and 29, aged 60-71 months, served as control group after surgery.

METHODS: Before surgery and 6 months postoperatively, the children were recorded producing three sustained vowels (/ɑ/, /u/, and /i/) and 14 words. The control groups were recorded only once. Three trained speech and language pathologists performed the perceptual analysis using visual analog scale for eight voice quality parameters. Acoustic analysis from sustained vowels included average fundamental frequency, jitter percent, shimmer percent, noise-to-harmonic ratio, and the center frequencies of formants 1-3.

RESULTS: Before surgery, the children were rated to have more hyponasality and compressed/throaty voice (P<0.05) and lower mean pitch (P<0.01) in comparison to the control group. They also had higher perturbation measures and lower frequencies of the second and third formants. After surgery, there were no differences perceptually. Perturbation measures decreased but were still higher compared with those of control group (P<0.05). Differences in formant frequencies for /i/ and /u/ remained. No differences were found between the two surgical methods.

CONCLUSION: Voice quality is affected perceptually and acoustically by adenotonsillar hypertrophy. After surgery, the voice is perceptually normalized but acoustic differences remain. Outcome was equal for both surgical methods.}, } @article {pmid21621380, year = {2012}, author = {Sundberg, J and Gu, L and Huang, Q and Huang, P}, title = {Acoustical study of classical Peking Opera singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {2}, pages = {137-143}, doi = {10.1016/j.jvoice.2011.01.001}, pmid = {21621380}, issn = {1873-4588}, mesh = {China ; *Culture ; Humans ; Male ; *Music ; *Speech Acoustics ; }, abstract = {Acoustic characteristics of classical opera singing differ considerably between the Western and the Chinese cultures. Singers in the classical Peking opera tradition specialize on one out of a limited number of standard roles. Audio and electroglottograph signals were recorded for four performers of the Old Man role and three performers of the Colorful Face role. Recordings were made of the singers' speech and when they sang recitatives and songs from their roles. Sound pressure level, fundamental frequency, and spectrum characteristics were analyzed. Histograms showing the distribution of fundamental frequency showed marked peaks for the songs, suggesting a scale tone structure. Some of the intervals between these peaks were similar to those used in Western music. Vibrato rate was about 3.5Hz, that is, considerably slower than in Western classical singing. Spectra of vibrato-free tones contained unbroken series of harmonic partials sometimes reaching up to 17 000Hz. Long-term-average spectrum (LTAS) curves showed no trace of a singer's formant cluster. However, the Colorful Face role singers' LTAS showed a marked peak near 3300Hz, somewhat similar to that found in Western pop music singers. The mean LTAS spectrum slope between 700 and 6000Hz decreased by about 0.2dB/octave per dB of equivalent sound level.}, } @article {pmid21604957, year = {2011}, author = {Hedrick, M and Bahng, J and von Hapsburg, D and Younger, MS}, title = {Weighting of cues for fricative place of articulation perception by children wearing cochlear implants.}, journal = {International journal of audiology}, volume = {50}, number = {8}, pages = {540-547}, doi = {10.3109/14992027.2010.549515}, pmid = {21604957}, issn = {1708-8186}, mesh = {Acoustic Stimulation ; Analysis of Variance ; Audiometry ; Auditory Threshold ; Case-Control Studies ; Child ; Child, Preschool ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Comprehension ; Correction of Hearing Impairment/*psychology ; *Cues ; Hearing Loss/psychology/*rehabilitation ; Humans ; Persons With Hearing Impairments/psychology/*rehabilitation ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {OBJECTIVE: To determine how children wearing cochlear implants weight cues for fricative perception compared to age-matched children with normal hearing.

DESIGN: Two seven-step continua of synthetic CV syllables were constructed, with frication pole varied from /s/ to /ƒ/ within the continuum, and appropriate formant transition values varied across continua. Relative weights applied to the frication, transition, and interaction cues were determined.

STUDY SAMPLE: Ten 5?7-year-old children with normal hearing and ten 5?8-year-old children wearing cochlear implants participated.

RESULTS: Both groups of children gave more perceptual weight to the frication spectral cue than to the formant transition cue. Children with normal hearing gave small but significant weight to formant transitions, but the children wearing cochlear implants did not. The degree of cue interaction was significant for children with normal hearing but was not for children wearing cochlear implants.

CONCLUSIONS: Children wearing a cochlear implant use similar cue-weighting strategies as normal listeners (i.e. all apply more weight to the frication noise than to the transition cue), but may have limitations in processing formant transitions and in cue interaction.}, } @article {pmid21568426, year = {2011}, author = {Toutios, A and Ouni, S and Laprie, Y}, title = {Estimating the control parameters of an articulatory model from electromagnetic articulograph data.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {5}, pages = {3245-3257}, doi = {10.1121/1.3569714}, pmid = {21568426}, issn = {1520-8524}, mesh = {Anthropometry ; Ear ; Female ; Humans ; Incisor/*physiology ; Larynx/*physiology ; Lip/*physiology ; Magnetic Resonance Imaging/*methods ; *Models, Biological ; Nose ; Palate/physiology ; Pharynx/physiology ; Phonation/*physiology ; Tongue/*physiology ; }, abstract = {Finding the control parameters of an articulatory model that result in given acoustics is an important problem in speech research. However, one should also be able to derive the same parameters from measured articulatory data. In this paper, a method to estimate the control parameters of the the model by Maeda from electromagnetic articulography (EMA) data, which allows the derivation of full sagittal vocal tract slices from sparse flesh-point information, is presented. First, the articulatory grid system involved in the model's definition is adapted to the speaker involved in the experiment, and EMA data are registered to it automatically. Then, articulatory variables that correspond to measurements defined by Maeda on the grid are extracted. An initial solution for the articulatory control parameters is found by a least-squares method, under constraints ensuring vocal tract shape naturalness. Dynamic smoothness of the parameter trajectories is then imposed by a variational regularization method. Generated vocal tract slices for vowels are compared with slices appearing in magnetic resonance images of the same speaker or found in the literature. Formants synthesized on the basis of these generated slices are adequately close to those tracked in real speech recorded concurrently with EMA.}, } @article {pmid21568424, year = {2011}, author = {Kokkinakis, K and Hazrati, O and Loizou, PC}, title = {A channel-selection criterion for suppressing reverberation in cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {5}, pages = {3221-3232}, pmid = {21568424}, issn = {1520-8524}, support = {R03 DC 008882/DC/NIDCD NIH HHS/United States ; R01 DC 007527/DC/NIDCD NIH HHS/United States ; R03 DC008882/DC/NIDCD NIH HHS/United States ; R01 DC010494/DC/NIDCD NIH HHS/United States ; R01 DC007527/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Algorithms ; Cochlear Implants/*standards ; Equipment Design ; Female ; Hearing Loss, Sensorineural/physiopathology/psychology/surgery ; Humans ; Male ; Middle Aged ; *Perceptual Distortion ; Phonetics ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Time Factors ; }, abstract = {Little is known about the extent to which reverberation affects speech intelligibility by cochlear implant (CI) listeners. Experiment 1 assessed CI users' performance using Institute of Electrical and Electronics Engineers (IEEE) sentences corrupted with varying degrees of reverberation. Reverberation times of 0.30, 0.60, 0.80, and 1.0 s were used. Results indicated that for all subjects tested, speech intelligibility decreased exponentially with an increase in reverberation time. A decaying-exponential model provided an excellent fit to the data. Experiment 2 evaluated (offline) a speech coding strategy for reverberation suppression using a channel-selection criterion based on the signal-to-reverberant ratio (SRR) of individual frequency channels. The SRR reflects implicitly the ratio of the energies of the signal originating from the early (and direct) reflections and the signal originating from the late reflections. Channels with SRR larger than a preset threshold were selected, while channels with SRR smaller than the threshold were zeroed out. Results in a highly reverberant scenario indicated that the proposed strategy led to substantial gains (over 60 percentage points) in speech intelligibility over the subjects' daily strategy. Further analysis indicated that the proposed channel-selection criterion reduces the temporal envelope smearing effects introduced by reverberation and also diminishes the self-masking effects responsible for flattened formants.}, } @article {pmid21568375, year = {2011}, author = {Arsikere, H and Lulich, SM and Alwan, A}, title = {Automatic estimation of the first subglottal resonance.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {5}, pages = {EL197-203}, doi = {10.1121/1.3567004}, pmid = {21568375}, issn = {1520-8524}, mesh = {Acceleration ; Acoustics ; Adult ; *Algorithms ; Copying Processes ; Databases, Factual ; Female ; Humans ; Language ; Larynx ; Male ; *Phonetics ; Sex Characteristics ; Sound Spectrography/*methods ; Speech Recognition Software ; }, abstract = {This letter focuses on the automatic estimation of the first subglottal resonance (Sg1). A database comprising speech and subglottal data of native American English speakers and bilingual Spanish/English speakers was used for the analysis. Data from 11 speakers (five males and six females) were used to derive an empirical relation among the first formant frequency, fundamental frequency, and Sg1. Using the derived relation, Sg1 was automatically estimated from voiced sounds in English and Spanish sentences spoken by 22 different speakers (11 males and 11 females). The error in estimating Sg1 was less than 50 Hz, on average.}, } @article {pmid21562187, year = {2011}, author = {Feng, Y and Gracco, VL and Max, L}, title = {Integration of auditory and somatosensory error signals in the neural control of speech movements.}, journal = {Journal of neurophysiology}, volume = {106}, number = {2}, pages = {667-679}, pmid = {21562187}, issn = {1522-1598}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC007603-05/DC/NIDCD NIH HHS/United States ; R01-DC-007603/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Auditory Cortex/*physiology ; Feedback, Physiological/*physiology ; Female ; Humans ; Jaw/physiology ; Male ; Movement/*physiology ; Neural Pathways/physiology ; Neurons/*physiology ; Somatosensory Cortex/*physiology ; *Speech Acoustics ; Young Adult ; }, abstract = {We investigated auditory and somatosensory feedback contributions to the neural control of speech. In task I, sensorimotor adaptation was studied by perturbing one of these sensory modalities or both modalities simultaneously. The first formant (F1) frequency in the auditory feedback was shifted up by a real-time processor and/or the extent of jaw opening was increased or decreased with a force field applied by a robotic device. All eight subjects lowered F1 to compensate for the up-shifted F1 in the feedback signal regardless of whether or not the jaw was perturbed. Adaptive changes in subjects' acoustic output resulted from adjustments in articulatory movements of the jaw or tongue. Adaptation in jaw opening extent in response to the mechanical perturbation occurred only when no auditory feedback perturbation was applied or when the direction of adaptation to the force was compatible with the direction of adaptation to a simultaneous acoustic perturbation. In tasks II and III, subjects' auditory and somatosensory precision and accuracy were estimated. Correlation analyses showed that the relationships 1) between F1 adaptation extent and auditory acuity for F1 and 2) between jaw position adaptation extent and somatosensory acuity for jaw position were weak and statistically not significant. Taken together, the combined findings from this work suggest that, in speech production, sensorimotor adaptation updates the underlying control mechanisms in such a way that the planning of vowel-related articulatory movements takes into account a complex integration of error signals from previous trials but likely with a dominant role for the auditory modality.}, } @article {pmid21555204, year = {2012}, author = {Celik, O and Boyaci, Z and Yelken, K and Atespare, A and Celebi, S and Koca, O}, title = {Septorhinoplasty with spreader grafts enhances perceived voice quality without affecting acoustic characteristics.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {4}, pages = {493-495}, doi = {10.1016/j.jvoice.2011.03.005}, pmid = {21555204}, issn = {1873-4588}, mesh = {Adult ; Cartilage/transplantation ; Female ; Humans ; Male ; *Nasal Surgical Procedures ; *Speech Acoustics ; *Voice Quality ; }, abstract = {OBJECTIVE: To identify the effect of septorhinoplasty with spreader grafts on patients' perception of voice and to measure formant frequencies that may be responsible for perceived changes in voice quality.

METHODS: A total of 20 patients who underwent septorhinoplasty and had spreader grafts placed during the operations were included. All subjects were tested within the week before surgery and 1-3 months postoperatively by means of perceptual assessment (Voice Handicap Index-10 [VHI-10] and self-assessment of hypo/hypernasality), acoustic analysis, and formant frequency analysis.

RESULTS: The mean of VHI-10 score was decreased from 9.44±6.1 to 5.1±3.94 postoperatively (P=0.03). Fifteen patients (75%) perceived their voices to be hyponasal before surgery, but only three perceived the hyponasality to persist after surgery (P<0.001). No patient perceived the voice to be hypernasal either before or after surgery. Fifteen patients (75%) perceived their overall voice quality to be improved, whereas five patients perceived no change. None of the patients perceived their voice to be worse after surgery. There were no significant differences between pre- and postoperative acoustic analysis and formant frequency analysis (P>0.05).

CONCLUSION: Septorhinoplasty with spreader grafts significantly improved patients' perception of voice; however, acoustic analysis and formant frequency analysis of nasalized vowels did not reveal any significant differences after the operation.}, } @article {pmid26557316, year = {2011}, author = {Laroche, M and Dajani, HR and Marcoux, AM}, title = {Contribution of Resolved and Unresolved Harmonic Regions to Brainstem Speech-Evoked Responses in Quiet and in Background Noise.}, journal = {Audiology research}, volume = {1}, number = {1}, pages = {e7}, pmid = {26557316}, issn = {2039-4330}, abstract = {Speech auditory brainstem responses (speech ABR) reflect activity that is phase-locked to the harmonics of the fundamental frequency (F0) up to at least the first formant (F1). Recent evidence suggests that responses at F0 in the presence of noise are more robust than responses at F1, and are also dissociated in some learning-impaired children. Peripheral auditory processing can be broadly divided into resolved and unresolved harmonic regions. This study investigates the contribution of these two regions to the speech ABR, and their susceptibility to noise. We recorded, in quiet and in background white noise, evoked responses in twelve normal hearing adults in response to three variants of a synthetic vowel: i) Allformants, which contains all first three formants, ii) F1Only, which is dominated by resolved harmonics, and iii) F2&F3Only, which is dominated by unresolved harmonics. There were no statistically significant differences in the response at F0 due to the three variants of the stimulus in quiet, nor did the noise affect this response with the Allformants and F1Only variants. On the other hand, the response at F0 with the F2&F3Only variant was significantly weaker in noise than with the two other variants (p<0.001). With the response at F1, there was no difference with the Allformants and F1Only variants in quiet, but was expectedly weaker with the F2&F3Only variant (p<0.01). The addition of noise significantly weakened the response at F1 with the F1Only variant (p<0.05), but this weakening only tended towards significance with the Allformants variant (p=0.07). The results of this study indicate that resolved and unresolved harmonics are processed in different but interacting pathways that converge in the upper brainstem. The results also support earlier work on the differential susceptibility of responses at F0 and F1 to added noise.}, } @article {pmid21550779, year = {2012}, author = {Gaskill, CS and Quinney, DM}, title = {The effect of resonance tubes on glottal contact quotient with and without task instruction: a comparison of trained and untrained voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {3}, pages = {e79-93}, doi = {10.1016/j.jvoice.2011.03.003}, pmid = {21550779}, issn = {1873-4588}, mesh = {Adult ; Alabama ; Analysis of Variance ; Biomechanical Phenomena ; Electrodiagnosis ; Glottis/*physiology ; Humans ; Loudness Perception ; Male ; Music ; *Phonation ; Pitch Perception ; Speech Acoustics ; Speech Production Measurement ; Task Performance and Analysis ; Time Factors ; Vibration ; *Voice Quality ; *Voice Training ; Young Adult ; }, abstract = {Phonation into narrow tubes or straws has been used as a voice training and voice therapy technique and belongs to a group of techniques known as semi-occluded vocal tract exercises. The use of what are called resonance tubes has received renewed attention in the voice research literature, in both theoretical and empirical studies. The assumption is that the partially occluded and lengthened vocal tract alters supraglottal acoustics in such a way as to allow phonation near a lowered first vocal tract formant, which has been suggested as a way to bring about a more efficient glottal closure pattern for sustained oscillation. In this study, two groups of male participants, 10 with no vocal training and 10 with classical vocal training, phonated into a resonance tube for approximately 1 minute. Electroglottography was used to estimate glottal contact quotient (CQ) during spoken /a/ vowels before tube phonation, during tube phonation, and again during spoken /a/ vowels after tube phonation. Half of each group of participants was made to keep pitch and loudness consistent for all phases of the experiment, replicating the method of a previous study by this author. The other half was instructed to practice phonating into the resonance tube before collecting data and was encouraged to find a pitch and loudness combination that maximized ease of phonation and a sense of forward oral resonance. Glottal CQ altered considerably from baseline for almost all participants during tube phonation, with a larger variability than that during vowel production. Small differences in glottal CQ were found as a function of training and instruction, with most participants' CQ increasing during tube phonation. A small post-tube phonation effect was found primarily for the trained and instructed group. Secondary single-subject analyses revealed large intersubject variation, highlighting the highly individualized response to the resonance tube task. Continued study of resonance tubes is recommended, comparing both male and female as well as vocally trained and untrained participants. Future studies should continue to examine systematic variations in task instruction, length of practice, and resonance tube dimensions.}, } @article {pmid21550772, year = {2012}, author = {Sharifzadeh, HR and McLoughlin, IV and Russell, MJ}, title = {A comprehensive vowel space for whispered speech.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {2}, pages = {e49-56}, doi = {10.1016/j.jvoice.2010.12.002}, pmid = {21550772}, issn = {1873-4588}, mesh = {Adult ; England ; Female ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; *Voice ; }, abstract = {Whispered speech is a relatively common form of communications, used primarily to selectively exclude or include potential listeners from hearing a spoken message. Despite the everyday nature of whispering, and its undoubted usefulness in vocal communications, whispers have received relatively little research effort to date, apart from some studies analyzing the main whispered vowels and some quite general estimations of whispered speech characteristics. In particular, a classic vowel space determination has been lacking for whispers. For voiced speech, this type of information has played an important role in the development and testing of recognition and processing theories over the past few decades and can be expected to be equally useful for whisper-mode communications and recognition systems. This article aims to redress the shortfall by presenting a vowel formant space for whispered speech and comparing the results with corresponding phonated samples. In addition, because the study was conducted using speakers from Birmingham, the analysis extends to discuss the effect of the common British West Midlands accent in comparison with Standard English (Received Pronunciation). Thus, the article presents the analysis of formant data showing differences between normal and whispered speech while also considering an accentual effect on whispered speech.}, } @article {pmid21532692, year = {2011}, author = {Vicent, LE and Wolf, KB}, title = {Analysis of digital images into energy-angular momentum modes.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {28}, number = {5}, pages = {808-814}, doi = {10.1364/JOSAA.28.000808}, pmid = {21532692}, issn = {1520-8532}, abstract = {The measurement of continuous wave fields by a digital (pixellated) screen of sensors can be used to assess the quality of a beam by finding its formant modes. A generic continuous field F(x, y) sampled at an N × N Cartesian grid of point sensors on a plane yields a matrix of values F(q(x), q(y)), where (q(x), q(y)) are integer coordinates. When the approximate rotational symmetry of the input field is important, one may use the sampled Laguerre-Gauss functions, with radial and angular modes (n, m), to analyze them into their corresponding coefficients F(n, m) of energy and angular momentum (E-AM). The sampled E-AM modes span an N[2]-dimensional space, but are not orthogonal--except for parity. In this paper, we propose the properly orthonormal "Laguerre-Kravchuk" discrete functions Λ(n, m)(q(x), q(y)) as a convenient basis to analyze the sampled beams into their E-AM polar modes, and with them synthesize the input image exactly.}, } @article {pmid21524013, year = {2011}, author = {Ortega-Llebaria, M and Prieto, P}, title = {Acoustic correlates of stress in central Catalan and Castilian Spanish.}, journal = {Language and speech}, volume = {54}, number = {Pt 1}, pages = {73-97}, doi = {10.1177/0023830910388014}, pmid = {21524013}, issn = {0023-8309}, mesh = {Adult ; Cues ; Female ; Humans ; *Language ; Male ; *Phonetics ; Psycholinguistics ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {The general literature on the phonetic correlates of stress agrees that duration, and in stress accent languages, F0 are consistent correlates of stress. However, the role of amplitude changes in the speech signal is more controversial. In particular, the conflicting results of spectral tilt as a correlate of stress have been attributed to the effects of vowel reduction. We examined the stress correlates of duration, overall intensity and spectral tilt in Catalan and Spanish in both accented and unaccented contexts while controlling for formant frequency differences between morphologically corresponding vowels in stressed and unstressed environments by comparing vowels that maintain the same quality across stress contexts with those that do not. Duration was a consistent stress correlate in all vowels in both languages, regardless of their formant frequency differences across stress contexts and of the absence of pitch accents. In fact, stress-related formant frequency differences between corresponding vowels amplify the duration cues to the stress contrast. On the other hand, the use speakers made of intensity was not as pervasive as that of duration. Specifically, changes in spectral tilt were significant only in Catalan and in those vowels that alternate a more open and peripheral realization in stressed syllables with a mid-central realization in unstressed syllables, indicating that spectral tilt is related to the formant frequency differences linked to the centralization processes rather than to the stress contrast.}, } @article {pmid21499546, year = {2011}, author = {Alwan, A and Jiang, J and Chen, W}, title = {Perception of Place of Articulation for Plosives and Fricatives in Noise.}, journal = {Speech communication}, volume = {53}, number = {2}, pages = {195-209}, pmid = {21499546}, issn = {0167-6393}, support = {R29 DC002033-05/DC/NIDCD NIH HHS/United States ; }, abstract = {This study aims at uncovering perceptually-relevant acoustic cues for the labial versus alveolar place of articulation distinction in syllable-initial plosives {/b/,/d/,/p/,/t/} and fricatives {/f/,/s/,/v/,/z/} in noise. Speech materials consisted of naturally-spoken consonant-vowel (CV) syllables from four talkers where the vowel was one of {/a/,/i/,/u/}. Acoustic analyses using logistic regression show that formant frequency measurements, relative spectral amplitude measurements, and burst/noise durations are generally reliable cues for labial/alveolar classification. In a subsequent perceptual experiment, each pair of syllables with the labial/alveolar distinction (e.g., /ba,da/) was presented to listeners in various levels of signal-to-noise-ratio (SNR) in a 2-AFC task. A threshold SNR was obtained for each syllable pair using sigmoid fitting of the percent correct scores. Results show that the perception of the labial/alveolar distinction in noise depends on the manner of articulation, the vowel context, and interaction between voicing and manner of articulation. Correlation analyses of the acoustic measurements and threshold SNRs show that formant frequency measurements (such as F1 and F2 onset frequencies and F2 and F3 frequency changes) become increasingly important for the perception of labial/alveolar distinctions as the SNR degrades.}, } @article {pmid21498577, year = {2011}, author = {Pyschny, V and Landwehr, M and Hahn, M and Walger, M and von Wedel, H and Meister, H}, title = {Bimodal hearing and speech perception with a competing talker.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {5}, pages = {1400-1415}, doi = {10.1044/1092-4388(2011/10-0210)}, pmid = {21498577}, issn = {1558-9102}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Audiometry, Speech ; Auditory Threshold ; *Cochlear Implants ; Dichotic Listening Tests ; *Field Dependence-Independence ; *Hearing Aids ; Hearing Loss/*rehabilitation ; Humans ; Male ; Middle Aged ; Perceptual Masking ; *Speech Perception ; Young Adult ; }, abstract = {PURPOSE: The objective of the study was to investigate the influence of bimodal stimulation upon hearing ability for speech recognition in the presence of a single competing talker.

METHOD: Speech recognition was measured in 3 listening conditions: hearing aid (HA) alone, cochlear implant (CI) alone, and both devices together (CI + HA). To examine the use of low-frequency cues, the competing masker voice was manipulated with respect to fundamental frequency (F0) and formant frequencies. Twelve implanted adults were included in the study.

RESULTS: Group results revealed only a relatively small benefit of CI + HA compared with the CI alone. A detailed analysis of errors, which was assumed to be an indicator for the release from masking, revealed that this benefit was not attributed to improved target-masker segregation. The variable determined to be responsible for segregating target and masker talkers was a large difference in F0 of the voices. This held true for all CI alone, HA alone, and CI + HA listening conditions.

CONCLUSIONS: Bimodal hearing improved overall speech recognition of both the target and the masker. No evidence for better target-masker separation with bimodal fitting could be found.}, } @article {pmid21477195, year = {2011}, author = {Zhang, Y and Koerner, T and Miller, S and Grice-Patil, Z and Svec, A and Akbari, D and Tusler, L and Carney, E}, title = {Neural coding of formant-exaggerated speech in the infant brain.}, journal = {Developmental science}, volume = {14}, number = {3}, pages = {566-581}, doi = {10.1111/j.1467-7687.2010.01004.x}, pmid = {21477195}, issn = {1467-7687}, mesh = {Brain/embryology/*physiology ; Electroencephalography ; Female ; Frontal Lobe/physiology ; Humans ; Infant ; *Language Development ; Male ; Nervous System/embryology ; *Nervous System Physiological Phenomena ; Parietal Lobe/physiology ; Phonetics ; Speech ; Speech Perception/*physiology ; Temporal Lobe/physiology ; }, abstract = {Speech scientists have long proposed that formant exaggeration in infant-directed speech plays an important role in language acquisition. This event-related potential (ERP) study investigated neural coding of formant-exaggerated speech in 6-12-month-old infants. Two synthetic /i/ vowels were presented in alternating blocks to test the effects of formant exaggeration. ERP waveform analysis showed significantly enhanced N250 for formant exaggeration, which was more prominent in the right hemisphere than the left. Time-frequency analysis indicated increased neural synchronization for processing formant-exaggerated speech in the delta band at frontal-central-parietal electrode sites as well as in the theta band at frontal-central sites. Minimum norm estimates further revealed a bilateral temporal-parietal-frontal neural network in the infant brain sensitive to formant exaggeration. Collectively, these results provide the first evidence that formant expansion in infant-directed speech enhances neural activities for phonetic encoding and language learning.}, } @article {pmid21476675, year = {2011}, author = {Pisanski, K and Rendall, D}, title = {The prioritization of voice fundamental frequency or formants in listeners' assessments of speaker size, masculinity, and attractiveness.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {4}, pages = {2201-2212}, doi = {10.1121/1.3552866}, pmid = {21476675}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Auditory Perception/physiology ; Body Size/*physiology ; Female ; Femininity ; Heterosexuality ; Humans ; Judgment ; Male ; *Masculinity ; Psychoacoustics ; *Sex Characteristics ; *Sexuality ; Social Behavior ; Voice Quality/*physiology ; }, abstract = {Key features of the voice--fundamental frequency (F(0)) and formant frequencies (Fn)--can vary extensively among individuals. Some of this variation might cue fitness-related, biosocial dimensions of speakers. Three experiments tested the independent, joint and relative effects of F(0) and Fn on listeners' assessments of the body size, masculinity (or femininity), and attractiveness of male and female speakers. Experiment 1 replicated previous findings concerning the joint and independent effects of F(0) and Fn on these assessments. Experiment 2 established frequency discrimination thresholds (or just-noticeable differences, JND's) for both vocal features to use in subsequent tests of their relative salience. JND's for F(0) and Fn were consistent in the range of 5%-6% for each sex. Experiment 3 put the two voice features in conflict by equally discriminable amounts and found that listeners consistently tracked Fn over F(0) in rating all three dimensions. Several non-exclusive possibilities for this outcome are considered, including that voice Fn provides more reliable cues to one or more dimensions and that listeners' assessments of the different dimensions are partially interdependent. Results highlight the value of first establishing JND's for discrimination of specific features of natural voices in future work examining their effects on voice-based social judgments.}, } @article {pmid21476673, year = {2011}, author = {Casserly, ED}, title = {Speaker compensation for local perturbation of fricative acoustic feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {4}, pages = {2181-2190}, pmid = {21476673}, issn = {1520-8524}, support = {T32 DC000011/DC/NIDCD NIH HHS/United States ; R01-DC0011/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological/*physiology ; Adolescent ; *Feedback ; Female ; Humans ; Male ; Phonation/physiology ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Young Adult ; }, abstract = {Feedback perturbation studies of speech acoustics have revealed a great deal about how speakers monitor and control their productions of segmental (e.g., formant frequencies) and non-segmental (e.g., pitch) linguistic elements. The majority of previous work, however, overlooks the role of acoustic feedback in consonant production and makes use of acoustic manipulations that effect either entire utterances or the entire acoustic signal, rather than more temporally and phonetically restricted alterations. This study, therefore, seeks to expand the feedback perturbation literature by examining perturbation of consonant acoustics that is applied in a time-restricted and phonetically specific manner. The spectral center of the alveopalatal fricative [∫] produced in vowel-fricative-vowel nonwords was incrementally raised until it reached the potential for [s]-like frequencies, but the characteristics of high-frequency energy outside the target fricative remained unaltered. An "offline," more widely accessible signal processing method was developed to perform this manipulation. The local feedback perturbation resulted in changes to speakers' fricative production that were more variable, idiosyncratic, and restricted than the compensation seen in more global acoustic manipulations reported in the literature. Implications and interpretations of the results, as well as future directions for research based on the findings, are discussed.}, } @article {pmid21476670, year = {2011}, author = {Panchapagesan, S and Alwan, A}, title = {A study of acoustic-to-articulatory inversion of speech by analysis-by-synthesis using chain matrices and the Maeda articulatory model.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {4}, pages = {2144-2162}, pmid = {21476670}, issn = {1520-8524}, support = {R01 DC 00820/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Calibration ; Female ; Humans ; Male ; Markov Chains ; *Models, Biological ; Speech/*physiology ; *Speech Acoustics ; Speech Intelligibility/*physiology ; *Speech Recognition Software ; Vocal Cords/physiology ; }, abstract = {In this paper, a quantitative study of acoustic-to-articulatory inversion for vowel speech sounds by analysis-by-synthesis using the Maeda articulatory model is performed. For chain matrix calculation of vocal tract (VT) acoustics, the chain matrix derivatives with respect to area function are calculated and used in a quasi-Newton method for optimizing articulatory trajectories. The cost function includes a distance measure between natural and synthesized first three formants, and parameter regularization and continuity terms. Calibration of the Maeda model to two speakers, one male and one female, from the University of Wisconsin x-ray microbeam (XRMB) database, using a cost function, is discussed. Model adaptation includes scaling the overall VT and the pharyngeal region and modifying the outer VT outline using measured palate and pharyngeal traces. The inversion optimization is initialized by a fast search of an articulatory codebook, which was pruned using XRMB data to improve inversion results. Good agreement between estimated midsagittal VT outlines and measured XRMB tongue pellet positions was achieved for several vowels and diphthongs for the male speaker, with average pellet-VT outline distances around 0.15 cm, smooth articulatory trajectories, and less than 1% average error in the first three formants.}, } @article {pmid21443933, year = {2011}, author = {Yeon, SC and Kim, YK and Park, SJ and Lee, SS and Lee, SY and Suh, EH and Houpt, KA and Chang, HH and Lee, HC and Yang, BG and Lee, HJ}, title = {Differences between vocalization evoked by social stimuli in feral cats and house cats.}, journal = {Behavioural processes}, volume = {87}, number = {2}, pages = {183-189}, doi = {10.1016/j.beproc.2011.03.003}, pmid = {21443933}, issn = {1872-8308}, mesh = {Aggression ; Animals ; Animals, Domestic/*psychology ; Animals, Wild/*psychology ; Cats/*psychology ; Dogs ; Female ; *Social Behavior ; Sound Spectrography ; Species Specificity ; Time Factors ; *Vocalization, Animal ; }, abstract = {To investigate how socialization can affect the types and characteristics of vocalization produced by cats, feral cats (n=25) and house cats (n=13) were used as subjects, allowing a comparison between cats socialized to people and non-socialized cats. To record vocalization and assess the cats' responses to behavioural stimuli, five test situations were used: approach by a familiar caretaker, by a threatening stranger, by a large doll, by a stranger with a dog and by a stranger with a cat. Feral cats showed extremely aggressive and defensive behaviour in most test situations, and produced higher call rates than those of house cats in the test situations, which could be attributed to less socialization to other animals and to more sensitivity to fearful situations. Differences were observed in the acoustic parameters of feral cats in comparison to those of house cats. The feral cat produced significantly higher frequency in fundamental frequency, peak frequency, 1st quartile frequency, 3rd quartile frequency of growls and hisses in agonistic test situations. In contrast to the growls and hisses, in meow, all acoustic parameters like fundamental frequency, first formant, peak frequency, 1st quartile frequency, and 3rd quartile frequency of house cats were of significantly higher frequency than those of feral cats. Also, house cats produced calls of significantly shorter in duration than feral cats in agonistic test situations. These results support the conclusion that a lack of socialization may affect usage of types of vocalizations, and the vocal characteristics, so that the proper socialization of cat may be essential to be a suitable companion house cat.}, } @article {pmid21439776, year = {2012}, author = {Sundberg, J and Thalén, M and Popeil, L}, title = {Substyles of belting: phonatory and resonatory characteristics.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {1}, pages = {44-50}, doi = {10.1016/j.jvoice.2010.10.007}, pmid = {21439776}, issn = {1873-4588}, mesh = {Female ; Glottis/physiology ; Humans ; *Music ; Phonation/*physiology ; Pressure ; Sound Spectrography ; Speech Acoustics ; Vibration ; Voice Quality/*physiology ; }, abstract = {Belting has been described as speechlike, yell-like, or shouting voice production commonly used in contemporary commercial music genres and substantially differing from the esthetic of the Western classical voice tradition. This investigation attempts to describe phonation and resonance characteristics of different substyles of belting (heavy, brassy, ringy, nasal, and speechlike) and the classical style. A professional singer and voice teacher, skilled in these genres, served as the single subject. The recorded material was found representative according to a classification test performed by an expert panel. Subglottal pressure was measured as the oral pressure during the occlusion for the consonant /p/. The voice source and formant frequencies were analyzed by inverse filtering the audio signal. The subglottal pressure and measured flow glottogram parameters differed clearly between the styles heavy and classical assuming opposite extremes in most parameters. The formant frequencies, by contrast, showed fewer less systematic differences between the substyles but were clearly separated from the classical style with regard to the first formant. Thus, the differences between the belting substyles mainly concerned the voice source.}, } @article {pmid21428521, year = {2011}, author = {Huttunen, KH and Keränen, HI and Pääkkönen, RJ and Päivikki Eskelinen-Rönkä, R and Leino, TK}, title = {Effect of cognitive load on articulation rate and formant frequencies during simulator flights.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {3}, pages = {1580-1593}, doi = {10.1121/1.3543948}, pmid = {21428521}, issn = {1520-8524}, mesh = {Adult ; *Aircraft ; Awareness ; *Cognition ; Comprehension ; *Computer Simulation ; Decision Making ; Humans ; Male ; Mental Processes ; *Military Personnel ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; Time Factors ; }, abstract = {It was explored how three types of intensive cognitive load typical of military aviation (load on situation awareness, information processing, or decision-making) affect speech. The utterances of 13 male military pilots were recorded during simulated combat flights. Articulation rate was calculated from the speech samples, and the first formant (F1) and second formant (F2) were tracked from first-syllable short vowels in pre-defined phoneme environments. Articulation rate was found to correlate negatively (albeit with low coefficients) with loads on situation awareness and decision-making but not with changes in F1 or F2. Changes were seen in the spectrum of the vowels: mean F1 of front vowels usually increased and their mean F2 decreased as a function of cognitive load, and both F1 and F2 of back vowels increased. The strongest associations were seen between the three types of cognitive load and F1 and F2 changes in back vowels. Because fluent and clear radio speech communication is vital to safety in aviation and temporal and spectral changes may affect speech intelligibility, careful use of standard aviation phraseology and training in the production of clear speech during a high level of cognitive load are important measures that diminish the probability of possible misunderstandings.}, } @article {pmid21428519, year = {2011}, author = {Kaburagi, T}, title = {Voice production model integrating boundary-layer analysis of glottal flow and source-filter coupling.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {3}, pages = {1554-1567}, doi = {10.1121/1.3533732}, pmid = {21428519}, issn = {1520-8524}, mesh = {Biomechanical Phenomena ; Computer Simulation ; Feedback, Physiological ; Glottis/anatomy & histology/*physiology ; Humans ; *Models, Biological ; Numerical Analysis, Computer-Assisted ; Oscillometry ; *Phonation ; Pressure ; Rheology ; Time Factors ; Vocal Cords/physiology ; *Voice ; }, abstract = {A voice production model is created in this work by considering essential aerodynamic and acoustic phenomena in human voice production. A precise flow analysis is performed based on a boundary-layer approximation and the viscous-inviscid interaction between the boundary layer and the core flow. This flow analysis can supply information on the separation point of the glottal flow and the thickness of the boundary layer, both of which strongly depend on the glottal configuration and yield an effective prediction of the flow behavior. When the flow analysis is combined with the modified two-mass model of the vocal fold [Pelorson et al. (1994). J. Acoust. Soc. Am. 96, 3416-3431], the resulting acoustic wave travels through the vocal tract and a pressure change develops in the vicinity of the glottis. This change can affect the glottal flow and the motion of the vocal folds, causing source-filter coupling. The property of the acoustic feedback is explicitly expressed in the frequency domain by using an acoustic tube model, allowing a clear interpretation of the coupling. Numerical experiments show that the vocal-tract input impedance and frequency responses representing the source-filter coupling have dominant peaks corresponding to the fourth and fifth formants. Results of time-domain simulations also suggest the importance of these high-frequency peaks in voice production.}, } @article {pmid21383492, year = {2011}, author = {de Silva, S and Abeyratne, UR and Hukins, C}, title = {A method to screen obstructive sleep apnea using multi-variable non-intrusive measurements.}, journal = {Physiological measurement}, volume = {32}, number = {4}, pages = {445-465}, doi = {10.1088/0967-3334/32/4/006}, pmid = {21383492}, issn = {1361-6579}, mesh = {Clinical Laboratory Techniques/*methods ; Databases, Factual ; Humans ; Male ; Middle Aged ; Neck/anatomy & histology ; Neural Networks, Computer ; Probability ; Reproducibility of Results ; Sleep Apnea, Obstructive/*diagnosis ; Snoring ; Sound ; Time Factors ; }, abstract = {Obstructive sleep apnea (OSA) is a serious sleep disorder. The current standard OSA diagnosis method is polysomnography (PSG) testing. PSG requires an overnight hospital stay while physically connected to 10-15 channels of measurement. PSG is expensive, inconvenient and requires the extensive involvement of a sleep technologist. As such, it is not suitable for community screening. OSA is a widespread disease and more than 80% of sufferers remain undiagnosed. Simplified, unattended and cheap OSA screening methods are urgently needed. Snoring is commonly associated with OSA but is not fully utilized in clinical diagnosis. Snoring contains pseudo-periodic packets of energy that produce characteristic vibrating sounds familiar to humans. In this paper, we propose a multi-feature vector that represents pitch information, formant information, a measure of periodic structure existence in snore episodes and the neck circumference of the subject to characterize OSA condition. Snore features were estimated from snore signals recorded in a sleep laboratory. The multi-feature vector was applied to a neural network for OSA/non-OSA classification and K-fold cross-validated using a random sub-sampling technique. We also propose a simple method to remove a specific class of background interference. Our method resulted in a sensitivity of 91 ± 6% and a specificity of 89 ± 5% for test data for AHI(THRESHOLD) = 15 for a database consisting of 51 subjects. This method has the potential as a non-intrusive, unattended technique to screen OSA using snore sound as the primary signal.}, } @article {pmid21376530, year = {2012}, author = {Master, S and De Biase, NG and Madureira, S}, title = {What about the "actor's formant" in actresses' voices?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {26}, number = {3}, pages = {e117-22}, doi = {10.1016/j.jvoice.2010.10.011}, pmid = {21376530}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Brazil ; Female ; Glottis/physiology ; Habits ; Humans ; Larynx/*physiology ; Male ; Middle Aged ; Pressure ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Verbal Behavior ; Vocal Cords/physiology ; *Voice Quality ; Voice Training ; Young Adult ; }, abstract = {Spectrographic analysis of male actors' voices showed a cluster, the "actor's formant" (AF), which is related to the perception of good and projected voice quality. To date, similar phenomena have not been described in the voices of actresses. Therefore, the objective of the current investigation was to compare actresses' and nonactresses' voices through acoustic analysis to verify the existence of the "AF" cluster or the strategies used to produce the performing voice. Thirty actresses and 30 nonactresses volunteered as subjects in the present study. All subjects read a 40-second text at both habitual and loud levels. Praat (v.5.1) was then used to analyze equivalent sound pressure level (Leq), speaking fundamental frequency (SFF), and in the long-term average spectrum window, the difference between the amplitude level of the fundamental frequency and first formant (L1-L0), the spectral tilt (alpha ratio), and the amplitude and frequency of the "AF" region. Significant differences between the groups, in both levels, were observed for SFF and L1-L0, with actresses presenting lower values. There were no significant differences between groups for Leq or alpha ratio at either level. There was no evidence of an "AF" cluster in the actresses' voices. Voice projection for this group of actresses seemed to be mainly a result of a laryngeal setting instead of vocal tract resonances.}, } @article {pmid21365838, year = {2010}, author = {Li, B and Shi, B and Yin, H and Li, Y}, title = {[Effects of speech training on velopharyngeal insufficiency in patients with cleft palate].}, journal = {Hua xi kou qiang yi xue za zhi = Huaxi kouqiang yixue zazhi = West China journal of stomatology}, volume = {28}, number = {6}, pages = {623-625}, pmid = {21365838}, issn = {1000-1182}, mesh = {China ; *Cleft Palate ; Humans ; Male ; Pharynx ; Plastic Surgery Procedures ; Speech ; *Velopharyngeal Insufficiency ; }, abstract = {OBJECTIVE: To investigate the effects of speech training on velopharyngeal insufficiency (VPI) in patients with cleft palate.

METHODS: Classify 30 non-syndrome cleft palate patients into Group VPI who was performed levator veli palatini reconstruction surgery at Department of Cleft and Palate Surgery, West China College of Stomatology, Sichuan University and who was performed speech training for 3-6 months after surgery. These patients were diagnosed as mild VPI by speech estimation and demonstrated a velopharyngeal distance between 1 and 5 mm through photography of pharyngeal cavity. VS-99 audio frequency analyzing system was applied to measure the second (F2) and third formant (F3) frequency of vowel [i] of 30 VPI patients before and after speech training, obtained data was compared with those of 30 normal people (Group C).

RESULTS: The formant frequency of Group VPI before speech training was F2 (1 958.95 +/- 431.40) Hz and F3 (3 059.84 +/- 330.09) Hz. The formant frequency of Group VPI after speech training was F2 (2 322.95 +/- 213.02) Hz and F3 (3 293.84 +/- 215.08) Hz. The formant frequency of Group C was F2 (2 430.47 +/- 223.05) Hz and F3 (3 345.97 +/- 180.83) Hz. The comparison of F2 and F3 between Group C and Group VPI before speech training showed significant difference (P < 0.05). The comparison of F2 and F3 between Group C and Group VPI after speech training showed no significant difference (P > 0.05). The comparison of F2 and F3 before and after the speech training in Group VPI showed significant difference (P < 0.05), in which data was higher after speech training.

CONCLUSION: Speech training is first recommended for those patients who received the cleft palate surgery at an elder age. It can improve the velopharyngeal insufficiency significantly, which may eventually keep the patients away from a second surgery.}, } @article {pmid21361461, year = {2011}, author = {Soltis, J and Blowers, TE and Savage, A}, title = {Measuring positive and negative affect in the voiced sounds of African elephants (Loxodonta africana).}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {2}, pages = {1059-1066}, doi = {10.1121/1.3531798}, pmid = {21361461}, issn = {1520-8524}, mesh = {*Affect ; Animals ; Elephants/*physiology ; Female ; Signal Processing, Computer-Assisted ; Social Behavior ; Sound Spectrography ; Time Factors ; *Vocalization, Animal ; }, abstract = {As in other mammals, there is evidence that the African elephant voice reflects affect intensity, but it is less clear if positive and negative affective states are differentially reflected in the voice. An acoustic comparison was made between African elephant "rumble" vocalizations produced in negative social contexts (dominance interactions), neutral social contexts (minimal social activity), and positive social contexts (affiliative interactions) by four adult females housed at Disney's Animal Kingdom®. Rumbles produced in the negative social context exhibited higher and more variable fundamental frequencies (F(0)) and amplitudes, longer durations, increased voice roughness, and higher first formant locations (F1), compared to the neutral social context. Rumbles produced in the positive social context exhibited similar shifts in most variables (F(0)variation, amplitude, amplitude variation, duration, and F1), but the magnitude of response was generally less than that observed in the negative context. Voice roughness and F(0) observed in the positive social context remained similar to that observed in the neutral context. These results are most consistent with the vocal expression of affect intensity, in which the negative social context elicited higher intensity levels than the positive context, but differential vocal expression of positive and negative affect cannot be ruled out.}, } @article {pmid21361452, year = {2011}, author = {MacDonald, EN and Purcell, DW and Munhall, KG}, title = {Probing the independence of formant control using altered auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {2}, pages = {955-965}, pmid = {21361452}, issn = {1520-8524}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Auditory Threshold ; *Feedback, Sensory ; Female ; Humans ; Motor Neurons/*physiology ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Two auditory feedback perturbation experiments were conducted to examine the nature of control of the first two formants in vowels. In the first experiment, talkers heard their auditory feedback with either F1 or F2 shifted in frequency. Talkers altered production of the perturbed formant by changing its frequency in the opposite direction to the perturbation but did not produce a correlated alteration of the unperturbed formant. Thus, the motor control system is capable of fine-grained independent control of F1 and F2. In the second experiment, a large meta-analysis was conducted on data from talkers who received feedback where both F1 and F2 had been perturbed. A moderate correlation was found between individual compensations in F1 and F2 suggesting that the control of F1 and F2 is processed in a common manner at some level. While a wide range of individual compensation magnitudes were observed, no significant correlations were found between individuals' compensations and vowel space differences. Similarly, no significant correlations were found between individuals' compensations and variability in normal vowel production. Further, when receiving normal auditory feedback, most of the population exhibited no significant correlation between the natural variation in production of F1 and F2.}, } @article {pmid25140113, year = {2011}, author = {Jacewicz, E and Fox, RA and Salmons, J}, title = {Cross-generational vowel change in American English.}, journal = {Language variation and change}, volume = {23}, number = {1}, pages = {45-86}, pmid = {25140113}, issn = {0954-3945}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; }, abstract = {This study examines cross-generational changes in the vowel systems in central Ohio, southeastern Wisconsin and western North Carolina. Speech samples from 239 speakers, males and females, were divided into three age groups: grandparents (66-91 years old), parents (35-51) and children (8-12). Acoustic analysis of vowel dynamics (i.e., formant movement) was undertaken to explore variation in the amount of spectral change for each vowel. A robust set of cross-generational changes in /ɪ, ε, æ, ɑ/ was found within each dialect-specific vowel system, involving both their positions and dynamics. With each successive generation, /ɪ, ε, æ/ become increasingly monophthongized and /ɑ/ is diphthongized in children. These changes correspond to a general anticlockwise parallel rotation of vowels (with some exceptions in /ɪ/ and /ε/). Given the widespread occurrence of these parallel chain-like changes, we term this development the "North American Shift" which conforms to the general principles of chain shifting formulated by Labov (1994) and others.}, } @article {pmid21321794, year = {2011}, author = {Sjerps, MJ and Mitterer, H and McQueen, JM}, title = {Constraints on the processes responsible for the extrinsic normalization of vowels.}, journal = {Attention, perception & psychophysics}, volume = {73}, number = {4}, pages = {1195-1215}, pmid = {21321794}, issn = {1943-393X}, mesh = {*Attention ; Humans ; *Phonetics ; Pitch Discrimination ; Psycholinguistics ; Recognition, Psychology ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Listeners tune in to talkers' vowels through extrinsic normalization. We asked here whether this process could be based on compensation for the long-term average spectrum (LTAS) of preceding sounds and whether the mechanisms responsible for normalization are indifferent to the nature of those sounds. If so, normalization should apply to nonspeech stimuli. Previous findings were replicated with first-formant (F1) manipulations of speech. Targets on a [pt]-[pɛt] (low-high F1) continuum were labeled as [pt] more after high-F1 than after low-F1 precursors. Spectrally rotated nonspeech versions of these materials produced similar normalization. None occurred, however, with nonspeech stimuli that were less speechlike, even though precursor-target LTAS relations were equivalent to those used earlier. Additional experiments investigated the roles of pitch movement, amplitude variation, formant location, and the stimuli's perceived similarity to speech. It appears that normalization is not restricted to speech but that the nature of the preceding sounds does matter. Extrinsic normalization of vowels is due, at least in part, to an auditory process that may require familiarity with the spectrotemporal characteristics of speech.}, } @article {pmid21303013, year = {2011}, author = {Leung, MT and Ciocca, V}, title = {The effects of tongue loading and auditory feedback on vowel production.}, journal = {The Journal of the Acoustical Society of America}, volume = {129}, number = {1}, pages = {316-325}, doi = {10.1121/1.3514529}, pmid = {21303013}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; Audiometry, Pure-Tone ; Audiometry, Speech ; *Auditory Perception ; Auditory Threshold ; *Feedback, Sensory ; Female ; Humans ; Male ; Perceptual Masking ; *Phonation ; *Speech Acoustics ; Tongue/*physiology ; Young Adult ; }, abstract = {This study investigated the role of sensory feedback during the production of front vowels. A temporary aftereffect induced by tongue loading was employed to modify the somatosensory-based perception of tongue height. Following the removal of tongue loading, tongue height during vowel production was estimated by measuring the frequency of the first formant (F1) from the acoustic signal. In experiment 1, the production of front vowels following tongue loading was investigated either in the presence or absence of auditory feedback. With auditory feedback available, the tongue height of front vowels was not modified by the aftereffect of tongue loading. By contrast, speakers did not compensate for the aftereffect of tongue loading when they produced vowels in the absence of auditory feedback. In experiment 2, the characteristics of the masking noise were manipulated such that it masked energy either in the F1 region or in the region of the second and higher formants. The results showed that the adjustment of tongue height during the production of front vowels depended on information about F1 in the auditory feedback. These findings support the idea that speech goals include both auditory and somatosensory targets and that speakers are able to make use of information from both sensory modalities to maximize the accuracy of speech production.}, } @article {pmid21297168, year = {2011}, author = {Molis, MR and Leek, MR}, title = {Vowel identification by listeners with hearing impairment in response to variation in formant frequencies.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {4}, pages = {1211-1223}, pmid = {21297168}, issn = {1558-9102}, support = {R01 DC000626/DC/NIDCD NIH HHS/United States ; R01 DC000626-23/DC/NIDCD NIH HHS/United States ; R01DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Impedance Tests ; Adult ; Aged ; Aged, 80 and over ; Analysis of Variance ; *Auditory Perception ; *Auditory Threshold ; Case-Control Studies ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; Models, Biological ; *Phonetics ; Psychoacoustics ; Reference Values ; Sound Spectrography ; *Speech Perception ; }, abstract = {PURPOSE: This study examined the influence of presentation level and mild-to-moderate hearing loss on the identification of a set of vowel tokens systematically varying in the frequency locations of their second and third formants.

METHOD: Five listeners with normal hearing (NH listeners) and five listeners with hearing impairment (HI listeners) identified synthesized vowels that represented both highly identifiable and ambiguous examples of /i/, /[Please see symbol]/, and /[Please see symbol]/.

RESULTS: Response patterns of NH listeners showed significant changes, with an increase in presentation level from 75 dB SPL to 95 dB SPL, including increased category overlap. HI listeners, listening only at the higher level, showed greater category overlap than normal and overall identification patterns that differed significantly from those of NH listeners. Excitation patterns based on estimates of auditory filters suggested smoothing of the internal representations, resulting in impaired formant resolution.

CONCLUSIONS: Both increased presentation level for NH listeners and the presence of hearing loss produced a significant change in vowel identification for this stimulus set. Major differences were observed between NH listeners and HI listeners in vowel category overlap and in the sharpness of boundaries between vowel tokens. It is likely that these findings reflect imprecise internal spectral representations due to reduced frequency selectivity.}, } @article {pmid21277740, year = {2011}, author = {Natour, YS and Marie, BS and Saleem, MA and Tadros, YK}, title = {Formant frequency characteristics in normal Arabic-speaking Jordanians.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {2}, pages = {e75-84}, doi = {10.1016/j.jvoice.2010.10.018}, pmid = {21277740}, issn = {1873-4588}, mesh = {Adolescent ; Age Factors ; Biomechanical Phenomena ; Child ; Child, Preschool ; Female ; Humans ; Jordan ; *Language ; Larynx/*physiology ; Male ; *Phonation ; Sex Factors ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {This study is designed to test the acoustic characteristics of the normal Arabic voice. The subjects were 300 normal Arabic speakers (100 adult males, 100 adult females, and 100 children). The subjects produced a sustained phonation of the six steady state Arabic vowels (/i:/, /e:/, /a:/, , /o:/, and /u:/). The samples were input into the Time-Frequency Analysis Software (TF32). F0, F1, F2, and F3 of the six Arabic vowels were analyzed. Comparisons among speakers of Jordanian Arabic showed that males' formant frequencies were significantly different in comparison with those of females and children. On the other hand, a significant difference was found between females and children in F1, but not in F2 or F3. Comparison with other ethnicities indicated that adult Arab males' formant frequencies showed a generally lower F1 and F2 and a higher F3; adult Arab females' formant frequencies showed a generally higher F1 and lower F2 and F3; whereas children formant frequencies showed a generally lower F1, F2, and F3. It is recommended that speech-language pathologists in Jordan use the new formant norms when evaluating and/or treating Jordanian Arabic speakers.}, } @article {pmid21271109, year = {2010}, author = {Lima-Gregio, AM and Dutka-Souza, Jde C and Marino, VC and Pegoraro-Krook, MI and Barbosa, PA}, title = {Spectral findings for vowels [a] and [ã] at different velopharyngeal openings.}, journal = {Pro-fono : revista de atualizacao cientifica}, volume = {22}, number = {4}, pages = {515-520}, doi = {10.1590/s0104-56872010000400026}, pmid = {21271109}, issn = {1809-399X}, mesh = {Adolescent ; Analysis of Variance ; Female ; Humans ; *Palatal Obturators ; Sound Spectrography/methods ; Spectrum Analysis ; *Speech Acoustics ; Speech Disorders/physiopathology ; Speech Production Measurement/methods ; Velopharyngeal Insufficiency/*physiopathology ; *Voice Quality ; }, abstract = {BACKGROUND: The size control of velopharyngeal opening is an important variable for the acoustic profile characterization of hypernasal speech.

AIM: To investigate frequency spectral aspects of F1, F2, F3, nasal formant (FN) and anti-formant, in Hertz, for vowels [a] and [ã] at different velopharyngeal openings produced in the bulb of a palatal prosthesis replica used by a patient with velopharyngeal insufficiency.

METHOD: Speech recordings were obtained for four words ("pato/mato" and "panto/manto") produced within a carrier phrase in 5 conditions of velopharyngeal functioning: prosthesis with no openings (control condition: CC); prosthesis with bulb opening of 10 mm² (experimental condition with 10 mm² opening: EC10), prosthesis with a 20 mm² opening (EC20), prosthesis with a 30 mm² opening (EC30), and without the prosthesis (ECO). Five speech-language pathologists made a live rating of speech nasality during the reading of an oral passage. The recordings were used for spectral analysis.

RESULTS: F1 values were significantly higher for [a] when compared to [ã] in all conditions. F2 values for [a] in EC20 and EC30 were significantly lower than values in the other conditions, being closer to the values presented for [ã]. F3 values were not significantly different between the testing conditions. There was a relationship between FN and anti-formants, and the auditory perception of nasality for conditions EC10 and EC20.

CONCLUSION: Significant changes were observed in the studied spectral values according to changes in the velopharyngeal opening size.}, } @article {pmid21268669, year = {2011}, author = {Heinrich, A and Carlyon, RP and Davis, MH and Johnsrude, IS}, title = {The continuity illusion does not depend on attentional state: FMRI evidence from illusory vowels.}, journal = {Journal of cognitive neuroscience}, volume = {23}, number = {10}, pages = {2675-2689}, doi = {10.1162/jocn.2011.21627}, pmid = {21268669}, issn = {1530-8898}, support = {MC_U105559842/MRC_/Medical Research Council/United Kingdom ; MC_U105580446/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Attention/*physiology ; *Brain Mapping ; Female ; Functional Laterality ; Humans ; Illusions/*physiology ; Image Processing, Computer-Assisted/methods ; Magnetic Resonance Imaging ; Male ; Noise ; Oxygen/blood ; *Phonetics ; Psychoacoustics ; Speech Perception/*physiology ; Temporal Lobe/blood supply/*physiology ; Time Factors ; Young Adult ; }, abstract = {We investigate whether the neural correlates of the continuity illusion, as measured using fMRI, are modulated by attention. As we have shown previously, when two formants of a synthetic vowel are presented in an alternating pattern, the vowel can be identified if the gaps in each formant are filled with bursts of plausible masking noise, causing the illusory percept of a continuous vowel ("Illusion" condition). When the formant-to-noise ratio is increased so that noise no longer plausibly masks the formants, the formants are heard as interrupted ("Illusion Break" condition) and vowels are not identifiable. A region of the left middle temporal gyrus (MTG) is sensitive both to intact synthetic vowels (two formants present simultaneously) and to Illusion stimuli, compared to Illusion Break stimuli. Here, we compared these conditions in the presence and absence of attention. We examined fMRI signal for different sound types under three attentional conditions: full attention to the vowels; attention to a visual distracter; or attention to an auditory distracter. Crucially, although a robust main effect of attentional state was observed in many regions, the effect of attention did not differ systematically for the illusory vowels compared to either intact vowels or to the Illusion Break stimuli in the left STG/MTG vowel-sensitive region. This result suggests that illusory continuity of vowels is an obligatory perceptual process, and operates independently of attentional state. An additional finding was that the sensitivity of primary auditory cortex to the number of sound onsets in the stimulus was modulated by attention.}, } @article {pmid21235800, year = {2011}, author = {Mahmoudi, Z and Rahati, S and Ghasemi, MM and Asadpour, V and Tayarani, H and Rajati, M}, title = {Classification of voice disorder in children with cochlear implantation and hearing aid using multiple classifier fusion.}, journal = {Biomedical engineering online}, volume = {10}, number = {}, pages = {3}, pmid = {21235800}, issn = {1475-925X}, mesh = {Child ; Child, Preschool ; Classification/*methods ; *Cochlear Implantation ; Female ; *Hearing Aids ; Hearing Loss/complications/physiopathology ; Humans ; Language ; Male ; Phonation/physiology ; Voice/physiology ; Voice Disorders/*classification/complications/physiopathology/*surgery ; }, abstract = {BACKGROUND: Speech production and speech phonetic features gradually improve in children by obtaining audio feedback after cochlear implantation or using hearing aids. The aim of this study was to develop and evaluate automated classification of voice disorder in children with cochlear implantation and hearing aids.

METHODS: We considered 4 disorder categories in children's voice using the following definitions: Level_1: Children who produce spontaneous phonation and use words spontaneously and imitatively. Level_2: Children, who produce spontaneous phonation, use words spontaneously and make short sentences imitatively. Level_3: Children, who produce spontaneous phonations, use words and arbitrary sentences spontaneously. Level_4: Normal children without any hearing loss background. Thirty Persian children participated in the study, including six children in each level from one to three and 12 children in level four. Voice samples of five isolated Persian words "mashin", "mar", "moosh", "gav" and "mouz" were analyzed. Four levels of the voice quality were considered, the higher the level the less significant the speech disorder. "Frame-based" and "word-based" features were extracted from voice signals. The frame-based features include intensity, fundamental frequency, formants, nasality and approximate entropy and word-based features include phase space features and wavelet coefficients. For frame-based features, hidden Markov models were used as classifiers and for word-based features, neural network was used.

RESULTS: After Classifiers fusion with three methods: Majority Voting Rule, Linear Combination and Stacked fusion, the best classification rates were obtained using frame-based and word-based features with MVR rule (level 1:100%, level 2: 93.75%, level 3: 100%, level 4: 94%).

CONCLUSIONS: Result of this study may help speech pathologists follow up voice disorder recovery in children with cochlear implantation or hearing aid who are in the same age range.}, } @article {pmid21218899, year = {2010}, author = {Summers, RJ and Bailey, PJ and Roberts, B}, title = {Effects of differences in fundamental frequency on across-formant grouping in speech perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {6}, pages = {3667-3677}, doi = {10.1121/1.3505119}, pmid = {21218899}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Pathways/*physiology ; Auditory Threshold ; Comprehension ; Dichotic Listening Tests ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking ; Pitch Perception ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {In an isolated syllable, a formant will tend to be segregated perceptually if its fundamental frequency (F0) differs from that of the other formants. This study explored whether similar results are found for sentences, and specifically whether differences in F0 (ΔF0) also influence across-formant grouping in circumstances where the exclusion or inclusion of the manipulated formant critically determines speech intelligibility. Three-formant (F1 + F2 + F3) analogues of almost continuously voiced natural sentences were synthesized using a monotonous glottal source (F0 = 150 Hz). Perceptual organization was probed by presenting stimuli dichotically (F1 + F2C + F3; F2), where F2C is a competitor for F2 that listeners must resist to optimize recognition. Competitors were created using time-reversed frequency and amplitude contours of F2, and F0 was manipulated (ΔF0 = ± 8, ± 2, or 0 semitones relative to the other formants). Adding F2C typically reduced intelligibility, and this reduction was greatest when ΔF0 = 0. There was an additional effect of absolute F0 for F2C, such that competitor efficacy was greater for higher F0s. However, competitor efficacy was not due to energetic masking of F3 by F2C. The results are consistent with the proposal that a grouping "primitive" based on common F0 influences the fusion and segregation of concurrent formants in sentence perception.}, } @article {pmid21218892, year = {2010}, author = {Alexander, JM and Kluender, KR}, title = {Temporal properties of perceptual calibration to local and broad spectral characteristics of a listening context.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {6}, pages = {3597-3513}, pmid = {21218892}, issn = {1520-8524}, support = {R01 DC004072/DC/NIDCD NIH HHS/United States ; T32 DC000013/DC/NIDCD NIH HHS/United States ; R01DC04072/DC/NIDCD NIH HHS/United States ; T32DC000013/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Audiometry, Speech ; Auditory Pathways/*physiology ; *Cues ; Humans ; Perceptual Masking ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {The auditory system calibrates to reliable properties of a listening environment in ways that enhance sensitivity to less predictable (more informative) aspects of sounds. These reliable properties may be spectrally local (e.g., peaks) or global (e.g., gross tilt), but the time course over which the auditory system registers and calibrates to these properties is unknown. Understanding temporal properties of this perceptual calibration is essential for revealing underlying mechanisms that serve to increase sensitivity to changing and informative properties of sounds. Relative influence of the second formant (F(2)) and spectral tilt was measured for identification of /u/ and /i/ following precursor contexts that were harmonic complexes with frequency-modulated resonances. Precursors filtered to match F(2) or tilt of following vowels induced perceptual calibration (diminished influence) to F(2) and tilt, respectively. Calibration to F(2) was greatest for shorter duration precursors (250 ms), which implicates physiologic and/or perceptual mechanisms that are sensitive to onsets. In contrast, calibration to tilt was greatest for precursors with longer durations and higher repetition rates because greater opportunities to sample the spectrum result in more stable estimates of long-term global spectral properties. Possible mechanisms that promote sensitivity to change are discussed.}, } @article {pmid21216128, year = {2011}, author = {Dromey, C and Heaton, E and Hopkin, JA}, title = {The acoustic effects of vowel equalization training in singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {6}, pages = {678-682}, doi = {10.1016/j.jvoice.2010.09.003}, pmid = {21216128}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Phonation ; *Phonetics ; *Speech Acoustics ; Voice ; Young Adult ; }, abstract = {Vowel equalization is a technique that can be used by singers to achieve a more balanced vocal resonance, or chiaroscuro, by balancing corresponding front and back vowels, which share approximate tongue heights, and also high and low vowels by means of a more neutral or centralized lingual posture. The goal of this single group study was to quantify acoustic changes in vowels after a brief training session in vowel equalization. Fifteen young adults with amateur singing experience sang a passage and sustained isolated vowels both before and after a 15-minute training session in vowel equalization. The first two formants of the target vowels /e, i, ɑ, o, u/ were measured from microphone recordings. An analysis of variance was used to test for changes in formant values after the training session. These formant values mostly changed in a manner reflective of a more central tongue posture. For the sustained vowels, all formant changes suggested a more neutral tongue position after the training session. The vowels in the singing passage mostly changed in the expected direction, with exceptions possibly attributable to coarticulation. The changes in the vowel formants indicated that even a brief training session can result in significant changes in vowel acoustics. Further work to explore the perceptual consequences of vowel equalization is warranted.}, } @article {pmid21212816, year = {2010}, author = {Hodges-Simeon, CR and Gaulin, SJ and Puts, DA}, title = {Different Vocal Parameters Predict Perceptions of Dominance and Attractiveness.}, journal = {Human nature (Hawthorne, N.Y.)}, volume = {21}, number = {4}, pages = {406-427}, pmid = {21212816}, issn = {1936-4776}, abstract = {Low mean fundamental frequency (F(0)) in men's voices has been found to positively influence perceptions of dominance by men and attractiveness by women using standardized speech. Using natural speech obtained during an ecologically valid social interaction, we examined relationships between multiple vocal parameters and dominance and attractiveness judgments. Male voices from an unscripted dating game were judged by men for physical and social dominance and by women in fertile and non-fertile menstrual cycle phases for desirability in short-term and long-term relationships. Five vocal parameters were analyzed: mean F(0) (an acoustic correlate of vocal fold size), F(0) variation, intensity (loudness), utterance duration, and formant dispersion (D(f), an acoustic correlate of vocal tract length). Parallel but separate ratings of speech transcripts served as controls for content. Multiple regression analyses were used to examine the independent contributions of each of the predictors. Physical dominance was predicted by low F(0) variation and physically dominant word content. Social dominance was predicted only by socially dominant word content. Ratings of attractiveness by women were predicted by low mean F(0), low D(f), high intensity, and attractive word content across cycle phase and mating context. Low D(f) was perceived as attractive by fertile-phase women only. We hypothesize that competitors and potential mates may attend more strongly to different components of men's voices because of the different types of information these vocal parameters provide.}, } @article {pmid21189968, year = {2010}, author = {Kim, JS and Lee, JH and Choi, YM and Kim, HG and Kim, SH and Lee, MK and Kim, SJ}, title = {Korean speech sound development in children from bilingual Japanese-Korean environments.}, journal = {Korean journal of pediatrics}, volume = {53}, number = {9}, pages = {834-839}, pmid = {21189968}, issn = {2092-7258}, abstract = {PURPOSE: This study investigates Korean speech sound development, including articulatory error patterns, among the Japanese-Korean children whose mothers are Japanese immigrants to Korea.

METHODS: The subjects were 28 Japanese-Korean children with normal development born to Japanese women immigrants who lived in Jeonbuk province, Korea. They were assessed through Computerized Speech Lab 4500. The control group consisted of 15 Korean children who lived in the same area.

RESULTS: The values of the voice onset time of consonants /p(h)/, /t/, /t(h)/, and /k(*)/ among the children were prolonged. The children replaced the lenis sounds with aspirated or fortis sounds rather than replacing the fortis sounds with lenis or aspirated sounds, which are typical among Japanese immigrants. The children showed numerous articulatory errors for /c/ and /l/ sounds (similar to Koreans) rather than errors on /p/ sounds, which are more frequent among Japanese immigrants. The vowel formants of the children showed a significantly prolonged vowel /o/ as compared to that of Korean children (P<0.05). The Japanese immigrants and their children showed a similar substitution /n/ for /ɧ/ [Japanese immigrants (62.5%) vs Japanese-Korean children (14.3%)], which is rarely seen among Koreans.

CONCLUSION: The findings suggest that Korean speech sound development among Japanese-Korean children is influenced not only by the Korean language environment but also by their maternal language. Therefore, appropriate language education programs may be warranted not only or immigrant women but also for their children.}, } @article {pmid21173392, year = {2011}, author = {Li, T and Fu, QJ}, title = {Perceptual adaptation of voice gender discrimination with spectrally shifted vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {4}, pages = {1240-1245}, pmid = {21173392}, issn = {1558-9102}, support = {R01 DC004792/DC/NIDCD NIH HHS/United States ; DC004792/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; *Adaptation, Psychological ; Adolescent ; Adult ; Association Learning ; *Auditory Perception ; *Discrimination, Psychological ; Female ; Humans ; Male ; Phonetics ; *Recognition, Psychology ; Reference Values ; Sex Factors ; *Voice ; Young Adult ; }, abstract = {PURPOSE: To determine whether perceptual adaptation improves voice gender discrimination of spectrally shifted vowels and, if so, which acoustic cues contribute to the improvement.

METHOD: Voice gender discrimination was measured for 10 normal-hearing subjects, during 5 days of adaptation to spectrally shifted vowels, produced by processing the speech of 5 male and 5 female talkers with 16-channel sine-wave vocoders. The subjects were randomly divided into 2 groups; one subjected to 50-Hz, and the other to 200-Hz, temporal envelope cutoff frequencies. No preview or feedback was provided.

RESULTS: There was significant adaptation in voice gender discrimination with the 200-Hz cutoff frequency, but significant improvement was observed only for 3 female talkers with F(0) > 180 Hz and 3 male talkers with F(0) < 170 Hz. There was no significant adaptation with the 50-Hz cutoff frequency.

CONCLUSIONS: Temporal envelope cues are important for voice gender discrimination under spectral shift conditions with perceptual adaptation, but spectral shift may limit the exclusive use of spectral information and/or the use of formant structure on voice gender discrimination. The results have implications for cochlear implant users and for understanding voice gender discrimination.}, } @article {pmid21165272, year = {2010}, author = {Kwon, HB}, title = {Gender difference in speech intelligibility using speech intelligibility tests and acoustic analyses.}, journal = {The journal of advanced prosthodontics}, volume = {2}, number = {3}, pages = {71-76}, pmid = {21165272}, issn = {2005-7814}, abstract = {PURPOSE: The purpose of this study was to compare men with women in terms of speech intelligibility, to investigate the validity of objective acoustic parameters related with speech intelligibility, and to try to set up the standard data for the future study in various field in prosthodontics.

MATERIALS AND METHODS: Twenty men and women were served as subjects in the present study. After recording of sample sounds, speech intelligibility tests by three speech pathologists and acoustic analyses were performed. Comparison of the speech intelligibility test scores and acoustic parameters such as fundamental frequency, fundamental frequency range, formant frequency, formant ranges, vowel working space area, and vowel dispersion were done between men and women. In addition, the correlations between the speech intelligibility values and acoustic variables were analyzed.

RESULTS: Women showed significantly higher speech intelligibility scores than men and there were significant difference between men and women in most of acoustic parameters used in the present study. However, the correlations between the speech intelligibility scores and acoustic parameters were low.

CONCLUSION: Speech intelligibility test and acoustic parameters used in the present study were effective in differentiating male voice from female voice and their values might be used in the future studies related patients involved with maxillofacial prosthodontics. However, further studies are needed on the correlation between speech intelligibility tests and objective acoustic parameters.}, } @article {pmid21162455, year = {2010}, author = {Fox, RA and Jacewicz, E}, title = {Auditory spectral integration in nontraditional speech cues in diotic and dichotic listening.}, journal = {Perceptual and motor skills}, volume = {111}, number = {2}, pages = {543-558}, pmid = {21162455}, issn = {0031-5125}, support = {R01 DC006879/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cues ; *Dichotic Listening Tests ; Female ; Humans ; Male ; *Phonetics ; Pitch Discrimination ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {Underlying auditory processes in speech perception were explored. Specifically of interest were the stages of auditory processing involved in the integration of dynamic information in nontraditional speech cues such as the virtual formant transitions. These signals utilize intensity ratio cues and changes in spectral center-of-gravity (instead of the actual formant frequency transitions) to produce perceived F3 glides. 6 men and 8 women (M age = 24.2 yr., SD = 2.1), recruited through posted materials from graduate students at The Ohio State University, participated in two experiments. The results for frequency-based formant transitions (Exp. 1) indicated that spectral cues to syllable identification are combined at more central levels of auditory processing. However, when the components of the virtual formant stimuli were divided between the ears in a dichotic listening task (Exp. 2), the results indicated that auditory spectral integration may occur above the auditory periphery but at stages more intermediate rather than central.}, } @article {pmid21159086, year = {2011}, author = {Goswami, U and Fosker, T and Huss, M and Mead, N and Szucs, D}, title = {Rise time and formant transition duration in the discrimination of speech sounds: the Ba-Wa distinction in developmental dyslexia.}, journal = {Developmental science}, volume = {14}, number = {1}, pages = {34-43}, doi = {10.1111/j.1467-7687.2010.00955.x}, pmid = {21159086}, issn = {1467-7687}, support = {G0400574//Medical Research Council/United Kingdom ; }, mesh = {Articulation Disorders ; Child ; Cues ; *Dyslexia ; Humans ; Language ; Linguistics ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {Across languages, children with developmental dyslexia have a specific difficulty with the neural representation of the sound structure (phonological structure) of speech. One likely cause of their difficulties with phonology is a perceptual difficulty in auditory temporal processing (Tallal, 1980). Tallal (1980) proposed that basic auditory processing of brief, rapidly successive acoustic changes is compromised in dyslexia, thereby affecting phonetic discrimination (e.g. discriminating /b/ from /d/) via impaired discrimination of formant transitions (rapid acoustic changes in frequency and intensity). However, an alternative auditory temporal hypothesis is that the basic auditory processing of the slower amplitude modulation cues in speech is compromised (Goswami et al., 2002). Here, we contrast children's perception of a synthetic speech contrast (ba/wa) when it is based on the speed of the rate of change of frequency information (formant transition duration) versus the speed of the rate of change of amplitude modulation (rise time). We show that children with dyslexia have excellent phonetic discrimination based on formant transition duration, but poor phonetic discrimination based on envelope cues. The results explain why phonetic discrimination may be allophonic in developmental dyslexia (Serniclaes et al., 2004), and suggest new avenues for the remediation of developmental dyslexia.}, } @article {pmid21131149, year = {2011}, author = {Morrison, GS and Zhang, C and Rose, P}, title = {An empirical estimate of the precision of likelihood ratios from a forensic-voice-comparison system.}, journal = {Forensic science international}, volume = {208}, number = {1-3}, pages = {59-65}, doi = {10.1016/j.forsciint.2010.11.001}, pmid = {21131149}, issn = {1872-6283}, mesh = {Adult ; *Biometric Identification ; Forensic Sciences ; Humans ; *Likelihood Functions ; Logistic Models ; Male ; Reproducibility of Results ; Speech ; *Voice ; Young Adult ; }, abstract = {An acoustic-phonetic forensic-voice-comparison system was constructed using the time-averaged formant values of tokens of 61 male Chinese speakers' /i/, /e/, and /a/ monophthongs as input. Likelihood ratios were calculated using a multivariate kernel density formula. A separate set of likelihood ratios was calculated for each vowel phoneme, and these were then fused and calibrated using linear logistic regression. The system was tested via cross-validation. The validity and reliability of the results were assessed using the log-likelihood-ratio-cost function (C(llr), a measure of accuracy) and an empirical estimate of the credible interval for the likelihood ratios from different-speaker comparisons (a measure of precision). The credible interval was calculated on the basis of two independent pairs of samples for each different-speaker comparison pair.}, } @article {pmid21117770, year = {2010}, author = {Mecke, AC and Sundberg, J}, title = {Gender differences in children's singing voices: acoustic analyses and results of a listening test.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {5}, pages = {3223-3231}, doi = {10.1121/1.3372730}, pmid = {21117770}, issn = {1520-8524}, mesh = {Adolescent ; *Auditory Perception ; Child ; Cues ; Female ; Germany ; Hearing Tests ; Humans ; Male ; *Music ; Observer Variation ; *Recognition, Psychology ; Reproducibility of Results ; Sex Factors ; Sound Spectrography ; Sweden ; *Voice Quality ; }, abstract = {This study tested the hypothesis that acoustic parameters exist which are specific to gender in children's singing voices, and that these parameters are relevant to listeners' identification of gender of children's singing voices. A listening test was run with examples of singing produced by children belonging to different singing cultures, six boys and six girls from a Swedish music school and six boys from an elite German boys' choir. Sustained vowels were analyzed with regard to formants and voice source properties (jitter, shimmer and glottal-to-noise-excitation rate, closed quotient, and normalized amplitude quotient). Most of the measured parameters differed significantly between the boys belonging to the two different singing cultures. Regarding boys and girls from the same choir, only the closed quotient and the fourth formant frequency differed significantly. The listening test was carried out by an expert panel. The listeners correctly identified the gender of the singer in 66.0% of the cases, i.e., far better than chance. A multiple linear regression analysis revealed that the listener's answers correlated well with the formant frequencies, with the fourth formant showing the highest correlation.}, } @article {pmid21110601, year = {2010}, author = {Löfqvist, A and Sahlén, B and Ibertsson, T}, title = {Vowel spaces in Swedish adolescents with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {5}, pages = {3064-3069}, doi = {10.1121/1.3466850}, pmid = {21110601}, issn = {1520-8524}, mesh = {Adolescent ; Age Factors ; Cochlear Implantation/*rehabilitation ; Female ; Humans ; *Language ; Male ; Models, Biological ; *Phonetics ; *Speech Articulation Tests ; Speech Disorders/rehabilitation ; *Speech Intelligibility ; }, abstract = {This paper examines vowel production in Swedish adolescents with cochlear implants. Twelve adolescents with cochlear implants and 11 adolescents with normal hearing participated. Measurements were made of the first and second formants in all the nine long Swedish vowels. The values in hertz were bark-transformed, and two measures of the size of the vowel space were obtained. The first of them was the average Euclidean distance in the F1-F2 plane between the nine vowels and the mean F1 and F2 values of all the vowels. The second was the mean Euclidean distance in the F1-F2 plane between all the vowels. The results showed a significant difference for both vowel space measures between the two groups of adolescents. The cochlear implant users had a smaller space than the adolescents with normal hearing. In general, the size of the vowel space showed no correlations with measures of receptive and productive linguistic abilities. However, the results of an identification test showed that the listeners made more confusions of the vowels produced by speakers who had a small mean distance in the F1-F2 plane between all the vowels.}, } @article {pmid21109251, year = {2011}, author = {Nittrouer, S and Shune, S and Lowenstein, JH}, title = {What is the deficit in phonological processing deficits: auditory sensitivity, masking, or category formation?.}, journal = {Journal of experimental child psychology}, volume = {108}, number = {4}, pages = {762-785}, pmid = {21109251}, issn = {1096-0457}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-23A1/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Age Factors ; Auditory Perceptual Disorders/complications/diagnosis/*physiopathology ; Awareness ; Child ; Child Development ; Cues ; Discrimination, Psychological ; Dyslexia/complications/diagnosis/physiopathology ; Female ; Humans ; Language Disorders/complications/diagnosis/*physiopathology ; Male ; Mental Recall ; *Perceptual Masking ; *Phonetics ; *Speech Perception ; Young Adult ; }, abstract = {Although children with language impairments, including those associated with reading, usually demonstrate deficits in phonological processing, there is minimal agreement as to the source of those deficits. This study examined two problems hypothesized to be possible sources: either poor auditory sensitivity to speech-relevant acoustic properties, mainly formant transitions, or enhanced masking of those properties. Adults and 8-year-olds with and without phonological processing deficits (PPD) participated. Children with PPD demonstrated weaker abilities than children with typical language development (TLD) in reading, sentence recall, and phonological awareness. Dependent measures were word recognition, discrimination of spectral glides, and phonetic judgments based on spectral and temporal cues. All tasks were conducted in quiet and in noise. Children with PPD showed neither poorer auditory sensitivity nor greater masking than adults and children with TLD, but they did demonstrate an unanticipated deficit in category formation for nonspeech sounds. These results suggest that these children may have an underlying deficit in perceptually organizing sensory information to form coherent categories.}, } @article {pmid21097867, year = {2010}, author = {Pardo, JS and Jay, IC and Krauss, RM}, title = {Conversational role influences speech imitation.}, journal = {Attention, perception & psychophysics}, volume = {72}, number = {8}, pages = {2254-2264}, doi = {10.3758/bf03196699}, pmid = {21097867}, issn = {1943-393X}, mesh = {*Awareness ; Female ; Humans ; *Imitative Behavior ; Interpersonal Relations ; Judgment ; Male ; *Phonetics ; *Semantics ; Sex Factors ; Social Conformity ; Speech Acoustics ; *Speech Perception ; *Verbal Behavior ; }, abstract = {This study assessed the impact of a conscious imitation goal on phonetic convergence during conversational interaction. Twelve pairs of unacquainted talkers participated in a conversational task designed to elicit between-talker repetitions of the same lexical items. To assess the degree to which the talkers exhibited phonetic convergence during the conversational task, these repetitions were used to elicit perceptual similarity judgments provided by separate sets of listeners. In addition, perceptual measures of phonetic convergence were compared with measures of articulation rates and vowel formants. The sex of the pair of talkers and a talker's role influenced the degree of phonetic convergence, and perceptual judgments of phonetic convergence were not consistently related to individual acoustic-phonetic attributes. Therefore, even with a conscious imitative goal, situational factors were shown to retain a strong influence on phonetic form in conversational interaction.}, } @article {pmid21097850, year = {2010}, author = {Remez, RE and Ferro, DF and Dubowski, KR and Meer, J and Broder, RS and Davids, ML}, title = {Is desynchrony tolerance adaptable in the perceptual organization of speech?.}, journal = {Attention, perception & psychophysics}, volume = {72}, number = {8}, pages = {2054-2058}, doi = {10.3758/bf03196682}, pmid = {21097850}, issn = {1943-393X}, support = {DC00308/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Perceptual Distortion ; Phonetics ; Pilot Projects ; Semantics ; Sound Spectrography ; *Speech Intelligibility ; *Speech Perception ; Time Perception ; }, abstract = {Speech signal components that are desynchronized from the veridical temporal pattern lose intelligibility. In contrast, audiovisual presentations with large desynchrony in visible and audible speech streams are perceived without loss of integration. Under such conditions, the limit of desynchrony that permits audiovisual integration is also adaptable. A new project directly investigated the potential for adaptation to consistent desynchrony with unimodal auditory sine-wave speech. Listeners transcribed sentences that are highly intelligible, with veridical temporal properties. Desynchronized variants were created by leading or lagging the tone analog of the second formant relative to the rest of the tones composing the sentences, in 50-msec steps, ranging from 250-msec lead to 250-msec lag. In blocked trials, listeners only tolerated desynchronies <50 msec, and exhibited no gain in intelligibility to consistent desynchrony. Unimodal auditory and bimodal audiovisual forms of perceptual integration evidently exhibit different temporal characteristics, an indication of distinct perceptual functions.}, } @article {pmid21079783, year = {2010}, author = {Mochida, T and Gomi, H and Kashino, M}, title = {Rapid change in articulatory lip movement induced by preceding auditory feedback during production of bilabial plosives.}, journal = {PloS one}, volume = {5}, number = {11}, pages = {e13866}, pmid = {21079783}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; Adult ; Auditory Perception/*physiology ; Feedback, Sensory/*physiology ; Female ; Humans ; Lip/*physiology ; Male ; Movement ; Phonetics ; Pitch Perception/physiology ; Psychomotor Performance ; Speech/physiology ; Speech Articulation Tests/methods ; Speech Perception/*physiology ; Task Performance and Analysis ; Time Factors ; Voice ; Young Adult ; }, abstract = {BACKGROUND: There has been plentiful evidence of kinesthetically induced rapid compensation for unanticipated perturbation in speech articulatory movements. However, the role of auditory information in stabilizing articulation has been little studied except for the control of voice fundamental frequency, voice amplitude and vowel formant frequencies. Although the influence of auditory information on the articulatory control process is evident in unintended speech errors caused by delayed auditory feedback, the direct and immediate effect of auditory alteration on the movements of articulators has not been clarified.

This work examined whether temporal changes in the auditory feedback of bilabial plosives immediately affects the subsequent lip movement. We conducted experiments with an auditory feedback alteration system that enabled us to replace or block speech sounds in real time. Participants were asked to produce the syllable /pa/ repeatedly at a constant rate. During the repetition, normal auditory feedback was interrupted, and one of three pre-recorded syllables /pa/, /Φa/, or /pi/, spoken by the same participant, was presented once at a different timing from the anticipated production onset, while no feedback was presented for subsequent repetitions. Comparisons of the labial distance trajectories under altered and normal feedback conditions indicated that the movement quickened during the short period immediately after the alteration onset, when /pa/ was presented 50 ms before the expected timing. Such change was not significant under other feedback conditions we tested.

CONCLUSIONS/SIGNIFICANCE: The earlier articulation rapidly induced by the progressive auditory input suggests that a compensatory mechanism helps to maintain a constant speech rate by detecting errors between the internally predicted and actually provided auditory information associated with self movement. The timing- and context-dependent effects of feedback alteration suggest that the sensory error detection works in a temporally asymmetric window where acoustic features of the syllable to be produced may be coded.}, } @article {pmid21074280, year = {2011}, author = {Celebi, S and Yelken, K and Celik, O and Taskin, U and Topak, M}, title = {Thermal welding vs. cold knife tonsillectomy: a comparison of voice and speech.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {75}, number = {1}, pages = {114-117}, doi = {10.1016/j.ijporl.2010.10.020}, pmid = {21074280}, issn = {1872-8464}, mesh = {Adolescent ; Adult ; Child ; Cohort Studies ; Cryosurgery/adverse effects/*methods ; Electrocoagulation/adverse effects/*methods ; Female ; Follow-Up Studies ; Humans ; Male ; Pain Measurement ; Pain, Postoperative/diagnosis ; Postoperative Complications/epidemiology/physiopathology ; Risk Assessment ; Speech Production Measurement ; Statistics, Nonparametric ; Tonsillectomy/adverse effects/*methods ; Treatment Outcome ; Voice Quality/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: To compare acoustic, aerodynamic and perceptual voice and speech parameters in thermal welding system tonsillectomy and cold knife tonsillectomy patients in order to determine the impact of operation technique on voice and speech.

METHODS: Thirty tonsillectomy patients (22 children, 8 adults) participated in this study. The preferred technique was cold knife tonsillectomy in 15 patients and thermal welding system tonsillectomy in the remaining 15 patients. One week before and 1 month after surgery the following parameters were estimated: average of fundamental frequency, Jitter, Shimmer, harmonic to noise ratio, formant frequency analyses of sustained vowels. Perceptual speech analysis and aerodynamic measurements (maximum phonation time and s/z ratio) were also conducted.

RESULTS: There was no significant difference in any of the parameters between cold knife tonsillectomy and thermal welding system tonsillectomy groups (p>0.05). When the groups were contrasted among themselves with regards to preoperative and postoperative rates, fundamental frequency was found to be significantly decreased after tonsillectomy in both of the groups (p<0.001). First formant for the vowel /a/ in the cold knife tonsillectomy group and for the vowel /i/ in the thermal welding system tonsillectomy group, second formant for the vowel /u/ in the thermal welding system tonsillectomy group and third formant for the vowel /u/ in the cold knife tonsillectomy group were found to be significantly decreased (p<0.05).

CONCLUSIONS: The surgical technique, whether it is cold knife or thermal welding system, does not appear to affect voice and speech in tonsillectomy patients.}, } @article {pmid21070135, year = {2011}, author = {Välimaa, TT and Sorri, MJ and Laitakari, J and Sivonen, V and Muhli, A}, title = {Vowel confusion patterns in adults during initial 4 years of implant use.}, journal = {Clinical linguistics & phonetics}, volume = {25}, number = {2}, pages = {121-144}, doi = {10.3109/02699206.2010.514692}, pmid = {21070135}, issn = {1464-5076}, mesh = {Adult ; Aged ; Cochlear Implantation/*rehabilitation ; Deafness/*rehabilitation/*therapy ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Pitch Perception ; Speech Discrimination Tests ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {This study investigated adult cochlear implant users' (n = 39) vowel recognition and confusions by an open-set syllable test during 4 years of implant use, in a prospective repeated-measures design. Subjects' responses were coded for phoneme errors and estimated by the generalized mixed model. Improvement in overall vowel recognition was highest during the first 6 months, showing statistically significant change until 4 years, especially for the mediocre performers. The best performers improved statistically significantly until 18 months. The poorest performers improved until 12 months and exhibited more vowel confusions. No differences were found in overall vowel recognition between Nucleus24M/24R and Med-ElC40+ device users (matched comparison), but certain vowels showed statistically significant differences. Vowel confusions between adjacent vowels were evident, probably due to the implant users' inability to discriminate formant frequencies. Vowel confusions were also dominated by vowels whose average F1 and/or F2 frequencies were higher than the target vowel, indicating a basalward shift in the confusions.}, } @article {pmid21068039, year = {2011}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {The intelligibility of noise-vocoded speech: spectral information available from across-channel comparison of amplitude envelopes.}, journal = {Proceedings. Biological sciences}, volume = {278}, number = {1711}, pages = {1595-1600}, pmid = {21068039}, issn = {1471-2954}, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; }, abstract = {Noise-vocoded (NV) speech is often regarded as conveying phonetic information primarily through temporal-envelope cues rather than spectral cues. However, listeners may infer the formant frequencies in the vocal-tract output-a key source of phonetic detail-from across-band differences in amplitude when speech is processed through a small number of channels. The potential utility of this spectral information was assessed for NV speech created by filtering sentences into six frequency bands, and using the amplitude envelope of each band (≤30 Hz) to modulate a matched noise-band carrier (N). Bands were paired, corresponding to F1 (≈N1 + N2), F2 (≈N3 + N4) and the higher formants (F3' ≈ N5 + N6), such that the frequency contour of each formant was implied by variations in relative amplitude between bands within the corresponding pair. Three-formant analogues (F0 = 150 Hz) of the NV stimuli were synthesized using frame-by-frame reconstruction of the frequency and amplitude of each formant. These analogues were less intelligible than the NV stimuli or analogues created using contours extracted from spectrograms of the original sentences, but more intelligible than when the frequency contours were replaced with constant (mean) values. Across-band comparisons of amplitude envelopes in NV speech can provide phonetically important information about the frequency contours of the underlying formants.}, } @article {pmid21047293, year = {2011}, author = {Hassan, DM}, title = {Perception of temporally modified speech in auditory neuropathy.}, journal = {International journal of audiology}, volume = {50}, number = {1}, pages = {41-49}, doi = {10.3109/14992027.2010.520035}, pmid = {21047293}, issn = {1708-8186}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Audiometry, Pure-Tone ; Audiometry, Speech ; Auditory Threshold ; Awareness ; Case-Control Studies ; Cochlear Nerve/*physiopathology ; *Cues ; Evoked Potentials, Auditory, Brain Stem ; Female ; Hearing Loss, Central/physiopathology/psychology/rehabilitation ; Hearing Loss, Sensorineural/physiopathology/psychology ; Humans ; Language ; Male ; Reaction Time ; Signal Detection, Psychological ; Sound Spectrography ; Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; Time Factors ; *Time Perception ; Young Adult ; }, abstract = {OBJECTIVE: Disrupted auditory nerve activity in auditory neuropathy (AN) significantly impairs the sequential processing of auditory information, resulting in poor speech perception. This study investigated the ability of AN subjects to perceive temporally modified consonant-vowel (CV) pairs and shed light on their phonological awareness skills.

DESIGN: Four Arabic CV pairs were selected: /ki/-/gi/, /to/-/do/, /si/-/sti/ and /so/-/zo/. The formant transitions in consonants and the pauses between CV pairs were prolonged. Rhyming, segmentation and blending skills were tested using words at a natural rate of speech and with prolongation of the speech stream.

STUDY SAMPLE: Fourteen adult AN subjects were compared to a matched group of cochlear-impaired patients in their perception of acoustically processed speech.

RESULTS: The AN group distinguished the CV pairs at a low speech rate, in particular with modification of the consonant duration. Phonological awareness skills deteriorated in adult AN subjects but improved with prolongation of the speech inter-syllabic time interval.

CONCLUSIONS: A rehabilitation program for AN should consider temporal modification of speech, training for auditory temporal processing and the use of devices with innovative signal processing schemes. Verbal modifications as well as visual imaging appear to be promising compensatory strategies for remediating the affected phonological processing skills.}, } @article {pmid20968377, year = {2010}, author = {Fox, RA and Jacewicz, E and Chang, CY}, title = {Auditory spectral integration in the perception of diphthongal vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {4}, pages = {2070-2074}, pmid = {20968377}, issn = {1520-8524}, support = {R01 DC006879/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Auditory Pathways/*physiology ; Auditory Threshold ; Humans ; *Phonetics ; *Signal Detection, Psychological ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {This study considers an operation of an auditory spectral integration process which may be involved in perceiving dynamic time-varying changes in speech found in diphthongs and glide-type transitions. Does the auditory system need explicit vowel formants to track the dynamic changes over time? Listeners classified diphthongs on the basis of a moving center of gravity (COG) brought about by changing intensity ratio of static spectral components instead of changing an F2. Listeners were unable to detect COG movement only when the F2 change was small (160 Hz) or when the separation between the static components was large (4.95 bark).}, } @article {pmid20968374, year = {2010}, author = {Cai, S and Ghosh, SS and Guenther, FH and Perkell, JS}, title = {Adaptive auditory feedback control of the production of formant trajectories in the Mandarin triphthong /iau/ and its pattern of generalization.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {4}, pages = {2033-2048}, pmid = {20968374}, issn = {1520-8524}, support = {R01 DC001925/DC/NIDCD NIH HHS/United States ; R01DC01925/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Psychological ; Adult ; Audiometry, Speech ; Auditory Pathways/*physiology ; Auditory Threshold ; *Feedback, Psychological ; Female ; Gestures ; Humans ; Male ; Motor Activity ; *Phonetics ; *Signal Detection, Psychological ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {In order to test whether auditory feedback is involved in the planning of complex articulatory gestures in time-varying phonemes, the current study examined native Mandarin speakers' responses to auditory perturbations of their auditory feedback of the trajectory of the first formant frequency during their production of the triphthong /iau/. On average, subjects adaptively adjusted their productions to partially compensate for the perturbations in auditory feedback. This result indicates that auditory feedback control of speech movements is not restricted to quasi-static gestures in monophthongs as found in previous studies, but also extends to time-varying gestures. To probe the internal structure of the mechanisms of auditory-motor transformations, the pattern of generalization of the adaptation learned on the triphthong /iau/ to other vowels with different temporal and spatial characteristics (produced only under masking noise) was tested. A broad but weak pattern of generalization was observed; the strength of the generalization diminished with increasing dissimilarity from /iau/. The details and implications of the pattern of generalization are examined and discussed in light of previous sensorimotor adaptation studies of both speech and limb motor control and a neurocomputational model of speech motor control.}, } @article {pmid20968372, year = {2010}, author = {Zhao, SY}, title = {Stop-like modification of the dental fricative /ð/: an acoustic analysis.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {4}, pages = {2009-2020}, pmid = {20968372}, issn = {1520-8524}, support = {R01 DC002978/DC/NIDCD NIH HHS/United States ; T32 DC000038/DC/NIDCD NIH HHS/United States ; T32DC00038/DC/NIDCD NIH HHS/United States ; DC02978/DC/NIDCD NIH HHS/United States ; }, mesh = {Cues ; *Dentition ; Female ; Humans ; Male ; *Phonation ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {This study concentrates on one of the commonly occurring phonetic variations in English: the stop-like modification of the dental fricative /ð/. The variant exhibits a drastic change from the canonical /ð/; the manner of articulation is changed from one that is fricative to one that is stop-like. Furthermore, the place of articulation of stop-like /ð/ has been a point of uncertainty, leading to confusion between stop-like /ð/ and /d/. In this study, acoustic and spectral moment measures were taken from 100 stop-like /ð/ and 102 /d/ tokens produced by 59 male and 23 female speakers in the TIMIT corpus. Data analysis indicated that stop-like /ð/ is significantly different from /d/ in burst amplitude, burst spectrum shape, burst peak frequency, second formant at following-vowel onset, and spectral moments. Moreover, the acoustic differences from /d/ are consistent with those expected for a dental stop-like /ð/. Automatic classification experiments involving these acoustic measures suggested that they are salient in distinguishing stop-like /ð/ from /d/.}, } @article {pmid20966384, year = {2011}, author = {Jacewicz, E and Fox, RA and Salmons, J}, title = {Regional dialect variation in the vowel systems of typically developing children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {2}, pages = {448-470}, pmid = {20966384}, issn = {1558-9102}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; }, mesh = {Age Factors ; Aged ; Child ; Child Language ; Female ; Humans ; *Language ; *Language Development ; Male ; Middle Aged ; *Models, Biological ; *Phonetics ; *Speech Acoustics ; }, abstract = {PURPOSE: To investigate regional dialect variation in the vowel systems of typically developing 8- to 12-year-old children.

METHOD: Thirteen vowels in isolated h_d words were produced by 94 children and 93 adults (males and females). All participants spoke American English and were born and raised in 1 of 3 distinct dialect regions in the United States: western North Carolina (Southern dialect), central Ohio (Midland dialect), and southeastern Wisconsin (Northern Midwestern dialect). Acoustic analysis included formant frequencies (F1 and F2) measured at 5 equidistant time points in a vowel and formant movement (trajectory length).

RESULTS: Children's productions showed many dialect-specific features comparable to those in adult speakers, both in terms of vowel dispersion patterns and formant movement. Different features were also found, including systemic vowel changes, significant monophthongization of selected vowels, and greater formant movement in diphthongs.

CONCLUSIONS: The acoustic results provide evidence for regional distinctiveness in children's vowel systems. Children acquire not only the systemic relations among vowels but also their dialect-specific patterns of formant dynamics. Directing attention to the regional variation in the production of American English vowels, this work may prove helpful in better understanding and interpreting the development of vowel categories and vowel systems in children.}, } @article {pmid20964506, year = {2010}, author = {Candeias, S and Perdigão, F}, title = {Syllable structure in dysfunctional Portuguese children's speech.}, journal = {Clinical linguistics & phonetics}, volume = {24}, number = {11}, pages = {883-889}, doi = {10.3109/02699206.2010.511402}, pmid = {20964506}, issn = {1464-5076}, mesh = {Child ; Cognition/*physiology ; Humans ; *Language ; Language Development Disorders/*physiopathology/therapy ; *Linguistics ; Portugal ; Speech/physiology ; *Speech Acoustics ; Speech Therapy ; }, abstract = {The goal of this work is to investigate whether children with speech dysfunctions (SD) show a deficit in planning some Portuguese syllable structures (PSS) in continuous speech production. Knowledge of which aspects of speech production are affected by SD is necessary for efficient improvement in the therapy techniques. The case-study is focused on PSS as C1C2V syllable sequences (consonant-consonant-vowel), in which C2 is [l] or [r]. To identify specific speech patterns that are sensitive to SD, coarticulation effects using formant trajectories, intensity, and durational structure are investigated. To explore the characteristics of continuous speech processes in SD speech output, the methodology uses acoustic analysis. Preliminary findings show systematic specific coarticulation in the child with SD when compared to the normal speech (NS) child. This also suggests that the traditional focus on a single word production in the SD assessment needs to be modified to allow more detailed consideration of speech production in continuous speech. It is the purpose of the authors in the future to develop an application that can be an optimal start for SD treatment/counselling programmes. The work reported here proves the importance of clinic linguistic knowledge in that way. This study is the result of a multidisciplinary-team whose work allies linguist, clinical therapy and engineering knowledge.}, } @article {pmid20938200, year = {2011}, author = {Kim, H and Hasegawa-Johnson, M and Perlman, A}, title = {Vowel contrast and speech intelligibility in dysarthria.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {63}, number = {4}, pages = {187-194}, pmid = {20938200}, issn = {1421-9972}, support = {R21 DC008090/DC/NIDCD NIH HHS/United States ; R21-DC008090-A/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Cerebral Palsy/complications/physiopathology ; Dysarthria/etiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Young Adult ; }, abstract = {BACKGROUND/AIMS: This study examined the spectral characteristics of American English vowels in dysarthria associated with cerebral palsy (CP), and investigated the relationship between a speaker's overall speech intelligibility and vowel contrast.

METHODS: The data were collected from 12 American English native speakers (9 speakers with a diagnosis of CP and 3 controls). Primary measures were F(1) and F(2) frequencies of 3 corner vowels /i, a, u/ and 3 noncorner vowels /I, 3, */. Six acoustic variables were derived from the formant measures, and were regressed against intelligibility: corner vowel space, noncorner vowel space, mean distance between vowels, F(1) and F(2) variability, and overlap degree among vowels.

RESULTS: First, the effect of vowel was significant for both F(1) and F(2) measures for all speakers, but post hoc analysis revealed a reduced distinction at lower intelligibility. Second, regression functions relating intelligibility and acoustic variables were significant for overlap degree among vowels, F(1) variability, corner vowel space and mean distance between vowels. Overlap degree among vowels accounted for the greatest amount of variance in intelligibility scores.

CONCLUSION: A speaker's overall intelligibility in dysarthric speech is better represented by the overlap degree among vowels than by the vowel space.}, } @article {pmid20938198, year = {2011}, author = {Kotby, MN and Saleh, M and Hegazi, M and Gamal, N and Abdel Salam, M and Nabil, A and Fahmi, S}, title = {The Arabic vowels: features and possible clinical application in communication disorders.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {63}, number = {4}, pages = {171-177}, doi = {10.1159/000316323}, pmid = {20938198}, issn = {1421-9972}, mesh = {Adult ; Arabs ; Articulation Disorders ; Classification ; Egypt ; Female ; Humans ; Language Development Disorders ; Male ; Middle Aged ; *Phonetics ; Reference Values ; Writing ; Young Adult ; }, abstract = {UNLABELLED: Most scholars, old and modern, agree that the vowel system of the Arabic language is composed of 3 vowels only, namely /i/, /ε/ and /u/. The spoken Cairo dialect suggests that there are 6 identifiable vowels, with a short and long variant for each.

OBJECTIVE: The aim of this study is to test the validity of the notion that there are 6 × 2 distinct vowels, with a more central one.

SUBJECTS AND METHODS: Spectral analysis was used to measure F(1) and F(2) for the vowels of 14 real words. Data was collected from 60 healthy adult informants, 30 males and 30 females. They were native Egyptians speaking the colloquial Cairene dialect.

RESULTS: The values of the 6 long and short vowels plus the central one are presented. A significant difference was found between each of them. The long and short vowels differed only in the duration but did not differ in their formant values.

CONCLUSION: The study illustrates the distinctive features of the vowels of the Arabic language. Each of the 7 vowels represents a distinct entity. This will have important implications in assessment and management of language, speech and voice disorders in children and adults.}, } @article {pmid20938196, year = {2011}, author = {Baudonck, N and Van Lierde, K and Dhooge, I and Corthals, P}, title = {A comparison of vowel productions in prelingually deaf children using cochlear implants, severe hearing-impaired children using conventional hearing aids and normal-hearing children.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {63}, number = {3}, pages = {154-160}, doi = {10.1159/000318879}, pmid = {20938196}, issn = {1421-9972}, mesh = {Articulation Disorders/*etiology ; Child ; *Cochlear Implants ; Deafness/complications/congenital/*surgery ; Feedback, Sensory ; Female ; *Hearing Aids ; Hearing Loss, Bilateral/complications/*rehabilitation ; Hearing Loss, Sensorineural/complications/*rehabilitation ; Humans ; Male ; *Phonetics ; }, abstract = {OBJECTIVE: The purpose of this study was to compare vowel productions by deaf cochlear implant (CI) children, hearing-impaired hearing aid (HA) children and normal-hearing (NH) children.

PATIENTS AND METHODS: 73 children [mean age: 9;14 years (years;months)] participated: 40 deaf CI children, 34 moderately to profoundly hearing-impaired HA children and 42 NH children. For the 3 corner vowels [a], [i] and [u], F(1), F(2) and the intrasubject SD were measured using the Praat software. Spectral separation between these vowel formants and vowel space were calculated.

RESULTS: The significant effects in the CI group all pertain to a higher intrasubject variability in formant values, whereas the significant effects in the HA group all pertain to lower formant values. Both hearing-impaired subgroups showed a tendency toward greater intervowel distances and vowel space.

CONCLUSION: Several subtle deviations in the vowel production of deaf CI children and hearing-impaired HA children could be established, using a well-defined acoustic analysis. CI children as well as HA children in this study tended to overarticulate, which hypothetically can be explained by a lack of auditory feedback and an attempt to compensate it by proprioceptive feedback during articulatory maneuvers.}, } @article {pmid20926254, year = {2011}, author = {Heffernan, CB and Rafferty, MA}, title = {Effect of tonsillectomy on the adult voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {4}, pages = {e207-10}, doi = {10.1016/j.jvoice.2010.05.006}, pmid = {20926254}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Oropharynx/physiology ; Prospective Studies ; Tonsillectomy/*adverse effects ; Voice/*physiology ; Voice Disorders/etiology ; Young Adult ; }, abstract = {OBJECTIVES AND HYPOTHESIS: Anecdotal evidence suggests that tonsillectomy has no deleterious consequences on a person's voice under normal vocal demand. However, whether the enlarged dimensions of the oropharynx after tonsillectomy impair the quality of a professional voice user remains unclear. Therefore, we designed a study to determine whether adult tonsillectomy altered the resonance characteristics of the vocal tract in any way and whether these changes were transient or permanent.

STUDY DESIGN: This is a prospective observational study with full institutional ethical approval.

METHODS: All adult patients presenting for tonsillectomy for recurrent tonsillitis in our institution were recruited. Their voice was recorded preoperatively, postoperatively, and at 4 weeks postoperatively. The values of the first four formants were calculated in all recordings. The oropharyngeal dimensions were measured preoperatively and postoperatively. Tonsillar weights and volumes were also measured.

RESULTS: The first formant was noted to rise postoperatively. The average value of F2 and F3 did not alter postoperatively or at 4 weeks. However, it was noted that the fourth formant was not universally present preoperatively but was present in all patients postoperatively and at 4 weeks.

CONCLUSIONS: Altering the dimensions of the oropharynx after tonsillectomy causes the first formant to rise but has no effect on the third and fourth formants. However, the fourth formant appears in patients who previously did not demonstrate it. The fourth formant was present in a greater proportion of male patients preoperatively than female patients, but it was universally present postoperatively and at 4 weeks in both sexes. This suggests that increasing the horizontal dimensions of the oropharynx has a nontransient effect on the higher order formants of the voice.}, } @article {pmid20926251, year = {2011}, author = {Radish Kumar, B and Bhat, JS and Mukhi, P}, title = {Vowel harmonic amplitude differences in persons with vocal nodules.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {5}, pages = {559-561}, doi = {10.1016/j.jvoice.2010.06.009}, pmid = {20926251}, issn = {1873-4588}, mesh = {Adult ; Female ; Fourier Analysis ; Humans ; Laryngeal Diseases/pathology/*physiopathology/therapy ; Male ; *Models, Biological ; *Phonetics ; Sound Spectrography ; Speech Therapy ; Vocal Cords/pathology/*physiopathology ; Voice/*physiology ; Young Adult ; }, abstract = {BACKGROUND: Spectrum is a fast Fourier transform-generated power spectrum extracted from the speech sample. It is reported to provide a quantitative acoustic index of the degree of glottal abduction and adduction in voices perceived to be breathy or pressed. In the present study, it was hypothesized that there would be abnormal reduction of higher harmonic amplitudes relative to the amplitude of the first harmonics in the subjects with vocal nodules and hence the present study was carried out.

METHOD: One hundred twenty participants were divided into clinical group and control group. They were instructed to phonate /a/ at their most comfortable pitch and loudness. Fourier transformation of the recorded acoustic signal was first performed to create a spectrum. Amplitudes were measured for the first and second harmonics (H1 and H2) as well as the harmonics at the first, second, and third formants (A1, A2, and A3) using the Computerized Speech Science Lab (Kay Pentax, Lincoln, NJ).

RESULTS: There was a significant difference between the means of two groups for all the parameters, such as H1-H2, H1-A1, H1-A2, and H1-A3 at P<0.05. The obtained results are discussed with respect to the underlying pathophysiology.

CONCLUSIONS: The present study investigated the vowel harmonic amplitude differences in persons with vocal nodules. The results revealed a significant difference between the two groups for the vowel harmonic amplitude differences. This particular measure could be used to track the changes following the vocal treatment.}, } @article {pmid20926250, year = {2011}, author = {Borch, DZ and Sundberg, J}, title = {Some phonatory and resonatory characteristics of the rock, pop, soul, and Swedish dance band styles of singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {5}, pages = {532-537}, doi = {10.1016/j.jvoice.2010.07.014}, pmid = {20926250}, issn = {1873-4588}, mesh = {Glottis/*physiology ; Humans ; Male ; *Music ; Phonation/*physiology ; *Phonetics ; Pressure ; Sound Spectrography/instrumentation/*methods ; Sweden ; Vibration ; }, abstract = {This investigation aims at describing voice function of four nonclassical styles of singing, Rock, Pop, Soul, and Swedish Dance Band. A male singer, professionally experienced in performing in these genres, sang representative tunes, both with their original lyrics and on the syllable /pae/. In addition, he sang tones in a triad pattern ranging from the pitch Bb2 to the pitch C4 on the syllable /pae/ in pressed and neutral phonation. An expert panel was successful in classifying the samples, thus suggesting that the samples were representative of the various styles. Subglottal pressure was estimated from oral pressure during the occlusion for the consonant [p]. Flow glottograms were obtained from inverse filtering. The four lowest formant frequencies differed between the styles. The mean of the subglottal pressure and the mean of the normalized amplitude quotient (NAQ), that is, the ratio between the flow pulse amplitude and the product of period and maximum flow declination rate, were plotted against the mean of fundamental frequency. In these graphs, Rock and Swedish Dance Band assumed opposite extreme positions with respect to subglottal pressure and mean phonation frequency, whereas the mean NAQ values differed less between the styles.}, } @article {pmid20888869, year = {2011}, author = {Bruder, J and Leppänen, PH and Bartling, J and Csépe, V and Démonet, JF and Schulte-Körne, G}, title = {An investigation of prototypical and atypical within-category vowels and non-speech analogues on cortical auditory evoked related potentials (AERPs) in 9 year old children.}, journal = {International journal of psychophysiology : official journal of the International Organization of Psychophysiology}, volume = {79}, number = {2}, pages = {106-117}, doi = {10.1016/j.ijpsycho.2010.09.008}, pmid = {20888869}, issn = {1872-7697}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/*physiology ; Brain Mapping ; Cerebral Cortex/*physiology ; Child ; Contingent Negative Variation/*physiology ; Discrimination, Psychological ; Electroencephalography/methods ; Evoked Potentials, Auditory/*physiology ; Humans ; Judgment/physiology ; *Phonetics ; Psycholinguistics ; Reaction Time/physiology ; Reading ; Statistics as Topic ; }, abstract = {The present study examined cortical auditory evoked related potentials (AERPs) for the P1-N250 and MMN components in children 9 years of age. The first goal was to investigate whether AERPs respond differentially to vowels and complex tones, and the second goal was to explore how prototypical language formant structures might be reflected in these early auditory processing stages. Stimuli were two synthetic within-category vowels (/y/), one of which was preferred by adult German listeners ("prototypical-vowel"), and analogous complex tones. P1 strongly distinguished vowels from tones, revealing larger amplitudes for the more difficult to discriminate but phonetically richer vowel stimuli. Prototypical language phoneme status did not reliably affect AERPs; however P1 amplitudes elicited by the prototypical-vowel correlated robustly with the ability to correctly identify two prototypical-vowels presented in succession as "same" (r=-0.70) and word reading fluency (r=-0.63). These negative correlations suggest that smaller P1 amplitudes elicited by the prototypical-vowel predict enhanced accuracy when judging prototypical-vowel "sameness" and increased word reading speed. N250 and MMN did not differentiate between vowels and tones and showed no correlations to behavioural measures.}, } @article {pmid20886300, year = {2011}, author = {Iriarte, J and Fernández, S and Fernandez-Arrechea, N and Urrestarazu, E and Pagola, I and Alegre, M and Artieda, J}, title = {Sound analysis of catathrenia: a vocal expiratory sound.}, journal = {Sleep & breathing = Schlaf & Atmung}, volume = {15}, number = {2}, pages = {229-235}, pmid = {20886300}, issn = {1522-1709}, mesh = {Aged ; Female ; Humans ; Larynx/physiopathology ; Male ; Middle Aged ; *Oscillometry ; Parasomnias/diagnosis/*physiopathology ; Polysomnography ; Respiratory Sounds/*physiopathology ; Sleep Apnea, Obstructive/*diagnosis/*physiopathology ; Sleep Stages ; *Sound Spectrography ; }, abstract = {PURPOSE: Catathrenia (nocturnal groaning) is a rare and relatively little-understood parasomnia. The characteristics of the sound and the recordings are not similar in all the relevant research papers. Indeed, there is currently some discussion regarding whether or not this is a single entity. For some authors, catathrenia is a particular form of parasomnia; for others, it may be a variant of snoring or a respiratory problem. The goal is to establish whether or not catathrenia may be regarded as an expiratory vocal sound. An attempt was made to classify the origin of this sound according to its sound structure.

METHODS: We present the sound analysis of two patients, a man and a woman, with clinically diagnosed catathrenia and we compared them with the analysis of snoring. We use the spectrogram and the oscillogram. We classified the sounds according to the Yanagihara criteria.

RESULTS: The vocal nature of the sound was confirmed, and several significant differences to some snoring sounds were discovered. The analysis of the catathrenia samples demonstrated that these signals are type II according to Yanagihara classification; these signals had a very short jitter, and had formants and harmonics. However, snoring is a type III, very irregular and had formants but not harmonics.

CONCLUSIONS: The oscillogram and the spectrogram in these patients show that the origins of the sounds are clearly different: catathrenia is laryngeal, while snoring is guttural. Catathrenia cannot be considered as expiratory snoring.}, } @article {pmid20884780, year = {2011}, author = {Kim, Y and Kent, RD and Weismer, G}, title = {An acoustic study of the relationships among neurologic disease, dysarthria type, and severity of dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {54}, number = {2}, pages = {417-429}, doi = {10.1044/1092-4388(2010/10-0020)}, pmid = {20884780}, issn = {1558-9102}, support = {DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Brain Injuries/complications ; *Dysarthria/classification/etiology/physiopathology ; Female ; Humans ; Male ; Middle Aged ; Multiple System Atrophy/complications ; Nervous System Diseases/*complications ; Parkinson Disease/complications ; Predictive Value of Tests ; *Severity of Illness Index ; *Speech Acoustics ; Speech Intelligibility ; Stroke/complications ; Young Adult ; }, abstract = {PURPOSE: This study examined acoustic predictors of speech intelligibility in speakers with several types of dysarthria secondary to different diseases and conducted classification analysis solely by acoustic measures according to 3 variables (disease, speech severity, and dysarthria type).

METHOD: Speech recordings from 107 speakers with dysarthria due to Parkinson's disease, stroke, traumatic brain injury, and multiple system atrophy were used for acoustic analysis and for perceptual judgment of speech intelligibility. Acoustic analysis included 8 segmental/suprasegmental features: 2nd formant frequency slope, articulation rate, voiceless interval duration, 1st moment analysis for fricatives, vowel space, F0, intensity range, and Pairwise Variability Index.

RESULTS: The results showed that (a) acoustic predictors of speech intelligibility differed slightly across diseases and (b) classification accuracy by dysarthria type was typically worse than by disease type or severity.

CONCLUSIONS: These findings were discussed with respect to (a) the relationship between acoustic characteristics and speech intelligibility and (b) dysarthria classification.}, } @article {pmid22319700, year = {2010}, author = {Sinha, SK and Basavaraj, V}, title = {Speech evoked auditory brainstem responses: a new tool to study brainstem encoding of speech sounds.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {62}, number = {4}, pages = {395-399}, pmid = {22319700}, issn = {0973-7707}, abstract = {The neural encoding of speech sound begins in the auditory nerve and travels to the auditory brainstem. Non speech stimuli such as click or tone bursts stimulus are used to check the auditory neural integrity routinely. Recently Speech evoked Auditory Brainstem measures (ABR) are being used as a tool to study the brainstem processing of Speech sounds. The aim of the study was to study the Speech evoked ABR to a consonant vowel (CV) stimulus. 30 subjects with normal hearing participated for the study. Speech evoked ABR were measured to a CV stimulus in all the participants. The speech stimulus used was a 40 ms synthesized/da/sound. The consonant and vowel portion was analysed separately. Speech evoked ABR was present in all the normal hearing subjects. The consonant portion of the stimulus elicited peak V in response waveform. Response to the vowel portion elicited a frequency following response (FFR). The FFR further showed a coding of the fundamental frequency (F0) and the first formant frequency (F1). The results of the present study throw light on the processing of speech in brainstem. The understanding of speech evoked ABR has other applications both in research as well as in clinical purposes. Such understanding is specially important if one is interested in studying the central auditory system function.}, } @article {pmid20871808, year = {2010}, author = {Iskarous, K}, title = {Vowel constrictions are recoverable from formants.}, journal = {Journal of phonetics}, volume = {38}, number = {3}, pages = {375-387}, pmid = {20871808}, issn = {0095-4470}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; R01 DC002717-13/DC/NIDCD NIH HHS/United States ; }, abstract = {The area function of the vocal tract in all of its spatial detail is not directly computable from the speech signal. But is partial, yet phonetically distinctive, information about articulation recoverable from the acoustic signal that arrives at the listener's ear? The answer to this question is important for phonetics, because various theories of speech perception predict different answers. Some theories assume that recovery of articulatory information must be possible, while others assume that it is impossible. However, neither type of theory provides firm evidence showing that distinctive articulatory information is or is not extractable from the acoustic signal. The present study focuses on vowel gestures and examines whether linguistically significant information, such as the constriction location, constriction degree, and rounding, is contained in the speech signal, and whether such information is recoverable from formant parameters. Perturbation theory and linear prediction were combined, in a manner similar to that in Mokhtari (1998) [Mokhtari, P. (1998). An acoustic-phonetic and articulatory study of speech-speaker dichotomy. Doctoral dissertation, University of New South Wales], to assess the accuracy of recovery of information about vowel constrictions. Distinctive constriction information estimated from the speech signal for ten American English vowels were compared to the constriction information derived from simultaneously collected X-ray microbeam articulatory data for 39 speakers [Westbury (1994). Xray microbeam speech production database user's handbook. University of Wisconsin, Madison, WI]. The recovery of distinctive articulatory information relies on a novel technique that uses formant frequencies and amplitudes, and does not depend on a principal components analysis of the articulatory data, as do most other inversion techniques. These results provide evidence that distinctive articulatory information for vowels can be recovered from the acoustic signal.}, } @article {pmid20862531, year = {2011}, author = {Wang, H and Isik, M and Borst, A and Hemmert, W}, title = {Auditory information coding by modeled cochlear nucleus neurons.}, journal = {Journal of computational neuroscience}, volume = {30}, number = {3}, pages = {529-542}, pmid = {20862531}, issn = {1573-6873}, mesh = {Action Potentials/*physiology ; Auditory Perception/physiology ; Cochlear Nucleus/cytology/*physiology ; Female ; Humans ; *Models, Neurological ; Neural Networks, Computer ; Neurons/cytology/*physiology ; Speech Perception/*physiology ; }, abstract = {In this paper we use information theory to quantify the information in the output spike trains of modeled cochlear nucleus globular bushy cells (GBCs). GBCs are part of the sound localization pathway. They are known for their precise temporal processing, and they code amplitude modulations with high fidelity. Here we investigated the information transmission for a natural sound, a recorded vowel. We conclude that the maximum information transmission rate for a single neuron was close to 1,050 bits/s, which corresponds to a value of approximately 5.8 bits per spike. For quasi-periodic signals like voiced speech, the transmitted information saturated as word duration increased. In general, approximately 80% of the available information from the spike trains was transmitted within about 20 ms. Transmitted information for speech signals concentrated around formant frequency regions. The efficiency of neural coding was above 60% up to the highest temporal resolution we investigated (20 μs). The increase in transmitted information to that precision indicates that these neurons are able to code information with extremely high fidelity, which is required for sound localization. On the other hand, only 20% of the information was captured when the temporal resolution was reduced to 4 ms. As the temporal resolution of most speech recognition systems is limited to less than 10 ms, this massive information loss might be one of the reasons which are responsible for the lack of noise robustness of these systems.}, } @article {pmid20849248, year = {2010}, author = {Mecke, AC and Sundberg, J and Richter, B}, title = {A virtual castrato?.}, journal = {Logopedics, phoniatrics, vocology}, volume = {35}, number = {3}, pages = {138-143}, doi = {10.3109/14015430903578787}, pmid = {20849248}, issn = {1651-2022}, mesh = {Adult ; Child ; Female ; Humans ; Larynx/*physiology ; Male ; *Music ; *Orchiectomy ; Pitch Perception ; Psycholinguistics ; Software ; *Sound Spectrography ; Voice Quality/*physiology ; }, abstract = {In this investigation the voice source from trained boy singers was processed with a transfer function that contained the singer's formant cluster of a bass, a baritone, or a tenor. The modified voices were evaluated by a panel of highly specialized experts. The experts were asked 1) to assess how similar the examples sounded to the voice of the last castrato Alessandro Moreschi, and 2) to rate how similar they thought the examples were to their imagination of an 18th-century castrato voice. For both questions, the voices with tenor formants produced significantly higher ratings than the other voice types. However, the mean ratings for the second question were generally lower than those for the first.}, } @article {pmid20846502, year = {2010}, author = {Xue, SA and Cheng, RW and Ng, LM}, title = {Vocal tract dimensional development of adolescents: an acoustic reflection study.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {74}, number = {8}, pages = {907-912}, doi = {10.1016/j.ijporl.2010.05.010}, pmid = {20846502}, issn = {1872-8464}, mesh = {Adolescent ; Age Factors ; Analysis of Variance ; Child ; Child Development/physiology ; Cohort Studies ; Female ; Humans ; Male ; Observer Variation ; Phonation/*physiology ; Sex Factors ; *Speech Acoustics ; Vocal Cords/*anatomy & histology/growth & development ; Voice Quality ; }, abstract = {OBJECTIVE: The purpose of the study was to investigate the effects of age and gender on adolescents' vocal tract dimensional development with acoustic reflection technology (ART).

METHODS: A total of ninety-five male and female adolescents aged between 10 and 18 divided into three age groups were tested with acoustic reflection technology (ART) and acoustic program to secure their vocal tract dimensional parameters and the vowel formant frequencies.

RESULTS: Significant age and gender effects were found not only in vocal tract length, but also segmental volumetric measurements, as well as the vowel formant frequencies.

CONCLUSIONS: The findings of this study have provided insights on the developmental trend of adolescents' vocal tracts. The study has also offered a preliminary anatomical database of adolescents' vocal tract dimensional growth for otolaryngologists, clinical anatomists, speech therapists and other health professionals of swallowing, respiration and communicative disorders.}, } @article {pmid20831379, year = {2010}, author = {Shriberg, LD and Fourakis, M and Hall, SD and Karlsson, HB and Lohmeier, HL and McSweeny, JL and Potter, NL and Scheer-Cohen, AR and Strand, EA and Tilkens, CM and Wilson, DL}, title = {Perceptual and acoustic reliability estimates for the Speech Disorders Classification System (SDCS).}, journal = {Clinical linguistics & phonetics}, volume = {24}, number = {10}, pages = {825-846}, pmid = {20831379}, issn = {1464-5076}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC000496/DC/NIDCD NIH HHS/United States ; DC000496/DC/NIDCD NIH HHS/United States ; HD03352/HD/NICHD NIH HHS/United States ; }, mesh = {Apraxias/diagnosis/genetics ; Child ; Dysarthria/*classification/*diagnosis ; Humans ; Language Development Disorders/classification/diagnosis ; Phonetics ; Reproducibility of Results ; Speech Acoustics ; Speech Disorders/*classification/*diagnosis ; Speech Perception ; Speech Production Measurement/methods/*standards ; }, abstract = {A companion paper describes three extensions to a classification system for paediatric speech sound disorders termed the Speech Disorders Classification System (SDCS). The SDCS uses perceptual and acoustic data reduction methods to obtain information on a speaker's speech, prosody, and voice. The present paper provides reliability estimates for the two perceptual methods (narrow phonetic transcription; prosody-voice coding) and the acoustic analysis methods the SDCS uses to describe and classify a speaker's speech competence, precision, and stability. Speech samples from 10 speakers, five with significant motor speech disorder and five with typical speech, were re-measured to estimate intra-judge and inter-judge agreement for the perceptual and acoustic methods. Each of the speakers completed five speech tasks (total = 50 datasets), ranging in articulatory difficulty for the speakers, with consequences for the difficulty level of data reduction. Point-to-point percentage of agreement findings for the two perceptual methods were as high or higher than reported in literature reviews and from previous studies conducted within the laboratory. Percentage of agreement findings for the acoustics tasks of segmenting phonemes, editing fundamental frequency tracks, and estimating formants ranged from values in the mid 70% to 100%, with most estimates in the mid 80% to mid 90% range. Findings are interpreted as support for the perceptual and acoustic methods used in the SDCS to describe and classify speakers with speech sound disorders.}, } @article {pmid20819135, year = {2011}, author = {Kwon, HB and Chang, SW and Lee, SH}, title = {The effect of obturator bulb height on speech in maxillectomy patients.}, journal = {Journal of oral rehabilitation}, volume = {38}, number = {3}, pages = {185-195}, doi = {10.1111/j.1365-2842.2010.02139.x}, pmid = {20819135}, issn = {1365-2842}, mesh = {Adult ; Aged ; *Dental Prosthesis Design ; Female ; Follow-Up Studies ; Humans ; Jaw Relation Record ; Male ; Maxilla/*surgery ; Maxillary Neoplasms/surgery ; Middle Aged ; *Palatal Obturators ; Phonetics ; Skin Transplantation ; Speech/*physiology ; Speech Acoustics ; Speech Intelligibility/physiology ; Surgical Flaps ; Time Factors ; Voice Quality/physiology ; }, abstract = {The purpose of this study was to compare the speech function of low height bulb obturators with that of high height bulb obturators. Thirteen maxillectomy patients, who underwent post-operative prosthodontic rehabilitations, were included. Two obturators of the same design except for different bulb heights were fabricated for each maxillectomy patient. One of the two obturators had high bulb design and the other had low bulb design. After one of the obturators was used for a period of 3 weeks, the patient's speaking functions were evaluated by measuring nasalance scores, formant frequencies, and vowel working space areas. The same procedures were repeated with the second obturator following another 3-week period of usage. In addition, the effect of delivery sequence and anatomic conditions related to maxillectomy were analysed. The results demonstrated that the nasalance scores with the low bulb obturators were significantly higher than those with the high bulb obturators. There were no significant differences in formant frequencies based on the bulb height of the obturators. The vowel working spaces for the two obturators were similar in shape and there were no significant differences between the vowel working space areas created by the two obturators. The delivery sequence affected the results. However, there were no significant differences related to the other anatomical variables. Although low bulb obturators might function similarly with high bulb obturators in terms of the articulation of speech, they would exhibit a difficulty in controlling hypernasality in maxillectomy patients.}, } @article {pmid20817474, year = {2011}, author = {da Silva, PT and Master, S and Andreoni, S and Pontes, P and Ramos, LR}, title = {Acoustic and long-term average spectrum measures to detect vocal aging in women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {4}, pages = {411-419}, doi = {10.1016/j.jvoice.2010.04.002}, pmid = {20817474}, issn = {1873-4588}, mesh = {Adult ; Aged ; Aged, 80 and over ; Aging/*physiology ; Female ; Humans ; Middle Aged ; Sound ; *Speech Acoustics ; Voice/*physiology ; Young Adult ; }, abstract = {Along the normal aging process, voice tends to become weak, breathy, and loses projection, which may interfere in the communication process. One reliable way to evaluate voice quality is through acoustical analysis using, for instance, the long-term average spectrum (LTAS). The aim of this study was to identify acoustic measures, particularly LTAS's, which characterize vocal aging in women without vocal complaints. For this purpose, 30 elderly and 30 young women were included in this study. All spoke standard Portuguese and none had a history of vocal and laryngeal alterations or respiratory diseases. On the basis of the reading task, in habitual and loud levels, the following parameters were assessed: the equivalent sound level (L(eq)), the speaking fundamental frequency (SFF) and, at the LTAS window, the difference between the levels of the regions of the first formant and fundamental frequency F(0) (L(1) - L(0)), alpha ratio, and the amplitude levels obtained at equal intervals of 160 Hz, ranging from 0 to 8 kHz. There were significant differences between young and old voices for SFF and L(eq) in both levels. In the LTAS window, amplitude levels were higher for young voices, comprising all frequencies except those in the regions between 4.6-6.7 and 4.8-6.5 kHz, in habitual and loud levels, respectively. There were also significant differences regarding L(1) - L(0) and alpha ratio between groups, in both levels.The observed differences in LTAS's slopes, L(1) - L(0) measures, and even L(eq) and SFF measures, may be attributed, to some extent, to lower subglottal pressure or a glottal setting providing a slower glottal closing speed for the elderly group.}, } @article {pmid20815469, year = {2010}, author = {Mok, PK}, title = {Language-specific realizations of syllable structure and vowel-to-vowel coarticulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {3}, pages = {1346-1356}, doi = {10.1121/1.3466859}, pmid = {20815469}, issn = {1520-8524}, mesh = {Female ; Humans ; *Language ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; Verbal Behavior ; }, abstract = {This paper investigates the effects of syllable structure on vowel-to-vowel (V-to-V) coarticulation using Thai and English data. Languages differ in syllable complexity and their realizations of syllable structure. It was hypothesized that languages with complex syllable structure (English) would allow more V-to-V coarticulation than languages with simple syllable structure (Thai). Onset and coda consonants are different acoustically, articulatorily, typologically and perceptually. Onsets are generally 'stronger' and more stable than codas because they are longer, louder, and involve tighter articulatory constrictions. It was hypothesized that closed syllables (that end in a consonant C, i.e., VC#V) would allow more V-to-V coarticulation than open syllables (V#CV). /C(1)V(1)#C(2)V(2)/ and /C(1)V(1)C(2)#V(2)t/ sequences were recorded from six native speakers in Thai and six in English. First and second formant frequencies were measured. Results show that English allows more V-to-V coarticulation than Thai regardless of the intervocalic duration and vowel quality difference, but open and closed syllables only affect V-to-V coarticulation minimally. In addition to syllable structure, other possible factors contributing to the language difference in V-to-V coarticulation are discussed.}, } @article {pmid20798570, year = {2010}, author = {Dilley, LC}, title = {Pitch range variation in English tonal contrasts: continuous or categorical?.}, journal = {Phonetica}, volume = {67}, number = {1-2}, pages = {63-81}, doi = {10.1159/000319379}, pmid = {20798570}, issn = {1423-0321}, mesh = {Adolescent ; Female ; Humans ; Imitative Behavior ; Male ; *Phonetics ; *Pitch Perception ; *Sound Spectrography ; *Speech Acoustics ; Young Adult ; }, abstract = {The importance of pitch range variation for intonational meaning and theory is well known; however, whether pitch range is a phonetic dimension which is treated categorically in English remains unclear. To test this possibility, three intonation continua varying in pitch range were constructed which had endpoints with contrastive representations under autosegmental-metrical (AM) theory: H* vs. L+H*, H* with 'peak delay' vs. L*+H, and %H L* vs. L*. The prediction derived from AM theory was that the reproduction of continuous pitch range variation should show a discrete pattern reflecting a change in the phonological representation of tonal sequences and in the number of tonal targets across each continuum. Participants' reproductions of each stimulus set showed continuous variation in pitch range, suggesting that pitch range is a gradient phonetic dimension in English conveying semantic contrast, similar to the formant space for vowels. Moreover, the gradience observed in productions across all parts of the pitch range suggests that contours within each series had the same number of tonal targets. The results support a version of AM theory in which rises and falls are usually comprised of two tonal targets, with strictly monotonic f(0) interpolation between them.}, } @article {pmid20798567, year = {2010}, author = {Recasens, D and Espinosa, A}, title = {The role of the spectral and temporal cues in consonantal vocalization and glide insertion.}, journal = {Phonetica}, volume = {67}, number = {1-2}, pages = {1-24}, doi = {10.1159/000319376}, pmid = {20798567}, issn = {1423-0321}, mesh = {*Cues ; Humans ; Language ; *Phonation ; *Phonetics ; Semantics ; *Sound Spectrography ; Spain ; *Speech Acoustics ; *Speech Perception ; }, abstract = {This study investigates the perceptual role of several acoustic characteristics to glide generation processes affecting the consonants [t], [beta] and [eta], i.e., the vocalization of syllable-final [t] and syllable-initial [beta] into [w], and the insertion of [j] before syllable-final [eta]. Results from identification tests with synthetic speech stimuli performed on Catalan-speaking informants reveal that both the formant frequency characteristics (at the consonant steady-state period for [t] and [beta], and at the endpoint of the vowel transitions for [eta]), and the onset or onset/offset time of the vowel transitions may play an active role in vocalization and glide insertion. Mostly for the changes [t] > [w] and [eta]> [jeta], glide identification was triggered by formant frequency variations rather than by variations in the temporal implementation of the vowel transitions. The implications of the perception results for the interpretation of the sound changes of interest are evaluated.}, } @article {pmid20797829, year = {2011}, author = {Yun, M and Choi, YM and Eun, SH and Seol, IJ and Kim, SJ}, title = {Acoustic effects of lamotrigine in pediatric patients with epilepsy.}, journal = {Brain & development}, volume = {33}, number = {5}, pages = {374-378}, doi = {10.1016/j.braindev.2010.07.007}, pmid = {20797829}, issn = {1872-7131}, mesh = {Adolescent ; Anticonvulsants/*pharmacology/*therapeutic use ; Child ; Child, Preschool ; Electroencephalography ; Epilepsy/*drug therapy/pathology/physiopathology ; Female ; Humans ; Lamotrigine ; Magnetic Resonance Imaging ; Male ; Speech/*drug effects ; *Speech Acoustics ; Triazines/*pharmacology/*therapeutic use ; Voice/drug effects ; }, abstract = {The aim of this study was to investigate the acoustic effects of lamotrigine in pediatric epileptic patients. Newly diagnosed 52 pediatric epileptic patients were assessed standard speech test through a Computerized Speech Lab applied before the beginning of therapy with lamotrigine and 2months after dosage had been stabilized. The voice onset times for /t/, /k(h)/, /p'/ and /t'/ after the therapy and those for /p/, /k/, /p(h)/, /t(h)/ and /k'/ was not affected. Total durations for all stop consonants did not change significantly except that lenis /p/ and /k/ increased significantly (P<0.05). No noteworthy alteration was observed for mean pitch and speaking rate of counting 1-10. Vowel formants and precise articulation rate remained the same. In conclusion, no significant effects of lamotrigine on speech were found in this study. Lamotrigine is safe for acoustic function in pediatric patients.}, } @article {pmid20728309, year = {2011}, author = {Cabrera, D and Davis, PJ and Connolly, A}, title = {Long-term horizontal vocal directivity of opera singers: effects of singing projection and acoustic environment.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {6}, pages = {e291-303}, doi = {10.1016/j.jvoice.2010.03.001}, pmid = {20728309}, issn = {1873-4588}, mesh = {*Acoustics ; Environment ; Female ; Humans ; Male ; Music ; *Phonation ; Speech Acoustics ; }, abstract = {Vocal directivity refers to how directional the sound is that comes from a singer's mouth, that is, whether the sound is focused into a narrow stream of sound projecting in front of the singers or whether it is spread out all around the singer. This study investigates the long-term vocal directivity and acoustic power of professional opera singers and how these vary among subjects, among singing projections, and among vastly different acoustic environments. The vocal sound of eight professional opera singers (six females and two males) was measured in anechoic and reverberant rooms and in a recital hall. Subjects sang in four different ways: (1) paying great attention to intonation; (2) singing as in performance, with all the emotional connection intended by the composer; (3) imagining a large auditorium; and (4) imagining a small theatre. The same song was sung by all singers in all conditions. A head and torso simulator (HATS), radiating sound from its mouth, was used for comparison in all situations. Results show that individual singers have quite consistent long-term average directivity, even across conditions. Directivity varies substantially among singers. Singers are more directional than the standard HATS (which is a physical model of a talking person). The singer's formant region of the spectrum exhibits greater directivity than the lower-frequency range, and results indicate that singers control directivity (at least, incidentally) for different singing conditions as they adjust the spectral emphasis of their voices through their formants.}, } @article {pmid20707450, year = {2010}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {The perceptual organization of sine-wave speech under competitive conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {128}, number = {2}, pages = {804-817}, doi = {10.1121/1.3445786}, pmid = {20707450}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Auditory Pathways/*physiology ; Auditory Threshold ; Dichotic Listening Tests ; Humans ; *Perceptual Masking ; *Pitch Perception ; *Signal Detection, Psychological ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Time Factors ; }, abstract = {Speech comprises dynamic and heterogeneous acoustic elements, yet it is heard as a single perceptual stream even when accompanied by other sounds. The relative contributions of grouping "primitives" and of speech-specific grouping factors to the perceptual coherence of speech are unclear, and the acoustical correlates of the latter remain unspecified. The parametric manipulations possible with simplified speech signals, such as sine-wave analogues, make them attractive stimuli to explore these issues. Given that the factors governing perceptual organization are generally revealed only where competition operates, the second-formant competitor (F2C) paradigm was used, in which the listener must resist competition to optimize recognition [Remez, R. E., et al. (1994). Psychol. Rev. 101, 129-156]. Three-formant (F1+F2+F3) sine-wave analogues were derived from natural sentences and presented dichotically (one ear=F1+F2C+F3; opposite ear=F2). Different versions of F2C were derived from F2 using separate manipulations of its amplitude and frequency contours. F2Cs with time-varying frequency contours were highly effective competitors, regardless of their amplitude characteristics. In contrast, F2Cs with constant frequency contours were completely ineffective. Competitor efficacy was not due to energetic masking of F3 by F2C. These findings indicate that modulation of the frequency, but not the amplitude, contour is critical for across-formant grouping.}, } @article {pmid20699342, year = {2010}, author = {Green, JR and Nip, IS and Wilson, EM and Mefferd, AS and Yunusova, Y}, title = {Lip movement exaggerations during infant-directed speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {6}, pages = {1529-1542}, pmid = {20699342}, issn = {1558-9102}, support = {R01 DC006463/DC/NIDCD NIH HHS/United States ; R03 DC004643-02/DC/NIDCD NIH HHS/United States ; R01 DC006463-03/DC/NIDCD NIH HHS/United States ; R01 DC006463-05S1/DC/NIDCD NIH HHS/United States ; R01DC006463/DC/NIDCD NIH HHS/United States ; R01 DC006463-02/DC/NIDCD NIH HHS/United States ; R01 DC006463-01A1/DC/NIDCD NIH HHS/United States ; R01 DC006463-05/DC/NIDCD NIH HHS/United States ; R03DC004643/DC/NIDCD NIH HHS/United States ; R03 DC004643-01/DC/NIDCD NIH HHS/United States ; R01 DC006463-04/DC/NIDCD NIH HHS/United States ; R01 DC009890/DC/NIDCD NIH HHS/United States ; R03 DC004643-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Biomechanical Phenomena ; Face/physiology ; Female ; Humans ; Infant ; *Language Development ; Lip/*physiology ; Male ; Models, Biological ; Mothers ; Movement/*physiology ; Phonetics ; Reading ; Speech/*physiology ; Videotape Recording ; Young Adult ; }, abstract = {PURPOSE: Although a growing body of literature has identified the positive effects of visual speech on speech and language learning, oral movements of infant-directed speech (IDS) have rarely been studied. This investigation used 3-dimensional motion capture technology to describe how mothers modify their lip movements when talking to their infants.

METHOD: Lip movements were recorded from 25 mothers as they spoke to their infants and other adults. Lip shapes were analyzed for differences across speaking conditions. The maximum fundamental frequency, duration, acoustic intensity, and first and second formant frequency of each vowel also were measured.

RESULTS: Lip movements were significantly larger during IDS than during adult-directed speech, although the exaggerations were vowel specific. All of the vowels produced during IDS were characterized by an elevated vocal pitch and a slowed speaking rate when compared with vowels produced during adult-directed speech.

CONCLUSION: The pattern of lip-shape exaggerations did not provide support for the hypothesis that mothers produce exemplar visual models of vowels during IDS. Future work is required to determine whether the observed increases in vertical lip aperture engender visual and acoustic enhancements that facilitate the early learning of speech.}, } @article {pmid20699341, year = {2010}, author = {Mefferd, AS and Green, JR}, title = {Articulatory-to-acoustic relations in response to speaking rate and loudness manipulations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {5}, pages = {1206-1219}, pmid = {20699341}, issn = {1558-9102}, support = {R01 DC006463/DC/NIDCD NIH HHS/United States ; R01 DC006463-04/DC/NIDCD NIH HHS/United States ; R01 DC006463-03/DC/NIDCD NIH HHS/United States ; R01 DC009890-03/DC/NIDCD NIH HHS/United States ; R01 DC006463-05S1/DC/NIDCD NIH HHS/United States ; R01 DC006463-02/DC/NIDCD NIH HHS/United States ; R01 DC009890/DC/NIDCD NIH HHS/United States ; R01 DC006463-05/DC/NIDCD NIH HHS/United States ; R01 DC009890-02/DC/NIDCD NIH HHS/United States ; R01 DC009890-01A1/DC/NIDCD NIH HHS/United States ; R01 DC009890-04/DC/NIDCD NIH HHS/United States ; R01 DC006463-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Biomechanical Phenomena ; Female ; Humans ; Loudness Perception ; Male ; Middle Aged ; Reference Standards ; Sound Spectrography/methods/standards ; Speech/*classification ; *Speech Acoustics ; Speech Articulation Tests/instrumentation/methods/*standards ; *Speech Intelligibility ; Verbal Behavior/*classification ; Young Adult ; }, abstract = {PURPOSE: In this investigation, the authors determined the strength of association between tongue kinematic and speech acoustics changes in response to speaking rate and loudness manipulations. Performance changes in the kinematic and acoustic domains were measured using two aspects of speech production presumably affecting speech clarity: phonetic specification and variability.

METHOD: Tongue movements for the vowels /ia/ were recorded in 10 healthy adults during habitual, fast, slow, and loud speech using three-dimensional electromagnetic articulography. To determine articulatory-to-acoustic relations for phonetic specification, the authors correlated changes in lingual displacement with changes in acoustic vowel distance. To determine articulatory-to-acoustic relations for phonetic variability, the authors correlated changes in lingual movement variability with changes in formant movement variability.

RESULTS: A significant positive linear association was found for kinematic and acoustic specification but not for kinematic and acoustic variability. Several significant speaking task effects were also observed.

CONCLUSION: Lingual displacement is a good predictor of acoustic vowel distance in healthy talkers. The weak association between kinematic and acoustic variability raises questions regarding the effects of articulatory variability on speech clarity and intelligibility, particularly in individuals with motor speech disorders.}, } @article {pmid20691672, year = {2010}, author = {Bidelman, GM and Krishnan, A}, title = {Effects of reverberation on brainstem representation of speech in musicians and non-musicians.}, journal = {Brain research}, volume = {1355}, number = {}, pages = {112-125}, pmid = {20691672}, issn = {1872-6240}, support = {R01 DC008549/DC/NIDCD NIH HHS/United States ; T32 DC000030/DC/NIDCD NIH HHS/United States ; T32 DC 00030/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Pathways/*physiology ; Brain Stem/*physiology ; Female ; Humans ; Male ; Music/*psychology ; Pitch Perception/*physiology ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Perceptual and neurophysiological enhancements in linguistic processing in musicians suggest that domain specific experience may enhance neural resources recruited for language specific behaviors. In everyday situations, listeners are faced with extracting speech signals in degraded listening conditions. Here, we examine whether musical training provides resilience to the degradative effects of reverberation on subcortical representations of pitch and formant-related harmonic information of speech. Brainstem frequency-following responses (FFRs) were recorded from musicians and non-musician controls in response to the vowel /i/ in four different levels of reverberation and analyzed based on their spectro-temporal composition. For both groups, reverberation had little effect on the neural encoding of pitch but significantly degraded neural encoding of formant-related harmonics (i.e., vowel quality) suggesting a differential impact on the source-filter components of speech. However, in quiet and across nearly all reverberation conditions, musicians showed more robust responses than non-musicians. Neurophysiologic results were confirmed behaviorally by comparing brainstem spectral magnitudes with perceptual measures of fundamental (F0) and first formant (F1) frequency difference limens (DLs). For both types of discrimination, musicians obtained DLs which were 2-4 times better than non-musicians. Results suggest that musicians' enhanced neural encoding of acoustic features, an experience-dependent effect, is more resistant to reverberation degradation which may explain their enhanced perceptual ability on behaviorally relevant speech and/or music tasks in adverse listening conditions.}, } @article {pmid20681749, year = {2011}, author = {Song, JH and Skoe, E and Banai, K and Kraus, N}, title = {Perception of speech in noise: neural correlates.}, journal = {Journal of cognitive neuroscience}, volume = {23}, number = {9}, pages = {2268-2279}, pmid = {20681749}, issn = {1530-8898}, support = {R01 DC001510-13/DC/NIDCD NIH HHS/United States ; R01 DC001510/DC/NIDCD NIH HHS/United States ; F32 DC008052/DC/NIDCD NIH HHS/United States ; T32 NS047987/NS/NINDS NIH HHS/United States ; R01 DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Pathways/physiology ; *Brain Mapping ; Brain Stem/*physiology ; Electroencephalography ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Music ; *Noise ; Reaction Time/physiology ; Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The presence of irrelevant auditory information (other talkers, environmental noises) presents a major challenge to listening to speech. The fundamental frequency (F(0)) of the target speaker is thought to provide an important cue for the extraction of the speaker's voice from background noise, but little is known about the relationship between speech-in-noise (SIN) perceptual ability and neural encoding of the F(0). Motivated by recent findings that music and language experience enhance brainstem representation of sound, we examined the hypothesis that brainstem encoding of the F(0) is diminished to a greater degree by background noise in people with poorer perceptual abilities in noise. To this end, we measured speech-evoked auditory brainstem responses to /da/ in quiet and two multitalker babble conditions (two-talker and six-talker) in native English-speaking young adults who ranged in their ability to perceive and recall SIN. Listeners who were poorer performers on a standardized SIN measure demonstrated greater susceptibility to the degradative effects of noise on the neural encoding of the F(0). Particularly diminished was their phase-locked activity to the fundamental frequency in the portion of the syllable known to be most vulnerable to perceptual disruption (i.e., the formant transition period). Our findings suggest that the subcortical representation of the F(0) in noise contributes to the perception of speech in noisy conditions.}, } @article {pmid20674455, year = {2010}, author = {Yadollahi, A and Moussavi, Z}, title = {Automatic breath and snore sounds classification from tracheal and ambient sounds recordings.}, journal = {Medical engineering & physics}, volume = {32}, number = {9}, pages = {985-990}, doi = {10.1016/j.medengphy.2010.06.013}, pmid = {20674455}, issn = {1873-4030}, mesh = {Audiovisual Aids ; Automation ; Female ; Forehead ; Humans ; Male ; Middle Aged ; Posture ; Respiratory Sounds/*classification ; *Snoring ; Trachea/*physiology ; }, abstract = {In this study respiratory sound signals were recorded from 23 patients suspect of obstructive sleep apnea, who were referred for the full-night sleep lab study. The sounds were recorded with two microphones simultaneously: one placed over trachea and one hung in the air in the vicinity of the patient. During recording the sound signals, patients' Polysomnography (PSG) data were also recorded simultaneously. An automatic method was developed to classify breath and snore sound segments based on their energy, zero crossing rate and formants of the sound signals. For every sound segment, the number of zero crossings, logarithm of the signal's energy and the first formant were calculated. Fischer Linear Discriminant was implemented to transform the 3-dimensional (3D) feature set to a 1-dimensional (1D) space and the Bayesian threshold was applied on the transformed features to classify the sound segments into either snore or breath classes. Three sets of experiments were implemented to investigate the method's performance for different training and test data sets extracted from different neck positions. The overall accuracy of all experiments for tracheal recordings were found to be more than 90% in classifying breath and snore sounds segments regardless of the neck position. This implies the method's accuracy is insensitive to patient's position; hence, simplifying data analysis for an entire night recording. The classification was also performed on sounds signals recorded simultaneously with an ambient microphone and the results were compared with those of the tracheal recording.}, } @article {pmid20674124, year = {2011}, author = {Xue, SA and Lam, CW and Whitehill, TL and Samman, N}, title = {Effects of Class III malocclusion on young male adults' vocal tract development: a pilot study.}, journal = {Journal of oral and maxillofacial surgery : official journal of the American Association of Oral and Maxillofacial Surgeons}, volume = {69}, number = {3}, pages = {845-852}, doi = {10.1016/j.joms.2010.02.038}, pmid = {20674124}, issn = {1531-5053}, mesh = {Adolescent ; Adult ; Articulation Disorders/*etiology ; Case-Control Studies ; Cephalometry ; Humans ; Male ; Malocclusion, Angle Class III/*complications ; Mouth/pathology ; Pharynx/anatomy & histology ; Pilot Projects ; Speech Acoustics ; Speech Production Measurement ; Statistics, Nonparametric ; Vocal Cords/*anatomy & histology ; Voice Disorders/*etiology ; Young Adult ; }, abstract = {PURPOSE: To compare the vocal tract configuration between male speakers with Class III malocclusion and their normally developing counterparts and to investigate the concomitant acoustic changes caused by the alterations in vocal tract configuration.

PATIENTS AND METHODS: Eight young male patients with Class III malocclusion and 8 normally developing counterparts participated in this study. Acoustic reflection technology was used to measure vocal tract dimensions in the 2 groups. A continuous speech sample and 4 sustained vowels (/a/, /æ/, /i/, and /u/) were recorded from each participant to obtain the fundamental frequency and the first 3 formant frequencies (F1, F2, and F3).

RESULTS: The results showed significantly greater oral length and oral volume for young male patients with Class III malocclusion than their cohorts. The F1 of vowel /u/ was found to be significantly higher in male patients with Class III malocclusion than their cohorts. The vowel space of the 4 recorded vowels was reduced and the F1-F2 formant map for /u/ was relatively more scattered in male patients with Class III malocclusion than in the control speakers.

CONCLUSION: This study has provided preliminary information on the effects of Class III malocclusion on vocal tract configuration and concomitant acoustic changes in young male patients.}, } @article {pmid20646047, year = {2010}, author = {Edmonds, BA and James, RE and Utev, A and Vestergaard, MD and Patterson, RD and Krumbholz, K}, title = {Evidence for early specialized processing of speech formant information in anterior and posterior human auditory cortex.}, journal = {The European journal of neuroscience}, volume = {32}, number = {4}, pages = {684-692}, doi = {10.1111/j.1460-9568.2010.07315.x}, pmid = {20646047}, issn = {1460-9568}, support = {MC_U135079245/MRC_/Medical Research Council/United Kingdom ; MC_U135097128/MRC_/Medical Research Council/United Kingdom ; G0500221/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; Auditory Cortex/*anatomy & histology/*physiology ; Electroencephalography ; Humans ; Male ; *Phonetics ; Speech/*physiology ; Time Factors ; Young Adult ; }, abstract = {Many speech sounds, such as vowels, exhibit a characteristic pattern of spectral peaks, referred to as formants, the frequency positions of which depend both on the phonological identity of the sound (e.g. vowel type) and on the vocal-tract length of the speaker. This study investigates the processing of formant information relating to vowel type and vocal-tract length in human auditory cortex by measuring electroencephalographic (EEG) responses to synthetic unvoiced vowels and spectrally matched noises. The results revealed specific sensitivity to vowel formant information in both anterior (planum polare) and posterior (planum temporale) regions of auditory cortex. The vowel-specific responses in these two areas appeared to have different temporal dynamics; the anterior source produced a sustained response for as long as the incoming sound was a vowel, whereas the posterior source responded transiently when the sound changed from a noise to a vowel, or when there was a change in vowel type. Moreover, the posterior source appeared to be largely invariant to changes in vocal-tract length. The current findings indicate that the initial extraction of vowel type from formant information is complete by the level of non-primary auditory cortex, suggesting that speech-specific processing may involve primary auditory cortex, or even subcortical structures. This challenges the view that specific sensitivity to speech emerges only beyond unimodal auditory cortex.}, } @article {pmid20645857, year = {2010}, author = {Neumeyer, V and Harrington, J and Draxler, C}, title = {An acoustic analysis of the vowel space in young and old cochlear-implant speakers.}, journal = {Clinical linguistics & phonetics}, volume = {24}, number = {9}, pages = {734-741}, doi = {10.3109/02699206.2010.491173}, pmid = {20645857}, issn = {1464-5076}, mesh = {*Acoustics ; Adolescent ; Adult ; Aged ; Aging/*physiology ; *Cochlear Implantation ; Deafness/*surgery ; Germany ; Hearing/*physiology ; Hearing Aids ; Humans ; Language ; Middle Aged ; Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Young Adult ; }, abstract = {The main purpose of this study was to compare acoustically the vowel spaces of two groups of cochlear implantees (CI) with two age-matched normal hearing groups. Five young test persons (15-25 years) and five older test persons (55-70 years) with CI and two control groups of the same age with normal hearing were recorded. The speech material consisted of five German vowels V = /a, e, i, o, u/ in bilabial and alveolar contexts. The results showed no differences between the two groups on Euclidean distances for the first formant frequency. In contrast, Euclidean distances for F2 of the CI group were shorter than those of the control group, causing their overall vowel space to be compressed. The main differences between the groups are interpreted in terms of the extent to which the formants are associated with visual cues to the vowels. Further results were partially longer vowel durations for the CI speakers.}, } @article {pmid20643794, year = {2010}, author = {Story, BH and Bunton, K}, title = {Relation of vocal tract shape, formant transitions, and stop consonant identification.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {6}, pages = {1514-1528}, pmid = {20643794}, issn = {1558-9102}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC004789-09/DC/NIDCD NIH HHS/United States ; R01 DC004789-10/DC/NIDCD NIH HHS/United States ; R01 DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Biomechanical Phenomena ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; *Models, Biological ; Phonation/*physiology ; *Phonetics ; Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; Vocal Cords/anatomy & histology/*physiology ; Young Adult ; }, abstract = {PURPOSE: The present study was designed to investigate the relation of formant transitions to place-of-articulation for stop consonants. A speech production model was used to generate simulated utterances containing voiced stop consonants, and a perceptual experiment was performed to test their identification by listeners.

METHOD: Based on a model of the vocal tract shape, a theoretical basis for reducing highly variable formant transitions to more invariant formant deflection patterns as a function of constriction location was proposed. A speech production model was used to simulate vowel-consonant-vowel (VCV) utterances for 3 underlying vowel-vowel contexts and for which the constriction location was incrementally moved from the lips toward the velar part of the vocal tract. These simulated VCVs were presented to listeners who were asked to identify the consonant.

RESULTS: Listener responses indicated that phonetic boundaries were well aligned with points along the vocal tract length where there was a shift in the deflection polarity of either the 2nd or 3rd formant.

CONCLUSIONS: This study demonstrated that regions of the vocal tract exist that, when constricted, shift the formant frequencies in a predictable direction. Based on a perceptual experiment, the boundaries of these acoustically defined regions were shown to coincide with phonetic categories for stop consonants.}, } @article {pmid20621742, year = {2010}, author = {Sidiropoulos, K and Ackermann, H and Wannke, M and Hertrich, I}, title = {Temporal processing capabilities in repetition conduction aphasia.}, journal = {Brain and cognition}, volume = {73}, number = {3}, pages = {194-202}, doi = {10.1016/j.bandc.2010.05.003}, pmid = {20621742}, issn = {1090-2147}, mesh = {Aged ; Aphasia, Conduction/*physiopathology ; *Auditory Perception ; *Differential Threshold ; Discrimination, Psychological ; Humans ; Male ; Signal Detection, Psychological ; Sound Spectrography ; *Speech Perception ; *Time Perception ; Verbal Behavior ; }, abstract = {This study investigates the temporal resolution capacities of the central-auditory system in a subject (NP) suffering from repetition conduction aphasia. More specifically, the patient was asked to detect brief gaps between two stretches of broadband noise (gap detection task) and to evaluate the duration of two biphasic (WN-3) continuous noise elements, starting with white noise (WN) followed by 3kHz bandpass-filtered noise (duration discrimination task). During the gap detection task, the two portions of each stimulus were either identical ("intra-channel condition") or differed ("inter-channel condition") in the spectral characteristics of the leading and trailing acoustic segments. NP did not exhibit any deficits in the intra-channel condition of the gap detection task, indicating intact auditory temporal resolution across intervals of 1-3ms. By contrast, the inter-channel condition yielded increased threshold values. Based upon the "multiple-looks" model of central-auditory processing, this profile points at a defective integration window operating across a few tens of milliseconds - a temporal range associated with critical features of the acoustic speech signal such as voice onset time and formant transitions. Additionally, NP was found impaired during a duration discrimination task addressing longer integration windows (ca. 150ms). Concerning speech, this latter time domain approximately corresponds to the duration of stationary segmental units such as fricatives and long vowels. On the basis of our results we suggest, that the patient's auditory timing deficits in non-speech tasks may account, at least partially, for his impairments in speech processing.}, } @article {pmid20606713, year = {2010}, author = {Monahan, PJ and Idsardi, WJ}, title = {Auditory Sensitivity to Formant Ratios:Toward an Account of Vowel Normalization.}, journal = {Language and cognitive processes}, volume = {25}, number = {6}, pages = {808-839}, pmid = {20606713}, issn = {0169-0965}, support = {R01 DC005660/DC/NIDCD NIH HHS/United States ; R01 DC005660-08/DC/NIDCD NIH HHS/United States ; }, abstract = {A long-standing question in speech perception research is how do listeners extract linguistic content from a highly variable acoustic input. In the domain of vowel perception, formant ratios, or the calculation of relative bark differences between vowel formants, have been a sporadically proposed solution. We propose a novel formant ratio algorithm in which the first (F1) and second (F2) formants are compared against the third formant (F3). Results from two magnetoencephelographic (MEG) experiments are presented that suggest auditory cortex is sensitive to formant ratios. Our findings also demonstrate that the perceptual system shows heightened sensitivity to formant ratios for tokens located in more crowded regions of the vowel space. Additionally, we present statistical evidence that this algorithm eliminates speaker-dependent variation based on age and gender from vowel productions. We conclude that these results present an impetus to reconsider formant ratios as a legitimate mechanistic component in the solution to the problem of speaker normalization.}, } @article {pmid20602581, year = {2010}, author = {De Letter, M and Van Borsel, J and Boon, P and De Bodt, M and Dhooge, I and Santens, P}, title = {Sequential changes in motor speech across a levodopa cycle in advanced Parkinson's disease.}, journal = {International journal of speech-language pathology}, volume = {12}, number = {5}, pages = {405-413}, doi = {10.3109/17549507.2010.491556}, pmid = {20602581}, issn = {1754-9515}, mesh = {Adult ; Aged ; Antiparkinson Agents/*administration & dosage ; Female ; Humans ; Levodopa/*administration & dosage ; Male ; Middle Aged ; Parkinson Disease/complications/*drug therapy ; Speech/*drug effects ; Speech Disorders/drug therapy/etiology ; }, abstract = {Previous research has indicated that in Parkinson's disease (PD) some motor speech characteristics are changed by levodopa administration, while others are not. In advanced PD, the time course of these changes and the correlations with motor performance have not been sufficiently investigated. The purpose was to investigate the sequential changes of respiratory, articulatory, and phonatory speech characteristics across a levodopa drug cycle, using spirometry, acoustic, and motor speech analysis. Seven patients with advanced PD were included. All patients were evaluated sequentually at 15 minute intervals before and following levodopa intake. Data were analysed using repeated measures ANOVA and non-parametric analysis. Significant changes were found in motor function, vital capacity, and standard deviation of the diadochokinetic period. A trend was present for shimmer and frequency of the first formant. Significant inter-individual differences in the sequential changes were demonstrated for nearly all evaluated parameters. The conclusion is that, in advanced PD, the evaluation of speech characteristics at one moment after levodopa administration is not representative of an entire drug cycle and that an individualized evaluation of an entire drug cycle is warranted before initiation of a speech-language pathology program.}, } @article {pmid20600193, year = {2010}, author = {Coez, A and Belin, P and Bizaguet, E and Ferrary, E and Zilbovicius, M and Samson, Y}, title = {Hearing loss severity: impaired processing of formant transition duration.}, journal = {Neuropsychologia}, volume = {48}, number = {10}, pages = {3057-3061}, doi = {10.1016/j.neuropsychologia.2010.06.016}, pmid = {20600193}, issn = {1873-3514}, mesh = {Acoustic Stimulation/methods ; Aged ; Aged, 80 and over ; Analysis of Variance ; Cognition Disorders/*etiology ; Female ; Hearing Aids ; Hearing Loss/*complications ; Humans ; Male ; Mental Processes/*physiology ; Middle Aged ; Neuropsychological Tests ; Psychoacoustics ; Reaction Time/physiology ; Severity of Illness Index ; }, abstract = {Normal hearing listeners exploit the formant transition (FT) detection to identify place of articulation for stop consonants. Neuro-imaging studies revealed that short FT induced less cortical activation than long FT. To determine the ability of hearing impaired listeners to distinguish short and long formant transitions (FT) from vowels of the same duration, 84 mild to severe hearing impaired listeners and 5 normal hearing listeners were asked to detect 10 synthesized stimuli with long (200 ms) or short (40 ms) FT among 30 stimuli of the same duration without FT. Hearing impaired listeners were tested with and without hearing aids. The effect of the difficulty of the task (short/long FT) was analysed as a function of the hearing loss with and without hearing aids. Normal hearing listeners were able to detect every FT (short and long). For hearing impaired listeners, the detection of long FT was better than that of short ones irrespective of their degree of hearing loss. The use of hearing aids improved detection of both kinds of FT; however, the detection of long FT remained much better than the detection of the short ones. The length of FT modified the ability of hearing impaired patients to detect FT. Short FT had access to less cortical processing than long FT and cochlea damages enhanced this specific deficit in short FT brain processing. These findings help to understand the limit of deafness rehabilitation in the time domain and should be taken into account in future devices development.}, } @article {pmid20570761, year = {2010}, author = {Sharifzadeh, HR and McLoughlin, IV and Ahmadi, F}, title = {Reconstruction of normal sounding speech for laryngectomy patients through a modified CELP codec.}, journal = {IEEE transactions on bio-medical engineering}, volume = {57}, number = {10}, pages = {2448-2458}, doi = {10.1109/TBME.2010.2053369}, pmid = {20570761}, issn = {1558-2531}, mesh = {Algorithms ; Humans ; Laryngectomy/*rehabilitation ; *Larynx, Artificial ; Signal Processing, Computer-Assisted/*instrumentation ; *Speech ; Speech, Alaryngeal/*instrumentation/methods ; }, abstract = {Whispered speech can be useful for quiet and private communication, and is the primary means of unaided spoken communication for many people experiencing voice-box deficiencies. Patients who have undergone partial or full laryngectomy are typically unable to speak anything more than hoarse whispers, without the aid of prostheses or specialized speaking techniques. Each of the current prostheses and rehabilitative methods for post-laryngectomized patients (primarily oesophageal speech, tracheo-esophageal puncture, and electrolarynx) have particular disadvantages, prompting new work on nonsurgical, noninvasive alternative solutions. One such solution, described in this paper, combines whisper signal analysis with direct formant insertion and speech modification located outside the vocal tract. This approach allows laryngectomy patients to regain their ability to speak with a more natural voice than alternative methods, by whispering into an external prosthesis, which then, recreates and outputs natural-sounding speech. It relies on the observation that while the pitch-generation mechanism of laryngectomy patients is damaged or unusable, the remaining components of the speech production apparatus may be largely unaffected. This paper presents analysis and reconstruction methods designed for the prosthesis, and demonstrates their ability to obtain natural-sounding speech from the whisper-speech signal using an external analysis-by-synthesis processing framework.}, } @article {pmid20556628, year = {2010}, author = {Kale, S and Heinz, MG}, title = {Envelope coding in auditory nerve fibers following noise-induced hearing loss.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {11}, number = {4}, pages = {657-673}, pmid = {20556628}, issn = {1438-7573}, support = {R01 DC009838-01A1/DC/NIDCD NIH HHS/United States ; R03 DC007348/DC/NIDCD NIH HHS/United States ; R01DC009838/DC/NIDCD NIH HHS/United States ; R03 DC007348-04/DC/NIDCD NIH HHS/United States ; R03DC007348/DC/NIDCD NIH HHS/United States ; R01 DC009838/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Perception/physiology ; Chinchilla ; Cochlear Nerve/*physiology ; Hearing Loss, Noise-Induced/*physiopathology ; Hearing Loss, Sensorineural/*physiopathology ; Male ; Models, Animal ; }, abstract = {Recent perceptual studies suggest that listeners with sensorineural hearing loss (SNHL) have a reduced ability to use temporal fine-structure cues, whereas the effects of SNHL on temporal envelope cues are generally thought to be minimal. Several perceptual studies suggest that envelope coding may actually be enhanced following SNHL and that this effect may actually degrade listening in modulated maskers (e.g., competing talkers). The present study examined physiological effects of SNHL on envelope coding in auditory nerve (AN) fibers in relation to fine-structure coding. Responses were compared between anesthetized chinchillas with normal hearing and those with a mild-moderate noise-induced hearing loss. Temporal envelope coding of narrowband-modulated stimuli (sinusoidally amplitude-modulated tones and single-formant stimuli) was quantified with several neural metrics. The relative strength of envelope and fine-structure coding was compared using shuffled correlogram analyses. On average, the strength of envelope coding was enhanced in noise-exposed AN fibers. A high degree of enhanced envelope coding was observed in AN fibers with high thresholds and very steep rate-level functions, which were likely associated with severe outer and inner hair cell damage. Degradation in fine-structure coding was observed in that the transition between AN fibers coding primarily fine structure or envelope occurred at lower characteristic frequencies following SNHL. This relative fine-structure degradation occurred despite no degradation in the fundamental ability of AN fibers to encode fine structure and did not depend on reduced frequency selectivity. Overall, these data suggest the need to consider the relative effects of SNHL on envelope and fine-structure coding in evaluating perceptual deficits in temporal processing of complex stimuli.}, } @article {pmid20518287, year = {2010}, author = {Wan, P and Huang, Z and Zheng, Q}, title = {[Acoustic elementary research on voice resonance of Chinese population].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {24}, number = {6}, pages = {250-252}, pmid = {20518287}, issn = {2096-7993}, mesh = {*Acoustics ; Adolescent ; Adult ; Age Factors ; Asian People ; Child ; Child, Preschool ; Female ; Humans ; Male ; Sampling Studies ; Sex Factors ; Speech Acoustics ; *Voice ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the developing rule of formants of core vowels with age, and difference with gender.

METHOD: 2492 normal people were randomly selected with 1318 males and 93 females,and divided to 4 groups according to age. /a/,/i/,/u/ of each child was recorded. Using D1200 to record normatively, reading data of formants in Real Analysis. Datawere analysis by SPSS13.0.

RESULT: Formants of Core vowels had significant difference with age (P < 0.05), generally having a decreasing trend mainly with F1,F2 and deltaA (A1-A2) were different significantly as to gender; There were also significant difference between the formants of the core vowels (P < 0.05).

CONCLUSION: (1) There are significant effect of age,gender and core vowels(a,i,u)on F1, F2; (2) There are parallel correlation between developing trend of F1, F2 and that of F0.}, } @article {pmid20457089, year = {2011}, author = {Debernardi, A and Suzanne, E and Formant, A and Pène, L and Dufour, AB and Lobry, JR}, title = {One year variability of peak heights, heterozygous balance and inter-locus balance for the DNA positive control of AmpFℓSTR© Identifiler© STR kit.}, journal = {Forensic science international. Genetics}, volume = {5}, number = {1}, pages = {43-49}, doi = {10.1016/j.fsigen.2010.01.020}, pmid = {20457089}, issn = {1878-0326}, mesh = {Alleles ; Cell Line, Transformed ; Cell Transformation, Viral ; Control Groups ; DNA/*genetics ; DNA Fingerprinting/methods ; Genetic Loci ; Genotype ; Herpesvirus 4, Human/physiology ; *Heterozygote ; Humans ; Microsatellite Repeats/genetics ; Multivariate Analysis ; Nucleic Acid Amplification Techniques/methods ; Polymerase Chain Reaction ; *Reagent Kits, Diagnostic ; Retrospective Studies ; Sensitivity and Specificity ; }, abstract = {Multivariate analyses of 205 positive control experiments in an AmpFℓSTR© Identifiler© STR kit were used to analyze the factors affecting peak heights at 16 loci. Peak heights were found to be highly correlated between loci and there was evidence for a difference in sensitivity of the two genetic analyzers in the blue channel. Heterozygous balance response at 10 loci was found to behave as a random variable following a beta-distribution with typical median values of 90%, without locus or genetic analyzer effect. Inter-locus balance at 16 loci was influenced by the blue channel effect and a temporal switch of unexplained origin. The implications of these results for the choice of minimum threshold values in quality control are discussed.}, } @article {pmid20456915, year = {2011}, author = {Leino, T and Laukkanen, AM and Radolf, V}, title = {Formation of the actor's/speaker's formant: a study applying spectrum analysis and computer modeling.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {2}, pages = {150-158}, doi = {10.1016/j.jvoice.2009.10.002}, pmid = {20456915}, issn = {1873-4588}, mesh = {*Computer Simulation ; Fourier Analysis ; Humans ; Language ; Male ; *Models, Theoretical ; Occupations ; *Phonetics ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice Quality ; *Voice Training ; }, abstract = {HYPOTHESIS: A strong peak between 3 and 4 kHz in the long-term average spectrum (LTAS) of speech has been found to be one correlate of a good male speaking voice, for example, among actors. The actor's or speaker's formant (resembling the singer's formant) can be established by certain vocal training. This study investigates the origin of the speaker's formant.

STUDY DESIGN AND SETTING: The immediate effects of a vocal exercise series on speaking voice were studied in a Finnish male actor, who is an experienced teacher of the exercises. They consist of nasal vowel syllable strings and words containing nasals. Before and after a 30-minute exercising, the subject (1) read aloud at three loudness levels and (2) phonated the Finnish vowels at habitual level.

METHODS: Formant frequencies were estimated from spectra of the vowel samples. LTAS was made and equivalent sound level (L(eq)) was measured for the text samples. Formant frequencies were used as the input for a one-dimensional (1D) mathematical model.

RESULTS: After the exercise, the peak at 3.5 kHz in the LTAS of the reading samples was stronger, although L(eq) was the same as before, suggesting a level-independent resonance change. Reading samples after exercising were evaluated to sound better in voice quality than before exercising. The strong peak at 3.5 kHz was present in all vowels, and it was mainly formed by clustering of F₄ and F₅.

CONCLUSIONS: A 1D model-based optimization suggested that this kind of a formant cluster could be best established by simultaneously narrowing the epilaryngeal tube, widening the pharynx and narrowing the front of the oral cavity.}, } @article {pmid20453651, year = {2010}, author = {Korczak, PA and Stapells, DR}, title = {Effects of various articulatory features of speech on cortical event-related potentials and behavioral measures of speech-sound processing.}, journal = {Ear and hearing}, volume = {31}, number = {4}, pages = {491-504}, doi = {10.1097/AUD.0b013e3181d8683d}, pmid = {20453651}, issn = {1538-4667}, support = {HD01799/HD/NICHD NIH HHS/United States ; P50 DC00223/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Attention/physiology ; Auditory Cortex/*physiology ; Auditory Perception/physiology ; *Evoked Potentials, Auditory ; Female ; Hearing/physiology ; Humans ; Male ; Reaction Time ; Speech/*physiology ; Speech Articulation Tests ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: To investigate the effects of three articulatory features of speech (i.e., vowel-space contrast, place of articulation of stop consonants, and voiced/voiceless distinctions) on cortical event-related potentials (ERPs) (waves N1, mismatch negativity, N2b, and P3b) and their related behavioral measures of discrimination (d-prime sensitivity and reaction time [RT]) in normal-hearing adults to increase our knowledge regarding how the brain responds to acoustical differences that occur within an articulatory speech feature and across articulatory features of speech.

DESIGN: Cortical ERPs were recorded to three sets of consonant-vowel speech stimuli (/bi versus /bu/, /ba/ versus /da/, /da/ versus /ta/) presented at 65 and 80 dB peak-to-peak equivalent SPL from 20 normal-hearing adults. All speech stimuli were presented in an oddball paradigm. Cortical ERPs were recorded from 10 individuals in the active-listening condition and another 10 individuals in the passive-listening condition. All listeners were tested at both stimulus intensities.

RESULTS: Mean amplitudes for all ERP components were considerably larger for the responses to the vowel contrast in comparison with the responses to the two consonant contrasts. Similarly, the mean mismatch negativity, P3b, and RT latencies were significantly shorter for the responses to the vowel versus consonant contrasts. For the majority of ERP components, only small nonsignificant differences occurred in either the ERP amplitude or the latency response measurements for stimuli within a particular articulatory feature of speech.

CONCLUSIONS: The larger response amplitudes and earlier latencies for the cortical ERPs to the vowel versus consonant stimuli are likely related, in part, to the large spectral differences present in these speech contrasts. The measurements of response strength (amplitudes and d-prime scores) and response timing (ERP and RT latencies) for the various cortical ERPs suggest that the brain may have an easier task processing the steady state information present in the vowel stimuli in comparison with the rapidly changing formant transitions in the consonant stimuli.}, } @article {pmid20440114, year = {2010}, author = {Martin, BA and Boothroyd, A and Ali, D and Leach-Berth, T}, title = {Stimulus presentation strategies for eliciting the acoustic change complex: increasing efficiency.}, journal = {Ear and hearing}, volume = {31}, number = {3}, pages = {356-366}, pmid = {20440114}, issn = {1538-4667}, support = {K23 DC005386/DC/NIDCD NIH HHS/United States ; K23 DC005386-07/DC/NIDCD NIH HHS/United States ; DC05386/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods/standards ; Adult ; Age Factors ; Auditory Cortex/*physiology ; Child ; Electrodes ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Noise ; *Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {OBJECTIVE: The purpose of this study was to compare four strategies for stimulus presentation in terms of their efficiency when generating a speech-evoked cortical acoustic change complex (ACC) in adults and children.

DESIGN: Ten normally hearing adults (aged 22 to 31 yrs) and nine normally hearing children (aged 6 to 9 yrs) served as participants. The ACC was elicited using a 75-dB SPL synthetic vowel containing 1000 Hz changes of second formant frequency, creating a change of perceived vowel between /u/ and /i/. The ACC was recorded from Cz using four stimulus formats:ACC magnitude was expressed as the standard deviation of the voltage waveform within a window believed to span the ACC. Noise magnitude was estimated from the variances at each sampling point in the same window. Efficiency was expressed in terms of the ACC to noise magnitude ratio divided by testing time.

RESULTS: ACC magnitude was not significantly different for the two directions of second formant change. Reducing interonset interval from 2 to 1 sec increased efficiency by a factor close to two. Combining data from the two directions of change increased efficiency further, by a factor approximating the square root of 2.

CONCLUSION: Continuous alternating stimulus presentation is more efficient than interrupted stimulus presentation in eliciting the ACC. The benefits of eliminating silent periods and doubling the number of acoustic changes presented in a given time period are not seriously offset by a reduction in root mean square response amplitude, at least in young adults and in children as young as 6 yrs.}, } @article {pmid20436682, year = {2010}, author = {King, LE and Soltis, J and Douglas-Hamilton, I and Savage, A and Vollrath, F}, title = {Bee threat elicits alarm call in African elephants.}, journal = {PloS one}, volume = {5}, number = {4}, pages = {e10346}, pmid = {20436682}, issn = {1932-6203}, mesh = {*Animal Communication ; Animals ; Anxiety ; *Bees ; *Behavior, Animal ; Elephants/physiology/*psychology ; Social Behavior ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {Unlike the smaller and more vulnerable mammals, African elephants have relatively few predators that threaten their survival. The sound of disturbed African honeybees Apis meliffera scutellata causes African elephants Loxodonta africana to retreat and produce warning vocalizations that lead other elephants to join the flight. In our first experiment, audio playbacks of bee sounds induced elephants to retreat and elicited more head-shaking and dusting, reactive behaviors that may prevent bee stings, compared to white noise control playbacks. Most importantly, elephants produced distinctive "rumble" vocalizations in response to bee sounds. These rumbles exhibited an upward shift in the second formant location, which implies active vocal tract modulation, compared to rumbles made in response to white noise playbacks. In a second experiment, audio playbacks of these rumbles produced in response to bees elicited increased headshaking, and further and faster retreat behavior in other elephants, compared to control rumble playbacks with lower second formant frequencies. These responses to the bee rumble stimuli occurred in the absence of any bees or bee sounds. This suggests that these elephant rumbles may function as referential signals, in which a formant frequency shift alerts nearby elephants about an external threat, in this case, the threat of bees.}, } @article {pmid20435116, year = {2011}, author = {Steinschneider, M and Fishman, YI}, title = {Enhanced physiologic discriminability of stop consonants with prolonged formant transitions in awake monkeys based on the tonotopic organization of primary auditory cortex.}, journal = {Hearing research}, volume = {271}, number = {1-2}, pages = {103-114}, pmid = {20435116}, issn = {1878-5891}, support = {R01 DC000657/DC/NIDCD NIH HHS/United States ; R01 DC000657-17/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Cortex/*physiology ; Child ; Discrimination, Psychological/*physiology ; Evoked Potentials, Auditory ; Humans ; Language Development ; Language Development Disorders/physiopathology ; Macaca fascicularis ; Male ; Models, Animal ; Models, Neurological ; Phonetics ; Speech Perception/*physiology ; Wakefulness ; }, abstract = {Many children with specific language impairment (SLI) have difficulty in perceiving stop consonant-vowel syllables (e.g., /ba/, /ga/, /da/) with rapid formant transitions, but perform normally when formant transitions are extended in time. This influential observation has helped lead to the development of the auditory temporal processing hypothesis, which posits that SLI is causally related to the processing of rapidly changing sounds in aberrantly expanded windows of temporal integration. We tested a potential physiological basis for this observation by examining whether syllables varying in their consonant place of articulation (POA) with prolonged formant transitions would evoke better differentiated patterns of activation along the tonotopic axis of A1 in awake monkeys when compared to syllables with short formant transitions, especially for more prolonged windows of temporal integration. Amplitudes of multi-unit activity evoked by /ba/, /ga/, and /da/ were ranked according to predictions based on responses to tones centered at the spectral maxima of frication at syllable onset. Population responses representing consonant POA were predicted by the tone responses. Predictions were stronger for syllables with prolonged formant transitions, especially for longer windows of temporal integration. Relevance of findings to normal perception and that occurring in SLI are discussed.}, } @article {pmid20434875, year = {2011}, author = {Akpinar, ME and Kocak, I and Gurpinar, B and Esen, HE}, title = {Effects of soft palate implants on acoustic characteristics of voice and articulation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {3}, pages = {381-386}, doi = {10.1016/j.jvoice.2010.01.008}, pmid = {20434875}, issn = {1873-4588}, mesh = {Adult ; Articulation Disorders/diagnosis/etiology/physiopathology ; Humans ; Male ; Middle Aged ; Palate, Soft/*surgery ; Prospective Studies ; Prosthesis Design ; Prosthesis Implantation/adverse effects/*instrumentation ; Severity of Illness Index ; Signal Processing, Computer-Assisted ; Sleep Apnea, Obstructive/diagnosis/*surgery ; Snoring/*surgery ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Treatment Outcome ; Turkey ; Voice Disorders/diagnosis/etiology/physiopathology ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To evaluate the effects of soft palate implants on voice and articulation.

STUDY DESIGN: Prospective case series.

METHODS: Male subjects (n=23) diagnosed with mild obstructive sleep apnea and/or habitual snoring underwent acoustic analysis with the Multidimensional Voice Program (Kay Multi-Speech Model 3700 Advanced Version; Kay Elemetrics [KayPentax], Lincoln Park, NJ) before and 8 weeks after insertion of palatal implants to determine the effects of soft palate implants on voice and articulation. Sustained vowels (/a/e/u/o/i/) and phonetically balanced carrier sentences were used for acoustic analyses. Parameters measured were fundamental frequency (F0), jitter, shimmer, noise-to-harmonics ratio, Voice Turbulence Index, Soft Phonation Index, degree of voiceless, degree of voice breaks and peak amplitude variation, first formant (F1) and second formant (F2) frequencies, and voice onset time (VOT). F1 and F2 for each vowel were determined using linear predictive analysis on a spectrogram. VOT was measured for the palatal consonant /k/ and the dental consonant /t/ on a wideband spectrogram from a carrier sentence segment that contained a syllable with a stop consonant.

RESULTS: No statistically significant difference was detected in F0, F1, F2, or other MDVP parameters before and after implantation. Average VOT values measured for /t/ were not significantly different. On the other hand, average VOT values of /k/ were found to be significantly shorter.

CONCLUSIONS: Implant insertion had no significant effect on MDVP parameters, F0, F1, or F2. On the other hand, articulation as a function of velar region seemed to be affected because VOT values of velar /k/ were changed.}, } @article {pmid20433339, year = {2010}, author = {Wenke, RJ and Cornwell, P and Theodoros, DG}, title = {Changes to articulation following LSVT(R) and traditional dysarthria therapy in non-progressive dysarthria.}, journal = {International journal of speech-language pathology}, volume = {12}, number = {3}, pages = {203-220}, doi = {10.3109/17549500903568468}, pmid = {20433339}, issn = {1754-9515}, mesh = {Adult ; Aged ; Aged, 80 and over ; Dysarthria/*therapy ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; *Speech ; Speech Acoustics ; Speech Articulation Tests ; Speech Intelligibility ; Time Factors ; Treatment Outcome ; Voice Training ; Young Adult ; }, abstract = {The present study aimed to evaluate the effects of the Lee Silverman Voice Treatment (LSVT(R)) on acoustic and perceptual measures of articulation in non-progressive dysarthria in comparison to traditional dysarthria therapy. The study involved 26 individuals with non-progressive dysarthria who were randomly allocated to receive either LSVT(R) or traditional dysarthria therapy (TRAD), both of which were administered for 16 hourly sessions over 4 weeks. Participants' speech samples were collected over a total of six testing sessions during three assessment phases: (1) prior to treatment, (2) immediately post-treatment, and (3) 6 months post-treatment (FU). Speech samples were analysed perceptually to determine articulatory precision and intelligibility as well as acoustically using vowel space (and vowel formant measures) and first moment differences. Results revealed short and long-term significant increases in vowel space area following LSVT(R). Significantly increased intelligibility was also found at FU in the LSVT(R) group. No significant differences between groups for any variables were found. The study reveals that LSVT(R) may be a suitable treatment option for improving vowel articulation and subsequent intelligibility in some individuals with non-progressive dysarthria.}, } @article {pmid20422980, year = {2010}, author = {Liang, Y and Fu, Q and Su, Y and Wang, T}, title = {[Comparison of auditory brainstem responses to speech from bilateral ears with ipsilateral recordings].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {24}, number = {4}, pages = {161-165}, pmid = {20422980}, issn = {2096-7993}, mesh = {*Acoustic Stimulation ; Adolescent ; Adult ; Ear/*physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Speech ; Young Adult ; }, abstract = {OBJECTIVE: To compare the time domain and the frequency domain of speech-evoked auditory brain-stem response measured by stimulation of left and right ears, and to explore the difference and possible reasons of neural coding for speech from different ears in auditory brainstem.

METHOD: Speech-ABRs to syllable /da/ of 31 healthy adults were recorded. Statistical analysis was performed on time-domain parameters, such as latencies and amplitudes of featured peaks, and frequency-domain ones, such as amplitudes of the fundamental frequency and the first formant of speech-ABRs ranging from 20-50 ms. A scoring criterion to grade the appearance of featured waves was proposed for waveform evaluation.

RESULT: There were no significant difference for the latencies of binaural featured peaks and amplitudes of feature peaks (except peaks A and O). The waveform .scores of right ear were greater than that of left ear. The amplitudes of fundamental frequency of binaural waves were both greater than that of the first formant. There was no significant difference of amplitudes of fundamental frequency and the first formant between two ears.

CONCLUSION: The origins and distributions of speech-ABR are essentially symmetrical in brainstem in contrast with the hemisphere asymmetry of speech.}, } @article {pmid20415001, year = {2010}, author = {Bavin, EL and Grayden, DB and Scotti, K and Stefanakis, T}, title = {Testing auditory processing skills and their associations with language in 4-5-year-olds.}, journal = {Language and speech}, volume = {53}, number = {Pt 1}, pages = {31-47}, doi = {10.1177/0023830909349151}, pmid = {20415001}, issn = {0023-8309}, mesh = {Acoustic Stimulation ; Audiometry ; *Auditory Perception ; *Child Language ; Child, Preschool ; Female ; Humans ; Language Tests ; Male ; Pitch Discrimination ; Regression Analysis ; *Signal Detection, Psychological ; Speech Acoustics ; Task Performance and Analysis ; Time Factors ; }, abstract = {Infants' auditory processing abilities have been shown to predict subsequent language development. In addition, poor auditory processing skills have been shown for some individuals with specific language impairment. Methods used in infant studies are not appropriate for use with young children, and neither are methods typically used to test auditory processing skills in specific language language impairment (SLI). The objective in this study was to develop an appropriate way of testing auditory processing skills in children in the 4-5 year age range. We report data from 49 children aged 4-5 years (mean age 58.57 months) tested on five tasks with tones and synthesized syllables. Frequencies and inter-stimulus intervals were varied in the tone tasks; the second formant transitions between consonant and vowel were varied in the syllable tasks. Consistent with past research, variability was found in children's auditory processing abilities. Significant correlations in discrimination thresholds for the tasks were found. The results from two regression analyses showed that the children's auditory processing abilities predicted significant amounts of variance for receptive and expressive language.}, } @article {pmid20381307, year = {2011}, author = {Szameitat, DP and Darwin, CJ and Szameitat, AJ and Wildgruber, D and Alter, K}, title = {Formant characteristics of human laughter.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {1}, pages = {32-37}, doi = {10.1016/j.jvoice.2009.06.010}, pmid = {20381307}, issn = {1873-4588}, mesh = {Female ; Humans ; Larynx/anatomy & histology/*physiology ; *Laughter ; Male ; *Phonation ; Sex Factors ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Time Factors ; }, abstract = {Although laughter is an important aspect of nonverbal vocalization, its acoustic properties are still not fully understood. Extreme articulation during laughter production, such as wide jaw opening, suggests that laughter can have very high first formant (F(1)) frequencies. We measured fundamental frequency and formant frequencies of the vowels produced in the vocalic segments of laughter. Vocalic segments showed higher average F(1) frequencies than those previously reported and individual values could be as high as 1100 Hz for male speakers and 1500 Hz for female speakers. To our knowledge, these are the highest F(1) frequencies reported to date for human vocalizations, exceeding even the F(1) frequencies reported for trained soprano singers. These exceptionally high F(1) values are likely to be based on the extreme positions adopted by the vocal tract during laughter in combination with physiological constraints accompanying the production of a "pressed" voice.}, } @article {pmid20380690, year = {2010}, author = {Briefer, E and Vannoni, E and McElligott, AG}, title = {Quality prevails over identity in the sexually selected vocalisations of an ageing mammal.}, journal = {BMC biology}, volume = {8}, number = {}, pages = {35}, pmid = {20380690}, issn = {1741-7007}, mesh = {Acoustics ; Aging/physiology ; Animal Communication ; Animals ; Deer/*physiology ; Male ; Principal Component Analysis ; Sexual Behavior, Animal/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {BACKGROUND: Male sexually selected vocalisations generally contain both individuality and quality cues that are crucial in intra- as well as inter-sexual communication. As individuality is a fixed feature whereas male phenotypic quality changes with age, individuality and quality cues may be subjected to different selection pressures over time. Individuality (for example, morphology of the vocal apparatus) and quality (for example, body size and dominance status) can both affect the vocal production mechanism, inducing the same components of vocalisations to convey both kinds of information. In this case, do quality-related changes to the acoustic structure of calls induce a modification of vocal cues to identity from year to year? We investigated this question in fallow deer (Dama dama), in which some acoustic parameters of vocalisations (groans) code for both individuality and quality.

RESULTS: We carried out a longitudinal analysis of groan individuality, examining the effects of age and dominance rank on the acoustic structure of groans of the same males recorded during consecutive years. We found both age- and rank-related changes to groans; the minimum values of the highest formant frequencies and the fundamental frequency increased with the age of males and they decreased when males became more dominant. Both age- and rank-related acoustic parameters contributed to individuality. Male quality changed with age, inducing a change in quality-related parameters and thus, a modification of vocal cues to male individuality between years.

CONCLUSIONS: The encoding of individuality and quality information in the same components of vocalisations induces a tradeoff between these two kinds of signals over time. Fallow deer vocalisations are honest signals of quality that are not fixed over time but are modified dynamically according to male quality. As they are more reliable cues to quality than to individuality, they may not be used by conspecifics to recognize a given male from one year to another, but potentially used by both sexes to assess male quality during each breeding season.}, } @article {pmid20371812, year = {2010}, author = {Anderson, S and Skoe, E and Chandrasekaran, B and Kraus, N}, title = {Neural timing is linked to speech perception in noise.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {30}, number = {14}, pages = {4922-4926}, pmid = {20371812}, issn = {1529-2401}, support = {R01 DC001510/DC/NIDCD NIH HHS/United States ; R01 DC001510-13/DC/NIDCD NIH HHS/United States ; R01 DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Auditory Pathways/physiology ; Auditory Perception/physiology ; Child ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; Neurons/*physiology ; *Noise ; Reaction Time/*physiology ; Speech Perception/*physiology ; }, abstract = {Understanding speech in background noise is challenging for every listener, including those with normal peripheral hearing. This difficulty is attributable in part to the disruptive effects of noise on neural synchrony, resulting in degraded representation of speech at cortical and subcortical levels as reflected by electrophysiological responses. These problems are especially pronounced in clinical populations such as children with learning impairments. Given the established effects of noise on evoked responses, we hypothesized that listening-in-noise problems are associated with degraded processing of timing information at the brainstem level. Participants (66 children; ages, 8-14 years; 22 females) were divided into groups based on their performance on clinical measures of speech-in-noise (SIN) perception and reading. We compared brainstem responses to speech syllables between top and bottom SIN and reading groups in the presence and absence of competing multitalker babble. In the quiet condition, neural response timing was equivalent between groups. In noise, however, the bottom groups exhibited greater neural delays relative to the top groups. Group-specific timing delays occurred exclusively in response to the noise-vulnerable formant transition, not to the more perceptually robust, steady-state portion of the stimulus. These results demonstrate that neural timing is disrupted by background noise and that greater disruptions are associated with the inability to perceive speech in challenging listening conditions.}, } @article {pmid20370042, year = {2010}, author = {Kiefte, M and Enright, T and Marshall, L}, title = {The role of formant amplitude in the perception of /i/ and /u/.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {4}, pages = {2611-2621}, doi = {10.1121/1.3353124}, pmid = {20370042}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Audiometry, Pure-Tone ; Auditory Threshold ; Humans ; *Perceptual Masking ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {Although recent evidence reconfirmed the importance of spectral peak frequencies in vowel identification [Kiefte and Kluender (2005). J. Acoust. Soc. Am. 117, 1395-1404], the role of formant amplitude in perception remains somewhat controversial. Although several studies have demonstrated a relationship between vowel perception and formant amplitude, this effect may be a result of basic auditory phenomena such as decreased local spectral contrast and simultaneous masking. This study examines the roles that local spectral contrast and simultaneous masking play in the relationship between the amplitude of spectral peaks and the perception of vowel stimuli. Both full- and incomplete-spectrum stimuli were used in an attempt to separate the effects of local spectral contrast and simultaneous masking. A second experiment was conducted to measure the detectability of the presence/absence of a formant peak to determine to what extent identification data could be predicted from spectral peak audibility alone. Results from both experiments indicate that, while both masking and spectral contrast likely play important roles in vowel perception, additional factors must be considered in order to account for vowel identification data. Systematic differences between the audibility of spectral peaks and predictions of perceived vowel identity were observed.}, } @article {pmid20370038, year = {2010}, author = {Warlaumont, AS and Oller, DK and Buder, EH and Dale, R and Kozma, R}, title = {Data-driven automated acoustic analysis of human infant vocalizations using neural network tools.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {4}, pages = {2563-2577}, pmid = {20370038}, issn = {1520-8524}, support = {R01 DC006099/DC/NIDCD NIH HHS/United States ; R01 DC006099-04/DC/NIDCD NIH HHS/United States ; R01 DC006099-05/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Age Factors ; *Algorithms ; Automation ; *Child Language ; Female ; Humans ; Infant ; Male ; *Models, Biological ; *Neural Networks, Computer ; *Phonation ; Reproducibility of Results ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; Time Factors ; *Voice ; }, abstract = {Acoustic analysis of infant vocalizations has typically employed traditional acoustic measures drawn from adult speech acoustics, such as f(0), duration, formant frequencies, amplitude, and pitch perturbation. Here an alternative and complementary method is proposed in which data-derived spectrographic features are central. 1-s-long spectrograms of vocalizations produced by six infants recorded longitudinally between ages 3 and 11 months are analyzed using a neural network consisting of a self-organizing map and a single-layer perceptron. The self-organizing map acquires a set of holistic, data-derived spectrographic receptive fields. The single-layer perceptron receives self-organizing map activations as input and is trained to classify utterances into prelinguistic phonatory categories (squeal, vocant, or growl), identify the ages at which they were produced, and identify the individuals who produced them. Classification performance was significantly better than chance for all three classification tasks. Performance is compared to another popular architecture, the fully supervised multilayer perceptron. In addition, the network's weights and patterns of activation are explored from several angles, for example, through traditional acoustic measurements of the network's receptive fields. Results support the use of this and related tools for deriving holistic acoustic features directly from infant vocalization data and for the automatic classification of infant vocalizations.}, } @article {pmid20370036, year = {2010}, author = {Rong, P and Kuehn, DP}, title = {The effect of oral articulation on the acoustic characteristics of nasalized vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {4}, pages = {2543-2553}, doi = {10.1121/1.3294486}, pmid = {20370036}, issn = {1520-8524}, mesh = {Computer Simulation ; Humans ; Magnetic Resonance Imaging ; Male ; Models, Biological ; Mouth/anatomy & histology/*physiology ; Nasal Cavity/anatomy & histology/*physiology ; *Phonation ; Sound Spectrography ; *Speech Acoustics ; Time Factors ; Velopharyngeal Sphincter/anatomy & histology/*physiology ; }, abstract = {To study the acoustic characteristics of nasalized vowels, the effects of velopharyngeal opening and oral articulation are considered. Based on vocal tract area functions for one American English speaker, spectral evolutions for the nasalization of three English vowels /a/, /i/, and /u/ were studied by simulating transfer functions for vowels with only velar movement, and for different nasal consonant-vowel utterances, which include both velar and oral movements. Simulations indicate extra nasal spectral poles and zeros and oral formant shifts as a result of the velopharyngeal opening and oral movements, respectively. In this sense, if oral articulation is coordinated with velar movement in such a way that nasal acoustic features are prominently attenuated, corresponding compensatory articulation can be developed to reduce hypernasality. This may be realized by (1) adjusting the articulatory placement for isolated nasalized vowels or by (2) changing the relative timing of coarticulatory movements for dynamic speech. The results demonstrate the effect of oral articulation on the acoustics of nasalized vowels. This effect allows oral articulation to compensate for velopharyngeal dysfunction, which may involve a constellation of speech production disorders resulting from anomalous velopharyngeal closure and which is usually accompanied by hypernasality and nasal emission of air.}, } @article {pmid20369992, year = {2010}, author = {Fulop, SA}, title = {Accuracy of formant measurement for synthesized vowels using the reassigned spectrogram and comparison with linear prediction.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {4}, pages = {2114-2117}, doi = {10.1121/1.3308476}, pmid = {20369992}, issn = {1520-8524}, mesh = {Algorithms ; Fourier Analysis ; Humans ; *Linear Models ; *Models, Theoretical ; *Phonetics ; Reproducibility of Results ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Recognition Software ; Time Factors ; }, abstract = {This brief report describes a small study which was undertaken with nine synthetic vowel tokens, in an effort to demonstrate the validity of the reassigned spectrogram as a formant measurement tool. The reassigned spectrogram's performance is also compared with that of a typical pitch-asynchronous linear predictive analysis and is found to be superior. In this study, reassigned spectrograms were further processed to highlight the formants and then were used to measure these synthetic vowel formants generally to within 0.5% of their known true values, far surpassing the accuracy of a typical linear predictive analysis procedure which was inaccurate by as much as 17%. The overall accuracy of reassigned spectrographic formant measurement is thus demonstrated in these cases.}, } @article {pmid20369377, year = {2011}, author = {Hodges-Simeon, CR and Gaulin, SJ and Puts, DA}, title = {Voice correlates of mating success in men: examining "contests" versus "mate choice" modes of sexual selection.}, journal = {Archives of sexual behavior}, volume = {40}, number = {3}, pages = {551-557}, doi = {10.1007/s10508-010-9625-0}, pmid = {20369377}, issn = {1573-2800}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; *Choice Behavior ; *Courtship ; Female ; Humans ; Male ; *Sexual Partners ; *Voice ; }, abstract = {Men's copulatory success can often be predicted by measuring traits involved in male contests and female choice. Previous research has demonstrated relationships between one such vocal trait in men, mean fundamental frequency (F(0)), and the outcomes and indicators of sexual success with women. The present study investigated the role of another vocal parameter, F(0) variation (the within-subject SD in F(0) across the utterance, F(0)-SD), in predicting men's reported number of female sexual partners in the last year. Male participants (N = 111) competed with another man for a date with a woman. Recorded interactions with the competitor ("competitive recording") and the woman ("courtship recording") were analyzed for five non-linguistic vocal parameters: F(0)-SD, mean F(0), intensity, duration, and formant dispersion (D(f), an acoustic correlate of vocal tract length), as well as dominant and attractive linguistic content. After controlling for age and attitudes toward uncommitted sex (SOI), lower F(0)-SD (i.e., a more monotone voice) and more dominant linguistic content were strong predictors of the number of past-year sexual partners, whereas mean F(0) and D(f) did not significantly predict past-year partners. These contrasts have implications for the relative importance of male contests and female choice in shaping men's mating success and hence the origins and maintenance of sexually dimorphic traits in humans.}, } @article {pmid20347260, year = {2011}, author = {Stepp, CE and Heaton, JT and Braden, MN and Jetté, ME and Stadelman-Cohen, TK and Hillman, RE}, title = {Comparison of neck tension palpation rating systems with surface electromyographic and acoustic measures in vocal hyperfunction.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {1}, pages = {67-75}, pmid = {20347260}, issn = {1873-4588}, support = {T32 DC000038/DC/NIDCD NIH HHS/United States ; T32 DC000038-18/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Adolescent ; Adult ; *Electromyography ; Female ; Humans ; Male ; Neck Muscles/*physiopathology ; Observer Variation ; *Palpation ; *Phonation ; Predictive Value of Tests ; Prospective Studies ; Reproducibility of Results ; Treatment Outcome ; *Voice ; Voice Disorders/*diagnosis/physiopathology/therapy ; Voice Training ; Young Adult ; }, abstract = {The purpose of this study was to evaluate current neck tension palpation rating systems to determine interrater reliability and possible correlation with necksurface electromyography (sEMG, collected from three electrode recording locations) and to measure the third formant for /a/ during various vocal behaviors. This prospective study examined the neck muscle tension of 16 participants before and after a single session of voice therapy. Interrater reliability and relationships between palpation ratings and objective measures of sEMG (anterior neck) and the third formant for /a/ were assessed using Pearson's correlations (r). Interrater reliability was relatively low as measured by Pearson's correlations, although Wilcoxon signed-rank test results were similar as those in a previous study. Correlations between palpation ratings and sEMG and between ratings of laryngeal height and the third formant for /a/ were generally low. Correlations increased between anterior neck sEMG and ratings of suprahyoid muscle tension when examined in a reduced set of individuals with higher interrater reliability. Palpation rating scales do not reliably capture changes that may occur in neck muscle tension of typical voice therapy patients over one session. Consequently, little can be concluded from correlations between sEMG and palpation ratings.}, } @article {pmid20346448, year = {2010}, author = {Seitz, AR and Protopapas, A and Tsushima, Y and Vlahou, EL and Gori, S and Grossberg, S and Watanabe, T}, title = {Unattended exposure to components of speech sounds yields same benefits as explicit auditory training.}, journal = {Cognition}, volume = {115}, number = {3}, pages = {435-443}, pmid = {20346448}, issn = {1873-7838}, support = {R01 EY015980-03/EY/NEI NIH HHS/United States ; R01 EY015980-04A2/EY/NEI NIH HHS/United States ; R21 EY017737-01/EY/NEI NIH HHS/United States ; R21 EY017737/EY/NEI NIH HHS/United States ; R21 EY017737-02/EY/NEI NIH HHS/United States ; R01 EY015980/EY/NEI NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Animals ; Female ; Humans ; Language ; Learning/*physiology ; Male ; Psychomotor Performance/physiology ; Sound ; *Speech ; Young Adult ; }, abstract = {Learning a second language as an adult is particularly effortful when new phonetic representations must be formed. Therefore the processes that allow learning of speech sounds are of great theoretical and practical interest. Here we examined whether perception of single formant transitions, that is, sound components critical in speech perception, can be enhanced through an implicit task-irrelevant learning procedure that has been shown to produce visual perceptual learning. The single-formant sounds were paired at subthreshold levels with the attended targets in an auditory identification task. Results showed that task-irrelevant learning occurred for the unattended stimuli. Surprisingly, the magnitude of this learning effect was similar to that following explicit training on auditory formant transition detection using discriminable stimuli in an adaptive procedure, whereas explicit training on the subthreshold stimuli produced no learning. These results suggest that in adults learning of speech parts can occur at least partially through implicit mechanisms.}, } @article {pmid20215743, year = {2010}, author = {Krizman, JL and Skoe, E and Kraus, N}, title = {Stimulus rate and subcortical auditory processing of speech.}, journal = {Audiology & neuro-otology}, volume = {15}, number = {5}, pages = {332-342}, pmid = {20215743}, issn = {1421-9700}, support = {R01 DC001510-13/DC/NIDCD NIH HHS/United States ; R01 DC001510/DC/NIDCD NIH HHS/United States ; F32 DC008052/DC/NIDCD NIH HHS/United States ; R01 DC01510/DC/NIDCD NIH HHS/United States ; F32 DC008052-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Brain Mapping ; Electroencephalography ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Reaction Time/physiology ; Signal Processing, Computer-Assisted ; Speech ; Speech Perception/*physiology ; }, abstract = {Many sounds in the environment, including speech, are temporally dynamic. The auditory brainstem is exquisitely sensitive to temporal features of the incoming acoustic stream, and by varying the speed of presentation of these auditory signals it is possible to investigate the precision with which temporal cues are represented at a subcortical level. Therefore, to determine the effects of stimulation rate on the auditory brainstem response (ABR), we recorded evoked responses to both a click and a consonant-vowel speech syllable (/da/) presented at three rates (15.4, 10.9 and 6.9 Hz). We hypothesized that stimulus rate affects the onset to speech-evoked responses to a greater extent than click-evoked responses and that subcomponents of the speech- ABR are distinctively affected. While the click response was invariant with changes in stimulus rate, timing of the onset response to /da/ varied systematically, increasing in peak latency as presentation rate increased. Contrasts between the click- and speech-evoked onset responses likely reflect acoustic differences, where the speech stimulus onset is more gradual, has more delineated spectral information, and is more susceptible to backward masking by the subsequent formant transition. The frequency-following response (FFR) was also rate dependent, with response magnitude of the higher frequencies (>400 Hz), but not the frequencies corresponding to the fundamental frequency, diminishing with increasing rate. The selective impact of rate on high-frequency components of the FFR implicates the involvement of distinct underlying neural mechanisms for high- versus low-frequency components of the response. Furthermore, the different rate sensitivities of the speech-evoked onset response and subcomponents of the FFR support the involvement of different neural streams for these two responses. Taken together, these differential effects of rate on the ABR components likely reflect distinct aspects of auditory function such that varying rate of presentation of complex stimuli may be expected to elicit unique patterns of abnormality, depending on the clinical population.}, } @article {pmid20203437, year = {2010}, author = {Kim, YK and Seo, EG and Lee, SS and Suh, EH and Houpt, KA and Lee, HC and Lee, HJ and Yeon, SC}, title = {Comparative analysis of vocalizations of thoroughbred mares (Equus caballus) between estrus and diestrus.}, journal = {The Journal of veterinary medical science}, volume = {72}, number = {7}, pages = {929-933}, doi = {10.1292/jvms.09-0430}, pmid = {20203437}, issn = {0916-7250}, mesh = {Animals ; Diestrus/*physiology ; Estrus/*physiology ; Female ; Horses/*physiology ; Lactation ; Loudness Perception ; Male ; Parturition/physiology ; Sexual Behavior, Animal/*physiology ; Time Factors ; *Vocalization, Animal ; }, abstract = {We investigated the differences between vocalizations of mares in estrus and diestrus and determined the spectrographic parameters to discriminate estrus from diestrus. Thoroughbred brood mares (n=89) were exposed to a teasing procedure for 3 min, and we recorded all vocalizations emitted from them. Among the mares, 56.5% of estrus and 78.6% of diestrus mares emitted calls toward an approaching stallion, indicating that there was higher tendency in the occurrence rate of vocal responses in diestrus than estrus mares. We analyzed the spectrographic data of the mares (25 estrus and 22 diestrus mares) emitting calls in the form of a squeal toward an approaching stallion. Based on broad bandwidth spectrographic analysis, the duration and third formant of the call have a significant effect on discriminating estrus from diestrus.}, } @article {pmid20163476, year = {2009}, author = {Naekawa, H}, title = {The structure of vocalization of ewes and male lambs at pasture.}, journal = {Animal science journal = Nihon chikusan Gakkaiho}, volume = {80}, number = {1}, pages = {104-111}, doi = {10.1111/j.1740-0929.2008.00602.x}, pmid = {20163476}, issn = {1740-0929}, mesh = {Animal Husbandry ; Animals ; Female ; Male ; *Sheep ; Sound ; *Vocalization, Animal ; }, abstract = {We performed an analysis of the vocalization of 161 ewes and 50 male lambs that were pastured day and night. The vocalization structures of the phonetic notations of the opened-mouth from closed mouth /etaaee/ and /etanaeee/ and closed-mouth /etaetaetaeta/ of ewes, and the opened-mouth from closed-mouth /etaeee/ and /etaneeee/ and closed-mouth /etaetaetaeta/ of male lambs were studied in terms of their audio characteristics. The items subject to analysis included duration of each vocalization, fundamental frequency, sound pressure and first formant (F1), second formant (F2), and third formant (F3). Among the male lambs, the duration of each call type showed significant differences depending on the behavior types or the target types, and the closed-mouth to open-mouth sounds, /etaeee/ and /etaneeee/ showed a tendency to be longer than the closed-mouth sound of /etaetaetaeta/. A significant correlation (r = 0.9, P < 0.05) was observed between the fundamental frequency and F1 with the /etaeee/ type of call of the male lambs when the vocalization was targeted toward the stockperson. When the ewes' behavior pattern was grazing and moving and the calling target was an ewe, the sound pressure significantly differed between /etaetaetaeta/ and /etanaeee/ (P < 0.05). From the analysis of the call durations, the fundamental frequencies, the sound pressures and formants, and the particular characteristics in sound structure were shown to vary according to the vocalization conditions and the situations, and where the animals were placed, even within the same call types.}, } @article {pmid20137894, year = {2011}, author = {Morris, RJ and Gorham-Rowan, MM and Harmon, AB}, title = {The effect of initiating oral contraceptive use on voice: a case study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {25}, number = {2}, pages = {223-229}, doi = {10.1016/j.jvoice.2009.08.006}, pmid = {20137894}, issn = {1873-4588}, mesh = {Acoustics ; Contraceptives, Oral/*adverse effects ; Female ; Glottis/*drug effects/physiology ; Humans ; Menstrual Cycle ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Voice Quality/*drug effects ; Young Adult ; }, abstract = {OBJECTIVE: To examine the effects of initiating oral contraceptive (OC) use on spectrally based acoustic measures.

STUDY DESIGN: Case history.

METHODS: A 23-year-old female volunteered to participate in the study. Voice recordings were obtained during eight consecutive menstrual cycles: two cycles before the onset of OC use and six cycles after the initiation of OC use. Recordings consisted of three trials of the sustained vowel /æ/ and were obtained during the follicular phase (preovulation, days 9-11) and luteal phase (premenstruation, days 20-22) of each cycle. Measurements obtained included the following: (1) H1-H2, the ratio of the amplitude of the first harmonic to the amplitude of the second harmonic, which correlates to closed quotient duration; (2) H1-A1, the ratio of the amplitude of the first harmonic to the amplitude of the first formant, which correlates to glottal width; and (3) H1-A3, the ratio of the first harmonic to the amplitude of the third formant, which correlates to the abruptness of vocal fold closure. In addition, acoustic measures of voice perturbation, jitter, shimmer, and noise-to-harmonic ratio were measured.

RESULTS: The participant exhibited changes in glottal characteristics in conjunction with initiation of OC use. Both the H1-H2 and H1-A1 differences changed with OC use relative to menstrual cycle phase. These two measures have been correlated with the glottal measures of closed quotient and glottal width, respectively. However, there was no significant change in the H1-A3 difference, indicating no effect of OC use on the speed of vocal fold closure.

CONCLUSIONS: The results of this study indicate that the onset of OC use can result in vocal changes as noted by different patterns of glottal adjustment across menstrual cycle phases.}, } @article {pmid20136228, year = {2010}, author = {Sagi, E and Meyer, TA and Kaiser, AR and Teoh, SW and Svirsky, MA}, title = {A mathematical model of vowel identification by users of cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {2}, pages = {1069-1083}, pmid = {20136228}, issn = {1520-8524}, support = {T32-DC00012/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; R01-DC03937/DC/NIDCD NIH HHS/United States ; R01 DC003937/DC/NIDCD NIH HHS/United States ; R01 DC003937-12A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Aged ; Algorithms ; *Cochlear Implants ; Computer Simulation ; Humans ; Information Theory ; Mathematical Concepts ; Middle Aged ; *Models, Neurological ; *Phonetics ; Psychoacoustics ; Psycholinguistics ; Speech ; *Speech Perception ; Young Adult ; }, abstract = {A simple mathematical model is presented that predicts vowel identification by cochlear implant users based on these listeners' resolving power for the mean locations of first, second, and/or third formant energies along the implanted electrode array. This psychophysically based model provides hypotheses about the mechanism cochlear implant users employ to encode and process the input auditory signal to extract information relevant for identifying steady-state vowels. Using one free parameter, the model predicts most of the patterns of vowel confusions made by users of different cochlear implant devices and stimulation strategies, and who show widely different levels of speech perception (from near chance to near perfect). Furthermore, the model can predict results from the literature, such as Skinner, et al. [(1995). Ann. Otol. Rhinol. Laryngol. 104, 307-311] frequency mapping study, and the general trend in the vowel results of Zeng and Galvin's [(1999). Ear Hear. 20, 60-74] studies of output electrical dynamic range reduction. The implementation of the model presented here is specific to vowel identification by cochlear implant users, but the framework of the model is more general. Computational models such as the one presented here can be useful for advancing knowledge about speech perception in hearing impaired populations, and for providing a guide for clinical research and clinical practice.}, } @article {pmid20136227, year = {2010}, author = {MacDonald, EN and Goldberg, R and Munhall, KG}, title = {Compensations in response to real-time formant perturbations of different magnitudes.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {2}, pages = {1059-1068}, pmid = {20136227}, issn = {1520-8524}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; R01 DC008092-05/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Algorithms ; Analysis of Variance ; *Feedback, Psychological ; Humans ; Male ; Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; Voice ; Young Adult ; }, abstract = {Previous auditory perturbation studies have demonstrated that talkers spontaneously compensate for real-time formant-shifts by altering formant production in a manner opposite to the perturbation. Here, two experiments were conducted to examine the effect of amplitude of perturbation on the compensatory behavior for the vowel /epsilon/. In the first experiment, 20 male talkers received three step-changes in acoustic feedback: F1 was increased by 50, 100, and 200 Hz, while F2 was simultaneously decreased by 75, 125, and 250 Hz. In the second experiment, 21 male talkers received acoustic feedback in which the shifts in F1 and F2 were incremented by +4 and -5 Hz on each utterance to a maximum of +350 and -450 Hz, respectively. In both experiments, talkers altered production of F1 and F2 in a manner opposite to that of the formant-shift perturbation. Compensation was approximately 25%-30% of the perturbation magnitude for shifts in F1 and F2 up to 200 and 250 Hz, respectively. As larger shifts were applied, compensation reached a plateau and then decreased. The similarity of results across experiments suggests that the compensatory response is dependent on the perturbation magnitude but not on the rate at which the perturbation is introduced.}, } @article {pmid20136222, year = {2010}, author = {Milenkovic, PH and Yaddanapudi, S and Vorperian, HK and Kent, RD}, title = {Effects of a curved vocal tract with grid-generated tongue profile on low-order formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {2}, pages = {1002-1013}, pmid = {20136222}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; P-30 HD03352/HD/NICHD NIH HHS/United States ; R03 DC4362/DC/NIDCD NIH HHS/United States ; R01 DC6282/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Female ; Finite Element Analysis ; Humans ; Magnetic Resonance Imaging ; Male ; *Models, Biological ; Mouth/anatomy & histology/diagnostic imaging/physiology ; Palate/anatomy & histology/diagnostic imaging/physiology ; Pharynx/anatomy & histology/diagnostic imaging/physiology ; Speech/physiology ; Speech Acoustics ; Tomography, X-Ray Computed ; Tongue/anatomy & histology/diagnostic imaging/*physiology ; Vocal Cords/*anatomy & histology/diagnostic imaging/physiology ; }, abstract = {A hyperbolic grid-generation algorithm allows investigation of the effect of vocal-tract curvature on low-order formants. A smooth two-dimensional (2D) curve represents the combined lower lip, tongue, and anterior pharyngeal wall profile as displacements from the combined upper lip, palate, and posterior pharyngeal wall outline. The algorithm is able to generate tongue displacements beyond the local radius of strongly curved sections of the palate. The 2D grid, along with transverse profiles of the lip, oral-pharyngeal, and epilarynx regions, specifies a vocal conduit from which an effective area function may be determined using corrections to acoustic parameters resulting from duct curvature; the effective area function in turn determines formant frequencies through an acoustic transmission-line calculation. Results of the corrected transmission line are compared with a three-dimensional finite element model. The observed effects of the curved vocal tract on formants F1 and F2 are in order of importance, as follows: (1) reduction in midline distances owing to curvature of the palate and the bend joining the palate to the pharynx, (2) the curvature correction to areas and section lengths, and (3) adjustments to the palate-tongue distance required to produce smooth tongue shapes at large displacements from the palate.}, } @article {pmid20121865, year = {2010}, author = {Basu, M and Krishnan, A and Weber-Fox, C}, title = {Brainstem correlates of temporal auditory processing in children with specific language impairment.}, journal = {Developmental science}, volume = {13}, number = {1}, pages = {77-91}, doi = {10.1111/j.1467-7687.2009.00849.x}, pmid = {20121865}, issn = {1467-7687}, mesh = {Acoustic Stimulation/methods ; Analysis of Variance ; *Brain Mapping ; Brain Stem/*physiopathology ; Child ; Child, Preschool ; Electroencephalography ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; *Language Development Disorders ; Language Disorders/*pathology/physiopathology ; Male ; Psychoacoustics ; Reaction Time/physiology ; Speech Perception/physiology ; Time Perception/*physiology ; }, abstract = {Deficits in identification and discrimination of sounds with short inter-stimulus intervals or short formant transitions in children with specific language impairment (SLI) have been taken to reflect an underlying temporal auditory processing deficit. Using the sustained frequency following response (FFR) and the onset auditory brainstem responses (ABR) we evaluated if children with SLI show abnormalities at the brainstem level consistent with a temporal processing deficit. To this end, the neural encoding of tonal sweeps, as reflected in the FFR, for different rates of frequency change, and the effects of reducing inter-stimulus interval on the ABR components were evaluated in 10 4-11-year-old SLI children and their age-matched controls. Results for the SLI group showed degraded FFR phase-locked neural activity that failed to faithfully track the frequency change presented in the tonal sweeps, particularly at the faster sweep rates. SLI children also showed longer latencies for waves III and V of the ABR and a greater prolongation of wave III at high stimulus rates (>30/sec), suggesting greater susceptibility to neural adaptation. These results taken together appear to suggest a disruption in the temporal pattern of phase-locked neural activity necessary to encode rapid frequency change and an increased susceptibility to desynchronizing factors related to faster rates of stimulus presentation in children with SLI.}, } @article {pmid20095218, year = {2009}, author = {Cramp, P and Derksen, FJ and Stick, JA and Nickels, FA and Brown, KE and Robinson, P and Robinson, NE}, title = {Effect of ventriculectomy versus ventriculocordectomy on upper airway noise in draught horses with recurrent laryngeal neuropathy.}, journal = {Equine veterinary journal}, volume = {41}, number = {8}, pages = {729-734}, doi = {10.2746/042516409x434099}, pmid = {20095218}, issn = {0425-1644}, mesh = {Animals ; Female ; Horse Diseases/*surgery ; Horses ; Male ; Respiratory Sounds/*veterinary ; Vocal Cord Paralysis/surgery/*veterinary ; Vocal Cords/surgery ; }, abstract = {REASONS FOR PERFORMING STUDY: Little is known about the efficacy of bilateral ventriculectomy (VE) or bilateral ventriculocordectomy (VCE) in draught horses.

OBJECTIVES: To compare the effect of VE and VCE on upper airway noise in draught horses with recurrent laryngeal neuropathy (RLN) by use of quantitative sound analysis techniques.

HYPOTHESIS: In competitive draught horses with grade 4 RLN, VE and VCE reduce upper airway noise during exercise, but VCE is more effective.

METHODS: Thirty competitive hitch or pulling draught horses with grade 4 RLN were evaluated for upper airway sound during exercise. Respiratory rate (RR), inspiratory (Ti) and expiratory time (Te), the ratio between Ti and Te (Ti/Te), inspiratory (Sli) and expiratory sound levels (Sle), the ratio between Sli and Sle (Sli/Sle), and peak sound intensity of the second formant (F2) were calculated. Eleven horses were treated with VE and 19 with VCE. After 90 days of voice and physical rest and 30 days of work, the horses returned for post operative upper airway sound evaluation and resting videoendoscopy.

RESULTS: VE significantly reduced Ti/Te, Sli, Sli/Sle and the sound intensity of F2. Respiratory rate, Ti, Te and Sle were unaffected by VE. VCE significantly reduced Ti/Te, Ti, Te, Sli, Sli/Sle and the sound intensity of F2, while RR and Sle were unaffected. The reduction in sound intensity of F2 following VCE was significantly greater than following VE. After VE and VCE, 7/11 (64%) and 15/18 (83%) owners, respectively, concluded that the surgery improved upper airway sound in their horses sufficiently for successful competition.

CONCLUSIONS: VE and VCE significantly reduce upper airway noise and indices of airway obstruction in draught horses with RLN, but VCE is more effective than VE. The procedures have few post operative complications.

POTENTIAL RELEVANCE: VCE is recommended as the preferred treatment for RLN in draught horses. Further studies are required to evaluate the longevity of the procedure's results.}, } @article {pmid20083379, year = {2010}, author = {Sundberg, J and Thalén, M}, title = {What is "Twang"?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {24}, number = {6}, pages = {654-660}, doi = {10.1016/j.jvoice.2009.03.003}, pmid = {20083379}, issn = {1873-4588}, mesh = {Auditory Perception ; Female ; Humans ; Larynx/*physiology ; *Music ; *Phonation ; Pressure ; Regression Analysis ; Sound Spectrography ; Speech Acoustics ; Vibration ; *Voice Quality ; }, abstract = {A single female professional vocal artist and pedagogue sang examples of "twang" and neutral voice quality, which a panel of experts classified, in almost complete agreement with the singer's intentions. Subglottal pressure was measured as the oral pressure during the occlusion during the syllable /pae/. This pressure tended to be higher in "twang," whereas the sound pressure level (SPL) was invariably higher. Voice source properties and formant frequencies were analyzed by inverse filtering. In "twang," as compared with neutral, the closed quotient was greater, the pulse amplitude and the fundamental were weaker, and the normalized amplitude tended to be lower, whereas formants 1 and 2 were higher and 3 and 5 were lower. The formant differences, which appeared to be the main cause of the SPL differences, were more important than the source differences for the perception of "twanginess." As resonatory effects occur independently of the voice source, the formant frequencies in "twang" may reflect a vocal strategy that is advantageous from the point of view of vocal hygiene.}, } @article {pmid20058991, year = {2010}, author = {Ortega-Llebaria, M and del Mar Vanrell, M and Prieto, P}, title = {Catalan speakers' perception of word stress in unaccented contexts.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {1}, pages = {462-471}, doi = {10.1121/1.3268506}, pmid = {20058991}, issn = {1520-8524}, mesh = {Adult ; Cues ; Humans ; Language ; Logistic Models ; *Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {In unaccented contexts, formant frequency differences related to vowel reduction constitute a consistent cue to word stress in English, whereas in languages such as Spanish that have no systematic vowel reduction, stress perception is based on duration and intensity cues. This article examines the perception of word stress by speakers of Central Catalan, in which, due to its vowel reduction patterns, words either alternate stressed open vowels with unstressed mid-central vowels as in English or contain no vowel quality cues to stress, as in Spanish. Results show that Catalan listeners perceive stress based mainly on duration cues in both word types. Other cues pattern together with duration to make stress perception more robust. However, no single cue is absolutely necessary and trading effects compensate for a lack of differentiation in one dimension by changes in another dimension. In particular, speakers identify longer mid-central vowels as more stressed than shorter open vowels. These results and those obtained in other stress-accent languages provide cumulative evidence that word stress is perceived independently of pitch accents by relying on a set of cues with trading effects so that no single cue, including formant frequency differences related to vowel reduction, is absolutely necessary for stress perception.}, } @article {pmid20058974, year = {2010}, author = {Hall, JL and Flanagan, JL}, title = {Intelligibility and listener preference of telephone speech in the presence of babble noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {127}, number = {1}, pages = {280-285}, doi = {10.1121/1.3263603}, pmid = {20058974}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Auditory Perception ; Female ; Humans ; Male ; *Noise ; Sex Characteristics ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech ; Speech Acoustics ; *Speech Perception ; *Telephone ; }, abstract = {Two procedures for improving the intelligibility of wideband telephone speech in the presence of competing babble noise are evaluated. One procedure is differentiation, the other consists of equalizing the speech spectrum by applying the inverse of the average spectrum of formant amplitudes for adult male speakers ("formant equalization"). Speech processed by these two methods was evaluated both for intelligibility and for listener preference. Both methods produced a clear increase in intelligibility compared to unprocessed wideband telephone speech. Formant equalization was found to be preferred over differentiation, more so at low signal-to-noise ratios than at high ones.}, } @article {pmid20043211, year = {2010}, author = {Tomita, K and Yamada, J and Takatsuka, S}, title = {English vowel spaces produced by Japanese speakers: the smaller point vowels' and the greater schwas'.}, journal = {Journal of psycholinguistic research}, volume = {39}, number = {5}, pages = {375-391}, pmid = {20043211}, issn = {1573-6555}, mesh = {Adult ; Asian People/psychology ; Female ; Humans ; Japan ; Male ; Multilingualism ; *Phonetics ; Psycholinguistics ; *Speech Acoustics ; United States ; Young Adult ; }, abstract = {This study investigated how Japanese-speaking learners of English pronounce the three point vowels /i/, /u/, and /a/ appearing in the first and second monosyllabic words of English noun phrases, and the schwa /ə/ appearing in English disyllabic words. First and second formant (F1 and F2) values were measured for four Japanese speakers and two American English speakers. The hypothesis that the area encompassed by the point vowels in the F1-F2 vowel space tends to be smaller for the Japanese speakers than for the English speakers was verified. The hypothesis that the area formed by the three schwas in chicke_n, spoonfu_l, and Tarza_n is greater for the Japanese speakers than for the English speakers and its related hypothesis were largely upheld. Implications for further research are briefly discussed.}, } @article {pmid20030552, year = {2010}, author = {Laaksonen, JP and Rieger, J and Happonen, RP and Harris, J and Seikaly, H}, title = {Speech after radial forearm free flap reconstruction of the tongue: a longitudinal acoustic study of vowel and diphthong sounds.}, journal = {Clinical linguistics & phonetics}, volume = {24}, number = {1}, pages = {41-54}, doi = {10.3109/02699200903340758}, pmid = {20030552}, issn = {1464-5076}, mesh = {Adolescent ; Adult ; Aged ; Canada ; Female ; Humans ; Language ; Longitudinal Studies ; Male ; Middle Aged ; Mouth Floor/physiopathology/radiation effects/surgery ; Mouth Neoplasms/radiotherapy/surgery ; *Phonetics ; Psychomotor Performance/physiology ; Plastic Surgery Procedures/*methods ; Sex Characteristics ; Speech/*physiology ; Speech Acoustics ; Tongue/*physiology/radiation effects/*surgery ; Treatment Outcome ; }, abstract = {The purpose of this study was to use acoustic analyses to describe speech outcomes over the course of 1 year after radial forearm free flap (RFFF) reconstruction of the tongue. Eighteen Canadian English-speaking females and males with reconstruction for oral cancer had speech samples recorded (pre-operative, and 1 month, 6 months, and 1 year post-operative). Acoustic characteristics of formants (F1, F2), fundamental frequency (F0), and duration of 699 vowel and diphthong tokens were analysed. Furthermore, the changes in size of the vowel space area were studied, as well as the effects of radiation therapy (RT) and inclusion of the floor of the mouth (FOM) in the reconstruction. RFFF reconstruction was found to affect several characteristics in males, and a minimal number of variables in females. General signs of reduced ability to articulate were not observed. RT and FOM had no differing effects compared to non-RT or non-FOM. There were individual differences between patients.}, } @article {pmid20008683, year = {2010}, author = {Jacks, A and Mathes, KA and Marquardt, TP}, title = {Vowel acoustics in adults with apraxia of speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {1}, pages = {61-74}, doi = {10.1044/1092-4388(2009/08-0017)}, pmid = {20008683}, issn = {1558-9102}, mesh = {Aged ; Analysis of Variance ; *Apraxias ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Reproducibility of Results ; *Speech Acoustics ; *Speech Disorders ; Speech Production Measurement ; *Voice ; }, abstract = {PURPOSE: To investigate the hypothesis that vowel production is more variable in adults with acquired apraxia of speech (AOS) relative to healthy individuals with unimpaired speech. Vowel formant frequency measures were selected as the specific target of focus.

METHOD: Seven adults with AOS and aphasia produced 15 repetitions of 6 American English vowels in /hVC/ context (hid, head, hat, hot, hub, hoot). Vowel formant frequency measures (F1, F2) were Bark transformed and compared with data from archival sources.

RESULTS: Measures of vowel acoustics in speakers with AOS did not differ from those of unimpaired speakers, including absolute Bark formant values, vowel space area, intervowel distance, and individual trial-to-trial formant variability.

CONCLUSION: Comparison with normative acoustic measures suggested that vowel production at the word level is unimpaired in the current speakers with AOS, supporting previous studies that have shown vowel production is relatively intact in AOS.}, } @article {pmid20004713, year = {2010}, author = {Schebesch, G and Lingner, A and Firzlaff, U and Wiegrebe, L and Grothe, B}, title = {Perception and neural representation of size-variant human vowels in the Mongolian gerbil (Meriones unguiculatus).}, journal = {Hearing research}, volume = {261}, number = {1-2}, pages = {1-8}, doi = {10.1016/j.heares.2009.12.016}, pmid = {20004713}, issn = {1878-5891}, mesh = {Animals ; Auditory Cortex/physiology ; Auditory Perception/*physiology ; Behavior, Animal/*physiology ; Cochlear Nerve/*physiology ; Electrophysiological Phenomena/physiology ; Evoked Potentials, Auditory/physiology ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Gerbillinae/*physiology/psychology ; Humans ; Inferior Colliculi/physiology ; Male ; Models, Animal ; *Phonetics ; Psychoacoustics ; }, abstract = {Humans reliably recognize spoken vowels despite the variability of the sounds caused by the across-subject variability of the speakers' vocal tract. The vocal tract serves as a resonator which imprints a spectral envelope onto the sounds generated by the vocal folds. This spectral envelope contains not only information about the type of vocalization but also about the size of the speaker: the larger the speaker, the lower the formant frequencies of the spoken vowels. In a combined psychophysical and electrophysiological study in the Mongolian gerbil (Meriones unguiculatus), we investigated the perception and neural representation of human vowels spoken by speakers of different sizes. Gerbils trained to discriminate two standard vowels, correctly assigned vowels spoken from different-sized human speakers. Complementary electrophysiological recordings from neurons in the auditory brainstem, midbrain, and primary auditory cortex show that the auditory brainstem retains a truthful representation of the frequency content of the presented vowel sounds. A small percentage of neurons in the midbrain and auditory cortex, however, showed selectivity for a certain vowel type or vocal tract length which is not related to the pure-tone, frequency response area, indicative of a preprocessing stage for auditory segregation of size and structure information.}, } @article {pmid19996499, year = {2009}, author = {Gogniashvili, G and Japaridze, Sh and Khujadze, M}, title = {Influence of acoustic rhinometry, rhinoresistometry and endoscopic sinus surgery on voice quality.}, journal = {Georgian medical news}, volume = {}, number = {176}, pages = {30-34}, pmid = {19996499}, issn = {1512-0112}, mesh = {Adult ; Aged ; Endoscopy/*methods ; Female ; Humans ; Male ; Middle Aged ; Rhinometry, Acoustic/*methods ; Sinusitis/*surgery ; *Voice Quality ; }, abstract = {Twenty-one patients with documented chronic paranasal sinusitis, being in need of endoscopic endonasal sinus surgery were subjected to voice analysis. Tape recordings of sustained vowels were performed both pre- and postoperatively. All voice samples were examined with a system of sound spectrographic analysis. Patients with nasal obstruction, detected by active anterior rhinomanometry were excluded from further studies. Analysis of pre- and postoperative spectograms was focused on changes in center frequency as well as on a bandwidth of the initial four formants. The variations in specific differences of the formant frequencies and amplitudes were also estimated. The different subgroups of patients revealed significant alterations in the parameters studied. The vowels [a:] and [i:] showed inverse changes in measured values. Evaluation of the vowel [u:], on the other hand, was restricted due to artifactual scattering of individual values. In general, the bandwidths diminished postoperatively and energy peaks of formants increased proportinally. In 6 out of 21 patients, one third of the cases examined, after surgery the patients detected perceptual changes in speech. Based on our data obtained it is recommended to inform all patients as well as voice professionals about the possible speech alterations of the speech after endonasal sinus surgery.}, } @article {pmid19965212, year = {2009}, author = {Yadollahi, A and Moussavi, Z}, title = {Formant analysis of breath and snore sounds.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2009}, number = {}, pages = {2563-2566}, doi = {10.1109/IEMBS.2009.5335292}, pmid = {19965212}, issn = {2375-7477}, mesh = {Adult ; Aged, 80 and over ; Automation ; Female ; Humans ; Male ; Middle Aged ; Polysomnography/methods ; Reproducibility of Results ; *Respiration ; Respiratory System ; *Signal Processing, Computer-Assisted ; *Snoring ; *Sound ; Sound Spectrography/*instrumentation/methods ; }, abstract = {Formant frequencies of snore and breath sounds represent resonance in the upper airways; hence, they change with respect to the upper airway anatomy. Therefore, formant frequencies and their variations can be examined to distinguish between snore and breath sounds. In this paper, formant frequencies of snore and breath sounds are investigated and automatically grouped into 7 clusters based on K-Means clustering. First, formants clusters of breath and snore sounds of all subjects were investigated together and their union were calculated as the most probable ranges of the formants. The ranges for the first four formants which span the main frequency components of breath and snore sounds were found to be [20-400]Hz, [270-840]Hz, [500-1380]Hz and [910-1920]Hz. These ranges were then used as priori information to recalculate the formants of snore and breath sounds separately. Statistical t-test showed the 1(st) and 3(rd) formants to be the most characteristic features in distinguishing the breath and snore sounds from each other.}, } @article {pmid19951923, year = {2009}, author = {Redford, MA and Gildersleeve-Neumann, CE}, title = {The development of distinct speaking styles in preschool children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {52}, number = {6}, pages = {1434-1448}, doi = {10.1044/1092-4388(2009/07-0223)}, pmid = {19951923}, issn = {1558-9102}, mesh = {Aging ; *Child Language ; Child, Preschool ; Female ; Humans ; Interpersonal Relations ; Male ; Phonetics ; *Social Behavior ; *Speech ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; }, abstract = {PURPOSE: To examine when and how socially conditioned distinct speaking styles emerge in typically developing preschool children's speech.

METHOD: Thirty preschool children, ages 3, 4, and 5 years old, produced target monosyllabic words with monophthongal vowels in different social-functional contexts designed to elicit clear and casual speaking styles. Thirty adult listeners were used to assess whether and at what age style differences were perceptible. Children's speech was acoustically analyzed to evaluate how style-dependent differences were produced.

RESULTS: The ratings indicated that listeners could not discern style differences in 3-year-olds' speech but could hear distinct styles in 4-year-olds' and especially in 5-year-olds' speech. The acoustic measurements were consistent with these results: Style-dependent differences in 4- and 5-year-olds' words included shorter vowel durations and lower fundamental frequency in clear compared with casual speech words. Five-year-olds' clear speech words also had more final stop releases and initial sibilants with higher spectral energy than did their casual speech words. Formant frequency measures showed no style-dependent differences in vowel production at any age nor any differences in initial stop voice onset times.

CONCLUSION: Overall, the findings suggest that distinct styles develop slowly and that early style-dependent differences in children's speech are unlike those observed in adult clear and casual speech. Children may not develop adultlike styles until they have acquired expert articulatory control and the ability to highlight the internal structure of an articulatory plan for a listener.}, } @article {pmid19948757, year = {2010}, author = {Tasko, SM and Greilick, K}, title = {Acoustic and articulatory features of diphthong production: a speech clarity study.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {1}, pages = {84-99}, doi = {10.1044/1092-4388(2009/08-0124)}, pmid = {19948757}, issn = {1558-9102}, mesh = {Adolescent ; Adult ; Air Pressure ; Biomechanical Phenomena ; Female ; Humans ; Interpersonal Relations ; Male ; Mandible/physiology ; *Phonetics ; Sound Spectrography ; *Speech/physiology ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; Time Factors ; Tongue/physiology ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to evaluate how speaking clearly influences selected acoustic and orofacial kinematic measures associated with diphthong production.

METHOD: Forty-nine speakers, drawn from the University of Wisconsin X-Ray Microbeam Speech Production Database (J. R. Westbury, 1994), served as participants. Samples of clear and conversational productions of the word combine were extracted for analysis. Analyses included listener ratings of speech clarity and a number of acoustic and articulatory kinematic measures associated with production of the diphthong /aI/.

RESULTS: Key results indicate that speaking clearly is associated with (a) increased duration of diphthong-related acoustic and kinematic events, (b) larger F1 and F2 excursions and associated tongue and mandible movements, and (c) minimal evidence of change in formant transition rate.

CONCLUSIONS: Overall, the results suggest that clarity-related changes in diphthong production are accomplished through larger, longer, but not necessarily faster diphthong-related transitions. The clarity-related adjustments in diphthong production observed in this study conform to a simple model that assumes speech clarity arises out of reduced overlap of articulatory gestures.}, } @article {pmid19948755, year = {2010}, author = {Sapir, S and Ramig, LO and Spielman, JL and Fox, C}, title = {Formant centralization ratio: a proposal for a new acoustic measure of dysarthric speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {53}, number = {1}, pages = {114-125}, pmid = {19948755}, issn = {1558-9102}, support = {R01 DC001150/DC/NIDCD NIH HHS/United States ; R01 DC001150-17/DC/NIDCD NIH HHS/United States ; R01 DC1150/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Dysarthria/*diagnosis/therapy ; Female ; Humans ; Male ; Parkinson Disease/diagnosis/therapy ; Phonetics ; Reproducibility of Results ; Sensitivity and Specificity ; Sex Factors ; *Speech ; *Speech Acoustics ; Speech Production Measurement/*methods ; Speech Therapy ; Time Factors ; Treatment Outcome ; }, abstract = {PURPOSE: The vowel space area (VSA) has been used as an acoustic metric of dysarthric speech, but with varying degrees of success. In this study, the authors aimed to test an alternative metric to the VSA-the formant centralization ratio (FCR), which is hypothesized to more effectively differentiate dysarthric from healthy speech and register treatment effects.

METHOD: Speech recordings of 38 individuals with idiopathic Parkinson's disease and dysarthria (19 of whom received 1 month of intensive speech therapy [Lee Silverman Voice Treatment; LSVT LOUD]) and 14 healthy control participants were acoustically analyzed. Vowels were extracted from short phrases. The same vowel-formant elements were used to construct the FCR, expressed as (F2u + F2a + F1i + F1u) / (F2i + F1a), the VSA, expressed as ABS([F1i x (F2a - F2u) + F1a x (F2u - F2i) + F1u x (F2i - F2a)] / 2), a logarithmically scaled version of the VSA (LnVSA), and the F2i /F2u ratio.

RESULTS: Unlike the VSA and the LnVSA, the FCR and F2i/F2u ratio robustly differentiated dysarthric from healthy speech and were not gender sensitive. All metrics effectively registered treatment effects and were strongly correlated with each other.

CONCLUSION: Albeit preliminary, the present findings indicate that the FCR is a sensitive, valid, and reliable acoustic metric for distinguishing dysarthric from unimpaired speech and for monitoring treatment effects, probably because of reduced sensitivity to interspeaker variability and enhanced sensitivity to vowel centralization.}, } @article {pmid19947529, year = {2009}, author = {Dmitrieva, ES and Gel'man, VIa and Zaĭtseva, KA and Orlov, AM}, title = {[Influence of human personal features on acoustic correlates of speech emotional intonation characteristics].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {59}, number = {5}, pages = {538-546}, pmid = {19947529}, issn = {0044-4677}, mesh = {Adult ; Aged ; Emotions/*physiology ; Female ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Speech Perception/*physiology ; Voice Quality/*physiology ; }, abstract = {Comparative study of acoustic correlates of emotional intonation was conducted on two types of speech material: sensible speech utterances and short meaningless words. The corpus of speech signals of different emotional intonations (happy, angry, frightened, sad and neutral) was created using the actor's method of simulation of emotions. Native Russian 20-70-year-old speakers (both professional actors and non-actors) participated in the study. In the corpus, the following characteristics were analyzed: mean values and standard deviations of the power, fundamental frequency, frequencies of the first and second formants, and utterance duration. Comparison of each emotional intonation with "neutral" utterances showed the greatest deviations of the fundamental frequency and frequencies of the first formant. The direction of these deviations was independent of the semantic content of speech utterance and its duration, age, gender, and being actor or non-actor, though the personal features of the speakers affected the absolute values of these frequencies.}, } @article {pmid19894839, year = {2009}, author = {Fox, RA and Jacewicz, E}, title = {Cross-dialectal variation in formant dynamics of American English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {5}, pages = {2603-2618}, pmid = {19894839}, issn = {1520-8524}, support = {R01 DC006871/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Female ; Humans ; Language ; Middle Aged ; *Models, Biological ; North Carolina ; Ohio ; *Phonation ; *Phonetics ; *Speech Acoustics ; Wisconsin ; }, abstract = {This study aims to characterize the nature of the dynamic spectral change in vowels in three distinct regional varieties of American English spoken in the Western North Carolina, in Central Ohio, and in Southern Wisconsin. The vowels /I, epsilon, e, ae, aI/ were produced by 48 women for a total of 1920 utterances and were contained in words of the structure /bVts/ and /bVdz/ in sentences which elicited nonemphatic and emphatic vowels. Measurements made at the vowel target (i.e., the central 60% of the vowel) produced a set of acoustic parameters which included position and movement in the F1 by F2 space, vowel duration, amount of spectral change [measured as vector length (VL) and trajectory length (TL)], and spectral rate of change. Results revealed expected variation in formant dynamics as a function of phonetic factors (vowel emphasis and consonantal context). However, for each vowel and for each measure employed, dialect was a strong source of variation in vowel-inherent spectral change. In general, the dialect-specific nature and amount of spectral change can be characterized quite effectively by position and movement in the F1 by F2 space, vowel duration, TL (but not VL which underestimates formant movement), and spectral rate of change.}, } @article {pmid19893303, year = {2010}, author = {Krishnan, A and Agrawal, S}, title = {Human frequency-following response to speech-like sounds: correlates of off-frequency masking.}, journal = {Audiology & neuro-otology}, volume = {15}, number = {4}, pages = {221-228}, doi = {10.1159/000255340}, pmid = {19893303}, issn = {1421-9700}, mesh = {Adult ; Cerebral Cortex/physiology ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Male ; Perceptual Masking/*physiology ; *Phonetics ; Pitch Discrimination/*physiology ; *Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Off-frequency masking of the second formant by energy at the first formant has been shown to influence both identification and discrimination of the second formant in normal-hearing and hearing-impaired listeners. While both excitatory spread and two-tone suppression have been implicated in this simultaneous masking, their relative contribution has been shown to depend on both the level of the masker and the frequency separation between the probe and the masker. Off-frequency masking effects were evaluated in 10 normal-hearing human adults using the frequency-following response (FFR) to two two-tone approximations of vowel stimuli (/a/ and /u/). In the first experiment, the masking effect of F(1) on F(2) was evaluated by attenuating the level of F(1) relative to a fixed F(2) level. In the second experiment, the masking effect was evaluated by increasing the frequency separation between F(1) and F(2) using F(2) frequency as the variable. Results revealed that both attenuation of the F(1) level, and increasing the frequency separation between F(1) and F(2) increased the magnitude of the FFR component at F(2). These results are consistent with a release from off-frequency masking. Given that the results presented here are for high signal and masker levels and for relatively smaller frequency separation between the masker and the probe, it is possible that both suppression and excitatory spread contributed to the masking effects observed in our data.}, } @article {pmid19891524, year = {2009}, author = {Robb, MP and Chen, Y}, title = {Is /h/ phonetically neutral?.}, journal = {Clinical linguistics & phonetics}, volume = {23}, number = {11}, pages = {842-855}, doi = {10.3109/02699200903247896}, pmid = {19891524}, issn = {1464-5076}, mesh = {Adult ; Female ; Humans ; Language ; Male ; Middle Aged ; *Phonetics ; Sex Factors ; Speech Acoustics ; Speech Articulation Tests ; Speech Production Measurement ; }, abstract = {Use of /h/ in the phrase, 'Say /hVC/ again' has been tacitly assumed to provide a neutral phonetic context in which to study the articulatory characteristics of speech either preceding or following /h/ articulation. Yet, assessment of the stability or neutrality of /h/ has gone untested. The current study sought to determine whether articulation of /h/ differs according to sex and language accent, as well as to examine its influence on subsequent vowel articulation. Selected acoustic features of /hVC/ were measured in 40 speakers of American English (AE) and 40 speakers of Mandarin-accented English (MAE). Results of an analysis of /h/ duration revealed no sex differences within each language group, however considerable variation was found according to accented vs unaccented English. Clear sex differences were found for the production of /h/, occurring more often among male speakers regardless of language variety. Considerable variation in production of /h/ was found between language groups. Analysis of vowel formant frequencies immediately following /h/ articulation indicated minimal coarticulatory effects for both AE and MAE speakers. The present results appear to support the suggestion that /h/ is not exclusively sex-linked and may indeed vary according to non-biological factors. In spite of these variations, /h/ articulation appears to have a negligible influence on neighbouring vowel articulation.}, } @article {pmid19864914, year = {2009}, author = {Kim, Y and Weismer, G and Kent, RD and Duffy, JR}, title = {Statistical models of F2 slope in relation to severity of dysarthria.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {61}, number = {6}, pages = {329-335}, pmid = {19864914}, issn = {1421-9972}, support = {R01 DC000319/DC/NIDCD NIH HHS/United States ; DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Dysarthria/*diagnosis/etiology ; Humans ; Male ; Middle Aged ; *Models, Statistical ; Parkinson Disease/complications ; Probability ; Regression Analysis ; Severity of Illness Index ; Speech ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; Stroke/complications ; Time Factors ; }, abstract = {OBJECTIVE: This study investigated the distribution of second-formant (F2) slopes in a relatively large number of speakers with dysarthria associated with two different underlying diseases.

PATIENTS AND METHODS: Forty speakers with dysarthria (20 with Parkinson's disease, PD; 20 with stroke) and 5 control speakers without a history of neurological disease were asked to repeat six words (coat, hail, sigh, shoot, row and wax) 10 times. Acoustic analysis was performed to derive F2 slope, and speech intelligibility data were collected using a direct magnitude estimate technique to examine its relationship to F2 slope.

RESULTS: Statistical analysis revealed that both clinical groups showed significantly reduced F2 slopes compared to healthy speakers for all words but row. No group difference was found between speakers with PD and stroke; however, different words showed varying sensitivity to the speech motor control problems. The F2 slopes of only two words, shoot and wax, were significantly correlated with scaled speech intelligibility.

CONCLUSION: The findings support the idea that distributional characteristics of acoustic variables, such as F2 slope, could be used to develop a quantitative metric of severity of speech motor control deficits in dysarthria, when the materials are appropriately selected and additional distributional characteristics are studied.}, } @article {pmid19852309, year = {2009}, author = {Kishon-Rabin, L and Taitelbaum-Swead, R and Salomon, R and Slutzkin, M and Amir, N}, title = {Are changes in pitch and formants enough to influence talker normalization processes in children and adults?.}, journal = {Journal of basic and clinical physiology and pharmacology}, volume = {20}, number = {3}, pages = {219-232}, doi = {10.1515/jbcpp.2009.20.3.219}, pmid = {19852309}, issn = {0792-6855}, mesh = {Adolescent ; Adult ; Age Factors ; Child, Preschool ; Dichotic Listening Tests ; Female ; Humans ; Language Development ; Male ; Perceptual Masking ; Pitch Perception ; *Speech Acoustics ; *Speech Discrimination Tests ; *Speech Perception ; Young Adult ; }, abstract = {In the present study, we examined the influence of mean F0 and formant values on talker normalization. Initially, two speakers recorded an identical set of 10 isophonemic word lists in Hebrew, consisting of 10 words each. These recordings were then manipulated by means that affect F0 only, or both F0 and formant frequencies. Different degrees of manipulation were carried out in order to create a new set of lists which were perceived to have been obtained from multiple talkers. The original and manipulated lists were presented to a group of 12 adults and 12 children at 30 dB above pure-tone average thresholds (PTA). Half of the listeners was presented with the single-talker lists first and then the multiple-talker lists, and the other half was presented with the word lists in reversed order. Listeners were instructed to repeat the words they heard. Correct word recognition was scored. The results indicated: (1) lower word recognition scores for the multiple-talker lists than for the single-talker lists, for both children and adults, (2) larger inter-subject variability in performance for the multiple-talker compared to the single-talker lists for both age groups, and (3) that order of presentation influenced word recognition of the multiple-talker lists only, being worse when presented first. Our findings support the following: (1) manipulations of F0 and formants of a single talker are sufficient to influence talker normalization processes, (2) this influence varies between listeners suggesting that listeners do not use the same acoustic information in their speech perception process, and (3) even adults, who are proficient in the language, do not eliminate entirely the irrelevant talker-specific acoustic information in the speech perception process.}, } @article {pmid19836198, year = {2010}, author = {Timon, CI and Hirani, SP and Epstein, R and Rafferty, MA}, title = {Investigation of the impact of thyroid surgery on vocal tract steadiness.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {24}, number = {5}, pages = {610-613}, doi = {10.1016/j.jvoice.2009.02.003}, pmid = {19836198}, issn = {1873-4588}, mesh = {Adult ; Aged ; Analysis of Variance ; Case-Control Studies ; Dysphonia/*etiology/*physiopathology ; Female ; Humans ; Laryngoscopy ; Male ; Matched-Pair Analysis ; Middle Aged ; *Phonation ; Predictive Value of Tests ; *Recurrent Laryngeal Nerve Injuries ; Speech Acoustics ; Thyroid Gland/*surgery ; Time Factors ; Treatment Outcome ; Vocal Cord Paralysis/*etiology/physiopathology ; Vocal Cords/*innervation ; Voice Quality ; }, abstract = {INTRODUCTION: Subjective nonspecific upper aerodigestive symptoms are not uncommon after thyroid surgery. These are postulated to be related to injury of an extrinsic perithyroid nerve plexus that innervates the muscles of the supraglottic and glottic larynx. This plexus is thought to receive contributing branches from both the recurrent and superior laryngeal nerves.

PATIENTS AND METHODS: The technique of linear predictive coding was used to estimate the F(2) values from a sustained vowel /a/ in patients before and 48 hours after thyroid or parathyroid surgery. These patients were controlled against a matched pair undergoing surgery without any theoretical effect on the supraglottic musculature. In total, 12 patients were recruited into each group. Each patient had the formant frequency fluctuation (FFF) and the formant frequency fluctuation ratio (FFFR) calculated for F(1) and F(2).

RESULTS: Mixed analysis of variance (ANOVA) for all acoustic parameters revealed that the chiF(2)FF showed a significant "time" main effect (F(1,22)=7.196, P=0.014, partial eta(2)=0.246) and a significant "time by group interaction" effect (F(1,22)=8.036, P=0.010, eta(p)(2)=0.268), with changes over time for the thyroid group but not for the controls. Similarly, mean chiF(2)FFR showed a similar significant "time" main effect (F(1,22)=6.488, P=0.018, eta(p)(2)=0.228) and a "time by group interaction" effect (F(1,22)=7.134, P=0.014, eta(p)(2)=0.245).

CONCLUSIONS: This work suggests that thyroid surgery produces a significant reduction in vocal tract stability in contrast to the controls. This noninvasive measurement offers a potential instrument to investigate the functional implications of any disturbance that thyroid surgery may have on pharyngeal innervations.}, } @article {pmid19830253, year = {2009}, author = {Lindblom, B and Diehl, R and Creeger, C}, title = {Do 'Dominant Frequencies' explain the listener's response to formant and spectrum shape variations?.}, journal = {Speech communication}, volume = {51}, number = {7}, pages = {622-629}, pmid = {19830253}, issn = {0167-6393}, support = {R01 DC000427/DC/NIDCD NIH HHS/United States ; R01 DC000427-19/DC/NIDCD NIH HHS/United States ; }, abstract = {Psychoacoustic experimentation shows that formant frequency shifts can give rise to more significant changes in phonetic vowel timber than differences in overall level, bandwidth, spectral tilt, and formant amplitudes. Carlson and Granström's perceptual and computational findings suggest that, in addition to spectral representations, the human ear uses temporal information on formant periodicities ('Dominant Frequencies') in building vowel timber percepts. The availability of such temporal coding in the cat's auditory nerve fibers has been demonstrated in numerous physiological investigations undertaken during recent decades. In this paper we explore, and provide further support for, the Dominant Frequency hypothesis using KONVERT, a computational auditory model. KONVERT provides auditory excitation patterns for vowels by performing a critical-band analysis. It simulates phase locking in auditory neurons and outputs DF histograms. The modeling supports the assumption that listeners judge phonetic distance among vowels on the basis formant frequency differences as determined primarily by a time-based analysis. However, when instructed to judge psychophysical distance among vowels, they can also use spectral differences such as formant bandwidth, formant amplitudes and spectral tilt. Although there has been considerable debate among psychoacousticians about the functional role of phase locking in monaural hearing, the present research suggests that detailed temporal information may nonetheless play a significant role in speech perception.}, } @article {pmid19813814, year = {2009}, author = {Cheyne, HA and Kalgaonkar, K and Clements, M and Zurek, P}, title = {Talker-to-listener distance effects on speech production and perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {4}, pages = {2052-2060}, doi = {10.1121/1.3205400}, pmid = {19813814}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Algorithms ; Female ; Humans ; Male ; Psychoacoustics ; *Sound Localization ; *Speech ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {Simulating talker-to-listener distance (TLD) in virtual audio environments requires mimicking natural changes in vocal effort. Studies have identified several acoustic parameters manipulated by talkers when varying vocal effort. However, no systematic study has investigated vocal effort variations due to TLD, under natural conditions, and their perceptual consequences. This work examined the feasibility of varying the vocal effort cues for TLD in synthesized speech and real speech by (a) recording and analyzing single word tokens spoken at 1 m < or = TLD < or = 32 m, (b) creating synthetic and modified speech tokens that vary in one or more acoustic parameters associated with vocal effort, and (c) conducting perceptual tests on the reference, synthetic, and modified tokens to identify salient cues for TLD perception. Measured changes in fundamental frequency, intensity, and formant frequencies of the reference tokens across TLD were similar to other reports in the literature. Perceptual experiments that asked listeners to estimate TLD showed that TLD estimation is most accurate with real speech; however, large standard deviations in the responses suggest that reliable judgments can only be made for gross changes in TLD.}, } @article {pmid19813812, year = {2009}, author = {McGowan, RS and Berger, MA}, title = {Acoustic-articulatory mapping in vowels by locally weighted regression.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {4}, pages = {2011-2032}, pmid = {19813812}, issn = {1520-8524}, support = {001247//PHS HHS/United States ; }, mesh = {Algorithms ; Biomechanical Phenomena ; Female ; Humans ; Linear Models ; Lip/physiology ; Male ; *Models, Statistical ; *Phonetics ; Principal Component Analysis ; Regression Analysis ; *Speech/physiology ; *Speech Acoustics ; Tongue/physiology ; }, abstract = {A method for mapping between simultaneously measured articulatory and acoustic data is proposed. The method uses principal components analysis on the articulatory and acoustic variables, and mapping between the domains by locally weighted linear regression, or loess [Cleveland, W. S. (1979). J. Am. Stat. Assoc. 74, 829-836]. The latter method permits local variation in the slopes of the linear regression, assuming that the function being approximated is smooth. The methodology is applied to vowels of four speakers in the Wisconsin X-ray Microbeam Speech Production Database, with formant analysis. Results are examined in terms of (1) examples of forward (articulation-to-acoustics) mappings and inverse mappings, (2) distributions of local slopes and constants, (3) examples of correlations among slopes and constants, (4) root-mean-square error, and (5) sensitivity of formant frequencies to articulatory change. It is shown that the results are qualitatively correct and that loess performs better than global regression. The forward mappings show different root-mean-square error properties than the inverse mappings indicating that this method is better suited for the forward mappings than the inverse mappings, at least for the data chosen for the current study. Some preliminary results on sensitivity of the first two formant frequencies to the two most important articulatory principal components are presented.}, } @article {pmid19797139, year = {2009}, author = {Dwyer, CH and Robb, MP and O'Beirne, GA and Gilbert, HR}, title = {The influence of speaking rate on nasality in the speech of hearing-impaired individuals.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {52}, number = {5}, pages = {1321-1333}, doi = {10.1044/1092-4388(2009/08-0035)}, pmid = {19797139}, issn = {1092-4388}, mesh = {Adolescent ; Child ; Female ; Hearing Loss/*physiopathology ; Humans ; Male ; *Nasal Cavity ; *Persons With Hearing Impairments ; *Phonetics ; Psychoacoustics ; *Speech ; Speech Perception ; Speech Therapy ; Young Adult ; }, abstract = {PURPOSE: The purpose of this study was to determine whether deliberate increases in speaking rate would serve to decrease the amount of nasality in the speech of severely hearing-impaired individuals.

METHOD: The participants were 11 severely to profoundly hearing-impaired students, ranging in age from 12 to 19 years (M = 16 years). Each participant provided a baseline speech sample (R1) followed by 3 training sessions during which participants were trained to increase their speaking rate. Following the training sessions, a second speech sample was obtained (R2). Acoustic and perceptual analyses of the speech samples obtained at R1 and R2 were undertaken. The acoustic analysis focused on changes in first (F(1)) and second (F(2)) formant frequency and formant bandwidths. The perceptual analysis involved listener ratings of the speech samples (at R1 and R2) for perceived nasality.

RESULTS: Findings indicated a significant increase in speaking rate at R2. In addition, significantly narrower F(2) bandwidth and lower perceptual rating scores of nasality were obtained at R2 across all participants, suggesting a decrease in nasality as speaking rate increases.

CONCLUSION: The nasality demonstrated by hearing-impaired individuals is amenable to change when speaking rate is increased. The influences of speaking rate changes on the perception and production of nasality in hearing-impaired individuals are discussed.}, } @article {pmid19776664, year = {2009}, author = {Hirata, Y and Tsukada, K}, title = {Effects of speaking rate and vowel length on formant frequency displacement in Japanese.}, journal = {Phonetica}, volume = {66}, number = {3}, pages = {129-149}, doi = {10.1159/000235657}, pmid = {19776664}, issn = {1423-0321}, mesh = {Adolescent ; Adult ; Analysis of Variance ; Humans ; *Language ; Male ; *Phonetics ; Regression Analysis ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {This study examined effects of phonemic vowel length and speaking rate, two factors that affect vowel duration, on the first and second formants of all vowels in Japanese. The aim was to delineate the aspects of formant displacement that are governed by the physiological proclivity of vowel production shared across languages, and the aspects that reveal language-specific phenomena. Acoustic analysis revealed that the phonemic long vowels occupied a more peripheral portion of the F1 x F2 vowel space than the phonemic short vowels (effect of vowel length), but effects of speaking rate were less clear. This was because of the significant interactions of the two effects: the formants of phonemic short vowels were more affected by speaking rates than the phonemic long vowels. Regression analyses between F2 and duration revealed that formant displacement occurs when vowels are less than 200 ms. Similarities and differences found for Japanese and English are discussed in terms of physiological proclivity of vowel production versus language-specific phonological encoding.}, } @article {pmid19751070, year = {2009}, author = {Anderson, JE and Sanderson, P}, title = {Sonification design for complex work domains: dimensions and distractors.}, journal = {Journal of experimental psychology. Applied}, volume = {15}, number = {3}, pages = {183-198}, doi = {10.1037/a0016329}, pmid = {19751070}, issn = {1939-2192}, mesh = {Acoustic Stimulation/*methods ; Adolescent ; Adult ; Analysis of Variance ; Attention/physiology ; Auditory Perception/*physiology ; Discrimination, Psychological/*physiology ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking/*physiology ; Psychoacoustics ; *Sound ; Students/psychology ; Young Adult ; }, abstract = {Sonification--representing data in sound--is a potential method for supporting human operators who have to monitor dynamic processes. Previous research has investigated a limited number of sound dimensions and has not systematically investigated the impact of dimensional interactions on sonification effectiveness. In three experiments the authors investigated accuracy for identifying changes in six target auditory dimensions of a continuous pulse stream under three conditions: no distractor, one distractor, and five distractors. In Experiment 1 amplitude, frequency, harmonics, speed, tremolo (cycles per pulse), and width were tested. Accuracy and patterns of interaction between the dimensions were mapped. In Experiment 2 the same dimensions were tested but tremolo was operationalized as cycles per second (Hz). The patterns of interaction between the temporal dimensions differed from Experiment 1. In Experiment 3 the amplitude contour of the pulse stream was changed. The dimensions tested were amplitude, frequency, formants, speed, tremolo (cycles per period), and width. Results showed low accuracy for formants and many interactions, both positive and negative between the dimensions. The authors interpret the results in terms of theories of perceptual interference in auditory dimensions.}, } @article {pmid19739766, year = {2009}, author = {Titze, IR and Worley, AS}, title = {Modeling source-filter interaction in belting and high-pitched operatic male singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {3}, pages = {1530}, pmid = {19739766}, issn = {1520-8524}, support = {R01 DC004224/DC/NIDCD NIH HHS/United States ; R01 DC004224-08/DC/NIDCD NIH HHS/United States ; R01 DC004224-09/DC/NIDCD NIH HHS/United States ; 5R01 DC004224-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Air ; Algorithms ; Computer Simulation ; Glottis/anatomy & histology/physiology ; Head/anatomy & histology/physiology ; Humans ; Larynx/anatomy & histology/physiology ; Lip/anatomy & histology/physiology ; Magnetic Resonance Imaging ; Male ; *Models, Biological ; Mouth/anatomy & histology/physiology ; *Music ; Nonlinear Dynamics ; Periodicity ; Phonetics ; *Speech/physiology ; *Speech Acoustics ; Vibration ; Vocal Cords/anatomy & histology/physiology ; }, abstract = {Nonlinear source-filter theory is applied to explain some acoustic differences between two contrasting male singing productions at high pitches: operatic style versus jazz belt or theater belt. Several stylized vocal tract shapes (caricatures) are discussed that form the bases of these styles. It is hypothesized that operatic singing uses vowels that are modified toward an inverted megaphone mouth shape for transitioning into the high-pitch range. This allows all the harmonics except the fundamental to be "lifted" over the first formant. Belting, on the other hand, uses vowels that are consistently modified toward the megaphone (trumpet-like) mouth shape. Both the fundamental and the second harmonic are then kept below the first formant. The vocal tract shapes provide collective reinforcement to multiple harmonics in the form of inertive supraglottal reactance and compliant subglottal reactance. Examples of lip openings from four well-known artists are used to infer vocal tract area functions and the corresponding reactances.}, } @article {pmid19739762, year = {2009}, author = {Lu, Y and Cooke, M}, title = {Speech production modifications produced in the presence of low-pass and high-pass filtered noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {3}, pages = {1495-1499}, doi = {10.1121/1.3179668}, pmid = {19739762}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Middle Aged ; Multivariate Analysis ; *Noise ; Reading ; *Speech ; Speech Acoustics ; Speech Production Measurement ; Young Adult ; }, abstract = {In the presence of noise, do speakers actively shift their spectral energy distribution to regions least affected by the noise? The current study measured speech level, fundamental frequency, first formant frequency, and spectral center of gravity for read speech produced in the presence of low- and high-pass filtered noise. In both filtering conditions, these acoustic parameters increased relative to speech produced in quiet, a response which creates a release from masking for listeners in the low-pass condition but which actually increases masking in the high-pass noise condition. These results suggest that, at least for read speech, speakers do not adopt production strategies in noise which optimize listeners' information reception but that instead the observed shifts could be a passive response which creates a fortuitous masking release in the low-pass noise. Independent variation in parameters such as F0, F1 and spectral center of gravity may be severely constrained by the increase in vocal effort which accompanies Lombard speech.}, } @article {pmid19739756, year = {2009}, author = {Divenyi, P}, title = {Perception of complete and incomplete formant transitions in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {3}, pages = {1427-1439}, pmid = {19739756}, issn = {1520-8524}, support = {R01 AG07998/AG/NIA NIH HHS/United States ; }, mesh = {Adult ; Humans ; Linear Models ; Models, Theoretical ; Nonlinear Dynamics ; *Phonetics ; Psychoacoustics ; ROC Curve ; Regression Analysis ; Speech ; *Speech Perception ; Time Factors ; Young Adult ; }, abstract = {In everyday speech, formant transitions rarely reach the canonical frequencies of a target vowel. The perceptual system often compensates for such production undershoots, called vowel reduction (VR), by a perceptual overshoot of the final transition frequencies. The present investigation explored the perceptual parameters and existence region of VR. In a series of experiments a 100-ms steady-state vowel V(1) was followed by a formant transition toward a target vowel V(2). By manipulating both its duration and velocity, in most stimuli the transition was truncated and only seldom reached the target. After being presented with the vowel V(2) before each block of trials, listeners were asked to rate their confidence that the transition actually reached the V(2) target. Transitions along six trajectories connecting the three cardinal vowels /a/, /i/, and /u/ in both directions as well as the transition /ie/ (halfway along the trajectory /ia/) were examined in experiments in which either the duration of the transition was fixed and its velocity was varied or vice-versa. Results confirmed the existence of perceptual overshoot and showed that, at the point a transition short of reaching the vowel V(2) was just perceived as if it had reached the target, transition duration and transition velocity were inversely related. The amount of overshoot was found to be larger for larger V(1)-V(2) distances and shorter trajectory durations. The overshoot could be reliably predicted by a linear model based on three parameters--the extent of V(1)-V(2) distance, transition velocity, and transition acceleration. These findings suggest that the perceptual dynamics of speech relies on mechanisms that estimate the rate of change in the resonant characteristics of the vocal tract.}, } @article {pmid19739754, year = {2009}, author = {Ménard, L and Dupont, S and Baum, SR and Aubin, J}, title = {Production and perception of French vowels by congenitally blind adults and sighted adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {3}, pages = {1406-1414}, doi = {10.1121/1.3158930}, pmid = {19739754}, issn = {1520-8524}, mesh = {Adult ; Analysis of Variance ; *Blindness ; Discrimination, Psychological ; Female ; Humans ; Language ; Male ; Middle Aged ; *Phonetics ; Psychoacoustics ; *Speech Perception ; Speech Production Measurement ; Task Performance and Analysis ; Young Adult ; }, abstract = {The goal of this study is to investigate the production and perception of French vowels by blind and sighted speakers. 12 blind adults and 12 sighted adults served as subjects. The auditory-perceptual abilities of each subject were evaluated by discrimination tests (AXB). At the production level, ten repetitions of the ten French oral vowels were recorded. Formant values and fundamental frequency values were extracted from the acoustic signal. Measures of contrasts between vowel categories were computed and compared for each feature (height, place of articulation, roundedness) and group (blind, sighted). The results reveal a significant effect of group (blind vs sighted) on production, with sighted speakers producing vowels that are spaced further apart in the vowel space than those of blind speakers. A group effect emerged for a subset of the perceptual contrasts examined, with blind speakers having higher peak discrimination scores than sighted speakers. Results suggest an important role of visual input in determining speech goals.}, } @article {pmid19739752, year = {2009}, author = {Escudero, P and Boersma, P and Rauber, AS and Bion, RA}, title = {A cross-dialect acoustic description of vowels: Brazilian and European Portuguese.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {3}, pages = {1379-1393}, doi = {10.1121/1.3180321}, pmid = {19739752}, issn = {1520-8524}, mesh = {Adolescent ; Brazil ; Female ; Humans ; *Language ; Male ; *Phonetics ; Portugal ; Sex Characteristics ; Speech ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; Young Adult ; }, abstract = {This paper examines four acoustic correlates of vowel identity in Brazilian Portuguese (BP) and European Portuguese (EP): first formant (F1), second formant (F2), duration, and fundamental frequency (F0). Both varieties of Portuguese display some cross-linguistically common phenomena: vowel-intrinsic duration, vowel-intrinsic pitch, gender-dependent size of the vowel space, gender-dependent duration, and a skewed symmetry in F1 between front and back vowels. Also, the average difference between the vocal tract sizes associated with /i/ and /u/, as measured from formant analyses, is comparable to the average difference between male and female vocal tract sizes. A language-specific phenomenon is that in both varieties of Portuguese the vowel-intrinsic duration effect is larger than in many other languages. Differences between BP and EP are found in duration (BP has longer stressed vowels than EP), in F1 (the lower-mid front vowel approaches its higher-mid counterpart more closely in EP than in BP), and in the size of the intrinsic pitch effect (larger for BP than for EP).}, } @article {pmid19696438, year = {2009}, author = {Ménard, L and Davis, BL and Boë, LJ and Roy, JP}, title = {Producing American English vowels during vocal tract growth: a perceptual categorization study of synthesized vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {52}, number = {5}, pages = {1268-1285}, doi = {10.1044/1092-4388(2009/08-0008)}, pmid = {19696438}, issn = {1092-4388}, mesh = {Adolescent ; Child ; Child, Preschool ; Humans ; Infant ; Jaw/physiology ; *Language Development ; *Phonetics ; Psychoacoustics ; Speech/*physiology ; Speech Perception ; Tongue/*growth & development/physiology ; *Vocabulary ; Vocal Cords/*growth & development/physiology ; Young Adult ; }, abstract = {PURPOSE: To consider interactions of vocal tract change with growth and perceived output patterns across development, the influence of nonuniform vocal tract growth on the ability to reach acoustic-perceptual targets for English vowels was studied.

METHOD: Thirty-seven American English speakers participated in a perceptual categorization experiment. For the experiment, an articulatory-to-acoustic model was used to synthesize 342 five-formant vowels, covering maximal vowel spaces for speakers at 5 growth stages (from 6 months old to adult).

RESULTS: Results indicate that the 3 vowels /i u ae/ can be correctly perceived by adult listeners when produced by speakers with a 6-month-old vocal tract. Articulatory-to-acoustic relationships for these 3 vowels differ across growth stages. For a given perceived vowel category, the infant's tongue position is more fronted than the adult's. Furthermore, nonuniform vocal tract growth influences degree of interarticulator coupling for a given perceived vowel, leading to a reduced correlation between jaw height and tongue body position in infantlike compared with adult vocal tracts.

CONCLUSION: Findings suggest that nonuniform vocal tract growth does not prevent the speaker from producing acoustic-auditory targets related to American English vowels. However, the relationships between articulatory configurations and perceptual targets change from birth to adulthood.}, } @article {pmid19654016, year = {2009}, author = {Gentilucci, M and Campione, GC and Dalla Volta, R and Bernardis, P}, title = {The observation of manual grasp actions affects the control of speech: a combined behavioral and Transcranial Magnetic Stimulation study.}, journal = {Neuropsychologia}, volume = {47}, number = {14}, pages = {3190-3202}, doi = {10.1016/j.neuropsychologia.2009.07.020}, pmid = {19654016}, issn = {1873-3514}, mesh = {Adult ; Biomechanical Phenomena ; Electromyography/methods ; Evoked Potentials, Motor/physiology ; Female ; *Hand ; Hand Strength/*physiology ; Humans ; Male ; Motor Cortex/physiology ; Muscle, Skeletal/physiology ; *Observation ; Psychomotor Performance ; Reaction Time/physiology ; Speech/*physiology ; *Transcranial Magnetic Stimulation ; Young Adult ; }, abstract = {Does the mirror system affect the control of speech? This issue was addressed in behavioral and Transcranial Magnetic Stimulation (TMS) experiments. In behavioral experiment 1, participants pronounced the syllable /da/ while observing (1) a hand grasping large and small objects with power and precision grasps, respectively, (2) a foot interacting with large and small objects and (3) differently sized objects presented alone. Voice formant 1 was higher when observing power as compared to precision grasp, whereas it remained unaffected by observation of the different types of foot interaction and objects alone. In TMS experiment 2, we stimulated hand motor cortex, while participants observed the two types of grasp. Motor Evoked Potentials (MEPs) of hand muscles active during the two types of grasp were greater when observing power than precision grasp. In experiments 3-5, TMS was applied to tongue motor cortex of participants silently pronouncing the syllable /da/ and simultaneously observing power and precision grasps, pantomimes of the two types of grasps, and differently sized objects presented alone. Tongue MEPs were greater when observing power than precision grasp either executed or pantomimed. Finally, in TMS experiment 6, the observation of foot interaction with large and small objects did not modulate tongue MEPs. We hypothesized that grasp observation activated motor commands to the mouth as well as to the hand that were congruent with the hand kinematics implemented in the observed type of grasp. The commands to the mouth selectively affected postures of phonation organs and consequently basic features of phonological units.}, } @article {pmid19603896, year = {2009}, author = {Collyer, S and Davis, PJ and Thorpe, CW and Callaghan, J}, title = {Fundamental frequency influences the relationship between sound pressure level and spectral balance in female classically trained singers.}, journal = {The Journal of the Acoustical Society of America}, volume = {126}, number = {1}, pages = {396-406}, doi = {10.1121/1.3132526}, pmid = {19603896}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Linear Models ; *Music ; *Pressure ; *Sound ; *Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {The influence of fundamental frequency (F0) on the relationship between sound pressure level (SPL) and spectral balance (SB) has been largely unexplored in the female singing voice. Five classically trained females performed a messa di voce across their musical F0 range. Average maximum SB rose with F0 by 0.27 dB/semitone (ST) to B4 and then decreased, while average minimum SB fell by 0.5 dB/ST to E5 and then generally rose. Of 318 tokens, 208 showed a linear SPL:SB relationship (R(2)>or=0.5), but F0 affected SPL:SB slope and intercept and their interaction above and below B4. The possibility that this reflects a change from subglottal inertance to compliance is discussed. Consistency of SB behavior change at B4 and E5 contrasted with variability in first-formant frequency. Nonlinear SPL:SB relationships did not arise from SB saturation. The presence of low-SPL "tails" may reflect the challenge in modifying vocal-fold adduction during crescendo and decrescendo. The results show that analysis of the SPL:SB relationship must take F0 into consideration.}, } @article {pmid19587194, year = {2009}, author = {Krajewski, J and Batliner, A and Golz, M}, title = {Acoustic sleepiness detection: framework and validation of a speech-adapted pattern recognition approach.}, journal = {Behavior research methods}, volume = {41}, number = {3}, pages = {795-804}, doi = {10.3758/BRM.41.3.795}, pmid = {19587194}, issn = {1554-351X}, mesh = {Computer Simulation ; Humans ; Pattern Recognition, Automated/*methods ; Signal Processing, Computer-Assisted ; *Sleep Stages ; Speech Recognition Software ; }, abstract = {This article describes a general framework for detecting sleepiness states on the basis of prosody, articulation, and speech-quality-related speech characteristics. The advantages of this automatic real-time approach are that obtaining speech data is nonobstrusive and is free from sensor application and calibration efforts. Different types of acoustic features derived from speech, speaker, and emotion recognition were employed (frame-level-based speech features). Combing these features with high-level contour descriptors, which capture the temporal information of frame-level descriptor contours, results in 45,088 features per speech sample. In general, the measurement process follows the speech-adapted steps of pattern recognition: (1) recording speech, (2) preprocessing, (3) feature computation (using perceptual and signal-processing-related features such as, e.g., fundamental frequency, intensity, pause patterns, formants, and cepstral coefficients), (4) dimensionality reduction, (5) classification, and (6) evaluation. After a correlation-filter-based feature subset selection employed on the feature space in order to find most relevant features, different classification models were trained. The best model-namely, the support-vector machine-achieved 86.1% classification accuracy in predicting sleepiness in a sleep deprivation study (two-class problem, N=12; 01.00-08.00 a.m.).}, } @article {pmid19571552, year = {2009}, author = {de Bruijn, MJ and ten Bosch, L and Kuik, DJ and Quené, H and Langendijk, JA and Leemans, CR and Verdonck-de Leeuw, IM}, title = {Objective acoustic-phonetic speech analysis in patients treated for oral or oropharyngeal cancer.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {61}, number = {3}, pages = {180-187}, doi = {10.1159/000219953}, pmid = {19571552}, issn = {1421-9972}, mesh = {Adult ; Aged ; Air Pressure ; Female ; Humans ; Male ; Middle Aged ; Mouth Neoplasms/radiotherapy/surgery/*therapy ; Oropharyngeal Neoplasms/radiotherapy/surgery/*therapy ; *Phonetics ; Sex Characteristics ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; Speech Intelligibility ; Speech Production Measurement/*methods ; Surveys and Questionnaires ; Treatment Outcome ; Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: Speech impairment often occurs in patients after treatment for head and neck cancer. New treatment modalities such as surgical reconstruction or (chemo)radiation techniques aim at sparing anatomical structures that are correlated with speech and swallowing. In randomized trials investigating efficacy of various treatment modalities or speech rehabilitation, objective speech analysis techniques may add to improve speech outcome assessment. The goal of the present study is to investigate the role of objective acoustic-phonetic analyses in a multidimensional speech assessment protocol.

PATIENTS AND METHODS: Speech recordings of 51 patients (6 months after reconstructive surgery and postoperative radiotherapy for oral or oropharyngeal cancer) and of 18 control speakers were subjectively evaluated regarding intelligibility, nasal resonance, articulation, and patient-reported speech outcome (speech subscale of the European Organization for Research and Treatment of Cancer Quality of Life Questionnaire-Head and Neck 35 module). Acoustic-phonetic analyses were performed to calculate formant values of the vowels /a, i, u/, vowel space, air pressure release of /k/ and spectral slope of /x/.

RESULTS: Intelligibility, articulation, and nasal resonance were best predicted by vowel space and /k/. Within patients, /k/ and /x/ differentiated tumor site and stage. Various objective speech parameters were related to speech problems as reported by patients.

CONCLUSION: Objective acoustic-phonetic analysis of speech of patients is feasible and contributes to further development of a speech assessment protocol.}, } @article {pmid19571549, year = {2009}, author = {Fraile, R and Sáenz-Lechón, N and Godino-Llorente, JI and Osma-Ruiz, V and Fredouille, C}, title = {Automatic detection of laryngeal pathologies in records of sustained vowels by means of mel-frequency cepstral coefficient parameters and differentiation of patients by sex.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {61}, number = {3}, pages = {146-152}, doi = {10.1159/000219950}, pmid = {19571549}, issn = {1421-9972}, mesh = {Adult ; Electronic Data Processing/*methods ; Female ; Humans ; Laryngeal Diseases/*diagnosis ; Male ; Middle Aged ; Neural Networks, Computer ; *Phonetics ; Probability ; *Sex Characteristics ; Sound Spectrography ; Speech ; Young Adult ; }, abstract = {Mel-frequency cepstral coefficients (MFCC) have traditionally been used in speaker identification applications. Their use has been extended to speech quality assessment for clinical applications during the last few years. While the significance of such parameters for such an application may not seem clear at first thought, previous research has demonstrated their robustness and statistical significance and, at the same time, their close relationship with glottal noise measurements. This paper includes a review of this parameterization scheme and it analyzes its performance for voice analysis when patients are differentiated by sex. While it is of common use for establishing normative values for traditional voice descriptors (e.g. pitch, jitter, formants), differentiation by sex had not been tested yet for cepstral analysis of voice with clinical purposes. This paper shows that the automatic detection of laryngeal pathology on voice records based on MFCC can significantly improve its performance by means of this prior differentiation by sex.}, } @article {pmid19551510, year = {2009}, author = {Ng, AK and Koh, TS and Baey, E and Puvanendran, K}, title = {Role of upper airway dimensions in snore production: acoustical and perceptual findings.}, journal = {Annals of biomedical engineering}, volume = {37}, number = {9}, pages = {1807-1817}, doi = {10.1007/s10439-009-9745-7}, pmid = {19551510}, issn = {1573-9686}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Middle Aged ; *Models, Biological ; Mouth/*pathology/*physiopathology ; Pharynx/*pathology/*physiopathology ; Snoring/*pathology/*physiopathology ; }, abstract = {While considerable efforts have been expended to develop snore-driven markers for detecting obstructive sleep apnea (OSA), there is little emphasis on the relationship between the human upper airway (UA) dimensions and the attributes of snores. This paper aims to investigate the acoustical and perceptual impacts of changing the cross-sectional areas (CSA) of the pharynx and oral cavity on the production of snores. Synthetic snores were generated based on the source-filter theory, whereas natural snores were recorded from 40 snorers during nocturnal polysomnography. First formant frequency (F1), spectral peak frequency (PF), and psychoacoustic metrics (loudness, sharpness, roughness, fluctuation strength, and annoyance) of CSA perturbations were examined, completed with diagnostic appraisal of F1 and PF for single- and mixed-gender groupings using the receiver operating characteristic curve analysis. Results show that (1) narrowing the pharyngeal airway consistently increases F1, but not for PF; and (2) altering the airway dimensions yield no considerable differences in perception of snore sounds, but indirectly affect the psychoacoustics by changing the dynamics of snore source flow. Diagnostic outcomes for all groupings (p-value < 0.0001) demonstrate that F1 is more capable of distinguishing apneic and benign snorers than PF due to the close association of F1 with the UA anatomical structures. Correlation exists between the UA anatomy and the properties of snores; there is a promising future for developing snore-driven screening tools for OSA.}, } @article {pmid19530519, year = {2009}, author = {Hirsch, F and Monfrais-Pfauwadel, MC and Sock, R and Vaxelaire, B}, title = {[Formant structures of vowels produced in stutterers' speech in normal and fast speech rates].}, journal = {Revue de laryngologie - otologie - rhinologie}, volume = {130}, number = {1}, pages = {17-22}, pmid = {19530519}, issn = {0035-1334}, mesh = {Adult ; Humans ; Phonetics ; *Speech ; Speech Production Measurement ; Stuttering/*physiopathology ; Time Factors ; }, abstract = {OBJECTIVE: The aim of this study is to analyse the steady-state portion of the first two formants (F1 and F2) in the production of [CVp] sequences, containing vowels [i, a, u] pronounced in two speech rates (normal and fast), by groups of untreated and treated stutterers, and control subjects.

MATERIAL AND METHOD: We recorded 15 speakers (5 control speakers, 5 non-treated stutterers and 5 treated stutterers) who had to repeat sentences containing [CV] sequences ten times, in normal and fast speech rates, where [C] was [p], [t] or [k] and [V], [i], [a] or [u].

RESULTS: Comparing data between the three groups of speakers, a vowel space reduction is observed for stutterers at a normal speaking rate, compared with control speakers and treated stutterers. When speech rate increases, no reduction of vowel space is noticeable for non-treated stutterers, contrary to treated stutterers and controls.

CONCLUSION: Results for stutterers could be interpreted in terms of lack of articulatory flexibility.}, } @article {pmid19525544, year = {2009}, author = {Hillenbrand, JM and Clark, MJ}, title = {The role of f (0) and formant frequencies in distinguishing the voices of men and women.}, journal = {Attention, perception & psychophysics}, volume = {71}, number = {5}, pages = {1150-1166}, doi = {10.3758/APP.71.5.1150}, pmid = {19525544}, issn = {1943-3921}, support = {R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonetics ; *Sex Characteristics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; *Voice Quality ; }, abstract = {The purpose of the present study was to determine the contributions of fundamental frequency (f (0)) and formants in cuing the distinction between men's and women's voices. A source-filter synthesizer was used to create four versions of 25 sentences spoken by men: (1) unmodified synthesis, (2) f (0) only shifted up toward values typical of women, (3) formants only shifted up toward values typical of women, and (4) both f (0) and formants shifted up. Identical methods were used to generate four corresponding versions of 25 sentences spoken by women, but with downward shifts. Listening tests showed that (1) shifting both f (0) and formants was usually effective (~82%) in changing the perceived sex of the utterance, and (2) shifting either f (0) or formants alone was usually ineffective in changing the perceived sex. Both f (0) and formants are apparently needed to specify speaker sex, though even together these cues are not entirely effective. Results also suggested that f (0) is somewhat more important than formants. A second experiment used the same methods, but isolated /hVd/ syllables were used as test signals. Results were broadly similar, with the important exception that, on average, the syllables were more likely to shift perceived talker sex with shifts in f (0) and/or formants.}, } @article {pmid19503606, year = {2009}, author = {Yovel, Y and Melcon, ML and Franz, MO and Denzinger, A and Schnitzler, HU}, title = {The voice of bats: how greater mouse-eared bats recognize individuals based on their echolocation calls.}, journal = {PLoS computational biology}, volume = {5}, number = {6}, pages = {e1000400}, pmid = {19503606}, issn = {1553-7358}, mesh = {Algorithms ; Animals ; Artificial Intelligence ; Chiroptera/*physiology ; Data Interpretation, Statistical ; Echolocation/*physiology ; Linear Models ; Male ; *Models, Biological ; Nonlinear Dynamics ; Principal Component Analysis ; Recognition, Psychology/*physiology ; }, abstract = {Echolocating bats use the echoes from their echolocation calls to perceive their surroundings. The ability to use these continuously emitted calls, whose main function is not communication, for recognition of individual conspecifics might facilitate many of the social behaviours observed in bats. Several studies of individual-specific information in echolocation calls found some evidence for its existence but did not quantify or explain it. We used a direct paradigm to show that greater mouse-eared bats (Myotis myotis) can easily discriminate between individuals based on their echolocation calls and that they can generalize their knowledge to discriminate new individuals that they were not trained to recognize. We conclude that, despite their high variability, broadband bat-echolocation calls contain individual-specific information that is sufficient for recognition. An analysis of the call spectra showed that formant-related features are suitable cues for individual recognition. As a model for the bat's decision strategy, we trained nonlinear statistical classifiers to reproduce the behaviour of the bats, namely to repeat correct and incorrect decisions of the bats. The comparison of the bats with the model strongly implies that the bats are using a prototype classification approach: they learn the average call characteristics of individuals and use them as a reference for classification.}, } @article {pmid19450029, year = {2009}, author = {Soltis, J and Leighty, KA and Wesolek, CM and Savage, A}, title = {The expression of affect in African elephant (Loxodonta africana) rumble vocalizations.}, journal = {Journal of comparative psychology (Washington, D.C. : 1983)}, volume = {123}, number = {2}, pages = {222-225}, doi = {10.1037/a0015223}, pmid = {19450029}, issn = {0735-7036}, mesh = {Affect/*physiology ; *Animal Communication ; Animals ; Body Size/physiology ; Competitive Behavior/physiology ; Dominance-Subordination ; Elephants/physiology ; Female ; Hierarchy, Social ; Social Behavior ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {Affective states are thought to be expressed in the mammalian voice, but such investigations are most common in primates. Source and filter features of rumbles were analyzed from 6 adult female African elephants (Loxodonta africana) at Disney's Animal Kingdom. Rumbles produced during periods of minimal social interaction ("low affect") were compared to those produced during dominance interactions ("high affect"). Low-ranking females produced rumbles with increased and more variable fundamental frequencies, and increased durations and amplitudes during dominance interactions with superiors, compared to the low affect context. This acoustic response is consistent with the expression of affect in mammals and may signal submission to superiors. The 2 highest ranking females were codominant and competed for alpha status. They produced rumbles with decreased and less variable fundamental frequencies, increased durations and amplitudes, and a decrease in formant dispersion during dominance interactions with each other, compared to the low affect context. This response is not generally consistent with the expression of affect, but may signal large body size to competitors. These results suggest that affect can be expressed in the voiced sounds of elephants.}, } @article {pmid19434279, year = {2008}, author = {Lee, SH and Kwon, HJ and Choi, HJ and Lee, NH and Lee, SJ and Jin, SM}, title = {The singer's formant and speaker's ring resonance: a long-term average spectrum analysis.}, journal = {Clinical and experimental otorhinolaryngology}, volume = {1}, number = {2}, pages = {92-96}, pmid = {19434279}, issn = {1976-8710}, abstract = {OBJECTIVES: We previously showed that a trained tenor's voice has the conventional singer's formant at the region of 3 kHz and another energy peak at 8-9 kHz. Singers in other operatic voice ranges are assumed to have the same peak in their singing and speaking voice. However, to date, no specific measurement of this has been made.

METHODS: Tenors, baritones, sopranos and mezzo sopranos were chosen to participate in this study of the singer's formant and the speaker's ring resonance. Untrained males (n=15) and females (n=15) were included in the control group. Each subject was asked to produce successive /a/ vowel sounds in their singing and speaking voice. For singing, the low pitch was produced in the chest register and the high notes in the head register. We collected the data on the long-term average spectra of the speaking and singing voices of the trained singers and the control groups.

RESULTS: For the sounds produced from the head register, a significant energy concentration was seen in both 2.2-3.4 kHz and 7.5-8.4 kHz regions (except for the voices of the mezzo sopranos) in the trained singer group when compared to the control groups. Also, the chest register had a significant energy concentration in the 4 trained singer groups at the 2.2-3.1 kHz and 7.8-8.4 kHz. For speaking sound, all trained singers had a significant energy concentration at 2.2-5.3 kHz and sopranos had another energy concentration at 9-10 kHz.

CONCLUSION: The results of this study suggest that opera singers have more energy concentration in the singer's formant/speaker's ring region, in both singing and speaking voices. Furthermore, another region of energy concentration was identified in opera singer's singing sound and in sopranos' speaking sound at 8-9 kHz. The authors believe that these energy concentrations may contribute to the rich voice of trained singers.}, } @article {pmid19433113, year = {2009}, author = {Chieffi, S and Secchi, C and Gentilucci, M}, title = {Deictic word and gesture production: Their interaction.}, journal = {Behavioural brain research}, volume = {203}, number = {2}, pages = {200-206}, doi = {10.1016/j.bbr.2009.05.003}, pmid = {19433113}, issn = {1872-7549}, mesh = {Adult ; Female ; *Gestures ; Humans ; Photic Stimulation ; Speech/*physiology ; Speech Production Measurement ; }, abstract = {We examined whether and how deictic gestures and words influence each other when the content of the gesture was congruent or incongruent with that of the simultaneously produced word. Two experiments were carried out. In Experiment 1, the participants read aloud the deictic word 'QUA' ('here') or 'LA" ('there'), printed on a token placed near to or far from their body. Simultaneously, they pointed towards one's own body, when the token was placed near, or at a remote position, when the token was placed far. In this way, participants read 'QUA' ('here') and pointed towards themselves (congruent condition) or a remote position (incongruent condition); or they read 'LA" ('there') and pointed towards a remote position (congruent condition) or themselves (incongruent condition). In a control condition, in which a string of 'X' letters was printed on the token, the participants were silent and only pointed towards themselves (token placed near) or a remote position (token placed far). In Experiment 2, the participants read aloud the deictic word placed in the near or far position without gesturing. The results showed that the congruence/incongruence between the content of the deictic word and that of the gesture affected gesture kinematics and voice spectra. Indeed, the movement was faster in the congruent than in the control and incongruent conditions; and it was slower in the incongruent than in the control condition. As concerns voice spectra, formant 2 (F2) decreased in the incongruent conditions. The results suggest the existence of a bidirectional interaction between speech and gesture production systems.}, } @article {pmid19427374, year = {2009}, author = {Osmanski, MS and Marvit, P and Depireux, DA and Dooling, RJ}, title = {Discrimination of auditory gratings in birds.}, journal = {Hearing research}, volume = {256}, number = {1-2}, pages = {11-20}, pmid = {19427374}, issn = {1878-5891}, support = {R01 DC000198/DC/NIDCD NIH HHS/United States ; R01 DC000198-23/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Pathways/*physiology ; *Auditory Perception ; Auditory Threshold ; Conditioning, Operant ; Female ; Finches/*physiology ; Humans ; Male ; Melopsittacus/*physiology ; *Pattern Recognition, Physiological ; *Pitch Discrimination ; Psychoacoustics ; *Signal Detection, Psychological ; Sound Spectrography ; Species Specificity ; Time Factors ; Vocalization, Animal ; }, abstract = {Auditory gratings (also called auditory ripples) are a family of complex, broadband sounds with sinusoidally modulated logarithmic amplitudes and a drifting spectral envelope. These stimuli have been studied both physiologically in mammals and psychophysically in humans. Auditory gratings share spectro-temporal properties with many natural sounds, including species-specific vocalizations and the formant transitions of human speech. We successfully trained zebra finches and budgerigars, using operant conditioning methods, to discriminate between flat-spectrum broadband noise and noises with ripple spectra of different densities that moved up or down in frequency at various rates. Results show that discrimination thresholds (minimum modulation depth) increased as a function of increasing grating periodicity and density across all species. Results also show that discrimination in the two species of birds was better at those grating periodicities and densities that are prominent in their species-specific vocalizations. Budgerigars were generally more sensitive than both zebra finches and humans. Both bird species showed greater sensitivity to descending auditory gratings, which mirrors the main direction in their vocalizations. Humans, on the other hand, showed no directional preference even though speech is somewhat downward directional. Overall, our results are suggestive of both common strategies in the processing of complex sounds between birds and mammals and specialized, species-specific variations on that processing in birds.}, } @article {pmid19425671, year = {2009}, author = {Alku, P and Magi, C and Yrttiaho, S and Bäckström, T and Story, B}, title = {Closed phase covariance analysis based on constrained linear prediction for glottal inverse filtering.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {5}, pages = {3289-3305}, doi = {10.1121/1.3095801}, pmid = {19425671}, issn = {1520-8524}, mesh = {Adult ; *Algorithms ; Electrodiagnosis ; Female ; *Glottis ; Humans ; Linear Models ; Male ; *Models, Biological ; Multivariate Analysis ; Phonetics ; Pressure ; Speech ; }, abstract = {Closed phase (CP) covariance analysis is a widely used glottal inverse filtering method based on the estimation of the vocal tract during the glottal CP. Since the length of the CP is typically short, the vocal tract computation with linear prediction (LP) is vulnerable to the covariance frame position. The present study proposes modification of the CP algorithm based on two issues. First, and most importantly, the computation of the vocal tract model is changed from the one used in the conventional LP into a form where a constraint is imposed on the dc gain of the inverse filter in the filter optimization. With this constraint, LP analysis is more prone to give vocal tract models that are justified by the source-filter theory; that is, they show complex conjugate roots in the formant regions rather than unrealistic resonances at low frequencies. Second, the new CP method utilizes a minimum phase inverse filter. The method was evaluated using synthetic vowels produced by physical modeling and natural speech. The results show that the algorithm improves the performance of the CP-type inverse filtering and its robustness with respect to the covariance frame position.}, } @article {pmid19425670, year = {2009}, author = {Geng, C and Mooshammer, C}, title = {How to stretch and shrink vowel systems: results from a vowel normalization procedure.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {5}, pages = {3278-3288}, doi = {10.1121/1.3106130}, pmid = {19425670}, issn = {1520-8524}, mesh = {Adult ; Algorithms ; Biomechanical Phenomena ; Female ; Humans ; Male ; Mouth/anatomy & histology ; Multivariate Analysis ; Palate/anatomy & histology ; *Phonetics ; Regression Analysis ; *Speech ; Speech Acoustics ; Tongue ; Vocal Cords/anatomy & histology ; }, abstract = {One of the goals of phonetic investigations is to find strategies for vowel production independent of speaker-specific vocal-tract anatomies and individual biomechanical properties. In this study techniques for speaker normalization that are derived from Procrustes methods were applied to acoustic and articulatory data. More precisely, data consist of the first two formants and EMMA fleshpoint markers of stressed and unstressed vowels of German from seven speakers in the consonantal context /t/. Main results indicate that (a) for the articulatory data, the normalization can be related to anatomical properties (palate shapes), (b) the recovery of phonemic identity is of comparable quality for acoustic and articulatory data, (c) the procedure outperforms the Lobanov transform in the acoustic domain in terms of phoneme recovery, and (d) this advantage comes at the cost of partly also changing ellipse orientations, which is in accordance with the formulation of the algorithms.}, } @article {pmid19401873, year = {2009}, author = {Sari, E and Kiliç, MA}, title = {The effects of surgical rapid maxillary expansion (SRME) on vowel formants.}, journal = {Clinical linguistics & phonetics}, volume = {23}, number = {6}, pages = {393-403}, doi = {10.1080/02699200802716074}, pmid = {19401873}, issn = {0269-9206}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Maxilla/anatomy & histology/surgery ; Mouth/anatomy & histology ; *Palatal Expansion Technique ; *Phonetics ; *Speech Acoustics ; Treatment Outcome ; Young Adult ; }, abstract = {The objective of this study was to investigate the effect of surgical rapid maxillary expansion (SRME) on vowel production. The subjects included 12 patients, whose speech were considered perceptually normal, that had undergone surgical RME for expansion of a narrow maxilla. They uttered the following Turkish vowels, ([a], [e], [dotless i], [i], [o], [oe], [u], [y]), in isolation before and after the maxillary expansion. These samples were recorded for acoustical analysis. The fundamental frequencies (F0) and the frequencies and bandwidths of the first two formants (F1 and F2) of the vowels were measured using the Multi-Speech programme (Kay Elemetrics). Statistical analysis revealed that the second formant frequency (F2) and bandwidth values in the vowel [i] and [oe] were lowered after maxillary expansion (p<.05). Surgical RME induced the decrease in the second formant of the [i] and [oe] vowels by affecting the size of the anterior oral cavity, however the influence was too small to cause any differences in the acoustic properties of the other vowels.}, } @article {pmid19394957, year = {2009}, author = {Torre, P and Barlow, JA}, title = {Age-related changes in acoustic characteristics of adult speech.}, journal = {Journal of communication disorders}, volume = {42}, number = {5}, pages = {324-333}, doi = {10.1016/j.jcomdis.2009.03.001}, pmid = {19394957}, issn = {1873-7994}, mesh = {Adult ; Aged ; Aged, 80 and over ; *Aging ; Female ; Hearing Loss ; Humans ; Male ; Middle Aged ; Phonetics ; *Sex Characteristics ; *Speech ; *Speech Acoustics ; Time Factors ; Young Adult ; }, abstract = {UNLABELLED: This paper addresses effects of age and sex on certain acoustic properties of speech, given conflicting findings on such effects reported in prior research. The speech of 27 younger adults (15 women, 12 men; mean age 25.5 years) and 59 older adults (32 women, 27 men; mean age 75.2 years) was evaluated for identification of differences for sex and age group across measures of fundamental and formant frequencies (F0, F1, F2 and F3) and voice onset time (VOT). There were significant sex-by-age group interactions for F0, F1, and VOT, some of which were specific to individual speech sounds. The findings suggest that further research on aging speech should focus on sex differences and the potential influence such changes may have on communication abilities of older adults with hearing loss.

LEARNING OUTCOMES: The reader will be able to understand and describe (1) possible changes in specific acoustic properties with age, (2) how these changes may differ for women and men, and (3) the potential impact these changes may have on the speech understanding of older individuals with hearing loss.}, } @article {pmid19369034, year = {2009}, author = {Niemi, M and Laaksonen, JP and Forssell, H and Jääskeläinen, S and Aaltonen, O and Happonen, RP}, title = {Acoustic and neurophysiologic observations related to lingual nerve impairment.}, journal = {International journal of oral and maxillofacial surgery}, volume = {38}, number = {7}, pages = {758-765}, doi = {10.1016/j.ijom.2009.02.023}, pmid = {19369034}, issn = {1399-0020}, mesh = {Adult ; Aged ; Blinking/physiology ; Brain Stem/physiology ; Cranial Nerve Injuries/etiology ; Female ; Humans ; Lingual Nerve/*physiopathology ; *Lingual Nerve Injuries ; Male ; Middle Aged ; Neurologic Examination ; Oral Surgical Procedures/adverse effects ; Reflex/physiology ; Sensory Thresholds ; Sex Factors ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Thermosensing ; Touch Perception ; Young Adult ; }, abstract = {The purpose of this study was to determine the acoustic effects of lingual nerve impairment on speech. Neurophysiologic examination and thermal quantitative sensory testing (QST) were carried out to determine if the profile, type or severity of sensory nerve impairment had effects on the degree of speech changes. The study group consisted of 5 women and 5 men with lingual nerve damage following an oral and maxillofacial surgery procedure. Time interval between the examination and the nerve damage ranged from 1 month to 20 years. Formants and fundamental frequency and duration of vowel sounds were analyzed. The patients underwent sensory tests, blink reflex and thermal QST of the lingual nerve area. The lingual nerve impairment had effects on the central acoustic features of vowel sounds. A relationship was observed between warm detection threshold values and the magnitude of second formant changes in men. It is concluded that lingual nerve impairment has gender-specific effects on speech. The variability in the acoustic changes of vowel sounds between different patients indicates individual compensatory manners of speech production following lingual nerve impairment.}, } @article {pmid19354412, year = {2009}, author = {Morrison, GS}, title = {Likelihood-ratio forensic voice comparison using parametric representations of the formant trajectories of diphthongs.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {4}, pages = {2387-2397}, doi = {10.1121/1.3081384}, pmid = {19354412}, issn = {1520-8524}, mesh = {Adult ; Forensic Sciences ; Humans ; Likelihood Functions ; Logistic Models ; Male ; Middle Aged ; Multivariate Analysis ; *Phonetics ; Signal Processing, Computer-Assisted ; *Speech ; Speech Production Measurement ; Young Adult ; }, abstract = {Non-contemporaneous speech samples from 27 male speakers of Australian English were compared in a forensic likelihood-ratio framework. Parametric curves (polynomials and discrete cosine transforms) were fitted to the formant trajectories of the diphthongs /a/I, /eI/, /o[see text]/, /a[see text]/, and open /[see text]I/. The estimated coefficient values from the parametric curves were used as input to a generative multivariate-kernel-density formula for calculating likelihood ratios expressing the probability of obtaining the observed difference between two speech samples under the hypothesis that the samples were produced by the same speaker versus under the hypothesis that they were produced by different speakers. Cross-validated likelihood-ratio results from systems based on different parametric curves were calibrated and evaluated using the log-likelihood-ratio cost function (C(llr)). The cross-validated likelihood ratios from the best-performing system for each vowel phoneme were fused using logistic regression. The resulting fused system had a very low error rate, thus meeting one of the requirements for admissibility in court.}, } @article {pmid19354411, year = {2009}, author = {Turner, RE and Walters, TC and Monaghan, JJ and Patterson, RD}, title = {A statistical, formant-pattern model for segregating vowel type and vocal-tract length in developmental formant data.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {4}, pages = {2374-2386}, pmid = {19354411}, issn = {1520-8524}, support = {G0500221/MRC_/Medical Research Council/United Kingdom ; G0500221(73813)/MRC_/Medical Research Council/United Kingdom ; G9900369/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Adolescent ; Adult ; Algorithms ; Child ; Child Development ; Child Language ; Child, Preschool ; Female ; Humans ; Larynx/*anatomy & histology/*growth & development ; Male ; *Models, Biological ; Mouth/*anatomy & histology/*growth & development ; *Phonetics ; Speech ; Young Adult ; }, abstract = {This paper investigates the theoretical basis for estimating vocal-tract length (VTL) from the formant frequencies of vowel sounds. A statistical inference model was developed to characterize the relationship between vowel type and VTL, on the one hand, and formant frequency and vocal cavity size, on the other. The model was applied to two well known developmental studies of formant frequency. The results show that VTL is the major source of variability after vowel type and that the contribution due to other factors like developmental changes in oral-pharyngeal ratio is small relative to the residual measurement noise. The results suggest that speakers adjust the shape of the vocal tract as they grow to maintain a specific pattern of formant frequencies for individual vowels. This formant-pattern hypothesis motivates development of a statistical-inference model for estimating VTL from formant-frequency data. The technique is illustrated using a third developmental study of formant frequencies. The VTLs of the speakers are estimated and used to provide a more accurate description of the complicated relationship between VTL and glottal pulse rate as children mature into adults.}, } @article {pmid19354407, year = {2009}, author = {Oglesbee, E and Kewley-Port, D}, title = {Estimating vowel formant discrimination thresholds using a single-interval classification task.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {4}, pages = {2323-2335}, pmid = {19354407}, issn = {1520-8524}, support = {R01 DC002229/DC/NIDCD NIH HHS/United States ; DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Differential Threshold ; Humans ; Logistic Models ; *Phonetics ; Semantics ; *Signal Detection, Psychological ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Previous research estimating vowel formant discrimination thresholds in words and sentences has often employed a modified two-alternative-forced-choice (2AFC) task with adaptive tracking. Although this approach has produced stable data, the length and number of experimental sessions, as well as the unnaturalness of the task, limit generalizations of results to ordinary speech communication. In this exploratory study, a typical identification task was used to estimate vowel formant discrimination thresholds. Specifically, a signal detection theory approach was used to develop a method to estimate vowel formant discrimination thresholds from a quicker, more natural single-interval classification task. In experiment 1 "classification thresholds" for words in isolation and embedded in sentences were compared to previously collected 2AFC data. Experiment 2 used a within-subjects design to compare thresholds estimated from both classification and 2AFC tasks. Due to instabilities observed in the experiment 1 sentence data, experiment 2 examined only isolated words. Results from these experiments show that for isolated words, thresholds estimated using the classification procedure are comparable to those estimated using the 2AFC task. These results, as well as an analysis of several aspects of the classification procedure, support the viability of this new approach for estimating discrimination thresholds for speech stimuli.}, } @article {pmid19354352, year = {2009}, author = {Story, BH}, title = {Vocal tract modes based on multiple area function sets from one speaker.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {4}, pages = {EL141-7}, pmid = {19354352}, issn = {1520-8524}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC004789-08/DC/NIDCD NIH HHS/United States ; R01-DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Humans ; Larynx/anatomy & histology/*physiology ; Male ; Mouth/anatomy & histology/*physiology ; *Phonetics ; Principal Component Analysis ; Speech/*physiology ; }, abstract = {The purpose of this study was to derive vocal tract modes from a wider range of vowel area functions for a specific speaker than has been previously reported. Area functions from Story et al. [(1996). J. Acoust. Soc. Am. 100, 537-554] and Story [(2008). J. Acoust. Soc. Am. 123, 327-335] were combined in a composite set from which modes were derived with principal component analysis. Along with scaling coefficients, these modes were used to generate a [F1, F2] formant space. In comparison to formant spaces similarly generated based on the two area function sets alone, the combined version provides a wider range of both F1 and F2 values. This new set of modes may be useful for inverse mapping of formant frequencies to area functions or for modeling of vocal tract shape changes.}, } @article {pmid19299898, year = {2009}, author = {Ng, ML and Chu, R}, title = {An acoustical and perceptual study of vowels produced by alaryngeal speakers of Cantonese.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {61}, number = {2}, pages = {97-104}, doi = {10.1159/000209272}, pmid = {19299898}, issn = {1421-9972}, mesh = {Aged ; Aged, 80 and over ; Analysis of Variance ; Humans ; Larynx, Artificial ; Male ; Middle Aged ; *Phonetics ; Reproducibility of Results ; Speech ; *Speech Intelligibility ; *Speech, Alaryngeal ; Speech, Esophageal/psychology ; }, abstract = {OBJECTIVE: The present study determines the acoustical and perceptual differences between alaryngeal and laryngeal speakers of Cantonese in the production of Cantonese vowels.

SUBJECTS AND METHODS: Formant frequencies (F1 and F2) associated with eight Cantonese vowels [see text] produced by 40 alaryngeal (10 esophageal, 10 tracheoesophageal, 10 with pneumatic artificial larynx, 10 electrolaryngeal) and 10 laryngeal speakers were obtained. To assess how formants affect vowel perception, the vowels were identified by 20 naïve listeners, based on which confusion matrices were established.

RESULTS: Spectral analysis revealed that F1 and F2 values of Cantonese vowels produced by the alaryngeal speakers were significantly higher than those produced by laryngeal speakers, with the exception of F2 of [see text], /i/ and /y/ produced by alaryngeal speakers using pneumatic artificial larynx, and [see text] produced by electrolaryngeal speakers.

CONCLUSION: The acoustic findings suggest a general shortening of the effective vocal tract length for resonance in alaryngeal speakers after laryngectomy. Percent correct identification of vowels from the perceptual experiment indicated similar error patterns in vowel identification between alaryngeal and laryngeal speaker groups. Among the different alaryngeal speaker groups, naïve listeners tended to perceive the vowels produced by electrolaryngeal speakers more accurately.}, } @article {pmid19275325, year = {2009}, author = {Nittrouer, S and Lowenstein, JH}, title = {Does harmonicity explain children's cue weighting of fricative-vowel syllables?.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {3}, pages = {1679-1692}, pmid = {19275325}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-22/DC/NIDCD NIH HHS/United States ; R01DC-00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Cues ; Female ; Humans ; Male ; *Phonetics ; *Signal Detection, Psychological ; *Speech Perception ; Young Adult ; }, abstract = {When labeling syllable-initial fricatives, children have been found to weight formant transitions more and fricative-noise spectra less than adults, prompting the suggestion that children attend more to the slow vocal-tract movements that create syllabic structure than to the rapid gestures more closely aligned with individual phonetic segments. That explanation fits well with linguistic theories, but an alternative explanation emerges from auditory science: Perhaps children attend to formant transitions because they are found in voiced signal portions, and so formants share a common harmonic structure. This work tested that hypothesis by using two kinds of stimuli lacking harmonicity: sine-wave and whispered speech. Adults and children under 7 years of age were asked to label fricative-vowel syllables in each of those conditions, as well as natural speech. Results showed that children did not change their weighting strategies from those used with natural speech when listening to sine-wave stimuli, but weighted formant transitions less when listening to whispered stimuli. These findings showed that it is not the harmonicity principle that explains children's preference for formant transitions in phonetic decisions. It is further suggested that children are unable to recover formant structure when those formants are not spectrally prominent and/or are noisy.}, } @article {pmid19272890, year = {2009}, author = {Kim, KH and Choi, SJ and Kim, JH and Kim, DH}, title = {An improved speech processing strategy for cochlear implants based on an active nonlinear filterbank model of the biological cochlea.}, journal = {IEEE transactions on bio-medical engineering}, volume = {56}, number = {3}, pages = {828-836}, doi = {10.1109/TBME.2008.2007850}, pmid = {19272890}, issn = {1558-2531}, mesh = {Adult ; Basilar Membrane/*physiology ; *Cochlear Implants ; Computer Simulation ; Equipment Design ; Female ; Humans ; Male ; Models, Biological ; *Nonlinear Dynamics ; *Signal Processing, Computer-Assisted ; Speech Acoustics ; *Speech Perception ; }, abstract = {The purpose of this study was to improve the speech processing strategy for cochlear implants (CIs) based on a nonlinear time-varying filter model of a biological cochlea. The level-dependent frequency response characteristic of the basilar membrane is known to produce robust formant representation and speech perception in noise. A dual resonance nonlinear (DRNL) model was adopted because it is simpler than other adaptive nonlinear models of the basilar membrane and can be readily incorporated into the CI speech processor. Spectral analysis showed that formant information is more saliently represented at the output of the proposed CI speech processor compared to the conventional strategy in noisy conditions. Acoustic simulation and hearing experiments showed that the DRNL-based nonlinear strategy improves speech performance in a speech-spectrum-shaped noise.}, } @article {pmid19268129, year = {2009}, author = {Rodríguez Valiente, A and Pérez Sanz, C and Górriz, C and Juárez, A and Monfort, M and García Berrocal, JR and Gil Fernández, J and Ramírez Camacho, R}, title = {[Designing a new tool for hearing exploration].}, journal = {Acta otorrinolaringologica espanola}, volume = {60}, number = {1}, pages = {43-48}, pmid = {19268129}, issn = {0001-6519}, mesh = {Acoustics ; *Audiometry, Speech ; Hearing Tests/methods ; Humans ; }, abstract = {INTRODUCTION: Many presbycusic patients have difficulty in understanding certain words. This could be justified because certain sounds in Spanish are more difficult to perceive, particularly the sounds with energy in the high frequencies. We propose to use a sentence as a tool to check this theory.

MATERIALS AND METHOD: All the Spanish sounds were analyzed, measuring the degree of acoustic energy in all the frequencies. The conclusions drawn from the comparison of the results allowed the design of the tool that is proposed here.

RESULTS: We established a gradient of perception difficulty, occlusive consonants being the least perceptible, followed by fricative, and finally all those segments with harmony and a clear formant structure. The Spanish sentence "Ana vio ese coche rojizo fino" is proposed as the tool for this study. This sentence has some as it comprises certain peculiarities that makes it particularly useful for this purpose. It will allow us to check whether understanding deteriorates as we move from beginning to end, helping evaluate the importance of high frequencies for intelligibility.

CONCLUSIONS: A positive result could help in the design of amplification systems to improve speech intelligibility. In addition, the exploratory tool could allow neuro-acoustic exploration, useful in the central auditory pathology studies.}, } @article {pmid19231480, year = {2009}, author = {Leff, AP and Iverson, P and Schofield, TM and Kilner, JM and Crinion, JT and Friston, KJ and Price, CJ}, title = {Vowel-specific mismatch responses in the anterior superior temporal gyrus: an fMRI study.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {45}, number = {4}, pages = {517-526}, pmid = {19231480}, issn = {0010-9452}, support = {075765/WT_/Wellcome Trust/United Kingdom ; 082420/WT_/Wellcome Trust/United Kingdom ; 088130/WT_/Wellcome Trust/United Kingdom ; G0701888/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation ; Adult ; *Brain Mapping ; Discrimination, Psychological/*physiology ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; Mental Processes/physiology ; Neural Pathways/physiology ; *Phonetics ; Reference Values ; Speech Perception/*physiology ; Temporal Lobe/*physiology ; Verbal Behavior/physiology ; }, abstract = {There have been many functional imaging studies that have investigated the neural correlates of speech perception by contrasting neural responses to speech and "speech-like" but unintelligible control stimuli. A potential drawback of this approach is that intelligibility is necessarily conflated with a change in the acoustic parameters of the stimuli. The approach we have adopted is to take advantage of the mismatch response elicited by an oddball paradigm to probe neural responses in temporal lobe structures to a parametrically varied set of deviants in order to identify brain regions involved in vowel processing. Thirteen normal subjects were scanned using a functional magnetic resonance imaging (fMRI) paradigm while they listened to continuous trains of auditory stimuli. Three classes of stimuli were used: 'vowel deviants' and two classes of control stimuli: one acoustically similar ('single formants') and the other distant (tones). The acoustic differences between the standard and deviants in both the vowel and single-formant classes were designed to match each other closely. The results revealed an effect of vowel deviance in the left anterior superior temporal gyrus (aSTG). This was most significant when comparing all vowel deviants to standards, irrespective of their psychoacoustic or physical deviance. We also identified a correlation between perceptual discrimination and deviant-related activity in the dominant superior temporal sulcus (STS), although this effect was not stimulus specific. The responses to vowel deviants were in brain regions implicated in the processing of intelligible or meaningful speech, part of the so-called auditory "what" processing stream. Neural components of this pathway would be expected to respond to sudden, perhaps unexpected changes in speech signal that result in a change to narrative meaning.}, } @article {pmid19198944, year = {2009}, author = {May, BJ and Little, N and Saylor, S}, title = {Loudness perception in the domestic cat: reaction time estimates of equal loudness contours and recruitment effects.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {10}, number = {2}, pages = {295-308}, pmid = {19198944}, issn = {1438-7573}, support = {P30 DC005211/DC/NIDCD NIH HHS/United States ; R01 DC000109/DC/NIDCD NIH HHS/United States ; P30 DC05211/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Threshold ; Cats ; Hearing/physiology ; Hearing Aids ; *Hyperacusis ; Loudness Perception/*physiology ; Male ; Models, Animal ; *Reaction Time ; Speech Perception ; }, abstract = {The domestic cat is the primary physiological model of loudness coding and recruitment. At present, there are no published descriptions of loudness perception in this species. This study used a reaction time task to characterize loudness perception in six behaviorally trained cats. The psychophysical approach was based on the assumption that sounds of equal loudness elicit responses of equal latency. The resulting equal latency contours reproduced well-known features of human equal loudness contours. At the completion of normal baseline measures, the cats were exposed to intense sound to investigate the behavioral correlates of loudness recruitment, the abnormally rapid growth of loudness that is commonly associated with hearing loss. Observed recruitment effects were similar in magnitude to those that have been reported in hearing-impaired humans. Linear hearing aid amplification is known to improve speech intelligibility but also exacerbate recruitment in impaired listeners. The effects of speech spectra and amplification on recruitment were explored by measuring the growth of loudness for natural and amplified vowels before and after sound exposure. Vowels produced more recruitment than tones, and the effect was exacerbated by the selective amplification of formant structure. These findings support the adequacy of the domestic cat as a model system for future investigations of the auditory processes that underlie loudness perception, recruitment, and hearing aid design.}, } @article {pmid19185452, year = {2010}, author = {Echternach, M and Sundberg, J and Arndt, S and Markl, M and Schumacher, M and Richter, B}, title = {Vocal tract in female registers--a dynamic real-time MRI study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {24}, number = {2}, pages = {133-139}, doi = {10.1016/j.jvoice.2008.06.004}, pmid = {19185452}, issn = {1873-4588}, mesh = {Adult ; Female ; Humans ; Jaw/physiology ; Lip/physiology ; Magnetic Resonance Imaging ; Mouth/*physiology ; *Music ; Pharynx/*physiology ; Phonation/*physiology ; *Sex Characteristics ; Speech Acoustics ; Voice/*physiology ; Young Adult ; }, abstract = {The area of vocal registers is still unclarified. In a previous investigation, dynamic real-time magnetic resonance imaging (MRI), which is able to produce up to 10 frames per second, was successfully applied for examinations of vocal tract modifications in register transitions in male singers. In the present study, the same MRI technique was used to study vocal tract shapes during four professional young sopranos' lower and upper register transitions. The subjects were asked to sing a scale on the vowel /a/ across their transitions. The transitions were acoustically identified by four raters. In neither of these transitions, clear vocal tract changes could be ascertained. However, substantial changes, that is, widening of the lips, opening of the jaw, elevation of the tongue dorsum, and continuous widening of the pharynx, were observed when the singers reached fundamental frequencies that were close to the frequency of the first formant of the vowel sung. These findings suggest that in these subjects register transition was not primarily the result of modifications of the vocal tract.}, } @article {pmid19173425, year = {2009}, author = {Munhall, KG and MacDonald, EN and Byrne, SK and Johnsrude, I}, title = {Talkers alter vowel production in response to real-time formant perturbation even when instructed not to compensate.}, journal = {The Journal of the Acoustical Society of America}, volume = {125}, number = {1}, pages = {384-390}, pmid = {19173425}, issn = {1520-8524}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; R01 DC008092-02/DC/NIDCD NIH HHS/United States ; DC-08092/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception ; Feedback ; Female ; Humans ; *Phonetics ; Time Factors ; *Verbal Behavior ; Young Adult ; }, abstract = {Talkers show sensitivity to a range of perturbations of auditory feedback (e.g., manipulation of vocal amplitude, fundamental frequency and formant frequency). Here, 50 subjects spoke a monosyllable ("head"), and the formants in their speech were shifted in real time using a custom signal processing system that provided feedback over headphones. First and second formants were altered so that the auditory feedback matched subjects' production of "had." Three different instructions were tested: (1) control, in which subjects were naive about the feedback manipulation, (2) ignore headphones, in which subjects were told that their voice might sound different and to ignore what they heard in the headphones, and (3) avoid compensation, in which subjects were informed in detail about the manipulation and were told not to compensate. Despite explicit instruction to ignore the feedback changes, subjects produced a robust compensation in all conditions. There were no differences in the magnitudes of the first or second formant changes between groups. In general, subjects altered their vowel formant values in a direction opposite to the perturbation, as if to cancel its effects. These results suggest that compensation in the face of formant perturbation is relatively automatic, and the response is not easily modified by conscious strategy.}, } @article {pmid19163463, year = {2008}, author = {Sola-Soler, J and Jane, R and Fiz, JA and Morera, J}, title = {Formant frequencies of normal breath sounds of snorers may indicate the risk of Obstructive Sleep Apnea Syndrome.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2008}, number = {}, pages = {3500-3503}, doi = {10.1109/IEMBS.2008.4649960}, pmid = {19163463}, issn = {2375-7477}, mesh = {Adult ; Body Mass Index ; Electronic Data Processing ; Exhalation/*physiology ; Humans ; Inhalation/physiology ; Middle Aged ; Models, Statistical ; Oropharynx/diagnostic imaging/*pathology/physiopathology ; Polysomnography ; Radiography ; Respiration ; Signal Processing, Computer-Assisted ; Sleep Apnea, Obstructive/*pathology/physiopathology ; Snoring/diagnosis/physiopathology ; }, abstract = {Several differences between the airway of normal subjects and those with OSAS are well known. The characteristics of the upper airway may be indirectly studied through the formant frequencies of breathing sounds. In this work we analyze the formants of inspiration and exhalation sounds in snoring subjects with and without OSAS. Formant frequencies of inspiration and exhalation appear in the same bands as snores. Formant F1 is significantly lower in inspiration episodes of OSAS patients (p=0.008) with a decreasing tendency as the AHI increases (r=-0.705). In addition, this formant has a significantly higher variability SF1 in pathological subjects, for both inspiration (p=0.022) and exhalation (p=0.038) episodes, as was previously found in snores. A higher variability of formant frequencies seems to be an indicator of the presence of OSAS. The proposed technique could allow the identification of OSAS patients from normal breathing alone.}, } @article {pmid19163029, year = {2008}, author = {Dinath, F and Bruce, IC}, title = {Hearing aid gain prescriptions balance restoration of auditory nerve mean-rate and spike-timing representations of speech.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2008}, number = {}, pages = {1793-1796}, doi = {10.1109/IEMBS.2008.4649526}, pmid = {19163029}, issn = {2375-7477}, mesh = {Animals ; Auditory Threshold/physiology ; Biomedical Engineering ; Cats ; Cochlear Nerve/*physiology ; Databases, Factual ; Evoked Potentials, Auditory ; *Hearing Aids/statistics & numerical data ; Models, Neurological ; Phonetics ; }, abstract = {Linear and nonlinear amplification schemes for hearing aids have thus far been developed and evaluated based on perceptual criteria such as speech intelligibility, sound comfort, and loudness equalization. Finding amplification schemes that optimize all of these perceptual metrics has proven difficult. Using a physiological model, Bruce et al. [1] investigated the effects of single-band gain adjustments to linear amplification prescriptions. Optimal gain adjustments for model auditory-nerve fiber responses to speech sentences from the TIMIT database were dependent on whether the error metric included the spike timing information (i.e., a time-resolution of several microseconds) or the mean firing rates (i.e., a time-resolution of several milliseconds). Results showed that positive gain adjustments are required to optimize the mean firing rate responses, whereas negative gain adjustments tend to optimize spike timing information responses. In this paper we examine the results in more depth using a similar optimization scheme applied to a synthetic vowel /E/. It is found that negative gain adjustments (i.e., below the linear gain prescriptions) minimize the spread of synchrony and deviation of the phase response to vowel formants in responses containing spike-timing information. In contrast, positive gain adjustments (i.e., above the linear gain prescriptions) normalize the distribution of mean discharge rates in the auditory nerve responses. Thus, linear amplification prescriptions appear to find a balance between restoring the spike-timing and mean-rate information in auditory-nerve responses.}, } @article {pmid19162052, year = {2009}, author = {Britton, B and Blumstein, SE and Myers, EB and Grindrod, C}, title = {The role of spectral and durational properties on hemispheric asymmetries in vowel perception.}, journal = {Neuropsychologia}, volume = {47}, number = {4}, pages = {1096-1106}, pmid = {19162052}, issn = {0028-3932}, support = {R01 DC006220-06/DC/NIDCD NIH HHS/United States ; DC006220/DC/NIDCD NIH HHS/United States ; R01 DC006220-03/DC/NIDCD NIH HHS/United States ; R01 DC006220-04/DC/NIDCD NIH HHS/United States ; R01 DC006220-05/DC/NIDCD NIH HHS/United States ; R01 DC006220/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adolescent ; Adult ; *Brain Mapping ; Discrimination, Psychological/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; Image Processing, Computer-Assisted/methods ; Magnetic Resonance Imaging ; Male ; Middle Aged ; Oxygen/blood ; Speech Perception/*physiology ; Time Perception/*physiology ; Visual Cortex/blood supply/physiology ; Young Adult ; }, abstract = {The aim of the current study is to investigate potential hemispheric asymmetries in the perception of vowels and the influence of different time scales on such asymmetries. Activation patterns for naturally produced vowels were examined at three durations encompassing a short (75 ms), medium (150 ms), and long (300 ms) integration time window in a discrimination task. A set of 5 corresponding non-speech sine wave tones were created with frequencies matching the second formant of each vowel. Consistent with earlier hypotheses, there was a right hemisphere preference in the superior temporal gyrus for the processing of spectral information for both vowel and tone stimuli. However, observed laterality differences for vowels and tones were a function of heightened right hemisphere sensitivity to long integration windows, whereas the left hemisphere showed sensitivity to both long and short integration windows. Although there were a number of similarities in the processing of vowels and tones, differences also emerged suggesting that even fairly early in the processing stream at the level of the STG, different mechanisms are recruited for processing vowels and tones.}, } @article {pmid19145013, year = {2009}, author = {Viswanathan, N and Fowler, CA and Magnuson, JS}, title = {A critical examination of the spectral contrast account of compensation for coarticulation.}, journal = {Psychonomic bulletin & review}, volume = {16}, number = {1}, pages = {74-79}, pmid = {19145013}, issn = {1069-9384}, support = {P01 HD001994/HD/NICHD NIH HHS/United States ; R01 DC005765/DC/NIDCD NIH HHS/United States ; DC00565/DC/NIDCD NIH HHS/United States ; HD01994/HD/NICHD NIH HHS/United States ; }, mesh = {Attention ; Humans ; *Phonetics ; Pitch Perception ; Reaction Time ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Vocal tract gestures for adjacent phones overlap temporally, rendering the acoustic speech signal highly context dependent. For example, following a segment with an anterior place of articulation, a posterior segment's place of articulation is pulled frontward, and listeners' category boundaries shift appropriately. Some theories assume that listeners perceptually attune or compensate for coarticulatory context. An alternative is that shifts result from spectral contrast. Indeed, shifts occur when speech precursors are replaced by pure tones, frequency matched to the formant offset at the assumed locus of contrast (Lotto & Kluender, 1998). However, tone analogues differ from natural formants in several ways, raising the possibility that conditions for contrast may not exist in natural speech. When we matched tones to natural formant intensities and trajectories, boundary shifts diminished. When we presented only the critical spectral region of natural speech tokens, no compensation was observed. These results suggest that conditions for spectral contrast do not exist in typical speech.}, } @article {pmid19127562, year = {2009}, author = {Clark, G}, title = {The multi-channel cochlear implant: past, present and future perspectives.}, journal = {Cochlear implants international}, volume = {10 Suppl 1}, number = {}, pages = {2-13}, doi = {10.1179/cim.2009.10.Supplement-1.2}, pmid = {19127562}, issn = {1754-7628}, mesh = {Animals ; Child ; Cochlea/*surgery ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Humans ; Rats ; Speech Perception ; }, abstract = {Initial research demonstrated that only low frequencies could be mimicked with rate of electrical stimulation, and thus multi-channel rather than single-channel stimulation was required for the place coding of the mid-high speech frequencies. Place coding of mid-high frequencies was best achieved with electrodes inside the cochlea. Furthermore, correct biomechanical properties of a multiple electrode bundle were required for it to pass around the cochlear spiral to the speech frequency region. Biological studies showed too that intra-cochlear electrodes could be used with minimal trauma, safe electrical stimulus parameters, and methods to prevent inner ear infection and meningitis. The crucial discoveries for coding speech with electrical stimulation have been based on the discovery of: 1) the fact the brain processes frequency information along spatial and temporal channels, and 2) that the first patient experienced vowels when stimulating different electrodes that corresponded to the place of excitation for single formant vowels in people with normal hearing. The inaugural and subsequent speech processing strategies extracted frequencies of special importance for speech intelligibility, and transmitted the information along place coding channels. The voicing frequency and/or amplitude, was coded as temporal information across these spatial channels. As a result a great majority of severely-to-profoundly deaf people with previous hearing can not only communicate when electrical stimulation is combined with lipreading, but with electrical stimulation alone. In addition, the benefits of binaural hearing with bilateral cochlear implants or an implant in one ear and hearing aid in the other ear have been realized. Related psychophysical research has discovered the basic perceptual skills that process the complex patterns of brain excitation that underlie speech recognition both in the one ear as well as bilateral implants.In addition the development of the perceptual skills in the maturing child for speech recognition, have been discovered. In the future high fidelity sound should be achieved by providing the fine temporo-spatial patterns of excitation and preserving the peripheral nerve network. This could require the release of nerve growth factors and the development of electrodes using nanotechnology.}, } @article {pmid19125025, year = {2009}, author = {Harkrider, AW and Plyler, PN and Hedrick, MS}, title = {Effects of hearing loss and spectral shaping on identification and neural response patterns of stop-consonant stimuli in young adults.}, journal = {Ear and hearing}, volume = {30}, number = {1}, pages = {31-42}, doi = {10.1097/AUD.0b013e31818f359f}, pmid = {19125025}, issn = {1538-4667}, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Audiometry ; Auditory Cortex/*physiopathology ; Behavior ; Cues ; Evoked Potentials, Auditory ; Female ; Hearing Loss/*physiopathology/psychology ; Humans ; Male ; Neurons ; Phonetics ; Psychometrics ; Reaction Time ; Severity of Illness Index ; Young Adult ; }, abstract = {OBJECTIVES: The primary purpose of this study was to more clearly define the effects of hearing loss, separate from age, on perception, and neural response patterns of dynamic spectral cues. To do this, the study was designed to determine whether (1) hearing loss affects the neural representation and/or categorical perception of stop-consonant stimuli among young adults and (2) spectrally shaped amplification aimed at increasing the audibility of the F2 formant transition cue reduces any effects of hearing loss. It was predicted that (1) young adults with hearing loss would differ from young adults with normal hearing in their behavioral and neural responses to stop-consonant stimuli and (2) enhancing the audibility of the F2 formant transition cue relative to the rest of the stimulus would not overcome the effects of hearing loss on behavioral performance or neural response patterns.

DESIGN: Behavioral identification and neural response patterns of stop-consonant stimuli varying along the /b-d-g/ place-of-articulation continuum were measured from seven young adults with mild-to-moderate hearing impairment (mean age = 21.4 yr) and compared with responses from 11 young adults with normal hearing (mean age = 27 yr). Psychometric functions and N1-P2 cortical-evoked responses were evoked by consonant-vowel (CV) stimuli without (unshaped) and with (shaped) frequency-dependent amplification that enhanced F2 relative to the rest of the stimulus.

RESULTS: Behavioral identification and neural response patterns of stop-consonant CVs differed between the two groups. Specifically, to the unshaped stimuli, listeners with hearing loss tended to make low-frequency judgments more often (more /b/, fewer /g/) than listeners with normal hearing when categorizing along the /b-d-g/ continuum. Additionally, N1 amplitudes were larger and P2 latencies were longer to all phonemes in young adults with hearing impairment versus normal hearing. Enhancing the audibility of the F2 transition cue with spectrally shaped amplification did not alter the neural representation of the stop-consonant CVs in the young listeners with hearing loss. It did modify categorical perception such that listeners with hearing loss tended to make high-frequency judgments more often (more /g/, fewer /b/). However, shaping the stimuli did not make their psychometric functions more like those of the normal controls. Instead, young adults with hearing loss went from one extreme (low-frequency judgments with unshaped stimuli) to the other (high-frequency judgments with shaped stimuli), whereas judgments from the normal controls were more balanced.

CONCLUSIONS: Hearing loss, separate from aging, seems to negatively impact identification and neural representation of time-varying spectral cues like the F2 formant transition. Enhancing the audibility of the F2 formant transition cue relative to the rest of the stimulus does not overcome the effects of hearing loss on behavioral performance or neural response patterns in young adults. Thus, the deleterious effects of hearing loss on stop-consonant perception along the place-of-articulation continuum may not only be due solely to decreased audibility but also due to improper coding by residual neurons, resulting in distortion of the time-varying spectral cue. This may explain, in part, why amplification cannot completely compensate for the effects of sensorineural hearing loss.}, } @article {pmid19122453, year = {2009}, author = {Hornickel, J and Skoe, E and Kraus, N}, title = {Subcortical laterality of speech encoding.}, journal = {Audiology & neuro-otology}, volume = {14}, number = {3}, pages = {198-207}, pmid = {19122453}, issn = {1421-9700}, support = {R01 DC001510/DC/NIDCD NIH HHS/United States ; R01 DC001510-13/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Pathways/*physiology ; Auditory Perception/physiology ; Brain Mapping ; Brain Stem/*physiology ; Child ; Cognition ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; *Functional Laterality ; Humans ; Signal Transduction/physiology ; Speech Intelligibility ; Speech Perception/*physiology ; }, abstract = {It is well established that in the majority of the population language processing is lateralized to the left hemisphere. Evidence suggests that lateralization is also present in the brainstem. In the current study, the syllable /da/ was presented monaurally to the right and left ears and electrophysiological responses from the brainstem were recorded in adults with symmetrical interaural click-evoked responses. Responses to the right-ear presentation occurred earlier than those to left-ear presentation in two peaks of the frequency following response (FFR) and approached significance for the third peak of the FFR and the offset peak. Interestingly, there were no differences in interpeak latencies indicating the response to right-ear presentation simply occurred earlier over this region. Analyses also showed more robust frequency encoding when stimuli were presented to the right ear than the left ear. The effect was found for the harmonics of the fundamental that correspond to the first formant of the stimulus, but was not seen in the fundamental frequency range. The results suggest that left lateralization of processing acoustic elements important for discriminating speech extends to the auditory brainstem and that these effects are speech specific.}, } @article {pmid22303151, year = {2009}, author = {Slot, K and Bronakowski, L and Cichosz, J and Kim, H}, title = {Application of poincare-mapping of voiced-speech segments for emotion sensing.}, journal = {Sensors (Basel, Switzerland)}, volume = {9}, number = {12}, pages = {9858-9872}, pmid = {22303151}, issn = {1424-8220}, abstract = {The following paper introduces a group of novel speech-signal descriptors that reflect phoneme-pronunciation variability and that can be considered as potentially useful features for emotion sensing. The proposed group includes a set of statistical parameters of Poincare maps, derived for formant-frequency evolution and energy evolution of voiced-speech segments. Two groups of Poincare-map characteristics were considered in the research: descriptors of sample-scatter, which reflect magnitudes of phone-uttering variations and descriptors of cross-correlations that exist among samples and that evaluate consistency of variations. It has been shown that inclusion of the proposed characteristics into the pool of commonly used speech descriptors, results in a noticeable increase-at the level of 10%-in emotion sensing performance. Standard pattern recognition methodology has been adopted for evaluation of the proposed descriptors, with the assumption that three- or four-dimensional feature spaces can provide sufficient emotion sensing. Binary decision trees have been selected for data classification, as they provide with detailed information on emotion-specific discriminative power of various speech descriptors.}, } @article {pmid24945017, year = {2008}, author = {MacDonald, E and Pile, E and Dajani, H and Munhall, K}, title = {The Specificity of Adaptation to Real-Time Formant Shifting.}, journal = {Proceedings of the ... International Seminar on Speech Production}, volume = {2008}, number = {}, pages = {397-400}, pmid = {24945017}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; R01 DC008092-02/DC/NIDCD NIH HHS/United States ; }, abstract = {In this study, two experiments were conducted to investigate the specificity of adaptation to real-time formant shifting. During the experiments, talkers were adapted to altered auditory feedback for one vowel (trained vowel) and received unaltered feedback for a different vowel (untrained vowel). In the first experiment, production of the untrained vowel was measured while the talker was in the process of adapting to the altered feedback for the trained vowel. In the second experiment, production of the untrained vowel was measured after talkers had adapted to the altered feedback for the trained vowel. In both experiments, talkers spontaneously modified production of the trained vowel in response to the altered auditory feedback. In the first experiment, talkers slightly altered production of the untrained vowel while the trained vowel was adapting to the altered feedback. In the second experiment, production of the untrained vowel was not altered after talkers had completely adapted to the altered feedback of the trained vowel. These results suggest that the degree of generalization depends on the conditions of adaptation and on the information available about the acoustic environment.}, } @article {pmid19112837, year = {2008}, author = {Kosztyła-Hojna, B and Rogowski, M and Łuczaj, J and Kasperuk, J}, title = {[The voice and speech quality in laryngectomy patients rehabilitated with second generation voice prosthesis].}, journal = {Polski merkuriusz lekarski : organ Polskiego Towarzystwa Lekarskiego}, volume = {25}, number = {147}, pages = {230-235}, pmid = {19112837}, issn = {1426-9686}, mesh = {Adult ; Humans ; Laryngectomy/*rehabilitation ; *Larynx, Artificial ; Male ; Middle Aged ; Phonation ; Speech Intelligibility ; Speech, Alaryngeal ; Voice Quality ; }, abstract = {UNLABELLED: Voice rehabilitation with the application of voice prostheses is a method of choice in surgical rehabilitation of patients following total laryngectomy.

THE AIM OF THE STUDY: To compare voice quality and fistula speech with use of second generation voice prostheses to voice and esophageal speech in patients rehabilitated with vocal method.

MATERIAL AND METHODS: The following study comprised 37 patients with fistula speech (group I). The assessment of voice quality included subjective and objective examination and acoustic analysis of fistula speech and aesophageal. Examination results were compared with data obtained in 15 patients with oesophageal voice of similar age and gender composition (group II). The acoustic analysis was performed using IRIS Medicom software. Fo values and Jitter, Shimmer, HNR parameters were analysed.

RESULTS: All examined patients with fistula speech demonstrated a permanent capacity of speaking, which enabled efficient verbal communication. Average values obtained during 'perception test' located this type of phonation between good and very good speech. Formant recordings in narrow-band spectrographs indicated relatively normal supraglottal articulation which is a factor conditioning good speech comprehension. During subjective assessment, fistula voice and speech were moderately loud, dull, hoarse, uttered in unrestrained, breathed way rather than being forced. Persistence of the pathologic phonation was confirmed by acoustic voice evaluation parameters (Jitter, Shimmer, HNR and Fo).

CONCLUSION: Examination findings confirm better voice quality obtained in rehabilitation with second generation voice prostheses as compared to oesophageal voice and speech resulting from natural rehabilitation process.}, } @article {pmid19111438, year = {2010}, author = {Waaramaa, T and Laukkanen, AM and Airas, M and Alku, P}, title = {Perception of emotional valences and activity levels from vowel segments of continuous speech.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {24}, number = {1}, pages = {30-38}, doi = {10.1016/j.jvoice.2008.04.004}, pmid = {19111438}, issn = {1873-4588}, mesh = {Adult ; *Emotions ; Female ; Glottis/physiology ; Humans ; Language ; Logistic Models ; Male ; Middle Aged ; *Phonetics ; Psychoacoustics ; Psycholinguistics ; Sex Characteristics ; *Speech/physiology ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; Time Factors ; }, abstract = {This study aimed to investigate the role of voice source and formant frequencies in the perception of emotional valence and psychophysiological activity level from short vowel samples (approximately 150 milliseconds). Nine professional actors (five males and four females) read a prose passage simulating joy, tenderness, sadness, anger, and a neutral emotional state. The stress carrying vowel [a:] was extracted from continuous speech during the Finnish word [ta:k:ahan] and analyzed for duration, fundamental frequency (F0), equivalent sound level (L(eq)), alpha ratio, and formant frequencies F1-F4. Alpha ratio was calculated by subtracting the L(eq) (dB) in the range 50 Hz-1 kHz from the L(eq) in the range 1-5 kHz. The samples were inverse filtered by Iterative Adaptive Inverse Filtering and the estimates of the glottal flow obtained were parameterized with the normalized amplitude quotient (NAQ = f(AC)/(d(peak)T)). Fifty listeners (mean age 28.5 years) identified the emotional valences from the randomized samples. Multinomial Logistic Regression Analysis was used to study the interrelations of the parameters for perception. It appeared to be possible to identify valences from vowel samples of short duration (approximately 150 milliseconds). NAQ tended to differentiate between the valences and activity levels perceived in both genders. Voice source may not only reflect variations of F0 and L(eq), but may also have an independent role in expression, reflecting phonation types. To some extent, formant frequencies appeared to be related to valence perception but no clear patterns could be identified. Coding of valence tends to be a complicated multiparameter phenomenon with wide individual variation.}, } @article {pmid19097598, year = {2008}, author = {Chen, CC and Lin, KC and Wu, CY and Chen, CL and Chen, CH and Chen, HC and Hong, WH and Wong, AM}, title = {Acoustic study in Mandarin-speaking children: developmental changes in vowel production.}, journal = {Chang Gung medical journal}, volume = {31}, number = {5}, pages = {503-509}, pmid = {19097598}, issn = {2072-0939}, mesh = {Acoustics ; Asian People ; Child ; Child, Preschool ; Female ; Humans ; *Language ; Male ; *Phonetics ; Taiwan ; }, abstract = {BACKGROUND: Acoustic analysis had been well incorporated into clinical evaluation and management of children with speech disorders for many years. The aim of this study is to investigate developmental changes in vowel production in Mandarin-speaking children using acoustic study analysis.

METHODS: A total of 22 children from 5-12 years old were analyzed in this study. Each child read a list of speech materials consisting of 6 dissyllabic words in Mandarin phonemes and the speech samples were recorded. The digitized acoustic recordings were submitted for acoustic analysis. The acoustic parameters in this study include the first and second formant frequencies (F1 and F2) of /a/, /i/ and /u/ and the vowel space. We used the Wilcoxon rank sum test and Spearman's rho correlation test for statistical analysis.

RESULTS: The F1 values of the vowel /i/ were significantly lower in boys than those in girls (p = 0.013) by Wilcoxon ranksum test. The F1 value of the vowel /i/ was negatively correlated with children's age (rho = -0.601, p = 0.003) and their body height (rho = 0.478 p = 0.045). The F1 values of the other two vowels (/u/ and /a/), the F2 values of all three vowels and the vowel space had no association with age and gender.

CONCLUSIONS: F1 acoustic parameters have developmental and gender changes in vowel production in Mandarin-speaking children. The data in this study provide references for acoustic assessment of Mandarin-speaking children.}, } @article {pmid19054525, year = {2009}, author = {Roy, N and Nissen, SL and Dromey, C and Sapir, S}, title = {Articulatory changes in muscle tension dysphonia: evidence of vowel space expansion following manual circumlaryngeal therapy.}, journal = {Journal of communication disorders}, volume = {42}, number = {2}, pages = {124-135}, doi = {10.1016/j.jcomdis.2008.10.001}, pmid = {19054525}, issn = {1873-7994}, mesh = {Biofeedback, Psychology ; Dysphonia/*psychology/*therapy ; Female ; Humans ; Larynx ; Middle Aged ; *Musculoskeletal Manipulations ; *Phonetics ; Speech ; Speech Articulation Tests ; *Speech Therapy ; Treatment Outcome ; }, abstract = {UNLABELLED: In a preliminary study, we documented significant changes in formant transitions associated with successful manual circumlaryngeal treatment (MCT) of muscle tension dysphonia (MTD), suggesting improvement in speech articulation. The present study explores further the effects of MTD on vowel articulation by means of additional vowel acoustic measures. Pre- and post-treatment audio recordings of 111 women with MTD were analyzed acoustically using two measures: vowel space area (VSA) and vowel articulation index (VAI), constructed using the first (F1) and second (F2) formants of 4 point vowels/ a, i, ae, u/, extracted from eight words within a standard reading passage. Pairwise t-tests revealed significant increases in both VSA and VAI, confirming that successful treatment of MTD is associated with vowel space expansion. Although MTD is considered a voice disorder, its treatment with MCT appears to positively affect vocal tract dynamics. While the precise mechanism underlying vowel space expansion remains unknown, improvements may be related to lowering of the larynx, expanding oropharyngeal space, and improving articulatory movements.

LEARNING OUTCOMES: The reader will be able to: (1) describe possible articulatory changes associated with successful treatment of muscle tension dysphonia; (2) describe two acoustic methods to assess vowel centralization and decentralization, and; (3) understand the basis for viewing muscle tension dysphonia as a disorder not solely confined to the larynx.}, } @article {pmid19045804, year = {2008}, author = {Assmann, PF and Nearey, TM}, title = {Identification of frequency-shifted vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {5}, pages = {3203-3212}, doi = {10.1121/1.2980456}, pmid = {19045804}, issn = {1520-8524}, mesh = {Adult ; Child ; Female ; Hearing/*physiology ; Humans ; Linguistics ; Male ; *Phonetics ; Sound Spectrography ; *Speech ; *Speech Intelligibility ; }, abstract = {Within certain limits, speech intelligibility is preserved with upward or downward scaling of the spectral envelope. To study these limits and assess their interaction with fundamental frequency (F0), vowels in /hVd/ syllables were processed using the STRAIGHT vocoder and presented to listeners for identification. Identification accuracy showed a gradual decline when the spectral envelope was scaled up or down in vowels spoken by men, women, and children. Upward spectral envelope shifts led to poorer identification of children's vowels compared to adults, while downward shifts had a greater impact on men's vowels compared to women and children. Coordinated shifts (F0 and spectral envelope shifted in the same direction) generally produced higher accuracy than conditions with F0 and spectral envelope shifted in opposite directions. Vowel identification was poorest in conditions with very high F0, consistent with suggestions from the literature that sparse sampling of the spectral envelope may be a factor in vowel identification. However, the gradual decline in accuracy as a function of both upward and downward spectral envelope shifts and the interaction between spectral envelope shifts and F0 suggests the additional operation of perceptual mechanisms sensitive to the statistical covariation of F0 and formant frequencies in natural speech.}, } @article {pmid19045663, year = {2008}, author = {Bharath Kumar, SV and Umesh, S}, title = {Nonuniform speaker normalization using affine transformation.}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {3}, pages = {1727-1738}, doi = {10.1121/1.2951597}, pmid = {19045663}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; *Models, Biological ; *Nonlinear Dynamics ; Psychoacoustics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Recognition Software ; }, abstract = {In this paper, a well-motivated nonuniform speaker normalization model that affinely relates the formant frequencies of speakers enunciating the same sound is proposed. Using the proposed affine model, the corresponding universal-warping function that is required for normalization is shown to have the same parametric form as the mel scale formula. The parameters of this universal-warping function are estimated from the vowel formant data and are shown to be close to the commonly used formula for the mel scale. This shows an interesting connection between nonuniform speaker normalization and the psychoacoustics based mel scale. In addition, the affine model fits the vowel formant data better than commonly used ad hoc normalization models. This work is motivated by a desire to improve the performance of speaker-independent speech recognition systems, where speaker normalization is conventionally done by assuming a linear-scaling relationship between spectra of speakers. The proposed affine relation is extended to describe the relationship between spectra of speakers enunciating the same sound. On a telephone-based connected digit recognition task, the proposed model provides improved recognition performance over the linear-scaling model.}, } @article {pmid19045658, year = {2008}, author = {Clopper, CG and Pierrehumbert, JB}, title = {Effects of semantic predictability and regional dialect on vowel space reduction.}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {3}, pages = {1682-1688}, pmid = {19045658}, issn = {1520-8524}, support = {F32 DC007237/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; *Residence Characteristics ; *Semantics ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; Time Factors ; United States ; Young Adult ; }, abstract = {This study explored the interaction between semantic predictability and regional dialect variation in an analysis of speech produced by college-aged female talkers from the Northern, Midland, and Southern dialects of American English. Previous research on the effects of semantic predictability has shown that vowels in high semantic predictability contexts are temporally and spectrally reduced compared to vowels in low semantic predictability contexts. In the current study, an analysis of vowel duration confirmed temporal reduction in the high predictability condition. An analysis of vowel formant structure and vowel space dispersion revealed overall spectral reduction for the Southern talkers. For the Northern talkers, more extreme Northern Cities shifting occurred in the high predictability condition than in the low predictability condition. No effects of semantic predictability were observed for the Midland talkers. These findings suggest an interaction between semantic and indexical factors in vowel reduction processes.}, } @article {pmid23853134, year = {2008}, author = {Keng Hoong Wee, and Turicchia, L and Sarpeshkar, R}, title = {An analog integrated-circuit vocal tract.}, journal = {IEEE transactions on biomedical circuits and systems}, volume = {2}, number = {4}, pages = {316-327}, doi = {10.1109/TBCAS.2008.2005296}, pmid = {23853134}, issn = {1932-4545}, abstract = {We present the first experimental integrated-circuit vocal tract by mapping fluid volume velocity to current, fluid pressure to voltage, and linear and nonlinear mechanical impedances to linear and nonlinear electrical impedances. The 275 muW analog vocal tract chip includes a 16-stage cascade of two-port pi-elements that forms a tunable transmission line, electronically variable impedances, and a current source as the glottal source. A nonlinear resistor models laminar and turbulent flow in the vocal tract. The measured SNR at the output of the analog vocal tract is 64, 66, and 63 dB for the first three formant resonances of a vocal tract with uniform cross-sectional area. The analog vocal tract can be used with auditory processors in a feedback speech locked loop-analogous to a phase locked loop-to implement speech recognition that is potentially robust in noise. Our use of a physiological model of the human vocal tract enables the analog vocal tract chip to synthesize speech signals of interest, using articulatory parameters that are intrinsically compact and linearly interpolatable.}, } @article {pmid19037152, year = {2008}, author = {Hagino, A and Inohara, K and Sumita, YI and Taniguchi, H}, title = {Investigation of the factors influencing the outcome of prostheses on speech rehabilitation of mandibulectomy patients.}, journal = {Nihon Hotetsu Shika Gakkai zasshi}, volume = {52}, number = {4}, pages = {543-549}, doi = {10.2186/jjps.52.543}, pmid = {19037152}, issn = {0389-5386}, mesh = {Aged ; Aged, 80 and over ; *Dental Prosthesis ; Female ; Humans ; Male ; Mandible/*surgery ; Middle Aged ; Speech/*physiology ; Surveys and Questionnaires ; Tongue/physiology ; }, abstract = {PURPOSE: The aim of this study was to investigate the factors influencing the outcome of prostheses on speech rehabilitation of mandibulectomy patients.

METHODS: Eleven patients (6 males and 5 females) who underwent mandibulectomy without glossectomy because of a tumor participated in the study. A Speech Intelligibility (SI) test was applied without and with a prosthesis to evaluate their speech ability. The type of resection, whether soft tissue grafting was undertaken or not, the continuity of mandibular bone, and the number of remaining teeth related to the stability of the prosthesis were determined from the medical records. The some of acoustic features, Formant 1 and Formant 2 range, were investigated to evaluate objectively the limitation of tongue movement. Five questionnaires were sent out to evaluate subjectively the difference in sense of discomfort while speaking with and without the prosthesis. These eleven items were entered into stepwise multiple regression models to determine the predictors of the differences in SI score without and with a prosthesis.

RESULTS: Three variables, the ease of tongue movements, whether soft tissue grafting was undertaken or not, and whether the mandibular bone was continuous or not, contributed to the recovery of speech ability with prosthodontic treatment.

CONCLUSION: The ease of tongue movement, no soft tissue grafting, and the continuity of mandibular bone contribute to the recovery of speech ability with prosthodontic treatment.}, } @article {pmid19034504, year = {2010}, author = {Baumann, O and Belin, P}, title = {Perceptual scaling of voice identity: common dimensions for different vowels and speakers.}, journal = {Psychological research}, volume = {74}, number = {1}, pages = {110-120}, pmid = {19034504}, issn = {1430-2772}, support = {BB/E003958/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Adult ; Discrimination, Psychological ; Female ; Humans ; Language ; Male ; Recognition, Psychology ; Sex Factors ; Speech Acoustics ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {(1) to determine if the acoustical parameters used by normal subjects to discriminate between different speakers vary when comparisons are made between pairs of two of the same or different vowels, and if they are different for male and female voices; (2) to ask whether individual voices can reasonably be represented as points in a low-dimensional perceptual space such that similarly sounding voices are located close to one another. Subjects were presented with pairs of voices from 16 male and 16 female speakers uttering the three French vowels "a", "i" and "u" and asked to give speaker similarity judgments. Multidimensional analyses of the similarity matrices were performed separately for male and female voices and for three types of comparisons: same vowels, different vowels and overall average. The resulting dimensions were then interpreted a posteriori in terms of relevant acoustical measures. For both male and female voices, a two-dimensional perceptual space was found to be most appropriate, with axes largely corresponding to contributions of the larynx (pitch) and supra-laryngeal vocal tract (formants), mirroring the two largely independent components of source and filter in voice production. These perceptual spaces of male and female voices and their corresponding voice samples are available at: http://vnl.psy.gla.ac.uk section Resources.}, } @article {pmid19001568, year = {2008}, author = {McMurray, B and Clayards, MA and Tanenhaus, MK and Aslin, RN}, title = {Tracking the time course of phonetic cue integration during spoken word recognition.}, journal = {Psychonomic bulletin & review}, volume = {15}, number = {6}, pages = {1064-1071}, pmid = {19001568}, issn = {1069-9384}, support = {DC005071/DC/NIDCD NIH HHS/United States ; DC008089/DC/NIDCD NIH HHS/United States ; R01 DC008089-02/DC/NIDCD NIH HHS/United States ; DC006537/DC/NIDCD NIH HHS/United States ; F31 DC006537/DC/NIDCD NIH HHS/United States ; R01 DC008089/DC/NIDCD NIH HHS/United States ; R01 DC005071/DC/NIDCD NIH HHS/United States ; }, mesh = {*Attention ; *Cues ; Humans ; Judgment ; Pattern Recognition, Visual ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speech perception requires listeners to integrate multiple cues that each contribute to judgments about a phonetic category. Classic studies of trading relations assessed the weights attached to each cue but did not explore the time course of cue integration. Here, we provide the first direct evidence that asynchronous cues to voicing (/b/ vs. /p/) and manner (/b/ vs. /w/) contrasts become available to the listener at different times during spoken word recognition. Using the visual world paradigm, we show that the probability of eye movements to pictures of target and of competitor objects diverge at different points in time after the onset of the target word. These points of divergence correspond to the availability of early (voice onset time or formant transition slope) and late (vowel length) cues to voicing and manner contrasts. These results support a model of cue integration in which phonetic cues are used for lexical access as soon as they are available.}, } @article {pmid18982619, year = {2008}, author = {Wismueller, A and Behrends, J and Hoole, P and Leinsinger, GL and Reiser, MF and Westesson, PL}, title = {Human vocal tract analysis by in vivo 3D MRI during phonation: a complete system for imaging, quantitative modeling, and speech synthesis.}, journal = {Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention}, volume = {11}, number = {Pt 2}, pages = {306-312}, doi = {10.1007/978-3-540-85990-1_37}, pmid = {18982619}, mesh = {Adult ; Computer Simulation ; Female ; Humans ; Image Interpretation, Computer-Assisted/*methods ; Imaging, Three-Dimensional/*methods ; Magnetic Resonance Imaging/*methods ; Male ; Models, Anatomic ; *Models, Biological ; Speech Production Measurement/*methods ; Vocal Cords/*anatomy & histology/*physiology ; Young Adult ; }, abstract = {We present a complete system for image-based 3D vocal tract analysis ranging from MR image acquisition during phonation, semi-automatic image processing, quantitative modeling including model-based speech synthesis, to quantitative model evaluation by comparison between recorded and synthesized phoneme sounds. For this purpose, six professionally trained speakers, age 22-34y, were examined using a standardized MRI protocol (1.5 T, T1w FLASH, ST 4mm, 23 slices, acq. time 21s). The volunteers performed a prolonged (> or = 21s) emission of sounds of the German phonemic inventory. Simultaneous audio tape recording was obtained to control correct utterance. Scans were made in axial, coronal, and sagittal planes each. Computer-aided quantitative 3D evaluation included (i) automated registration of the phoneme-specific data acquired in different slice orientations, (ii) semi-automated segmentation of oropharyngeal structures, (iii) computation of a curvilinear vocal tract midline in 3D by nonlinear PCA, (iv) computation of cross-sectional areas of the vocal tract perpendicular to this midline. For the vowels /a/,/e/,/i/,/o/,/ø/,/u/,/y/, the extracted area functions were used to synthesize phoneme sounds based on an articulatory-acoustic model. For quantitative analysis, recorded and synthesized phonemes were compared, where area functions extracted from 2D midsagittal slices were used as a reference. All vowels could be identified correctly based on the synthesized phoneme sounds. The comparison between synthesized and recorded vowel phonemes revealed that the quality of phoneme sound synthesis was improved for phonemes /a/, /o/, and /y/, if 3D instead of 2D data were used, as measured by the average relative frequency shift between recorded and synthesized vowel formants (p < 0.05, one-sided Wilcoxon rank sum test). In summary, the combination of fast MRI followed by subsequent 3D segmentation and analysis is a novel approach to examine human phonation in vivo. It unveils functional anatomical findings that may be essential for realistic modelling of the human vocal tract during speech production.}, } @article {pmid18952854, year = {2009}, author = {Alexander, JM and Kluender, KR}, title = {Spectral tilt change in stop consonant perception by listeners with hearing impairment.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {52}, number = {3}, pages = {653-670}, pmid = {18952854}, issn = {1092-4388}, support = {R01 DC004072/DC/NIDCD NIH HHS/United States ; R01 DC004072-08/DC/NIDCD NIH HHS/United States ; T32 DC000013/DC/NIDCD NIH HHS/United States ; T32 DC000013-28/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Female ; Hearing Aids ; Hearing Loss, Sensorineural/*psychology ; Hearing Tests ; Humans ; Likelihood Functions ; Linear Models ; Logistic Models ; Male ; Middle Aged ; *Phonetics ; *Speech Perception ; Task Performance and Analysis ; }, abstract = {PURPOSE: To evaluate how perceptual importance of spectral tilt is altered when formant information is degraded by sensorineural hearing loss.

METHOD: Eighteen listeners with mild to moderate hearing impairment (HI listeners) and 20-23 listeners with normal hearing (NH listeners) identified synthesized stimuli that varied in second formant (F(2)) frequency and spectral tilt. Experiments 1 and 2 examined utterance-initial stops (/ba/ and /da/), and Experiments 3 and 4 examined medial stops (/aba/ and /ada/). Spectral tilt was manipulated at either consonant onset (Experiments 1 and 3), vowels (Experiments 2 and 4), or both (Experiment 5).

RESULTS: Regression analyses revealed that HI listeners weighted F(2) substantially less than NH listeners. There was no difference in absolute tilt weights between groups. However, HI listeners emphasized tilt as much as F(2) for medial stops. NH listeners weighted tilt primarily when F(2) was ambiguous, whereas HI listeners weighted tilt significantly more than NH listeners on unambiguous F(2) endpoints.

CONCLUSIONS: Attenuating changes in spectral tilt can be as deleterious as taking away F(2) information for HI listeners. Recordings through a wide dynamic range compression hearing aid show compromised changes in spectral tilt, compressed in range by up to 50%.}, } @article {pmid18950004, year = {2008}, author = {Wang, J and Gao, X and Liu, X and Feng, Y and Shen, X and Yu, C and Yang, Y}, title = {[The investigation of formant on different artistic voice].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {22}, number = {15}, pages = {679-682}, pmid = {18950004}, issn = {2096-7993}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Music ; Spectroscopy, Fourier Transform Infrared ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the characteristic of formant-a very important parameter in the spectrogram of three types of artistic voice (western mode; Chinese mode; Beijing opera).

METHOD: We used MATLAB software to make the short-time Fourier transform and spectrogram analysis on the homeostasis vowel examples of the three types.

RESULT: The western mode had different representation "singer formant" (Fs) based on the voice part; the Chinese mode's notable features were that F1, F2, F3, were continuous and the energy of them changed softly; the Beijing opera had the common representation which was a very wide formant and there was soft transition between formants and various harmonic, besides it showed a similar component like the "Fs" (two formants connected normally).

CONCLUSION: Different artistic voice showed their own characteristics of the formant parameter in the spectrogram, which had important value on the identification, objective evaluation and prediction.}, } @article {pmid18936352, year = {2008}, author = {de Carvalho-Teles, V and Sennes, LU and Gielow, I}, title = {Speech evaluation after palatal augmentation in patients undergoing glossectomy.}, journal = {Archives of otolaryngology--head & neck surgery}, volume = {134}, number = {10}, pages = {1066-1070}, doi = {10.1001/archotol.134.10.1066}, pmid = {18936352}, issn = {1538-361X}, mesh = {Adult ; Aged ; Aged, 80 and over ; Analysis of Variance ; Articulation Disorders/etiology/*rehabilitation ; Brazil ; Evaluation Studies as Topic ; Female ; Follow-Up Studies ; Glossectomy/adverse effects/methods/*rehabilitation ; Humans ; Male ; Middle Aged ; Probability ; Prosthesis Design ; *Prosthesis Implantation ; Quality of Life ; Plastic Surgery Procedures/methods/rehabilitation ; Retrospective Studies ; Risk Assessment ; Speech Intelligibility ; Speech Production Measurement ; Speech Therapy/*methods ; Statistics, Nonparametric ; Tongue Neoplasms/pathology/*surgery ; Treatment Outcome ; }, abstract = {OBJECTIVE: To assess, in patients undergoing glossectomy, the influence of the palatal augmentation prosthesis on the speech intelligibility and acoustic spectrographic characteristics of the formants of oral vowels in Brazilian Portuguese, specifically the first 3 formants (F1 [/a,e,u/], F2 [/o,ó,u/], and F3 [/a,ó/]).

DESIGN: Speech evaluation with and without a palatal augmentation prosthesis using blinded randomized listener judgments.

SETTING: Tertiary referral center.

PATIENTS: Thirty-six patients (33 men and 3 women) aged 30 to 80 (mean [SD], 53.9 [10.5]) years underwent glossectomy (14, total glossectomy; 12, total glossectomy and partial mandibulectomy; 6, hemiglossectomy; and 4, subtotal glossectomy) with use of the augmentation prosthesis for at least 3 months before inclusion in the study.

MAIN OUTCOME MEASURES: Spontaneous speech intelligibility (assessed by expert listeners using a 4-category scale) and spectrographic formants assessment.

RESULTS: We found a statistically significant improvement of spontaneous speech intelligibility and the average number of correctly identified syllables with the use of the prosthesis (P < .05). Statistically significant differences occurred for the F1 values of the vowels /a,e,u/; for F2 values, there was a significant difference of the vowels /o,ó,u/; and for F3 values, there was a significant difference of the vowels /a,ó/ (P < .001).

CONCLUSIONS: The palatal augmentation prosthesis improved the intelligibility of spontaneous speech and syllables for patients who underwent glossectomy. It also increased the F2 and F3 values for all vowels and the F1 values for the vowels /o,ó,u/. This effect brought the values of many vowel formants closer to normal.}, } @article {pmid18852968, year = {2008}, author = {Rehder, MI and Behlau, M}, title = {Perceptual, auditory and acoustic vocal analysis of speech and singing in choir conductors.}, journal = {Pro-fono : revista de atualizacao cientifica}, volume = {20}, number = {3}, pages = {195-200}, doi = {10.1590/s0104-56872008000300010}, pmid = {18852968}, issn = {1809-399X}, mesh = {Adult ; Aged ; Auditory Perception/*physiology ; Female ; Humans ; Male ; Middle Aged ; Music ; Occupational Health ; *Speech Acoustics ; Speech Perception/*physiology ; Voice/*physiology ; Voice Disorders/*prevention & control ; Voice Quality/*physiology ; Voice Training ; }, abstract = {BACKGROUND: the voice of choir conductors.

AIM: to evaluate the vocal quality of choir conductors based on the production of a sustained vowel during singing and when speaking in order to observe auditory and acoustic differences.

METHOD: participants of this study were 100 choir conductors, with an equal distribution between genders. Participants were asked to produce the sustained vowel "é" using a singing and speaking voice. Speech samples were analyzed based on auditory-perceptive and acoustic parameters. The auditory-perceptive analysis was carried out by two speech-language pathologist, specialists in this field of knowledge. The acoustic analysis was carried out with the support of the computer software Doctor Speech (Tiger Electronics, SRD, USA, version 4.0), using the Real Analysis module.

RESULTS: the auditory-perceptive analysis of the vocal quality indicated that most conductors have adapted voices, presenting more alterations in their speaking voice. The acoustic analysis indicated different values between genders and between the different production modalities. The fundamental frequency was higher in the singing voice, as well as the values for the first formant; the second formant presented lower values in the singing voice, with statistically significant results only for women.

CONCLUSION: the voice of choir conductors is adapted, presenting fewer deviations in the singing voice when compared to the speaking voice. Productions differ based the voice modality, singing or speaking.}, } @article {pmid18845518, year = {2009}, author = {Bryant, GA and Haselton, MG}, title = {Vocal cues of ovulation in human females.}, journal = {Biology letters}, volume = {5}, number = {1}, pages = {12-15}, pmid = {18845518}, issn = {1744-9561}, mesh = {*Cues ; Female ; *Fertility ; Humans ; Ovulation/*physiology ; Voice/*physiology ; }, abstract = {Recent research has documented a variety of ovulatory cues in humans, and in many nonhuman species, the vocal channel provides cues of reproductive state. We collected two sets of vocal samples from 69 normally ovulating women: one set during the follicular (high-fertility) phase of the cycle and one set during the luteal (low-fertility) phase, with ovulation confirmed by luteinizing hormone tests. In these samples we measured fundamental frequency (pitch), formant dispersion, jitter, shimmer, harmonics-to-noise ratio and speech rate. When speaking a simple introductory sentence, women's pitch increased during high- as compared with low-fertility, and this difference was the greatest for women whose voices were recorded on the two highest fertility days within the fertile window (the 2 days just before ovulation). This pattern did not occur when the same women produced vowels. The high- versus low-fertility difference in pitch was associated with the approach of ovulation and not menstrual onset, thus representing, to our knowledge, the first research to show a specific cyclic fertility cue in the human voice. We interpret this finding as evidence of a fertility-related enhancement of femininity consistent with other research documenting attractiveness-related changes associated with ovulation.}, } @article {pmid18839614, year = {2008}, author = {Kosztyła-Hojna, B and Rogowski, M and Olszewska, E and Kasperuk, J}, title = {[Voice quality evaluation in patients with obstructive sleep apnea syndrome treated with uvulopalatopharyngoplasty (UPPP)].}, journal = {Polski merkuriusz lekarski : organ Polskiego Towarzystwa Lekarskiego}, volume = {25}, number = {145}, pages = {46-50}, pmid = {18839614}, issn = {1426-9686}, mesh = {Adult ; Dysphonia/classification/*etiology ; Humans ; Male ; Middle Aged ; Otorhinolaryngologic Surgical Procedures/*adverse effects ; Palate/surgery ; Pharynx/surgery ; Prognosis ; Plastic Surgery Procedures/*adverse effects ; Sleep Apnea, Obstructive/*surgery ; Treatment Outcome ; Uvula/surgery ; *Voice Quality ; }, abstract = {MATERIAL AND METHODS: The evaluation of voice quality of 22 patients with obstructive sleep apnea syndrome (OSAS), who underwent uvulopalatopharyngoplasty (UPPP), was performed. The voice quality was assessed before the operation, and 1 month and 6-months after the procedure. The grade of dysphonia was determined on a six-grade dysphonia scale according to the Union of European Phoniatricians. Maximum phonation time (MPT) was analyzed. The acoustic examination of voice was performed determining fundamental frequency (F0) value and shimmer, jitter and NHR acoustic parameters. Formants analysis of voice was made estimating the level of F1, F2, F3 and F4 formants.

RESULTS: Registered hearing dysphonia directly after the surgery was transient. The surgery did not significantly change F0 and jitter parameters that decrease the grade intensification of dysphonia. The reduction of maximum phonation time, changes in jitter and NHR acoustic parameters as well as the reduction of F3 and F4 formants confirmed the occurrence of open nasality directly after UPPP.

CONCLUSION: Registered voice quality and timbre disturbance after UPPP were periodic.}, } @article {pmid18826092, year = {2008}, author = {Yang, Y and Wang, LP}, title = {[Acoustic characteristics of adductor spasmodic dysphonia].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {43}, number = {6}, pages = {419-423}, pmid = {18826092}, issn = {1673-0860}, mesh = {Adult ; Case-Control Studies ; Dysphonia/diagnosis/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Sound Spectrography ; Speech Acoustics ; Tics/diagnosis/*physiopathology ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the acoustic characteristics of adductor spasmodic dysphonia.

METHODS: The acoustic characteristics, including acoustic signal of recorded voice, three-dimensional sonogram patterns and subjective assessment of voice, between 10 patients (7 women, 3 men) with adductor spasmodic dysphonia and 10 healthy volunteers (5 women, 5 men), were compared.

RESULTS: The main clinical manifestation of adductor spasmodic dysphonia included the disorders of sound quality, rhyme and fluency. It demonstrated the tension dysphonia when reading, acoustic jitter, momentary fluctuation of frequency and volume, voice squeezing, interruption, voice prolongation, and losing normal chime. Among 10 patients, there were 1 mild dysphonia (abnormal syllable number < 25%), 6 moderate dysphonia (abnormal syllable number 25%-49%), 1 severe dysphonia (abnormal syllable number 50%-74%) and 2 extremely severe dysphonia (abnormal syllable number > or = 75%). The average reading time in 10 patients was 49 s, with reading time extension and aphasia area interruption in acoustic signals, whereas the average reading time in health control group was 30 s, without voice interruption. The aphasia ratio averaged 42%. The respective symptom syllable in different patients demonstrated in the three-dimensional sonogram. There were voice onset time prolongation, irregular, interrupted and even absent vowel formants. The consonant of symptom syllables displayed absence or prolongation of friction murmur in the block-friction murmur occasionally.

CONCLUSIONS: The acoustic characteristics of adductor spasmodic dysphonia is the disorders of sound quality, rhyme and fluency. The three-dimensional sonogram of the symptom syllables show distinctive changes of proportional vowels or consonant phonemes.}, } @article {pmid18818121, year = {2008}, author = {Johnson, KL and Nicol, T and Zecker, SG and Bradlow, AR and Skoe, E and Kraus, N}, title = {Brainstem encoding of voiced consonant--vowel stop syllables.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {119}, number = {11}, pages = {2623-2635}, doi = {10.1016/j.clinph.2008.07.277}, pmid = {18818121}, issn = {1388-2457}, support = {R01DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Analysis of Variance ; Brain Stem/*physiology ; Child ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; Reaction Time/*physiology ; Time Factors ; }, abstract = {OBJECTIVE: The purpose of this study is to expand our understanding of how the human auditory brainstem encodes temporal and spectral acoustic cues in voiced stop consonant-vowel syllables.

METHODS: Auditory evoked potentials measuring activity from the brainstem of 22 normal learning children were recorded to the voiced stop consonant syllables [ga], [da], and [ba]. Spectrotemporal information distinguishing these voiced consonant-vowel syllables is contained within the first few milliseconds of the burst and the formant transition to the vowel. Responses were compared across stimuli with respect to their temporal and spectral content.

RESULTS: Brainstem response latencies change in a predictable manner in response to systematic alterations in a speech syllable indicating that the distinguishing acoustic cues are represented by neural response timing (synchrony). Spectral analyses of the responses show frequency distribution differences across stimuli (some of which appear to represent acoustic characteristics created by difference tones of the stimulus formants) indicating that neural phase-locking is also important for encoding these acoustic elements.

CONCLUSIONS: Considered within the context of existing knowledge of brainstem encoding of speech-sound structure, these data are the beginning of a comprehensive delineation of how the human auditory brainstem encodes perceptually critical features of speech.

SIGNIFICANCE: The results of this study could be used to determine how neural encoding is disrupted in the clinical populations for whom stop consonants pose particular perceptual challenges (e.g., hearing impaired individuals and poor readers).}, } @article {pmid18797830, year = {2008}, author = {Gugsch, C and Dannhauer, KH and Fuchs, M}, title = {Evaluation of the progress of therapy in patients with cleft lip, jaw and palate, using voice analysis--a pilot study.}, journal = {Journal of orofacial orthopedics = Fortschritte der Kieferorthopadie : Organ/official journal Deutsche Gesellschaft fur Kieferorthopadie}, volume = {69}, number = {4}, pages = {257-267}, doi = {10.1007/s00056-008-0702-0}, pmid = {18797830}, issn = {1434-5293}, mesh = {Child ; Child, Preschool ; Cleft Lip/complications/*diagnosis/*rehabilitation ; Cleft Palate/complications/*diagnosis/*rehabilitation ; Cohort Studies ; Female ; Humans ; Male ; Outcome Assessment, Health Care/methods ; Pilot Projects ; Reproducibility of Results ; Retrospective Studies ; Sensitivity and Specificity ; Speech Production Measurement/*methods ; Speech Recognition Software ; Treatment Outcome ; Voice Disorders/*diagnosis/etiology/*prevention & control ; }, abstract = {BACKGROUND: The defective morphology of the hard and soft palate in patients with cleft lip, jaw and/or palate in conjunction with speech, voice and hearing disorders can considerably restrict the ability to communicate. Changes in vocal timbre and the centripetal displacement of articulation are characteristic of cleft palate speech. There has not been a uniform diagnostic method in cleft centers to date which makes possible the analysis, documentation and comparison of changes in timbre. In this study we assessed a computer-aided evaluation process to determine objective vocal timbre parameters while treating children with cleft lip, jaw and palate who had undergone surgery according to the principle of delayed palate repair.

PATIENTS AND METHODS: The data and findings of 24 patients with various cleft forms were evaluated. The group we investigated consisted of seven patients with bilateral clefts, fourteen with unilateral cleft, and three with isolated cleft palate. Our subject cohort was limited to those born between 1985 and 1986. At the baseline investigation the children were aged 3 to 4 years, at the second investigation 4 to 5 years, and at the third investigation 5 to 7 years. These children underwent palate repair together with velopharyngoplasty between their fourth and fifth years. Our data were based on the tape recordings taken at these three investigation time points (before velopharyngoplasty, after velopharyngoplasty, and in the year they started school). The German words "Ball", "Kaffeekanne", "Schuhe" and "Schokoladenpudding" and the sentence "Meine Puppe heisst Sabine." were analyzed using "Multi Speech" software. The individual vowels were manually extracted from these recordings and the fundamental frequency (F0) and frequency of the first formant (F1) determined

RESULTS: We were able to evaluate the spectra of the vowels /a/, /i/ and /u/ of 24 children in all. There were statistically-significant differences in fundamental frequency when considering the intra-individual progress of patients with bilateral cleft lip, jaw and palate. The fundamental frequency at the second and third investigation time points was significantly lower (p=0.003; p=0.000) than that at the baseline investigation. We observed no significant differences regarding that parameter at the various time points when evaluating the children with unilateral cleft lip, jaw and palate. Careful appraisal of the individual vowels showed that the speech results correlated with the severity of the type of cleft.

CONCLUSION: Since the fundamental frequency and analysis of the first formant have proven to be suitable parameters for the analytical description of the vocal timbre of cleft patients, our results provide a solid basis for further studies. We provided evidence that this investigatory method is also effective when considering time and equipment requirements.}, } @article {pmid18792517, year = {2008}, author = {Remez, RE and Ferro, DF and Wissig, SC and Landau, CA}, title = {Asynchrony tolerance in the perceptual organization of speech.}, journal = {Psychonomic bulletin & review}, volume = {15}, number = {4}, pages = {861-865}, pmid = {18792517}, issn = {1069-9384}, support = {DC00308/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Perceptual Distortion ; *Phonetics ; Pilot Projects ; *Semantics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Researchers have claimed that listeners tolerate large temporal distortion when integrating the spectral components of speech. In some estimates, perceivers resolve linguistic attributes at spectral desynchronies as great as the duration of a syllable. We obtained new measures of perceptual tolerance of auditory asynchrony, using sine-wave synthesis in order to require perceivers to resolve the speech stream dynamically. Listeners transcribed sentences in which the tone analogue of a second formant was desynchronized relative to the remaining tones of a sentence, with desynchrony ranging from a 250-msec lead to a 250-msec lag. Intelligibility declined symmetrically from 72% at synchrony to 7% at +/-100 msec. This finding of narrow asynchrony tolerance indicates a time-critical feature of the auditory perceptual organization of speech.}, } @article {pmid18769619, year = {2008}, author = {Vannoni, E and McElligott, AG}, title = {Low frequency groans indicate larger and more dominant fallow deer (Dama dama) males.}, journal = {PloS one}, volume = {3}, number = {9}, pages = {e3113}, pmid = {18769619}, issn = {1932-6203}, mesh = {Acoustics ; *Animal Communication ; Animals ; Body Size ; Deer ; Female ; Male ; Models, Biological ; Phenotype ; Reproduction ; *Sexual Behavior, Animal ; Social Dominance ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {BACKGROUND: Models of honest advertisement predict that sexually selected calls should signal male quality. In most vertebrates, high quality males have larger body sizes that determine higher social status and in turn higher reproductive success. Previous research has emphasised the importance of vocal tract resonances or formant frequencies of calls as cues to body size in mammals. However, the role of the acoustic features of vocalisations as cues to other quality-related phenotypic characteristics of callers has rarely been investigated.

We examined whether the acoustic structure of fallow deer groans provides reliable information on the quality of the caller, by exploring the relationships between male quality (body size, dominance rank, and mating success) and the frequency components of calls (fundamental frequency, formant frequencies, and formant dispersion). We found that body size was not related to the fundamental frequency of groans, whereas larger males produced groans with lower formant frequencies and lower formant dispersion. Groans of high-ranking males were characterised by lower minimum fundamental frequencies and to a lesser extent, by lower formant dispersions. Dominance rank was the factor most strongly related to mating success, with higher-ranking males having higher mating success. The minimum fundamental frequency and the minimum formant dispersion were indirectly related to male mating success (through dominance rank).

CONCLUSION/SIGNIFICANCE: Our study is the first to show that sexually selected vocalisations can signal social dominance in mammals other than primates, and reveals that independent acoustic components encode accurate information on different phenotypic aspects of male quality.}, } @article {pmid18765945, year = {2008}, author = {Waaramaa, T and Laukkanen, AM and Alku, P and Väyrynen, E}, title = {Monopitched expression of emotions in different vowels.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {60}, number = {5}, pages = {249-255}, doi = {10.1159/000151762}, pmid = {18765945}, issn = {1421-9972}, mesh = {Auditory Perception/physiology ; Communication ; Drama ; *Emotions ; Female ; Humans ; Language ; Male ; Perception/*physiology ; Phonation/*physiology ; Pitch Discrimination ; Pitch Perception/*physiology ; Voice/*physiology ; Voice Quality/*physiology ; }, abstract = {Fundamental frequency (F(0)) and intensity are known to be important variables in the communication of emotions in speech. In singing, however, pitch is predetermined and yet the voice should convey emotions. Hence, other vocal parameters are needed to express emotions. This study investigated the role of voice source characteristics and formant frequencies in the communication of emotions in monopitched vowel samples [a:], [i:] and [u:]. Student actors (5 males, 8 females) produced the emotional samples simulating joy, tenderness, sadness, anger and a neutral emotional state. Equivalent sound level (L(eq)), alpha ratio [SPL (1-5 kHz) - SPL (50 Hz-1 kHz)] and formant frequencies F1-F4 were measured. The [a:] samples were inverse filtered and the estimated glottal flows were parameterized with the normalized amplitude quotient [NAQ = f(AC)/(d(peak)T)]. Interrelations of acoustic variables were studied by ANCOVA, considering the valence and psychophysiological activity of the expressions. Forty participants listened to the randomized samples (n = 210) for identification of the emotions. The capacity of monopitched vowels for conveying emotions differed. L(eq) and NAQ differentiated activity levels. NAQ also varied independently of L(eq). In [a:], filter (formant frequencies F1-F4) was related to valence. The interplay between voice source and F1-F4 warrants a synthesis study.}, } @article {pmid18765275, year = {2008}, author = {Aiken, SJ and Picton, TW}, title = {Envelope and spectral frequency-following responses to vowel sounds.}, journal = {Hearing research}, volume = {245}, number = {1-2}, pages = {35-47}, doi = {10.1016/j.heares.2008.08.004}, pmid = {18765275}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Adult ; Auditory Threshold/*physiology ; Cochlear Microphonic Potentials ; Electroencephalography/statistics & numerical data ; Female ; Fourier Analysis ; Humans ; Male ; *Phonetics ; Psychoacoustics ; Young Adult ; }, abstract = {Frequency-following responses (FFRs) were recorded to two naturally produced vowels (/a/ and /i/) in normal hearing subjects. A digitally implemented Fourier analyzer was used to measure response amplitude at the fundamental frequency and at 23 higher harmonics. Response components related to the stimulus envelope ("envelope FFR") were distinguished from components related to the stimulus spectrum ("spectral FFR") by adding or subtracting responses to opposite polarity stimuli. Significant envelope FFRs were detected at the fundamental frequency of both vowels, for all of the subjects. Significant spectral FFRs were detected at harmonics close to formant peaks, and at harmonics corresponding to cochlear intermodulation distortion products, but these were not significant in all subjects, and were not detected above 1500 Hz. These findings indicate that speech-evoked FFRs follow both the glottal pitch envelope as well as spectral stimulus components.}, } @article {pmid18664702, year = {2008}, author = {Bor, S and Souza, P and Wright, R}, title = {Multichannel compression: effects of reduced spectral contrast on vowel identification.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {5}, pages = {1315-1327}, pmid = {18664702}, issn = {1092-4388}, support = {T32 DC000033-16/DC/NIDCD NIH HHS/United States ; DC00033/DC/NIDCD NIH HHS/United States ; Z01 DC000033/ImNIH/Intramural NIH HHS/United States ; T32 DC000033/DC/NIDCD NIH HHS/United States ; DC006014/DC/NIDCD NIH HHS/United States ; R01 DC006014-02/DC/NIDCD NIH HHS/United States ; R01 DC006014/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Aged ; Aged, 80 and over ; *Hearing Aids ; Hearing Loss, Sensorineural/*therapy ; Humans ; Middle Aged ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: To clarify if large numbers of wide dynamic range compression channels provide advantages for vowel identification and to measure its acoustic effects. Methods Eight vowels produced by 12 talkers in the /hVd/ context were compressed using 1, 2, 4, 8, and 16 channels. Formant contrast indices (mean formant peak minus mean formant trough; maximum formant peak minus minimum formant trough) were developed to quantify spectral changes. Twenty listeners with mild to moderately severe sensorineural hearing loss identified the compressed vowels in an 8-alternative forced-choice procedure.

RESULTS: Formant contrast measures revealed significant spectral flattening for 6 of the 8 vowels as channel number increased. A significant decrease in vowel identification performance was also observed as spectral contrast decreased.

CONCLUSIONS: Increasing the number of wide dynamic range compression channels may not be beneficial for all speech signals, and individual vowel identification performance can vary greatly for listeners with similar hearing loss.}, } @article {pmid18664687, year = {2008}, author = {Hustad, KC and Lee, J}, title = {Changes in speech production associated with alphabet supplementation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {6}, pages = {1438-1450}, doi = {10.1044/1092-4388(2008/07-0185)}, pmid = {18664687}, issn = {1092-4388}, support = {R03 DC005536/DC/NIDCD NIH HHS/United States ; R03 DC005536-04/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Cerebral Palsy/complications ; Dysarthria/diagnosis/etiology/physiopathology ; Female ; Humans ; Linguistics/*methods ; Male ; Phonetics ; Speech Acoustics ; *Speech Production Measurement ; Speech Therapy/*methods ; Verbal Learning ; }, abstract = {PURPOSE: This study examined the effect of alphabet supplementation (AS) on temporal and spectral features of speech production in individuals with cerebral palsy and dysarthria.

METHOD: Twelve speakers with dysarthria contributed speech samples using habitual speech and while using AS. One hundred twenty listeners orthographically transcribed speech samples. Differences between habitual and AS speech were examined for intelligibility, rate, word duration, vowel duration, pause duration, pause frequency, vowel space, and first and second formant frequency (F1 and F2) values for corner vowels.

RESULTS: Descriptive results showed that intelligibility was higher, rate of speech was slower, and pause duration and pause frequency were greater for AS than for habitual speech. Inferential statistics showed that vowel duration, word duration, and vowel space increased significantly for AS. Vowel space did not differ for male and female speakers; however, there was an interaction between sex and speaking condition. Changes in vowel space were accomplished by reductions in F2 for /u/. Vowel space accounted for more variability in intelligibility than rate for AS; the opposite was true for habitual speech.

CONCLUSION: AS is associated with temporal and spectral changes in speech production. Spectral changes associated with corner vowels appear to be more important than temporal changes.}, } @article {pmid18658060, year = {2008}, author = {Jacks, A}, title = {Bite block vowel production in apraxia of speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {4}, pages = {898-913}, doi = {10.1044/1092-4388(2008/066)}, pmid = {18658060}, issn = {1092-4388}, mesh = {Aged ; Apraxias/diagnosis/*physiopathology ; Brain/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Orthodontic Appliances, Removable ; *Phonetics ; Severity of Illness Index ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: This study explored vowel production and adaptation to articulatory constraints in adults with acquired apraxia of speech (AOS) plus aphasia.

METHOD: Five adults with acquired AOS plus aphasia and 5 healthy control participants produced the vowels [i], [epsilon], and [ae] in four word-length conditions in unconstrained and bite block conditions. In addition to acoustic and perceptual measures of vowel productions, individually determined idealized vowels based on each participant's best performance were used to assess vowel accuracy and distinctiveness.

RESULTS: Findings showed (a) clear separation of vowel formants in speakers with AOS; (b) impaired vowel production in speakers with AOS, shown by perceptual measures of vowel quality and acoustic measures of vowel accuracy and contrastivity; and (c) incomplete compensation to bite block compensation both for individuals with AOS and for healthy controls.

CONCLUSIONS: Although adults with AOS were less accurate overall in vowel production than unimpaired speakers, introduction of a bite block resulted in similar patterns of decreased vowel accuracy for the two groups. Findings suggest that feedback control for vowel production is relatively intact in these individuals with AOS and aphasia. Predominant use of feedback control mechanisms is hypothesized to account for characteristic vowel deficits of the disorder.}, } @article {pmid18658058, year = {2008}, author = {McGowan, RS and Nittrouer, S and Chenausky, K}, title = {Speech production in 12-month-old children with and without hearing loss.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {4}, pages = {879-888}, pmid = {18658058}, issn = {1092-4388}, support = {R01 DC006237/DC/NIDCD NIH HHS/United States ; R01 DC006237-05/DC/NIDCD NIH HHS/United States ; NIDCD-06237//PHS HHS/United States ; }, mesh = {Adult ; Child ; Female ; *Hearing ; Hearing Disorders/*epidemiology ; Humans ; Infant ; Male ; Speech/*physiology ; Speech Acoustics ; Speech Disorders/diagnosis/*epidemiology ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: The purpose of this study was to compare speech production at 12 months of age for children with hearing loss (HL) who were identified and received intervention before 6 months of age with those of children with normal hearing (NH).

METHOD: The speech production of 10 children with NH was compared with that of 10 children with HL whose losses were identified (better ear pure-tone average at 0.5, 1, and 2 kHz poorer than 50 dB HL) and whose intervention started before 6 months of age. These children were recorded at 12 months of age interacting with a parent. Three properties of speech production were analyzed: (a) syllable shape, (b) consonant type, and (c) vowel formant frequencies.

RESULTS: Children with HL had (a) fewer multisyllable utterances with consonants, (b) fewer fricatives and fewer stops with alveolar-velar stop place, and (c) more restricted front-back tongue positions for vowels than did the children with NH.

CONCLUSION: Even when hearing loss is identified shortly after birth, children with HL do not develop speech production skills as their peers with NH do at 12 months of age. This suggests that researchers need to consider their approaches to early intervention carefully.}, } @article {pmid18652309, year = {2008}, author = {Zhang, M and Wang, L and Zhang, L}, title = {[The influence on velopharyngeal function by children adenoidectomy].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {22}, number = {9}, pages = {389-392}, pmid = {18652309}, issn = {2096-7993}, mesh = {Adenoidectomy/*adverse effects ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech Articulation Tests ; *Surveys and Questionnaires ; Velopharyngeal Insufficiency/*epidemiology ; Voice Quality ; }, abstract = {OBJECTIVE: To study the early and long term effect on velopharyngeal closure function by children adenoidectomy.

METHOD: An investigation on parents or person who attended the children to be executed adenoidectomy, blowing soak test and computerized phonography were carried on 1 week, 1 month, and 3 months preoperatively and postoperatively, respectively.

RESULT: 1) The investigation showed that 22 in 31 cases presented hypernasality, in which 9 lasted more than 3 months after operation, Seven children presented nasal regurgitation of food and hypernasality only within 1 month. 2) Blowing soak test showed velopharyngeal insufficiency within 1 week (P <0.01) after operation. 3) Some voice figures appeared consonant and formants incomplete, and nasalization in the early period after operation. The value of F3 in /i:/ decreased after operation, especially within 1 week.

CONCLUSION: Velopharyngeal insufficiency appears on most children after adenoidectomy transiently. There is no evidence shows a long term influence on velopharyngeal function for children after adenoidectomy.}, } @article {pmid18647005, year = {2008}, author = {Riede, T and Tokuda, IT and Munger, JB and Thomson, SL}, title = {Mammalian laryngseal air sacs add variability to the vocal tract impedance: physical and computational modeling.}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {1}, pages = {634-647}, pmid = {18647005}, issn = {1520-8524}, support = {R01 DC005788/DC/NIDCD NIH HHS/United States ; R01 DC05788/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustic Impedance Tests ; *Air ; Animals ; Larynx/*physiology ; Mammals ; Sound Spectrography ; Vibration ; Vocal Cords/*physiology ; }, abstract = {Cavities branching off the main vocal tract are ubiquitous in nonhumans. Mammalian air sacs exist in human relatives, including all four great apes, but only a substantially reduced version exists in humans. The present paper focuses on acoustical functions of the air sacs. The hypotheses are investigated on whether the air sacs affect amplitude of utterances and/or position of formants. A multilayer synthetic model of the vocal folds coupled with a vocal tract model was utilized. As an air sac model, four configurations were considered: open and closed uniform tube-like side branches, a rigid cavity, and an inflatable cavity. Results suggest that some air sac configurations can enhance the sound level. Furthermore, an air sac model introduces one or more additional resonance frequencies, shifting formants of the main vocal tract to some extent but not as strongly as previously suggested. In addition, dynamic range of vocalization can be extended by the air sacs. A new finding is also an increased variability of the vocal tract impedance, leading to strong nonlinear source-filter interaction effects. The experiments demonstrated that air-sac-like structures can destabilize the sound source. The results were validated by a transmission line computational model.}, } @article {pmid18646991, year = {2008}, author = {Makagon, MM and Funayama, ES and Owren, MJ}, title = {An acoustic analysis of laughter produced by congenitally deaf and normally hearing college students.}, journal = {The Journal of the Acoustical Society of America}, volume = {124}, number = {1}, pages = {472-483}, pmid = {18646991}, issn = {1520-8524}, support = {R01 MH065317/MH/NIMH NIH HHS/United States ; 1 R01 MH65317-01A2/MH/NIMH NIH HHS/United States ; }, mesh = {*Acoustics ; Adult ; Deafness/*congenital ; Female ; *Hearing ; Humans ; *Laughter ; Male ; Phonation ; Posture ; Students/*psychology ; Universities ; Vibration ; Videotape Recording ; Vocal Cords/physiology ; }, abstract = {Relatively few empirical data are available concerning the role of auditory experience in nonverbal human vocal behavior, such as laughter production. This study compared the acoustic properties of laughter in 19 congenitally, bilaterally, and profoundly deaf college students and in 23 normally hearing control participants. Analyses focused on degree of voicing, mouth position, air-flow direction, temporal features, relative amplitude, fundamental frequency, and formant frequencies. Results showed that laughter produced by the deaf participants was fundamentally similar to that produced by the normally hearing individuals, which in turn was consistent with previously reported findings. Finding comparable acoustic properties in the sounds produced by deaf and hearing vocalizers confirms the presumption that laughter is importantly grounded in human biology, and that auditory experience with this vocalization is not necessary for it to emerge in species-typical form. Some differences were found between the laughter of deaf and hearing groups; the most important being that the deaf participants produced lower-amplitude and longer-duration laughs. These discrepancies are likely due to a combination of the physiological and social factors that routinely affect profoundly deaf individuals, including low overall rates of vocal fold use and pressure from the hearing world to suppress spontaneous vocalizations.}, } @article {pmid18608242, year = {2008}, author = {Mildner, V and Liker, M}, title = {Fricatives, affricates, and vowels in Croatian children with cochlear implants.}, journal = {Clinical linguistics & phonetics}, volume = {22}, number = {10-11}, pages = {845-856}, doi = {10.1080/02699200802130557}, pmid = {18608242}, issn = {0269-9206}, mesh = {Adolescent ; Child ; Child, Preschool ; *Cochlear Implants ; Croatia ; Deafness/rehabilitation ; Female ; Hearing ; Humans ; Infant ; Male ; *Phonetics ; Reference Values ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {The aim of the research was to analyse the speech of children with cochlear implants over approximately a 46-month period, and compare it with the speech of hearing controls. It focused on three categories of sounds in Croatian: vowels (F1 and F2 of /i/, /e/, /a/, /o/ and /u/), fricatives /s/ and /integral/ (spectral differences expressed in terms of center of gravity), and affricates /ts/ and /t integral/ (accuracy, total duration, and pattern of stop-fricative components). One group of subjects were 10 implanted children who had been profoundly deaf before implantation. There were four recordings per child. Group two children were hearing controls matched for age and sex. The results show that the implanted children are closest to unimpaired children in terms of their formant-defined vowel space. Their fricatives exhibit poor distinction in terms of the noise spectrum, and the affricates are the most difficult to produce.}, } @article {pmid18568795, year = {2008}, author = {Katz, WF and Garst, DM and Levitt, J}, title = {The role of prosody in a case of foreign accent syndrome (FAS).}, journal = {Clinical linguistics & phonetics}, volume = {22}, number = {7}, pages = {537-566}, doi = {10.1080/02699200802106284}, pmid = {18568795}, issn = {0269-9206}, mesh = {Female ; Humans ; Middle Aged ; Severity of Illness Index ; Sound Spectrography ; Speech Disorders/*diagnosis ; Speech Production Measurement ; }, abstract = {Foreign accent syndrome (FAS) is a rare disorder characterized by the emergence of a perceived foreign accent following brain damage. The symptomotology, functional bases, and neural substrates of this disorder are still being elucidated. In this case study, acoustic analyses were performed on the speech of a 46-year old monolingual female who presented with FAS of unknown aetiology. The patient had a pseudo-accent frequently described as 'Swedish' or 'Eastern European'. Stop consonant VOT, consonant burst spectra and duration, vowel durations, formant frequencies, and trajectories were analysed, along with prosodic cues for lexical stress assignment and sentence-level intonation. Results indicated VOT values were generally preserved, while there was a strong tendency to realize the English alveolar flap as a full stop, and to produce flaps that had greater-than-normal closure durations. The spectral properties of the patient's vowels resembled those of normal talkers (with the possible exceptions of decreased F1 values for /i/ and slight differences in formant dynamics for /u/, /o/, /i/, and /epsilon/). However, vowel durations were relatively long, contributing to exaggerated tense/lax contrasts. Token-to-token variability in vowel production was slightly higher than normal for duration, but not for formant frequency values. Lexical stress assignment was inaccurate and highly variable (with similar problems noted for non-speech materials), and sentence level intonation showed occasional deviations from typical American English patterns. For this patient, an underlying timing/rhythm difficulty appeared responsible for the range of segmental and suprasegmental changes leading to the impression of a foreign accent.}, } @article {pmid18568794, year = {2008}, author = {Lee, S and Iverson, GK}, title = {The development of monophthongal vowels in Korean: age and sex differences.}, journal = {Clinical linguistics & phonetics}, volume = {22}, number = {7}, pages = {523-536}, doi = {10.1080/02699200801945120}, pmid = {18568794}, issn = {0269-9206}, mesh = {Age Factors ; *Asian People ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; *Speech ; Speech Acoustics ; Verbal Behavior ; }, abstract = {The purpose of this study was to acoustically examine the developmental characteristics of fundamental (f(0)) and formant frequencies (F1 and F2) in vowels produced by Korean male and female children in two age ranges (5 and 10 years of age). The study also compared formant frequency values among Korean children with those from existing English data. Results revealed that f(0) and F1 and F2 decrease as age increases. The f(0) was similar, but formant frequencies were found to differ between male and female children. In particular, F1 and F2 values differed for the low-central vowel /a/ among the four Korean groups, similar to English-speaking children, but, unlike English, F2 values of front vowels were not different between age groups in Korean. The results of this study provide a base of information for establishing the normal pattern of development in Korean children, and bear on the exploration of cross-linguistic similarities and differences in vowel development between English and Korean.}, } @article {pmid18552003, year = {2007}, author = {Paradowska-Opałka, B and Tarnowska, C and Swidziński, P and Grochowska, E}, title = {[Perceptiv-acoustic characteristic after supracricoid laryngectomy with cricohyopexy or cricohyoepiglottopexy].}, journal = {Otolaryngologia polska = The Polish otolaryngology}, volume = {61}, number = {5}, pages = {698-706}, doi = {10.1016/S0030-6657(07)70509-7}, pmid = {18552003}, issn = {0030-6657}, mesh = {Adult ; Cricoid Cartilage/*surgery ; Female ; Glottis/*physiopathology ; Humans ; Laryngectomy/*methods ; Male ; Middle Aged ; *Phonation ; *Speech Perception ; Voice ; *Voice Quality ; }, abstract = {INTRODUCTION: Supracricoid laryngectomy with cricohyopexy (CHP) and cricoepiglottopexy (CHEP) are the one of functional laryngectomy.

AIM: The aim of the study is phonation assessment of the reconstruction larynx. Material and methods. The examined group consisted of 58 patients (49 males and 9 female). An average age 54. 32 patients underwent CHP and 26-CHEP CHP was performed in following modes: a) 1 arytenoid cartilage left in 17 cases, b) 2 arytenoid cartilages left in 14 cases and c) 1 arytenoid cartilage left and second was resected with subsequent reconstruction in 1 case. The arytenoid cartilage was reconstructed in 19 cases (8 after CHP and 11 after CHEP). The vascularized thyroid lobe was used to the reconstruction of arytenoid cartilage in 8 cases (6 after CHP and 2 after CHEP), cuneiform or corniculate cartilage was used in 4 patients (1 CHP and 3 CHEP) and mucous membrane in 7 cases (1 CHP and 6 CHEP).

RESULT: Socially efficient speech was found in 74% patients and the results were better after CHEP.

CONCLUSION: The phonetic-acoustic structure of voice and resonant speech was considerably different from the phonetic-acoustic structure of voice and speech under physiologic conditions. These differences applied to segmental (formant structure, frequencies, noise range), as well as suprasegmental voice features.}, } @article {pmid18537399, year = {2008}, author = {Zhang, Y and Nissen, SL and Francis, AL}, title = {Acoustic characteristics of English lexical stress produced by native Mandarin speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {6}, pages = {4498-4513}, pmid = {18537399}, issn = {1520-8524}, support = {R03 DC006811/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Auditory Perception ; China/ethnology ; England ; Humans ; *Language ; Language Development ; Language Tests ; *Multilingualism ; *Phonation ; Pitch Perception ; *Speech Acoustics ; Speech Intelligibility ; United States ; Verbal Behavior ; }, abstract = {Native speakers of Mandarin Chinese have difficulty producing native-like English stress contrasts. Acoustically, English lexical stress is multidimensional, involving manipulation of fundamental frequency (F0), duration, intensity and vowel quality. Errors in any or all of these correlates could interfere with perception of the stress contrast, but it is unknown which correlates are most problematic for Mandarin speakers. This study compares the use of these correlates in the production of lexical stress contrasts by 10 Mandarin and 10 native English speakers. Results showed that Mandarin speakers produced significantly less native-like stress patterns, although they did use all four acoustic correlates to distinguish stressed from unstressed syllables. Mandarin and English speakers' use of amplitude and duration were comparable for both stressed and unstressed syllables, but Mandarin speakers produced stressed syllables with a higher F0 than English speakers. There were also significant differences in formant patterns across groups, such that Mandarin speakers produced English-like vowel reduction in certain unstressed syllables, but not in others. Results suggest that Mandarin speakers' production of lexical stress contrasts in English is influenced partly by native-language experience with Mandarin lexical tones, and partly by similarities and differences between Mandarin and English vowel inventories.}, } @article {pmid18537398, year = {2008}, author = {Bonneau, A and Laprie, Y}, title = {Selective acoustic cues for French voiceless stop consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {6}, pages = {4482-4497}, doi = {10.1121/1.2916693}, pmid = {18537398}, issn = {1520-8524}, mesh = {Adult ; Cues ; France ; Humans ; *Language ; Male ; Middle Aged ; *Phonation ; Psychoacoustics ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Intelligibility ; *Voice ; }, abstract = {The objective of this study is to define selective cues that identify only certain realizations of a feature, more precisely the place of articulation of French unvoiced stops, but have every realization identified with a very high level of confidence. The method is based on the delimitation of "distinctive regions" for well chosen acoustic criteria, which contains some exemplars of a feature and (almost) no other exemplar of any other feature in competition. Selective cues, which correspond to distinctive regions, must not be combined with less reliable acoustic cues and their evaluation should be done on reliable elementary acoustic detector outputs. A set of selective cues has been defined for the identification of the place of /p,t,k/, and then tested on a corpus of sentences. The cues were estimated from formant transitions and the transient segment (an automatic segmentation of the transient part of the burst has been designed). About 38% of the feature realizations have been identified by selective cues on the basis of their very distinctive patterns. The error rate, which constitutes the crucial test of our approach, was 0.7%. This opens the way to interesting applications for the improvement of oral comprehension, lexical access, or automatic speech recognition.}, } @article {pmid18537397, year = {2008}, author = {Zhou, X and Espy-Wilson, CY and Boyce, S and Tiede, M and Holland, C and Choe, A}, title = {A magnetic resonance imaging-based articulatory and acoustic study of "retroflex" and "bunched" American English /r/.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {6}, pages = {4466-4481}, pmid = {18537397}, issn = {1520-8524}, support = {R01 DC005250/DC/NIDCD NIH HHS/United States ; 1-R01-DC05250-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; England ; Humans ; *Language ; Larynx/physiology ; Magnetic Resonance Imaging/*methods ; *Phonation ; *Speech Acoustics ; Speech Articulation Tests ; Speech Production Measurement ; Tongue/physiology ; United States ; Vocal Cords/physiology ; Voice/*physiology ; }, abstract = {Speakers of rhotic dialects of North American English show a range of different tongue configurations for /r/. These variants produce acoustic profiles that are indistinguishable for the first three formants [Delattre, P., and Freeman, D. C., (1968). "A dialect study of American English r's by x-ray motion picture," Linguistics 44, 28-69; Westbury, J. R. et al. (1998), "Differences among speakers in lingual articulation for American English /r/," Speech Commun. 26, 203-206]. It is puzzling why this should be so, given the very different vocal tract configurations involved. In this paper, two subjects whose productions of "retroflex" /r/ and "bunched" /r/ show similar patterns of F1-F3 but very different spacing between F4 and F5 are contrasted. Using finite element analysis and area functions based on magnetic resonance images of the vocal tract for sustained productions, the results of computer vocal tract models are compared to actual speech recordings. In particular, formant-cavity affiliations are explored using formant sensitivity functions and vocal tract simple-tube models. The difference in F4/F5 patterns between the subjects is confirmed for several additional subjects with retroflex and bunched vocal tract configurations. The results suggest that the F4/F5 differences between the variants can be largely explained by differences in whether the long cavity behind the palatal constriction acts as a half- or a quarter-wavelength resonator.}, } @article {pmid18533559, year = {2008}, author = {Huang, Z and Wan, P}, title = {[Relationship between voice acoustic parameters and vocal quality].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {22}, number = {6}, pages = {251-255}, pmid = {18533559}, issn = {2096-7993}, mesh = {Acoustics ; Adult ; *Auditory Perception ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Vocal Cords ; *Voice Quality ; }, abstract = {OBJECTIVE: This research is aimed at looking for the effect acoustic parameters of voice quality through the Spearman correlation analysis between auditory-perceptual judgment of voice quality and synthesized acoustic parameters.

METHOD: Thirty-six vowel samples of /ae/ synthesized with six acoustic parameters (F0, Jitter, Shimmer, NNE, Spectral tilt, Formant Flutter), each parameter dimension included 25% coped ones, altogether 45 samples. Then give the serial number to each sample after their order is randomized. The sound samples were given by computer, intensity was fixed at 70dB. Eight famous ENT doctors participated in the task of auditory-perceptual judgment. Give the grades to G, R, B, resulting that there are high inherent consistency (P<0.01). Then do the Spearman correlation analysis with each acoustic parameter respectively.

RESULT: There are significant correlation between the auditory-perception (G, R, B) and Jitter, Shimmer and NNE (P<0.01).

CONCLUSION: Jitter, Shimmer and NNE are effect acoustic parameters reflecting the voice quality. Human voice is multidimensional. The combination the auditory-perceptual (G, R, B) judgment with acoustic analysis is the best method of assessing the voice quality.}, } @article {pmid18529210, year = {2008}, author = {Charlton, BD and Reby, D and McComb, K}, title = {Effect of combined source (F0) and filter (formant) variation on red deer hind responses to male roars.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {5}, pages = {2936-2943}, doi = {10.1121/1.2896758}, pmid = {18529210}, issn = {1520-8524}, support = {//Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Animal Communication ; Animals ; Attention ; Body Size ; *Choice Behavior ; Cues ; Deer/anatomy & histology/*physiology ; Female ; Habituation, Psychophysiologic ; Male ; New Zealand ; Scotland ; *Sexual Behavior, Animal ; *Vocalization, Animal ; }, abstract = {Studying female response to variation in single acoustic components has provided important insights into how sexual selection operates on male acoustic signals. However, since vocal signals are typically composed of independent components, it is important to account for possible interactions between the studied parameter and other relevant acoustic features of vocal signals. Here, two key components of the male red deer roar, the fundamental frequency and the formant frequencies (an acoustic cue to body size), are independently manipulated in order to examine female response to calls characterized by different combinations of these acoustic components. The results revealed that red deer hinds showed greater overall attention and had lower response latencies to playbacks of roars where lower formants simulated larger males. Furthermore, female response to male roars simulating different size callers was unaffected by the fundamental frequency of the male roar when it was varied within the natural range. Finally, the fundamental frequency of the male roar had no significant separate effect on any of the female behavioral response categories. Taken together these findings indicate that directional intersexual selection pressures have contributed to the evolution of the highly mobile and descended larynx of red deer stags and suggest that the fundamental frequency of the male roar does not affect female perception of size-related formant information.}, } @article {pmid18529206, year = {2008}, author = {Taylor, AM and Reby, D and McComb, K}, title = {Human listeners attend to size information in domestic dog growls.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {5}, pages = {2903-2909}, doi = {10.1121/1.2896962}, pmid = {18529206}, issn = {1520-8524}, support = {//Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Animals ; Attention ; *Body Size ; Dogs/*anatomy & histology/*growth & development ; *Hearing ; Humans ; Larynx/physiology ; Size Perception/*physiology ; Speech/physiology ; Vibration ; Vocal Cords/physiology ; *Vocalization, Animal ; }, abstract = {The acoustic features of vocalizations have the potential to transmit information about the size of callers. Most acoustic studies have focused on intraspecific perceptual abilities, but here, the ability of humans to use growls to assess the size of adult domestic dogs was tested. In a first experiment, the formants of growls were shifted to create playback stimuli with different formant dispersions (Deltaf), simulating different vocal tract lengths within the natural range of variation. Mean fundamental frequency (F0) was left unchanged and treated as a covariate. In a second experiment, F0 was resynthesized and Deltaf was left unchanged. In both experiments Deltaf and F0 influenced how participants rated the size of stimuli. Lower formant and fundamental frequencies were rated as belonging to larger dogs. Crucially, when F0 was manipulated and Deltaf was natural, ratings were strongly correlated with the actual weight of the dogs, while when Deltaf was varied and F0 was natural, ratings were not related to the actual weight. Taken together, this suggests that participants relied more heavily on Deltaf, in accordance with the fact that formants are better predictors of body size than F0.}, } @article {pmid18529200, year = {2008}, author = {Sagi, E and Svirsky, MA}, title = {Information transfer analysis: a first look at estimation bias.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {5}, pages = {2848-2857}, pmid = {18529200}, issn = {1520-8524}, support = {R01 DC003937/DC/NIDCD NIH HHS/United States ; R01 DC003937-10/DC/NIDCD NIH HHS/United States ; R01-DC003937/DC/NIDCD NIH HHS/United States ; }, mesh = {Bias ; *Communication ; Hearing/*physiology ; Humans ; Information Dissemination/methods ; Mathematics ; Models, Biological ; *Phonation ; Probability ; Speech/*physiology ; Speech Intelligibility ; }, abstract = {Information transfer analysis [G. A. Miller and P. E. Nicely, J. Acoust. Soc. Am. 27, 338-352 (1955)] is a tool used to measure the extent to which speech features are transmitted to a listener, e.g., duration or formant frequencies for vowels; voicing, place and manner of articulation for consonants. An information transfer of 100% occurs when no confusions arise between phonemes belonging to different feature categories, e.g., between voiced and voiceless consonants. Conversely, an information transfer of 0% occurs when performance is purely random. As asserted by Miller and Nicely, the maximum-likelihood estimate for information transfer is biased to overestimate its true value when the number of stimulus presentations is small. This small-sample bias is examined here for three cases: a model of random performance with pseudorandom data, a data set drawn from Miller and Nicely, and reported data from three studies of speech perception by hearing impaired listeners. The amount of overestimation can be substantial, depending on the number of samples, the size of the confusion matrix analyzed, as well as the manner in which data are partitioned therein.}, } @article {pmid18529192, year = {2008}, author = {Jacewicz, E and Fox, RA}, title = {Amplitude variations in coarticulated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {5}, pages = {2750-2768}, pmid = {18529192}, issn = {1520-8524}, support = {R03 DC005560/DC/NIDCD NIH HHS/United States ; R03DC005560/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Adolescent ; Adult ; England ; Female ; Humans ; *Language ; Loudness Perception/*physiology ; Male ; Phonetics ; Reference Values ; Sound ; Sound Spectrography ; *Speech ; Speech Acoustics ; Speech Production Measurement ; United States ; }, abstract = {This paper seeks to characterize the nature, size, and range of acoustic amplitude variation in naturally produced coarticulated vowels in order to determine its potential contribution and relevance to vowel perception. The study is a partial replication and extension of the pioneering work by House and Fairbanks [J. Acoust. Soc. Am. 22, 105-113 (1953)], who reported large variation in vowel amplitude as a function of consonantal context. Eight American English vowels spoken by men and women were recorded in ten symmetrical CVC consonantal contexts. Acoustic amplitude measures included overall rms amplitude, amplitude of the rms peak along with its relative location in the CVC-word, and the amplitudes of individual formants F1-F4 along with their frequencies. House and Fairbanks' amplitude results were not replicated: Neither the overall rms nor the rms peak varied appreciably as a function of consonantal context. However, consonantal context was shown to affect significantly and systematically the amplitudes of individual formants at the vowel nucleus. These effects persisted in the auditory representation of the vowel signal. Auditory spectra showed that the pattern of spectral amplitude variation as a function of contextual effects may still be encoded and represented at early stages of processing by the peripheral auditory system.}, } @article {pmid18529191, year = {2008}, author = {Titze, IR}, title = {Nonlinear source-filter coupling in phonation: theory.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {5}, pages = {2733-2749}, pmid = {18529191}, issn = {1520-8524}, support = {R01 DC004224/DC/NIDCD NIH HHS/United States ; R01 DC004224-07/DC/NIDCD NIH HHS/United States ; R01 DC004224-08/DC/NIDCD NIH HHS/United States ; 5 R01 DC004224 08/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Fourier Analysis ; Glottis/*physiology ; Humans ; Larynx/physiology ; Models, Biological ; Models, Theoretical ; Music ; Phonation/*physiology ; Pitch Discrimination ; Pressure ; *Speech ; Vibration ; Vocal Cords/*physiology ; }, abstract = {A theory of interaction between the source of sound in phonation and the vocal tract filter is developed. The degree of interaction is controlled by the cross-sectional area of the laryngeal vestibule (epilarynx tube), which raises the inertive reactance of the supraglottal vocal tract. Both subglottal and supraglottal reactances can enhance the driving pressures of the vocal folds and the glottal flow, thereby increasing the energy level at the source. The theory predicts that instabilities in vibration modes may occur when harmonics pass through formants during pitch or vowel changes. Unlike in most musical instruments (e.g., woodwinds and brasses), a stable harmonic source spectrum is not obtained by tuning harmonics to vocal tract resonances, but rather by placing harmonics into favorable reactance regions. This allows for positive reinforcement of the harmonics by supraglottal inertive reactance (and to a lesser degree by subglottal compliant reactance) without the risk of instability. The traditional linear source-filter theory is encumbered with possible inconsistencies in the glottal flow spectrum, which is shown to be influenced by interaction. In addition, the linear theory does not predict bifurcations in the dynamical behavior of vocal fold vibration due to acoustic loading by the vocal tract.}, } @article {pmid18523365, year = {2008}, author = {Fox, RA and Jacewicz, E and Feth, LL}, title = {Spectral integration of dynamic cues in the perception of syllable-initial stops.}, journal = {Phonetica}, volume = {65}, number = {1-2}, pages = {19-44}, pmid = {18523365}, issn = {1423-0321}, support = {R01 DC006879/DC/NIDCD NIH HHS/United States ; R01 DC006879-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Cues ; Humans ; Phonetics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; User-Computer Interface ; }, abstract = {The present experiments examine the potential role of auditory spectral integration and spectral center of gravity (COG) effects in the perception of initial formant transitions in the syllables [da]-[ga] and [t(h)a]-[k(h)a]. Of interest is whether the place distinction for stops in these syllables can be cued by a 'virtual F3 transition' in which the percept of a frequency transition is produced by a dynamically changing COG. Listeners perceived the virtual F3 transitions comparably with actual F3 transitions although the former were less salient a cue. However, in a separate experiment, static 'virtual F3 bursts' were not as effective as actual F3 bursts in cueing the alveolar-velar place distinction. These results indicate that virtual F3 transitions can provide phonetic information to the perceptual system and that auditory spectral integration (completed by the central auditory system) may play a significant role in speech perception.}, } @article {pmid18506036, year = {2008}, author = {Neel, AT}, title = {Vowel space characteristics and vowel identification accuracy.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {3}, pages = {574-585}, doi = {10.1044/1092-4388(2008/041)}, pmid = {18506036}, issn = {1092-4388}, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; Speech Acoustics ; Speech Discrimination Tests ; Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; }, abstract = {PURPOSE: To examine the relation between vowel production characteristics and intelligibility.

METHOD: Acoustic characteristics of 10 vowels produced by 45 men and 48 women from the J. M. Hillenbrand, L. A. Getty, M. J. Clark, and K. Wheeler (1995) study were examined and compared with identification accuracy. Global (mean f0, F1, and F2; duration; and amount of formant movement) and fine-grained measures (vowel space area; mean distance among vowels; f0, F1, and F2 ranges; duration ratio between long and short vowels; and formant movement ratio between dynamic and static vowels) were used to predict identification scores. Acoustic measures of the most frequently confused pairs (/ae/-/epsilon/ and /a/-/inverted v/) were compared.

RESULTS: Global and fine-grained measures accounted for less than 1/4 of variance in identification scores: Vowel space area alone accounted for 9%-12% of variance. Differences in vowel identification were largely due to poor identification of /ae/, /epsilon/, /a/, or /inverted v/. Well-identified vowels were distinctive in formant frequencies, duration, and amount of formant movement over time.

CONCLUSIONS: Distinctiveness among neighboring vowels is more important in determining vowel intelligibility than vowel space area. Acoustic comparison of confused vowels may be more useful in studying intelligibility of normal and disordered speech than in measuring vowel space area.}, } @article {pmid18504111, year = {2009}, author = {Sundberg, J}, title = {Articulatory configuration and pitch in a classically trained soprano singer.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {5}, pages = {546-551}, doi = {10.1016/j.jvoice.2008.02.003}, pmid = {18504111}, issn = {1873-4588}, mesh = {Biomechanical Phenomena ; Educational Status ; Female ; Humans ; Jaw/physiology ; Lip/physiology ; Magnetic Resonance Imaging ; Motor Activity/*physiology ; Mouth/*physiology ; *Music ; Phonation/*physiology ; Phonetics ; Software ; Sound Spectrography ; *Speech Acoustics ; Tongue/physiology ; *Voice ; Voice Training ; }, abstract = {Previous studies suggest that singers modify articulation to avoid that the pitch frequency F0 exceeds the normal value of the first formant F1(Normal). Using magnetic resonance imaging at a rate of 5 frames/s, articulation was analyzed in a professional soprano singing an ascending triad pattern from C4 to G5 (262-784Hz) on the vowels /i, e, u, o, a/. Lip and jaw opening and tongue dorsum height were measured and analyzed as function of pitch. Four or five semitones below the pitch where F0=F1(Normal) the tongue dorsum height was reduced in /i, e, u, a/, whereas in /o/ the lip opening was widened and in /a/ also the jaw opening was widened. At higher pitches, the jaw opening was widened in all vowels. These articulatory maneuvers are likely to raise F1 in these vowels.}, } @article {pmid18500598, year = {2008}, author = {Hocevar-Boltezar, I and Boltezar, M and Zargi, M}, title = {The influence of cochlear implantation on vowel articulation.}, journal = {Wiener klinische Wochenschrift}, volume = {120}, number = {7-8}, pages = {228-233}, pmid = {18500598}, issn = {0043-5325}, mesh = {Adolescent ; Adult ; Articulation Disorders/*diagnosis/therapy ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*rehabilitation ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; Speech Therapy ; }, abstract = {PURPOSE: Speech of deaf persons differs considerably from that of speakers with normal hearing. The purpose of this study was to investigate the acoustic changes in articulation of corner vowels in deaf children and adults after cochlear implantation.

METHODS: Thirteen prelingually deaf children and 12 postlingually deaf adults were included in the study. Voice samples of the isolated corner vowels /a/, /i/ and /u/ were analyzed before and 6-12 months after the implantation. The frequencies of the first (F1) and second (F2) formants, the F1/F2 ratio of all three corner vowels, and the area of the vowel triangle were calculated and compared before and 6-12 months after the implantation.

RESULT: In the adults, no significant differences were detected in the formant frequencies, the F1/F2 ratio or the area of the vowel triangle. However, significant change in formant frequencies was detected in the group of 13 prelingually deaf children. After the implantation the F1 of /u/ decreased significantly, and favorable decreases of the F1 of /i/ and the F1/F2 ratio in /i/ and /u/ were close to being statistically significant. All changes caused better phonological difference between the two vowels. The significant change in the F1 of /u/ and the change of F1 of /i/ resulted in the expansion of the vowel space, which was expressed as an increase in the area of the vowel triangle.

CONCLUSIONS: We suggest that in children the acquired hearing ability and further development of neuromuscular control of articulation are the reasons for the significant improvement after cochlear implantation. The results also suggest that the area of the vowel triangle is a useful and sensitive indicator of the more precise articulation after implantation. In order to achieve better and faster improvement of articulation, regular speech therapy should be included in the rehabilitation of deaf persons after cochlear implantation.}, } @article {pmid18479835, year = {2008}, author = {Bleeck, S and Ingham, NJ and Verhey, JL and Winter, IM}, title = {Rebound depolarization in single units of the ventral cochlear nucleus: a contribution to grouping by common onset?.}, journal = {Neuroscience}, volume = {154}, number = {1}, pages = {139-146}, doi = {10.1016/j.neuroscience.2008.03.020}, pmid = {18479835}, issn = {0306-4522}, support = {//Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation/methods ; Action Potentials/*physiology ; Animals ; Auditory Perception/*physiology ; Auditory Threshold/*physiology ; Cochlear Nucleus/*physiology ; Guinea Pigs ; Models, Neurological ; Psychophysics ; Time Factors ; }, abstract = {Simultaneous grouping by common onset time is believed to be a powerful cue in auditory perception; components that start or stop roughly at the same time are judged as far more likely to have originated from the same source. Here we report a simple experiment designed to simulate a complex psychophysical paradigm first described by Darwin and Sutherland [(1984) Grouping frequency components of vowels. When is a harmonic not a harmonic? Quarterly J of Experimental Psychology: Hum Exp Psychol 36(A):193-208]. It is possible to change the perception of the vowel /I/ to /epsilon/ by manipulating the harmonics around the first formant (F1). Increasing the amplitude of one harmonic around F1 caused the perception of the vowel to change from /I/ to /epsilon/. Extending the increased component before the vowel could, however, greatly reduce this change. The role of neural adaptation in this effect was questioned by repeating the experiment but this time using a 'captor' tone which was switched on with the asynchronous harmonic and off when the vowel started. This time the vowel percept did change in a fashion analogous to the effect of an increase in the amplitude of the fourth harmonic (which is close to F1). This effect was explained by assuming that the captor had grouped with the leading portion of the asynchronous component enabling the remainder of the asynchronous component to be grouped with the remainder of the components. We propose a relatively low-level neuronal explanation for this grouping effect: the captor reduces the neural response to the leading segment of the asynchronous component by activating across-frequency suppression, either from the cochlea, or acting via a wideband inhibitor in the ventral cochlear nucleus. The reduction in neural response results in a release from adaptation with the offset of the captor terminating the inhibition, such that the response to the continuation of that component is now enhanced. Using a simplified paradigm we show that both primary-like and chopper units in the ventral cochlear nucleus of the anesthetized guinea pig may show a rebound in excitation when a captor is positioned so as to stimulate the suppressive sidebands in its receptive field. The strength of the rebound was positively correlated with the strength of the suppression. These and other results are consistent with the view that low-level mechanisms underlie the psychophysical captor effect.}, } @article {pmid18468842, year = {2009}, author = {Sundberg, J and Romedahl, C}, title = {Text intelligibility and the singer's formant--a relationship?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {5}, pages = {539-545}, doi = {10.1016/j.jvoice.2008.01.010}, pmid = {18468842}, issn = {1873-4588}, mesh = {Acoustic Stimulation ; Humans ; Male ; *Music ; Noise ; Phonetics ; Psychoacoustics ; Sound Spectrography ; Speech Acoustics ; *Speech Intelligibility ; Task Performance and Analysis ; Time Factors ; *Voice ; }, abstract = {BACKGROUND AND HYPOTHESIS: A clear enunciation of consonants is crucial to text intelligibility, and consonants are identified by specific formant frequency patterns. The singer's formant, a spectral peak near 3,000 Hz, enhances the higher formants in male opera singers' voices. It is well known that the second and higher formants are crucial to text intelligibility. Therefore, it seams reasonable to hypothesize that the singer's formant increases intelligibility of consonants and hence also the intelligibility of the text. For the same reason, text intelligibility of musical theatre singers, who lack a singer's formant, could be assumed to be lower than that of opera singers.

METHOD: Two professional opera singers and two professional musical theatre singers sang a carrier phrase that contained one nonsense syllable. The phrases were masked with noise of different levels. The degree of intelligibility was measured by a listening test.

RESULT: The results showed that the intelligibility was slightly higher for the musical theatre singers than for the opera singers.

CONCLUSION: One possible reason for this would be that the musical theatre singers use formant frequencies more similar to those occurring in normal speech. Another reason could be that the formant transitions characterizing the consonants were considerably slower in the case of the musical theatre singers than in the case of the operatic singers.}, } @article {pmid18439952, year = {2008}, author = {Hamdan, AL and Tabri, D and Deeb, R and Rifai, H and Rameh, C and Fuleihan, N}, title = {Formant frequencies in Middle Eastern singers.}, journal = {American journal of otolaryngology}, volume = {29}, number = {3}, pages = {180-183}, doi = {10.1016/j.amjoto.2007.05.008}, pmid = {18439952}, issn = {1532-818X}, mesh = {Adolescent ; Adult ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Middle East ; *Music ; Phonation/*physiology ; *Phonetics ; Pitch Perception ; Sound Spectrography/*methods ; Voice/*physiology ; Young Adult ; }, abstract = {PURPOSE: This work was conducted to describe the formant frequencies in a group of Middle Eastern singers and to look for the presence of the singer's formant described in operatic singers.

MATERIAL: A total of 13 Middle Eastern singers were enrolled in this study. There were 5 men and 8 women.

METHOD: Descriptive analysis was performed to report the various formants (F1, F2, F3, and F4) in both speaking and singing. The Wilcoxon test was used to compare the means of the formants under both conditions.

RESULTS: For both sexes combined, for the /a/ vowel, F1 singing was significantly lower than F1 speaking (P = .05) and F3 singing was significantly higher than F3 speaking (P = .046). For the /u/ vowel, only F2 singing was significantly higher than F2 speaking (P = .012). For the /i/ vowel, both F2 and F3 singing were significantly lower than F2 and F3 speaking, respectively (P = .006 and .012, respectively). There was no clustering of the formants in any of the Middle Eastern sung vowels.

CONCLUSION: Formant frequencies for the vowels /a/, /i/, and /u/ differ between Middle Eastern singing vs speaking. There is absence of the singer's formant.}, } @article {pmid18437972, year = {2008}, author = {Zhao, XP and Pan, CB and Huang, HZ and Zhang, B and Li, JS and Wang, JG}, title = {[Evaluation of the articulator function in the hemi-tongue defect patients after radical tongue cancer resection and simultaneous reconstruction with forearm flap or primary close].}, journal = {Zhonghua zheng xing wai ke za zhi = Zhonghua zhengxing waike zazhi = Chinese journal of plastic surgery}, volume = {24}, number = {1}, pages = {6-9}, pmid = {18437972}, issn = {1009-4598}, mesh = {Adult ; Aged ; Female ; Forearm/surgery ; Humans ; Male ; Middle Aged ; Plastic Surgery Procedures/methods ; Skin Transplantation ; Surgical Flaps ; Suture Techniques ; Tongue Neoplasms/*physiopathology/*surgery ; *Voice Quality ; }, abstract = {OBJECTIVE: To explore the effect of the patients' articulator function after reconstruction of hemi-tongue defect with forearm flap (FAP) or prime close (PC).

METHODS: 36 patients who underwent hemiglossectomy were investigated after radical surgery for TSCC. 20 cases were reconstructed with FAP flaps and 16 with primary closure. The patients' articulator functions were evaluated by articulation tests. VS-9700 was used to analyze the speech character when they pronounce /ji/.

RESULTS: 1) The speech articulation of patients who underwent hemi-tongue reconstruction with FAF was better than that of patients with PC, and there was significant difference between them (P < 0.05). 2) The first formant (F1) of /i/ of the PC group was lower than that of the control group (P < 0.05). But the second formant (F2) of the PC group was higher than that of the control group (P < 0.05). The first formant (F1) of /i/ of the FAF group was lower than that of the control group (P < 0.05), but there were no significant differences between FAF group and control group in F2 of /i/ (P > 0.05).

CONCLUSIONS: Articulator function can be well achieved by forearm flaps reconstruction to hemi-tongue defect patients.}, } @article {pmid18421245, year = {2007}, author = {Fulop, SA}, title = {Phonetic applications of the time-corrected instantaneous frequency spectrogram.}, journal = {Phonetica}, volume = {64}, number = {4}, pages = {237-262}, doi = {10.1159/000121375}, pmid = {18421245}, issn = {1423-0321}, mesh = {Glottis/physiology ; Humans ; Models, Biological ; Movement/physiology ; *Phonetics ; *Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Time Factors ; Vocal Cords/physiology ; }, abstract = {A reassigned or time-corrected instantaneous frequency spectrogram has been developed in the work of a number of practitioners. Here we present a general description of this imaging technique and explore its manifold applications to acoustic phonetics. The TCIF spectrogram shows the locations of signal components with unrivalled precision, eliminating the blurring and smearing of components that hamper the readability of the conventional spectrogram. Formants of vowels and other resonants are shown with great accuracy by observing glottal pulsations at very short time scales with a wideband analysis. A further post-processing technique is also described, by which signal components such as formants, as well as impulsive events, can be effectively isolated to the exclusion of other signal information. When the phonation process is examined this closely, a variety of evidence surfaces which supports recent developments in the theory and computational simulation of aeroacoustic phenomena in speech. Narrowband analysis is also demonstrated to permit pitch tracking with relative ease.}, } @article {pmid18414663, year = {2008}, author = {Azadpour, M and Balaban, E}, title = {Phonological representations are unconsciously used when processing complex, non-speech signals.}, journal = {PloS one}, volume = {3}, number = {4}, pages = {e1966}, pmid = {18414663}, issn = {1932-6203}, mesh = {Acoustic Stimulation ; Acoustics ; Auditory Perception ; Evoked Potentials, Auditory ; Female ; Humans ; Italy ; Language ; Linguistics ; Male ; Phonetics ; Regression Analysis ; *Speech ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Neuroimaging studies of speech processing increasingly rely on artificial speech-like sounds whose perceptual status as speech or non-speech is assigned by simple subjective judgments; brain activation patterns are interpreted according to these status assignments. The naïve perceptual status of one such stimulus, spectrally-rotated speech (not consciously perceived as speech by naïve subjects), was evaluated in discrimination and forced identification experiments. Discrimination of variation in spectrally-rotated syllables in one group of naïve subjects was strongly related to the pattern of similarities in phonological identification of the same stimuli provided by a second, independent group of naïve subjects, suggesting either that (1) naïve rotated syllable perception involves phonetic-like processing, or (2) that perception is solely based on physical acoustic similarity, and similar sounds are provided with similar phonetic identities. Analysis of acoustic (Euclidean distances of center frequency values of formants) and phonetic similarities in the perception of the vowel portions of the rotated syllables revealed that discrimination was significantly and independently influenced by both acoustic and phonological information. We conclude that simple subjective assessments of artificial speech-like sounds can be misleading, as perception of such sounds may initially and unconsciously utilize speech-like, phonological processing.}, } @article {pmid18411035, year = {2009}, author = {Barrichelo-Lindström, V and Behlau, M}, title = {Resonant voice in acting students: perceptual and acoustic correlates of the trained Y-Buzz by Lessac.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {5}, pages = {603-609}, doi = {10.1016/j.jvoice.2007.12.001}, pmid = {18411035}, issn = {1873-4588}, mesh = {Analysis of Variance ; Female ; Humans ; Male ; *Phonation ; *Phonetics ; Sex Characteristics ; *Speech Acoustics ; Speech Production Measurement ; Students ; *Voice ; *Voice Training ; Young Adult ; }, abstract = {This study aimed to investigate perceptually and acoustically Lessac's Y-Buzz and sustained productions of Brazilian Portuguese habitual /i/ vowels pre- and posttraining and to verify the presence of formant tuning and its association with the perception of a more resonant voice. The subjects of this study were 54 acting students, 31 female and 23 male, with no voice problems, distributed in seven groups. Each group received four weekly sessions of training. For the pretraining recording, they were asked to sustain the vowel /i/ in a habitual mode three times at self-selected comfortable frequencies and intensity. After training, they repeated the habitual /i/ and also the trained Y-Buzz. Five voice specialists rated how resonant each sample sounded. The fundamental frequency (F(0)), the first four formant frequencies, the distance between the frequencies of F(1) and F(0) were measured, as well as the harmonic frequency (H(2)) frequency and the difference between F(1) and H(2) in the case of male voices (Praat 4.4.33, Institute of Phonetic Sciences, University of Amsterdam, The Netherlands). The trained Y-Buzz was considered more resonant than the habitual /i/ samples, regardless the gender and demonstrated a lowering of the four formant frequencies. F(1) was especially lower in both groups (288Hz-female and 285Hz-male), statistically significant in the female group. The F(1)-F(0) difference was significantly smaller for the female Y-Buzz (52Hz), as well as F(1)-H(2) in the case of the male Y-Buzz (12Hz), suggesting formant tuning. It was not possible to establish association between the perceptual grades and measures F(1)-F(0) or F(1)-H(2).}, } @article {pmid18396999, year = {2008}, author = {Titze, I and Riede, T and Popolo, P}, title = {Nonlinear source-filter coupling in phonation: vocal exercises.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {4}, pages = {1902-1915}, pmid = {18396999}, issn = {1520-8524}, support = {R01 DC004224/DC/NIDCD NIH HHS/United States ; R01 DC004224-07/DC/NIDCD NIH HHS/United States ; R01 DC004224-08/DC/NIDCD NIH HHS/United States ; 5R01 DC004224-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Vibration ; Vocal Cords/*physiology ; }, abstract = {Nonlinear source-filter coupling has been demonstrated in computer simulations, in excised larynx experiments, and in physical models, but not in a consistent and unequivocal way in natural human phonations. Eighteen subjects (nine adult males and nine adult females) performed three vocal exercises that represented a combination of various fundamental frequency and formant glides. The goal of this study was to pinpoint the proportion of source instabilities that are due to nonlinear source-tract coupling. It was hypothesized that vocal fold vibration is maximally destabilized when F(0) crosses F(1), where the acoustic load changes dramatically. A companion paper provides the theoretical underpinnings. Expected manifestations of a source-filter interaction were sudden frequency jumps, subharmonic generation, or chaotic vocal fold vibrations that coincide with F(0)-F(1) crossovers. Results indicated that the bifurcations occur more often in phonations with F(0)-F(1) crossovers, suggesting that nonlinear source-filter coupling is partly responsible for source instabilities. Furthermore it was observed that male subjects show more bifurcations in phonations with F(0)-F(1) crossovers, presumably because in normal speech they are less likely to encounter these crossovers as much as females and hence have less practice in suppressing unwanted instabilities.}, } @article {pmid18396921, year = {2008}, author = {Liu, C}, title = {Rollover effect of signal level on vowel formant discrimination.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {4}, pages = {EL52-8}, doi = {10.1121/1.2884085}, pmid = {18396921}, issn = {1520-8524}, mesh = {Adult ; Auditory Threshold ; *Discrimination, Psychological ; Humans ; Perceptual Masking ; *Phonetics ; Signal Detection, Psychological ; *Speech Perception ; }, abstract = {The goal of this study was to measure the ability of normal-hearing listeners to discriminate formant frequency for vowels in isolation and sentences at three signal levels. Results showed significant elevation in formant thresholds as formant frequency and linguistic context increased. The signal level indicated a rollover effect, especially for F2, in which formant thresholds at 85 dB SPL were lower than thresholds at 70 or 100 dB SPL in both isolated vowels and sentences. This rollover level effect could be due to reduced frequency selectivity and forward/backward masking in sentence at high signal levels for normal-hearing listeners.}, } @article {pmid18361203, year = {2007}, author = {Ungvári, K and Barrak, S and Smehák, G and Szamosközi, A and Rovó, L and Radnai, M}, title = {[The effect of complete upper denture on phonation].}, journal = {Fogorvosi szemle}, volume = {100}, number = {6}, pages = {301-305}, pmid = {18361203}, issn = {0015-5314}, mesh = {Aged ; Aged, 80 and over ; *Denture, Complete, Upper/adverse effects ; Female ; Humans ; Jaw, Edentulous/therapy ; Male ; Maxilla ; Middle Aged ; *Phonation ; Speech Acoustics ; Speech Disorders/*etiology ; Speech Production Measurement ; }, abstract = {In case of total edentulousness the formation of vowels and consonants is impaired, due to the change of the morphology in the oral cavity. The aim of our study was to estimate the influence of the upper complete denture on the phonation of the patient. Patients wearing total upper denture were asked to utter special words with the prosthesis in the mouth and without it, and the change in the formation of "hissing" sounds were noted. The changes in the pronunciation of vowels and consonants were analyzed with the help of a computer program, which provided objective assessment. The data were analyzed statistically. When wearing the prosthesis the scores of the formant became higher and most examined sounds were pronounced faster (p = 0.006). The articulation of patients accustomed to wearing the full total prosthesis was influenced positively by having the prosthesis in the mouth. As a result of these findings it is recommended that the prosthesis be manufactured keeping these phonetic aspects in mind.}, } @article {pmid18345278, year = {2007}, author = {Grepl, M and Furst, T and Pesak, J}, title = {The F1-F2 vowel chart for Czech whispered vowels a, e, i, o, u.}, journal = {Biomedical papers of the Medical Faculty of the University Palacky, Olomouc, Czechoslovakia}, volume = {151}, number = {2}, pages = {353-356}, doi = {10.5507/bp.2007.061}, pmid = {18345278}, issn = {1804-7521}, mesh = {Adult ; Female ; Humans ; Language ; Male ; *Sound Spectrography ; *Speech Acoustics ; }, abstract = {AIM: The aim of this contribution is to present the formant chart of the Czech vowels a, e, i, o, u and show that this can be achieved by means of digital methods of sound processing.

METHOD: A group of 35 Czech students of the Pedagogical Faculty of Palacky University was tested and a record of whispered vowels was taken from each of them. The record was digitalized and processed by the Discrete Fourier Trasform. The result is the power spectrum of the individual vocals - the graphic output consists of a plot of the relative power of individual frequencies in the original sound. The values of the first two maxima which represent the first and the second formants were determined from the graph. The values were plotted on a formant chart.

RESULTS: Altogether, 175 spectral analyses of individual vowels were performed. In the resulting power spectrum, the first and the second formant frequencies were identified. The first formant was plotted against the second one and pure vocal formant regions were identified.

CONCLUSION: Frequency bands for the Czech vowel "a" were circumscribed between 850 and 1150 Hz for first formant (F1) and between 1200 and 2000 Hz for second formant (F2). Similarly, borders of frequency band for vowel "e" they were 700 and 950 Hz for F1 and 1700 and 3000 Hz for F2. For vowel "i" 300 and 450 Hz for F1 and 2000 and 3600 Hz for F2, for vowel "o" 600 and 800 Hz for F1 and 600 and 1400 Hz for F2, for vowel "u" 100 and 400 Hz for F1 and 400 and 1200 Hz for F2.

DISCUSSION: At low frequencies it is feasible to invoke the source-filter model of voice production and associate vowel identity with frequencies of the first two formants in the voice spectrum. On the other hand, subject to intonation, singing or other forms of exposed voice (such as emotional speech, focused speech), the formant regions tend to spread. In spectral analysis other frequencies dominate, so specific formant frequency bands are not easily recognizable. Although the resulting formant map is not much different from the formant map of Peterson, it carries basic information about specific Czech vowels. The results may be used in further research and in education.}, } @article {pmid18281900, year = {2008}, author = {Jääskeläinen, IP and Kauramäki, J and Tujunen, J and Sams, M}, title = {Formant transition-specific adaptation by lipreading of left auditory cortex N1m.}, journal = {Neuroreport}, volume = {19}, number = {1}, pages = {93-97}, doi = {10.1097/WNR.0b013e3282f36f7a}, pmid = {18281900}, issn = {0959-4965}, mesh = {Acoustic Stimulation/methods ; Adaptation, Physiological/*physiology ; Adult ; Auditory Cortex/*physiology ; Brain Mapping ; Dose-Response Relationship, Radiation ; Evoked Potentials, Auditory/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; *Lipreading ; Magnetoencephalography ; Male ; Speech Perception/*physiology ; }, abstract = {To test for the feature specificity of adaptation of auditory-cortex magnetoencephalographic N1m responses to phonemes during lipreading, we presented eight healthy volunteers with a simplified sine-wave first-formant (F1) transition shared by /ba/, /ga/, and /da/, and a continuum of second-formant (F2) transitions contained in /ba/ (ascending), /da/ (level), and /ga/ (descending), during lipreading of /ba/ vs. /ga/ vs. a still-face baseline. N1m responses to the F1 transition were suppressed during lipreading, further, visual /ga/ (vs. /ba/) significantly suppressed left-hemisphere N1m responses to the F2 transition contained in /ga/. This suggests that visual speech activates and adapts auditory cortex neural populations tuned to formant transitions, the basic sound-sweep constituents of phonemes, potentially explaining enhanced speech perception during lipreading.}, } @article {pmid18252442, year = {1998}, author = {Fels, SS and Hinton, GE}, title = {Glove-TalkII--a neural-network interface which maps gestures to parallel formant speech synthesizer controls.}, journal = {IEEE transactions on neural networks}, volume = {9}, number = {1}, pages = {205-212}, doi = {10.1109/72.655042}, pmid = {18252442}, issn = {1045-9227}, abstract = {Glove-TalkII is a system which translates hand gestures to speech through an adaptive interface. Hand gestures are mapped continuously to ten control parameters of a parallel formant speech synthesizer. The mapping allows the hand to act as an artificial vocal tract that produces speech in real time. This gives an unlimited vocabulary in addition to direct control of fundamental frequency and volume. Currently, the best version of Glove-TalkII uses several input devices (including a Cyberglove, a ContactGlove, a three-space tracker, and a foot pedal), a parallel formant speech synthesizer, and three neural networks. The gesture-to-speech task is divided into vowel and consonant production by using a gating network to weight the outputs of a vowel and a consonant neural network. The gating network and the consonant network are trained with examples from the user. The vowel network implements a fixed user-defined relationship between hand position and vowel sound and does not require any training examples from the user. Volume, fundamental frequency, and stop consonants are produced with a fixed mapping from the input devices. One subject has trained to speak intelligibly with Glove-TalkII. He speaks slowly but with far more natural sounding pitch variations than a text-to-speech synthesizer.}, } @article {pmid18247893, year = {2008}, author = {Mesgarani, N and David, SV and Fritz, JB and Shamma, SA}, title = {Phoneme representation and classification in primary auditory cortex.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {2}, pages = {899-909}, doi = {10.1121/1.2816572}, pmid = {18247893}, issn = {1520-8524}, support = {F32DC008453/DC/NIDCD NIH HHS/United States ; R01DC005779/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Animals ; Audiometry, Speech ; Auditory Cortex/anatomy & histology/*physiology ; *Brain Mapping ; Confusion/physiopathology ; Female ; Ferrets ; Humans ; Male ; Microelectrodes ; Neurons/physiology ; Pattern Recognition, Physiological/*physiology ; *Phonetics ; Psychoacoustics ; Psycholinguistics ; Sound Spectrography ; Species Specificity ; *Speech Acoustics ; Speech Perception/*physiology ; Time Perception ; }, abstract = {A controversial issue in neurolinguistics is whether basic neural auditory representations found in many animals can account for human perception of speech. This question was addressed by examining how a population of neurons in the primary auditory cortex (A1) of the naive awake ferret encodes phonemes and whether this representation could account for the human ability to discriminate them. When neural responses were characterized and ordered by spectral tuning and dynamics, perceptually significant features including formant patterns in vowels and place and manner of articulation in consonants, were readily visualized by activity in distinct neural subpopulations. Furthermore, these responses faithfully encoded the similarity between the acoustic features of these phonemes. A simple classifier trained on the neural representation was able to simulate human phoneme confusion when tested with novel exemplars. These results suggest that A1 responses are sufficiently rich to encode and discriminate phoneme classes and that humans and animals may build upon the same general acoustic representations to learn boundaries for categorical and robust sound classification.}, } @article {pmid18240662, year = {2007}, author = {Stipinovich, A and Van der Merwe, A}, title = {Acquired dysarthria within the context of the four-level framework of speech sensorimotor control.}, journal = {The South African journal of communication disorders = Die Suid-Afrikaanse tydskrif vir Kommunikasieafwykings}, volume = {54}, number = {}, pages = {67-76}, pmid = {18240662}, issn = {0379-8046}, mesh = {Adult ; Age of Onset ; Aged ; Aged, 80 and over ; Dysarthria/diagnosis/*epidemiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Psychomotor Disorders/*epidemiology ; }, abstract = {The Four-Level Framework of speech sensorimotor control (Van der Merwe, 1997) complicates the traditional view of dysarthria as a purely motor execution disorder. According to this framework, hypokinetic, hyperkinetic and ataxic dysarthria are programming-execution dysarthrias, while flaccid dysarthria is the only execution dysarthria. This preliminary study aimed to differentiate programming-execution dysarthria from execution dysarthria by examining variability of the temporal control of speech. Six participants and five control participants repeated 15 stimulus words ten times. Voice onset time, vowel duration, vowel steady state duration and vowel formant transition duration were measured acoustically. The coefficient of variation of the temporal parameters, and the correlation coefficient between the durational parameters, were calculated and analysed using descriptive statistics. The coefficient of variation revealed that the speakers with dysarthria were more variable than the control speakers. All participants, except those with flaccid dysarthria, showed similar patterns of intra-subject variability. Those with flaccid dysarthria exhibited greater intra-subject variability of voice onset time. The correlation analysis did not reveal differences between dysarthria type, or between the dysarthric speakers and the controls. Differences found in the patterns of variability may support the hypothesis that individuals with programming-execution dysarthria resort to a different level of control than those with execution dysarthria. Further research in this field is necessary.}, } @article {pmid18230866, year = {2008}, author = {Dromey, C and Nissen, SL and Roy, N and Merrill, RM}, title = {Articulatory changes following treatment of muscle tension dysphonia: preliminary acoustic evidence.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {51}, number = {1}, pages = {196-208}, doi = {10.1044/1092-4388(2008/015)}, pmid = {18230866}, issn = {1092-4388}, mesh = {Adult ; Female ; Glottis/physiology ; Humans ; Middle Aged ; *Muscle Contraction ; Retrospective Studies ; *Speech ; Speech Acoustics ; Speech Articulation Tests ; Voice ; Voice Disorders/*physiopathology/*therapy ; }, abstract = {PURPOSE: Primary muscle tension dysphonia (MTD), a voice disturbance that occurs in the absence of structural or neurological pathology, may respond to manual circumlaryngeal techniques, which ostensibly alter the posture of the larynx and/or the configuration of the vocal folds without directly targeting supralaryngeal articulatory structures. Although the phonatory benefits of these techniques have been documented previously, this investigation examined whether acoustic evidence exists for articulatory changes accompanying successful management.

METHOD: In this retrospective study of a clinical database, pre- and post-treatment speech samples from 111 women with MTD were analyzed for acoustic evidence of supraglottal vocal tract changes associated with voice improvement, which was confirmed by perceptual ratings of dysphonia severity. The slopes of the first and second formants in diphthongs, as well as global measures of speech timing were acquired. Twenty younger females with normal voices were recorded twice, across a similar time-span to the disordered speakers, to allow comparisons in performance.

RESULTS: Repeated measures analysis of variance was used to evaluate changes accompanying treatment. Significant time by group interactions for /I/ F2 slope, /eI/ F2 slope, sample duration, and speaking time ratio were observed. As compared to the controls, diphthong second formant transitions increased in slope, and timing measures showed increases in speech continuity for the speakers with MTD.

CONCLUSIONS: Collectively, these preliminary findings suggest that individuals with MTD experience changes in both articulatory and phonatory behavior following successful treatment that targets the larynx.}, } @article {pmid18211243, year = {2008}, author = {Heinrich, A and Carlyon, RP and Davis, MH and Johnsrude, IS}, title = {Illusory vowels resulting from perceptual continuity: a functional magnetic resonance imaging study.}, journal = {Journal of cognitive neuroscience}, volume = {20}, number = {10}, pages = {1737-1752}, doi = {10.1162/jocn.2008.20069}, pmid = {18211243}, issn = {0898-929X}, support = {MC_U105559842/MRC_/Medical Research Council/United Kingdom ; MC_U105580446/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Auditory Cortex/*blood supply/*physiology ; Auditory Pathways/blood supply/physiology ; Brain Mapping ; Female ; Humans ; Illusions/*physiology ; Image Processing, Computer-Assisted/methods ; *Language ; *Magnetic Resonance Imaging/methods ; Male ; Oxygen/blood ; Psychoacoustics ; Speech Perception/*physiology ; }, abstract = {We used functional magnetic resonance imaging to study the neural processing of vowels whose perception depends on the continuity illusion. Participants heard sequences of two-formant vowels under a number of listening conditions. In the "vowel conditions," both formants were always present simultaneously and the stimuli were perceived as speech-like. Contrasted with a range of nonspeech sounds, these vowels elicited activity in the posterior middle temporal gyrus (MTG) and superior temporal sulcus (STS). When the two formants alternated in time, the "speech-likeness" of the sounds was reduced. It could be partially restored by filling the silent gaps in each formant with bands of noise (the "Illusion" condition) because the noise induced an illusion of continuity in each formant region, causing the two formants to be perceived as simultaneous. However, this manipulation was only effective at low formant-to-noise ratios (FNRs). When the FNR was increased, the illusion broke down (the "illusion-break" condition). Activation in vowel-sensitive regions of the MTG was greater in the illusion than in the illusion-break condition, consistent with the perception of Illusion stimuli as vowels. Activity in Heschl's gyri (HG), the approximate location of the primary auditory cortex, showed the opposite pattern, and may depend instead on the number of perceptual onsets in a sound. Our results demonstrate that speech-sensitive regions of the MTG are sensitive not to the physical characteristics of the stimulus but to the perception of the stimulus as speech, and also provide an anatomically distinct, objective physiological correlate of the continuity illusion in human listeners.}, } @article {pmid18189575, year = {2007}, author = {Liu, C and Kewley-Port, D}, title = {Factors affecting vowel formant discrimination by hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {5}, pages = {2855-2864}, doi = {10.1121/1.2781580}, pmid = {18189575}, issn = {1520-8524}, support = {DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Threshold ; Hearing Disorders/*psychology ; Humans ; Middle Aged ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {The goal of this study was to measure the ability of adult hearing-impaired listeners to discriminate formant frequency for vowels in isolation, syllables, and sentences. Vowel formant discrimination for F1 and F2 for the vowels /I epsilon ae / was measured. Four experimental factors were manipulated including linguistic context (isolated vowels, syllables, and sentences), signal level (70 and 95 dB SPL), formant frequency, and cognitive load. A complex identification task was added to the formant discrimination task only for sentences to assess effects of cognitive load. Results showed significant elevation in formant thresholds as formant frequency and linguistic context increased. Higher signal level also elevated formant thresholds primarily for F2. However, no effect of the additional identification task on the formant discrimination was observed. In comparable conditions, these hearing-impaired listeners had elevated thresholds for formant discrimination compared to young normal-hearing listeners primarily for F2. Altogether, poorer performance for formant discrimination for these adult hearing-impaired listeners was mainly caused by hearing loss rather than cognitive difficulty for tasks implemented in this study.}, } @article {pmid18189574, year = {2007}, author = {Iverson, P and Evans, BG}, title = {Learning English vowels with different first-language vowel systems: perception of formant targets, formant movement, and duration.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {5}, pages = {2842-2854}, doi = {10.1121/1.2783198}, pmid = {18189574}, issn = {1520-8524}, mesh = {Adult ; Humans ; *Language ; *Learning ; Middle Aged ; Noise ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {This study examined whether individuals with a wide range of first-language vowel systems (Spanish, French, German, and Norwegian) differ fundamentally in the cues that they use when they learn the English vowel system (e.g., formant movement and duration). All subjects: (1) identified natural English vowels in quiet; (2) identified English vowels in noise that had been signal processed to flatten formant movement or equate duration; (3) perceptually mapped best exemplars for first- and second-language synthetic vowels in a five-dimensional vowel space that included formant movement and duration; and (4) rated how natural English vowels assimilated into their L1 vowel categories. The results demonstrated that individuals with larger and more complex first-language vowel systems (German and Norwegian) were more accurate at recognizing English vowels than were individuals with smaller first-language systems (Spanish and French). However, there were no fundamental differences in what these individuals learned. That is, all groups used formant movement and duration to recognize English vowels, and learned new aspects of the English vowel system rather than simply assimilating vowels into existing first-language categories. The results suggest that there is a surprising degree of uniformity in the ways that individuals with different language backgrounds perceive second language vowels.}, } @article {pmid18177172, year = {2008}, author = {Pittman, A}, title = {Perceptual coherence in listeners having longstanding childhood hearing losses, listeners with adult-onset hearing losses, and listeners with normal hearing.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {1}, pages = {441-449}, pmid = {18177172}, issn = {1520-8524}, support = {R01 DC004300/DC/NIDCD NIH HHS/United States ; R03 DC006573-04/DC/NIDCD NIH HHS/United States ; R03 DC006573/DC/NIDCD NIH HHS/United States ; R03DC06573/DC/NIDCD NIH HHS/United States ; R01 DC004300-05/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age of Onset ; Aged ; Audiometry, Pure-Tone ; Auditory Threshold ; Child ; Hearing/*physiology ; Hearing Loss, Sensorineural/diagnosis/*epidemiology/*physiopathology ; Humans ; Middle Aged ; Severity of Illness Index ; Speech Discrimination Tests ; Time Factors ; }, abstract = {Perceptual coherence, the process by which the individual elements of complex sounds are bound together, was examined in adult listeners with longstanding childhood hearing losses, listeners with adult-onset hearing losses, and listeners with normal hearing. It was hypothesized that perceptual coherence would vary in strength between the groups due to their substantial differences in hearing history. Bisyllabic words produced by three talkers as well as comodulated three-tone complexes served as stimuli. In the first task, the second formant of each word was isolated and presented for recognition. In the second task, an isolated formant was paired with an intact word and listeners indicated whether or not the isolated second formant was a component of the intact word. In the third task, the middle component of the three-tone complex was presented in the same manner. For the speech stimuli, results indicate normal perceptual coherence in the listeners with adult-onset hearing loss but significantly weaker coherence in the listeners with childhood hearing losses. No differences were observed across groups for the nonspeech stimuli. These results suggest that perceptual coherence is relatively unaffected by hearing loss acquired during adulthood but appears to be impaired when hearing loss is present in early childhood.}, } @article {pmid18177168, year = {2008}, author = {Alexander, JM and Kluender, KR}, title = {Spectral tilt change in stop consonant perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {1}, pages = {386-396}, doi = {10.1121/1.2817617}, pmid = {18177168}, issn = {1520-8524}, support = {R01DC04072/DC/NIDCD NIH HHS/United States ; T32 DC000013/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {There exists no clear understanding of the importance of spectral tilt for perception of stop consonants. It is hypothesized that spectral tilt may be particularly salient when formant patterns are ambiguous or degraded. Here, it is demonstrated that relative change in spectral tilt over time, not absolute tilt, significantly influences perception of /b/ vs /d/. Experiments consisted of burstless synthesized stimuli that varied in spectral tilt and onset frequency of the second formant. In Experiment 1, tilt of the consonant at voice onset was varied. In Experiment 2, tilt of the vowel steady state was varied. Results of these experiments were complementary and revealed a significant contribution of relative spectral tilt change only when formant information was ambiguous. Experiments 3 and 4 replicated Experiments 1 and 2 in an /aba/-/ada/ context. The additional tilt contrast provided by the initial vowel modestly enhanced effects. In Experiment 5, there was no effect for absolute tilt when consonant and vowel tilts were identical. Consistent with earlier studies demonstrating contrast between successive local spectral features, perceptual effects of gross spectral characteristics are likewise relative. These findings have implications for perception in nonlaboratory environments and for listeners with hearing impairment.}, } @article {pmid18177167, year = {2008}, author = {Nittrouer, S and Lowenstein, JH}, title = {Spectral structure across the syllable specifies final-stop voicing for adults and children alike.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {1}, pages = {377-385}, pmid = {18177167}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-20/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Cues ; Humans ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; *Voice ; }, abstract = {Traditional accounts of speech perception generally hold that listeners use isolable acoustic "cues" to label phonemes. For syllable-final stops, duration of the preceding vocalic portion and formant transitions at syllable's end have been considered the primary cues to voicing decisions. The current experiment tried to extend traditional accounts by asking two questions concerning voicing decisions by adults and children: (1) What weight is given to vocalic duration versus spectral structure, both at syllable's end and across the syllable? (2) Does the naturalness of stimuli affect labeling? Adults and children (4, 6, and 8 years old) labeled synthetic stimuli that varied in vocalic duration and spectral structure, either at syllable's end or earlier in the syllable. Results showed that all listeners weighted dynamic spectral structure, both at syllable's end and earlier in the syllable, more than vocalic duration, and listeners performed with these synthetic stimuli as listeners had performed previously with natural stimuli. The conclusion for accounts of human speech perception is that rather than simply gathering acoustic cues and summing them to derive strings of phonemic segments, listeners are able to attend to global spectral structure, and use it to help recover explicitly phonetic structure.}, } @article {pmid18177166, year = {2008}, author = {Kiefte, M and Kluender, KR}, title = {Absorption of reliable spectral characteristics in auditory perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {1}, pages = {366-376}, doi = {10.1121/1.2804951}, pmid = {18177166}, issn = {1520-8524}, mesh = {Adult ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; Voice ; Voice Quality ; }, abstract = {Several experiments are described in which synthetic monophthongs from series varying between /i/ and /u/ are presented following filtered precursors. In addition to F(2), target stimuli vary in spectral tilt by applying a filter that either raises or lowers the amplitudes of higher formants. Previous studies have shown that both of these spectral properties contribute to identification of these stimuli in isolation. However, in the present experiments we show that when a precursor sentence is processed by the same filter used to adjust spectral tilt in the target stimulus, listeners identify synthetic vowels on the basis of F(2) alone. Conversely, when the precursor sentence is processed by a single-pole filter with center frequency and bandwidth identical to that of the F(2) peak of the following vowel, listeners identify synthetic vowels on the basis of spectral tilt alone. These results show that listeners ignore spectral details that are unchanged in the acoustic context. Instead of identifying vowels on the basis of incorrect acoustic information, however (e.g., all vowels are heard as /i/ when second formant is perceptually ignored), listeners discriminate the vowel stimuli on the basis of the more informative spectral property.}, } @article {pmid18177162, year = {2008}, author = {Story, BH}, title = {Comparison of magnetic resonance imaging-based vocal tract area functions obtained from the same speaker in 1994 and 2002.}, journal = {The Journal of the Acoustical Society of America}, volume = {123}, number = {1}, pages = {327-335}, pmid = {18177162}, issn = {1520-8524}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC004789-07/DC/NIDCD NIH HHS/United States ; R01-DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Larynx/*anatomy & histology/physiology ; *Magnetic Resonance Imaging ; Male ; Pharynx/*anatomy & histology/physiology ; *Phonetics ; Speech Production Measurement ; Time Factors ; Vocal Cords/*physiology ; Voice Quality/physiology ; }, abstract = {A new set of area functions for vowels has been obtained with magnetic resonance imaging from the same speaker as that previously reported in 1996 [Story et al., J. Acoust. Soc. Am. 100, 537-554 (1996)]. The new area functions were derived from image data collected in 2002, whereas the previously reported area functions were based on magnetic resonance images obtained in 1994. When compared, the new area function sets indicated a tendency toward a constricted pharyngeal region and expanded oral cavity relative to the previous set. Based on calculated formant frequencies and sensitivity functions, these morphological differences were shown to have the primary acoustic effect of systematically shifting the second formant (F2) downward in frequency. Multiple instances of target vocal tract shapes from a specific speaker provide additional sampling of the possible area functions that may be produced during speech production. This may be of benefit for understanding intraspeaker variability in vowel production and for further development of speech synthesizers and speech models that utilize area function information.}, } @article {pmid24683552, year = {2008}, author = {Purcell, D and Munhall, K}, title = {Weighting of Auditory Feedback Across the English Vowel Space.}, journal = {Proceedings of the ... International Seminar on Speech Production}, volume = {8}, number = {}, pages = {389-392}, pmid = {24683552}, support = {R01 DC008092/DC/NIDCD NIH HHS/United States ; R01 DC008092-02/DC/NIDCD NIH HHS/United States ; }, abstract = {Auditory feedback in the headphones of talkers was manipulated in the F1 dimension using a real-time vowel formant filtering system. Minimum formant shifts required to elicit a response and the amount of compensation were measured for vowels across the English vowel space. The largest response in production of F1 was observed for the vowel /ε/ and smaller or non-significant changes were found for point vowels. In general, changes in production were of a compensatory nature that reduced the error in the auditory feedback.}, } @article {pmid20840020, year = {2008}, author = {Ng, ML and Chen, Y and Sadaka, J}, title = {Vowel features in Turkish accented English.}, journal = {International journal of speech-language pathology}, volume = {10}, number = {6}, pages = {404-413}, doi = {10.1080/17549500802399007}, pmid = {20840020}, issn = {1754-9507}, abstract = {The present study quantified the amount of accent in English vowels produced by native adult Turkish speakers. Vowels that are present in both Turkish and English (close vowels) were compared with those that are present only in English (distant vowels). The first two formant frequencies (F1 and F2) were obtained from the 11 English monophthong vowels (/i, i, e, ε, æ, Λ, u, [Formula: see text], o, [Formula: see text], [Formula: see text]/) produced by 20 Turkish-accented English (TE) (10 males and 10 females) bilinguals and 20 native American English (AE) speakers. Euclidean distance (ED) was used to measure the separation between the corresponding TE and AE vowels. Perceptual experiment was also carried out to assess the amount of accent in the English produced by Turkish speakers as perceived by native monolingual English speakers. F1 and F2 values revealed that TE speakers generally were able to produce close and distant vowels comparably, with considerable amount of deviation from AE speakers. ED values for close and distant vowels were not significantly different. The amount of perceived accent indicated the precision of vowel production and was found to directly correlate with the acoustic findings.}, } @article {pmid18155094, year = {2008}, author = {Evans, S and Neave, N and Wakelin, D and Hamilton, C}, title = {The relationship between testosterone and vocal frequencies in human males.}, journal = {Physiology & behavior}, volume = {93}, number = {4-5}, pages = {783-788}, doi = {10.1016/j.physbeh.2007.11.033}, pmid = {18155094}, issn = {0031-9384}, mesh = {Adolescent ; Adult ; Analysis of Variance ; Circadian Rhythm ; Fingers/innervation ; Humans ; Male ; Psychomotor Performance/physiology ; Saliva/metabolism ; Speech Acoustics ; Testosterone/*metabolism ; *Voice ; Voice Quality/*physiology ; }, abstract = {We investigated relationships between circulating levels of salivary testosterone and the fundamental and formant frequencies of male voices in a sample of forty healthy adult males, who recorded their voices and provided saliva samples at 9 am, 12 noon and 3 pm on a single day. The relationship between 2D:4D ratio as a putative biomarker of prenatal testosterone and vocal parameters was also explored. Results supported previous findings for a negative relationship between circulating levels of testosterone and fundamental frequency, with higher testosterone indicating lower fundamental frequency, although the magnitude of the relationship was larger than previously observed. Some limited evidence for a relationship between circulating testosterone and formant dispersion is also found, although this did not reach significance. Diurnal variation in testosterone and fundamental frequency, but not formant dispersion was reported, together with a trend towards an association between the fall in testosterone and the rise in fundamental frequency. Finally, there was no relationship between 2D:4D and the vocal parameters. It is thought that male voices may have deepened over the course of evolution in order to signal dominance and/or to increase the speaker's attractiveness. Findings confirm that vocal frequencies may provide an honest signal of the speaker's hormonal quality.}, } @article {pmid18085431, year = {2008}, author = {Rendall, D and Vasey, PL and McKenzie, J}, title = {The Queen's English: an alternative, biosocial hypothesis for the distinctive features of "gay speech".}, journal = {Archives of sexual behavior}, volume = {37}, number = {1}, pages = {188-204}, doi = {10.1007/s10508-007-9269-x}, pmid = {18085431}, issn = {0004-0002}, mesh = {Adolescent ; Adult ; Body Size ; Female ; *Homosexuality, Female ; *Homosexuality, Male ; Humans ; Imitative Behavior ; Male ; Middle Aged ; *Phonetics ; *Sex Characteristics ; Social Identification ; Sound Spectrography ; *Speech Acoustics ; Stereotyping ; *Voice Quality ; }, abstract = {Popular stereotypes concerning the speech of homosexuals typically attribute speech patterns characteristic of the opposite-sex, i.e., broadly feminized speech in gay men and broadly masculinized speech in lesbian women. A small body of recent empirical research has begun to address the subject more systematically and to consider specific mechanistic hypotheses to account for the potentially distinctive features of homosexual speech. Results do not yet fully endorse the stereotypes but they do not entirely discount them either; nor do they cleanly favor any single mechanistic hypothesis. To contribute to this growing body of research, we report acoustic analyses of 2,875 vowel sounds from a balanced set of 125 speakers representing heterosexual and homosexual individuals of each sex from southern Alberta, Canada. Analyses focused on voice pitch and formant frequencies which together determine the principle perceptual features of vowels. There was no significant difference in mean voice pitch between heterosexual and homosexual men or between heterosexual and homosexual women, but there were significant differences in the formant frequencies of vowels produced by both homosexual groups compared to their heterosexual counterparts. Formant frequency differences were specific to only certain vowel sounds and some could be attributed to basic differences in body size between heterosexual and homosexual speakers. The remaining formant frequency differences were not obviously due to differences in vocal tract anatomy between heterosexual and homosexual speakers, nor did they reflect global feminization or masculinization of vowel production patterns in homosexual men and women, respectively. The vowel-specific differences observed could reflect social modeling processes in which only certain speech patterns of the opposite-sex, or of same-sex homosexuals, are selectively adopted. However, we introduce an alternative biosocial hypothesis, specifically that the distinctive, vowel-specific features of homosexual speakers relative to heterosexual speakers arise incidentally as a product of broader psychobehavioral differences between the two groups that are, in turn, continuous with and flow from the physiological processes that affect sexual orientation to begin with.}, } @article {pmid18082365, year = {2009}, author = {Liu, H and Ng, ML}, title = {Formant characteristics of vowels produced by Mandarin esophageal speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {2}, pages = {255-260}, doi = {10.1016/j.jvoice.2007.09.002}, pmid = {18082365}, issn = {1873-4588}, mesh = {Adult ; Aged ; Humans ; Language ; Middle Aged ; *Phonetics ; Speech ; *Speech Acoustics ; Speech, Esophageal/*psychology ; }, abstract = {With the use of the pharyngoesophageal (PE) segment as the neoglottis in esophageal speakers, their voice quality becomes significantly different from that of laryngeal speakers. The present study investigated the characteristics of the vocal tract resonance in Mandarin esophageal speakers. The first three formant frequencies of vowels /a, epsilon, i, [see in text], u, y/ produced by seven superior esophageal and seven laryngeal speakers of Mandarin were obtained. For each formant, the formant frequencies measured from the medial 80% of the vowel portion were averaged, and data obtained from esophageal speakers were compared with that obtained from laryngeal speakers. The results indicated that esophageal speakers were associated with significantly higher formant frequencies (F(1), F(2), and F(3)) and a significantly diminished vowel space circumscribed by the three corner vowels /i/, /a/, and /u/ when compared with laryngeal speakers. The findings that formant frequency changes across vowels were systematic and similar for esophageal and laryngeal speakers are consistent with those previously reported in English. It can be concluded that vocal tract transmission has been changed in esophageal speakers of Mandarin after laryngectomy.}, } @article {pmid18055771, year = {2007}, author = {Vorperian, HK and Kent, RD}, title = {Vowel acoustic space development in children: a synthesis of acoustic and anatomic data.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {6}, pages = {1510-1545}, pmid = {18055771}, issn = {1092-4388}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R03 DC004362-02/DC/NIDCD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; R01 DC000319-11/DC/NIDCD NIH HHS/United States ; P30 HD003352-40/HD/NICHD NIH HHS/United States ; R01 DC00319/DC/NIDCD NIH HHS/United States ; R03 DC004362-01/DC/NIDCD NIH HHS/United States ; P30 HD003352-37/HD/NICHD NIH HHS/United States ; R01 DC000319-17/DC/NIDCD NIH HHS/United States ; R01 DC000319-19/DC/NIDCD NIH HHS/United States ; P30 HD003352-35/HD/NICHD NIH HHS/United States ; R01 DC000319-18/DC/NIDCD NIH HHS/United States ; R03 DC004362-03/DC/NIDCD NIH HHS/United States ; P30 HD003352-39/HD/NICHD NIH HHS/United States ; R01 DC006282-03/DC/NIDCD NIH HHS/United States ; R01 DC000319-20/DC/NIDCD NIH HHS/United States ; P30 HD003352-42/HD/NICHD NIH HHS/United States ; R01 DC000319-16A1/DC/NIDCD NIH HHS/United States ; R01 DC006282-05S1/DC/NIDCD NIH HHS/United States ; P30 HD003352-38/HD/NICHD NIH HHS/United States ; R01 DC000319/DC/NIDCD NIH HHS/United States ; R01 DC006282-05/DC/NIDCD NIH HHS/United States ; P30 HD003352-41/HD/NICHD NIH HHS/United States ; R01 DC006282-04/DC/NIDCD NIH HHS/United States ; R01 DC000319-15/DC/NIDCD NIH HHS/United States ; P-30 HD03352/HD/NICHD NIH HHS/United States ; R01 DC006282-01/DC/NIDCD NIH HHS/United States ; R03 DC4362/DC/NIDCD NIH HHS/United States ; R01 DC006282-02/DC/NIDCD NIH HHS/United States ; R01 DC6282/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; *Child Language ; Female ; Humans ; Infant ; Larynx/*anatomy & histology ; Magnetic Resonance Imaging ; Male ; Mouth/*anatomy & histology ; Pharynx/*anatomy & histology ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {PURPOSE: This article integrates published acoustic data on the development of vowel production. Age specific data on formant frequencies are considered in the light of information on the development of the vocal tract (VT) to create an anatomic-acoustic description of the maturation of the vowel acoustic space for English.

METHOD: Literature searches identified 14 studies reporting data on vowel formant frequencies. Data on corner vowels are summarized graphically to show age- and sex- related changes in the area and shape of the traditional vowel quadrilateral.

CONCLUSIONS: Vowel development is expressed as follows: (a) establishment of a language-appropriate acoustic representation (e.g., F1-F2 quadrilateral or F1-F2-F3 space), (b) gradual reduction in formant frequencies and F1-F2 area with age, (c) reduction in formant-frequency variability, (d) emergence of male-female differences in formant frequency by age 4 years with more apparent differences by 8 years, (e) jumps in formant frequency at ages corresponding to growth spurts of the VT, and (f) a decline of f0 after age 1 year, with the decline being more rapid during early childhood and adolescence. Questions remain about optimal procedures for VT normalization and the exact relationship between VT growth and formant frequencies. Comments are included on nasalization and vocal fundamental frequency as they relate to the development of vowel production.}, } @article {pmid18054514, year = {2008}, author = {Rees, JM and Regunath, G and Whiteside, SP and Wadnerkar, MB and Cowell, PE}, title = {Adaptation of wavelet transform analysis to the investigation of biological variations in speech signals.}, journal = {Medical engineering & physics}, volume = {30}, number = {7}, pages = {865-871}, doi = {10.1016/j.medengphy.2007.10.006}, pmid = {18054514}, issn = {1350-4533}, mesh = {Adult ; Estrogens/metabolism ; Female ; Fourier Analysis ; Hormones/*physiology ; Humans ; Menstrual Cycle/*physiology ; Models, Statistical ; Progesterone/metabolism ; Sex Characteristics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Time Factors ; Voice/physiology ; }, abstract = {The purpose of this study was to adapt wavelet analysis as a tool for discriminating speech samples taken from healthy subjects across two biological states. Speech pressure waveforms were drawn from a study on effects of hormone fluctuations across the menstrual cycle on language functions. Speech samples from the vowel portion of the syllable 'pa', taken at the low- and high-hormone phases of the menstrual cycle, were extracted for analysis. Initial analysis applied Fourier transforms to examine the fundamental and formant frequencies. Wavelet analysis was used to investigate spectral differences at a more microbehavioural level. The key finding showed that wavelet coefficients for the fundamental frequency of speech samples taken from the high-hormone phase had larger amplitudes than those from the low-hormone phase. This study provided evidence for differences in speech across the menstrual cycle that affected the vowel portion of syllables. This evidence complements existing data on the temporal features of speech that characterise the consonant portion of syllables. Wavelet analysis provides a new tool for examination of behavioural differences in speech linked to hormonal variation.}, } @article {pmid18035557, year = {2008}, author = {Tourville, JA and Reilly, KJ and Guenther, FH}, title = {Neural mechanisms underlying auditory feedback control of speech.}, journal = {NeuroImage}, volume = {39}, number = {3}, pages = {1429-1443}, pmid = {18035557}, issn = {1053-8119}, support = {P41 RR014075/RR/NCRR NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; P41RR14075/RR/NCRR NIH HHS/United States ; R01 DC02852/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Algorithms ; Auditory Perception/*physiology ; Brain Mapping ; Cerebral Cortex/anatomy & histology/*physiology ; Data Interpretation, Statistical ; Feedback/*physiology ; Female ; Humans ; Image Processing, Computer-Assisted ; Magnetic Resonance Imaging ; Male ; Models, Neurological ; Models, Statistical ; Oxygen/blood ; Speech/*physiology ; }, abstract = {The neural substrates underlying auditory feedback control of speech were investigated using a combination of functional magnetic resonance imaging (fMRI) and computational modeling. Neural responses were measured while subjects spoke monosyllabic words under two conditions: (i) normal auditory feedback of their speech and (ii) auditory feedback in which the first formant frequency of their speech was unexpectedly shifted in real time. Acoustic measurements showed compensation to the shift within approximately 136 ms of onset. Neuroimaging revealed increased activity in bilateral superior temporal cortex during shifted feedback, indicative of neurons coding mismatches between expected and actual auditory signals, as well as right prefrontal and Rolandic cortical activity. Structural equation modeling revealed increased influence of bilateral auditory cortical areas on right frontal areas during shifted speech, indicating that projections from auditory error cells in posterior superior temporal cortex to motor correction cells in right frontal cortex mediate auditory feedback control of speech.}, } @article {pmid18003474, year = {2007}, author = {Kim, KH and Kim, JH and Kim, DH}, title = {An improved speech processor for cochlear implant based on active nonlinear model of biological cochlea.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2007}, number = {}, pages = {6352-6355}, doi = {10.1109/IEMBS.2007.4353808}, pmid = {18003474}, issn = {2375-7477}, mesh = {Adult ; *Cochlear Implants ; Female ; Humans ; Male ; *Models, Biological ; Noise ; *Speech Perception ; }, abstract = {The purpose of this study was to improve speech perception performance of cochlear implant (CI) under noise by a speech processing strategy based on nonlinear time-varying filter model of biological cochlea, which is beneficial in preserving spectral cues for speech perception. A dual resonance nonlinear model was applied to implement this feature. Time-frequency analysis indicated that formant information was more clearly represented at the output of CI speech processor, especially under noise. Acoustic simulation and hearing experiment also showed the superiority of the proposed strategy in that vowel perception score was notably enhanced. It was also observed that the AN responses to the stimulation pulses produced by the proposed strategy encode the formant information faithfully. Since the proposed strategy can be employed in CI devices without modification of hardwares, a significant contribution for the improvement of speech perception capability of CI implantees is expected.}, } @article {pmid18002519, year = {2007}, author = {Manfredi, C and Canalicchio, R and Cecconi, G and Cantarella, G}, title = {A robust tool to compare pre- and post-surgical voice quality.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2007}, number = {}, pages = {2568-2571}, doi = {10.1109/IEMBS.2007.4352853}, pmid = {18002519}, issn = {2375-7477}, mesh = {Humans ; Hypopharynx/surgery ; *Postoperative Care ; *Signal Processing, Computer-Assisted ; *Software ; *Speech Acoustics ; Voice Disorders ; Voice Quality/*physiology ; }, abstract = {Assessing voice quality by means of objective parameters is of great relevance for clinicians. A large number of indexes have been proposed in literature and in commercially available software tools. However, clinicians commonly resort to a small subset of such indexes, due to difficulties in managing set up options and understanding their meaning. In this paper, the analysis has been limited to few but effective indexes, devoting great effort to their robust and automatic evaluation. Specifically, fundamental frequency (F0), along with its irregularity (Jitter (J) and Relative Average Perturbation (RAP)), noise and formant frequencies, are tracked on voiced parts of the signal only. Mean and std values are also displayed. The underlying high-resolution estimation procedure is further strengthened by an adaptive estimation of the optimal length of signal frames for analysis, linked to varying signal characteristics. Moreover, the new tool allows for automatic analysis of any kind of signal, both as far as F0 range and sampling frequency are concerned, no manual setting being required to the user. This makes the tool feasible for application by non-expert users, also thanks to its simple interface. The proposed approach is applied here to patients suffering from cysts and polyps that underwent micro-laryngoscopic direct exeresis (MLSD).}, } @article {pmid17981011, year = {2009}, author = {Meurer, EM and Garcez, V and von Eye Corleta, H and Capp, E}, title = {Menstrual cycle influences on voice and speech in adolescent females.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {1}, pages = {109-113}, doi = {10.1016/j.jvoice.2007.03.001}, pmid = {17981011}, issn = {1873-4588}, mesh = {Adolescent ; Animals ; Cross-Sectional Studies ; Emotions ; Female ; Follicular Phase/*physiology ; Humans ; Luteal Phase/*physiology ; Phonation ; Speech/*physiology ; *Speech Acoustics ; Voice/*physiology ; }, abstract = {The objective of this study is to characterize voice intensity and stability of fundamental frequency, formants and diadochokinesis, vocal modulations, rhythms, and speed of speech in adolescents during follicular and luteal phases of the menstrual cycle. Twenty-three adolescent females who were nonusers of oral contraceptives participated in a cross-sectional study of menstrual cycle influences on voicing and speaking tasks. Acoustic analyses were performed during both phases of the menstrual cycle using the Kay Elemetrics Computer Speech Lab Software Package. Data were analyzed using Student's paired sample t test. Phono-articulatory parameters were similar in both phases of the menstrual cycle (fundamental frequency: 192.6+/-23.9 Hz; minimum formant 891.7+/-110.3 Hz; and maximum formant: 2471.5+/-203.6 Hz). In diadochokinesis, they had a speed of 5.6+/-0.6 seg/s and vocal intensity was 61.5+/-2.6 dB. The mean values for the variations in voice modulations were as follows: anger (21.7+/-8.7 Hz)
METHOD: Seven acoustic metrics were measured for conversational and clear vowels produced by 12 talkers-6 who previously were found (S. H. Ferguson, 2004) to produce a large clear speech vowel intelligibility effect for listeners with normal hearing identifying vowels in background noise (the big benefit talkers), and 6 who produced no clear speech vowel intelligibility benefit (the no benefit talkers).

RESULTS: For vowel duration and for certain measures of the overall acoustic vowel space, the change from conversational to clear speech was significantly greater for big benefit talkers than for no benefit talkers. For measures of formant dynamics, in contrast, the clear speech effect was similar for the 2 groups.

CONCLUSION: These results suggest that acoustic vowel space expansion and large vowel duration increases improve vowel intelligibility. In contrast, changing the dynamic characteristics of vowels seems not to contribute to improved clear speech vowel intelligibility. However, talker variability suggested that improved vowel intelligibility can be achieved using a variety of clear speech strategies, including some apparently not measured here.}, } @article {pmid17905901, year = {2007}, author = {Vongpaisal, T and Pichora-Fuller, MK}, title = {Effect of age on F0 difference limen and concurrent vowel identification.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {5}, pages = {1139-1156}, doi = {10.1044/1092-4388(2007/079)}, pmid = {17905901}, issn = {1092-4388}, mesh = {Adult ; Aged ; Aged, 80 and over ; *Aging ; Auditory Threshold ; Humans ; Individuality ; *Phonetics ; *Speech Perception ; }, abstract = {PURPOSE: To investigate the effect of age on voice fundamental frequency (F0) difference limen (DL) and identification of concurrently presented vowels.

METHOD: Fifteen younger and 15 older adults with normal audiometric thresholds in the speech range participated in 2 experiments. In Experiment 1, F0 DLs were measured for a synthesized vowel. In Experiment 2, accuracy in identifying concurrently presented vowel pairs was measured. Vowel pairs were formed from 5 synthesized vowels with F0 separations ranging from 0 to 4 semitones.

RESULTS: Younger adults had smaller (better) F0 DLs than older adults. For the older group, age was significantly correlated with F0 DLs. Younger adults identified concurrent vowels more accurately than older adults. When the vowels in the pairs had different formants, both age groups benefited similarly from F0 separation. Interestingly, when both constituent vowels had identical formants, F0 separation was deleterious, especially for older adults. Pure-tone average threshold did not correlate significantly with either F0 DL or accuracy in concurrent vowel identification.

CONCLUSION: Age-related declines were confirmed for F0 DLs, identification of concurrently spoken vowels, and benefit from F0 separation between vowels with identical formants. This pattern of findings is consistent with age-related deficits in periodicity coding.}, } @article {pmid17902869, year = {2007}, author = {Dilley, LC and Pitt, MA}, title = {A study of regressive place assimilation in spontaneous speech and its implications for spoken word recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {4}, pages = {2340-2353}, doi = {10.1121/1.2772226}, pmid = {17902869}, issn = {1520-8524}, support = {DC004330/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonation ; *Phonetics ; Semantics ; Sound Spectrography ; *Speech Acoustics ; Speech Articulation Tests ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {Regressive place assimilation is a form of pronunciation variation in which a word-final alveolar sound takes the place of articulation of a following labial or velar sound, as when green boat is pronounced greem boat. How listeners recover the intended word (e.g., green, given greem) has been a major focus of spoken word recognition theories. However, the extent to which this variation occurs in casual, unscripted speech has previously not been reported. Two studies of pronunciation variation were conducted using a spontaneous speech corpus. First, phonetic labeling data were used to identify contexts in which assimilation could occur, namely, when a word-final alveolar stop (/t/, /d/, or /n/) was followed by a velar or labial consonant. Assimilation was indicated relatively infrequently, while deletion, glottalization, or canonical pronunciations were more often indicated. Moreover, lexical frequency was shown to affect pronunciation; high frequency lexical items showed more types of variation. Second, acoustic analyses showed that neither place of articulation cues (indicated by second formant variation) nor relative amplitude was sufficient to distinguish assimilated from deleted and canonical variants; only when closure duration was additionally taken into account were these three variant types distinguishable. Implications for theories of word recognition are discussed.}, } @article {pmid17902868, year = {2007}, author = {Nittrouer, S}, title = {Dynamic spectral structure specifies vowels for children and adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {4}, pages = {2328-2339}, pmid = {17902868}, issn = {1520-8524}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-20/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; Perceptual Masking ; *Phonation ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {When it comes to making decisions regarding vowel quality, adults seem to weight dynamic syllable structure more strongly than static structure, although disagreement exists over the nature of the most relevant kind of dynamic structure: spectral change intrinsic to the vowel or structure arising from movements between consonant and vowel constrictions. Results have been even less clear regarding the signal components children use in making vowel judgments. In this experiment, listeners of four different ages (adults, and 3-, 5-, and 7-year-old children) were asked to label stimuli that sounded either like steady-state vowels or like CVC syllables which sometimes had middle sections masked by coughs. Four vowel contrasts were used, crossed for type (front/back or closed/open) and consonant context (strongly or only slightly constraining of vowel tongue position). All listeners recognized vowel quality with high levels of accuracy in all conditions, but children were disproportionately hampered by strong coarticulatory effects when only steady-state formants were available. Results clarified past studies, showing that dynamic structure is critical to vowel perception for all aged listeners, but particularly for young children, and that it is the dynamic structure arising from vocal-tract movement between consonant and vowel constrictions that is most important.}, } @article {pmid17902867, year = {2007}, author = {Lulich, SM and Bachrach, A and Malyska, N}, title = {A role for the second subglottal resonance in lexical access.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {4}, pages = {2320-2327}, doi = {10.1121/1.2772227}, pmid = {17902867}, issn = {1520-8524}, support = {DC00075/DC/NIDCD NIH HHS/United States ; T32DC000038/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Glottis/*physiology ; Humans ; Male ; Phonation/*physiology ; *Phonetics ; Pulmonary Ventilation/*physiology ; *Semantics ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Acoustic coupling between the vocal tract and the lower (subglottal) airway results in the introduction of pole-zero pairs corresponding to resonances of the uncoupled lower airway. If the second formant (F2) passes through the second subglottal resonance a discontinuity in amplitude occurs. This work explores the hypothesis that this F2 discontinuity affects how listeners perceive the distinctive feature [back] in transitions from a front vowel (high F2) to a labial stop (low F2). Two versions of the utterances "apter" and "up there" were synthesized with an F2 discontinuity at different locations in the initial VC transition. Subjects heard portions of the utterances with and without the discontinuity, and were asked to identify whether the utterances were real words or not. Results show that the frequency of the F2 discontinuity in an utterance influences the perception of backness in the vowel. Discontinuities of this sort are proposed to play a role in shaping vowel inventories in the world's languages [K. N. Stevens, J. Phonetics 17, 3-46 (1989)]. The results support a model of lexical access in which articulatory-acoustic discontinuities subserve phonological feature identification.}, } @article {pmid17902866, year = {2007}, author = {Villacorta, VM and Perkell, JS and Guenther, FH}, title = {Sensorimotor adaptation to feedback perturbations of vowel acoustics and its relation to perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {4}, pages = {2306-2319}, doi = {10.1121/1.2773966}, pmid = {17902866}, issn = {1520-8524}, support = {DC01925/DC/NIDCD NIH HHS/United States ; DC02852/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Feedback/*physiology ; Female ; Humans ; Male ; Models, Theoretical ; Motor Cortex/*physiology ; Perceptual Distortion/physiology ; Perceptual Masking/physiology ; Phonation/*physiology ; *Phonetics ; Somatosensory Cortex/*physiology ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {The role of auditory feedback in speech motor control was explored in three related experiments. Experiment 1 investigated auditory sensorimotor adaptation: the process by which speakers alter their speech production to compensate for perturbations of auditory feedback. When the first formant frequency (F1) was shifted in the feedback heard by subjects as they produced vowels in consonant-vowel-consonant (CVC) words, the subjects' vowels demonstrated compensatory formant shifts that were maintained when auditory feedback was subsequently masked by noise-evidence of adaptation. Experiment 2 investigated auditory discrimination of synthetic vowel stimuli differing in F1 frequency, using the same subjects. Those with more acute F1 discrimination had compensated more to F1 perturbation. Experiment 3 consisted of simulations with the directions into velocities of articulators model of speech motor planning, which showed that the model can account for key aspects of compensation. In the model, movement goals for vowels are regions in auditory space; perturbation of auditory feedback invokes auditory feedback control mechanisms that correct for the perturbation, which in turn causes updating of feedforward commands to incorporate these corrections. The relation between speaker acuity and amount of compensation to auditory perturbation is mediated by the size of speakers' auditory goal regions, with more acute speakers having smaller goal regions.}, } @article {pmid17902738, year = {2007}, author = {Story, BH}, title = {A comparison of vocal tract perturbation patterns based on statistical and acoustic considerations.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {4}, pages = {EL107-14}, pmid = {17902738}, issn = {1520-8524}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC004789-07/DC/NIDCD NIH HHS/United States ; R01-DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Glottis/physiology ; Humans ; Lip/physiology ; *Models, Statistical ; Oropharynx/*physiology ; Phonation/*physiology ; *Phonetics ; Pulmonary Ventilation/*physiology ; Sensitivity and Specificity ; *Sound Spectrography ; *Speech Acoustics ; }, abstract = {The purpose of this study was to investigate the relation between vocal tract deformation patterns obtained from statistical analyses of a set of area functions representative of a vowel repertoire, and the acoustic properties of a neutral vocal tract shape. Acoustic sensitivity functions were calculated for a mean area function based on seven different speakers. Specific linear combinations of the sensitivity functions corresponding to the first two formant frequencies were shown to possess essentially the same amplitude variation along the vocal tract length as the statistically derived deformation patterns reported in previous studies.}, } @article {pmid17899147, year = {2008}, author = {Birkent, H and Soken, H and Akcam, T and Karahatay, S and Gerek, M}, title = {The effect of radiofrequency volumetric tissue reduction of soft palate on voice.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {265}, number = {2}, pages = {195-198}, pmid = {17899147}, issn = {0937-4477}, mesh = {Adult ; Aged ; Catheter Ablation/*methods ; Humans ; Male ; Middle Aged ; Palate, Soft/*surgery ; Phonetics ; Preoperative Care ; Prospective Studies ; Sleep Apnea, Obstructive/surgery ; Snoring/*surgery ; Speech Acoustics ; *Voice Quality ; }, abstract = {The surgical techniques used for snoring and OSA treatment include partial/complete resection or tissue reduction of the oropharyngeal structures such as uvula, tonsilla palatinas, soft palate, lateral pharyngeal tissues and tongue base. So it is predictable for these techniques to affect the resonating volume of the vocal tract and therefore the speech sounds. The goal of this study was to evaluate whether radiofrequency volumetric tissue reduction (RFVTR) of the soft palate can cause voice changes by altering the formant frequencies and fundamental frequency of vowels. A prospective study of 26 habitual snorers and mild obstructive sleep apnea patients (apnea-hypopnea index, <10 in all cases) were investigated before and 6 weeks after RFVTR. The patients received one Somnoplasty RFVTR treatment of 1,400 J per treatment session: 700 J into the midline and 350 J on each side of the soft palate with a maximum temperature of 80 degrees C. Acoustic evaluation was made by the Multidimensional Voice Program. The mean fundamental frequency (MF0) and the first three formant frequencies (F1, F2, F3) of four sustained vowels /a/, /e/, /i/ and /o/ were determined. Comparison between preoperative and postoperative acoustic analysis of the MF0 and F1, F2, F3 of sustained vowels revealed no significant change. The findings of the study indicate that RFVTR of the soft palate as a treatment for snoring and mild forms of OSA does not have a significant impact on the mean fundamental frequency and formant frequencies of vowels. These results seem to be important in management of patients with concerns about postoperative vocal quality, such as singers and professional speakers.}, } @article {pmid17882689, year = {2007}, author = {McAuliffe, MJ and Robb, MP and Murdoch, BE}, title = {Acoustic and perceptual analysis of speech adaptation to an artificial palate.}, journal = {Clinical linguistics & phonetics}, volume = {21}, number = {11-12}, pages = {885-894}, doi = {10.1080/02699200701576827}, pmid = {17882689}, issn = {0269-9206}, mesh = {Adult ; *Dental Prosthesis ; Female ; Humans ; *Palate ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Time Factors ; }, abstract = {The study investigated adaptation to a standard electropalatographic (EPG) practise palate in a group of eight adults (mean age = 24 years). The participants read the phrase "a CVC" over four sampling conditions: prior to inserting the palate, immediately following insertion of the palate, 45 minutes after palate insertion, and 3 hours after insertion of the palate. Perceptual and acoustic analyses were conducted on the initial CV portion of the stimuli. Consonants examined included: /t/, /k/, /s/, and // followed by the /i/, /a/, and /u/ vowels. Results revealed that individuals within the group were able to adapt their speech articulation to compensate for the presence of the artificial palate. Perceptually, mild consonant imprecision was observed upon insertion of the palate; however, this resolved following 45 minutes to 3 hours of adaptation. Acoustic findings indicated that the palate did not affect segment durations or vowel formant frequencies. However, a significant reduction in M1 for /s/ persisted across the sampling periods. Overall, the results suggest that a period of between 45 minutes and 3 hours of adaptation is generally suitable for participation in EPG studies.}, } @article {pmid17852527, year = {2008}, author = {Modha, G and Bernhardt, BM and Church, R and Bacsfalvi, P}, title = {Case study using ultrasound to treat /r/.}, journal = {International journal of language & communication disorders}, volume = {43}, number = {3}, pages = {323-329}, doi = {10.1080/13682820701449943}, pmid = {17852527}, issn = {1368-2822}, mesh = {Adolescent ; Articulation Disorders/*diagnostic imaging/psychology/therapy ; Humans ; Male ; Phonetics ; Speech Production Measurement ; Speech Therapy/methods ; Tongue/*diagnostic imaging ; Treatment Outcome ; Ultrasonography ; }, abstract = {BACKGROUND: Ultrasound has shown promise as visual feedback in remediation of /r/.

AIMS: To compare treatment for /r/ with and without ultrasound.

METHODS & PROCEDURES: A Canadian English-speaking adolescent participated in a case study with a no treatment baseline, alternating treatment blocks with and without ultrasound and a final no treatment period.

OUTCOMES & RESULTS: Formant values and trained listener ratings of speech samples indicated improvement in /r/ production, particularly after the introduction of ultrasound.

CONCLUSIONS: Ultrasound appeared to facilitate the acquisition of /r/ for the participant. Large-scale studies are needed to evaluate ultrasound further.}, } @article {pmid17827108, year = {2008}, author = {Diehl, RL}, title = {Acoustic and auditory phonetics: the adaptive design of speech sound systems.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {363}, number = {1493}, pages = {965-978}, pmid = {17827108}, issn = {0962-8436}, support = {R01 DC000427/DC/NIDCD NIH HHS/United States ; R01 DC000427-15/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Auditory Pathways/*physiology ; Humans ; Models, Biological ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Speech perception is remarkably robust. This paper examines how acoustic and auditory properties of vowels and consonants help to ensure intelligibility. First, the source-filter theory of speech production is briefly described, and the relationship between vocal-tract properties and formant patterns is demonstrated for some commonly occurring vowels. Next, two accounts of the structure of preferred sound inventories, quantal theory and dispersion theory, are described and some of their limitations are noted. Finally, it is suggested that certain aspects of quantal and dispersion theories can be unified in a principled way so as to achieve reasonable predictive accuracy.}, } @article {pmid17825609, year = {2008}, author = {Ng, AK and Koh, TS and Baey, E and Lee, TH and Abeyratne, UR and Puvanendran, K}, title = {Could formant frequencies of snore signals be an alternative means for the diagnosis of obstructive sleep apnea?.}, journal = {Sleep medicine}, volume = {9}, number = {8}, pages = {894-898}, doi = {10.1016/j.sleep.2007.07.010}, pmid = {17825609}, issn = {1389-9457}, mesh = {Acoustics ; Adult ; Body Mass Index ; Diagnosis, Differential ; Female ; Humans ; Male ; Middle Aged ; Polysomnography ; Severity of Illness Index ; Sleep Apnea, Obstructive/*complications/*diagnosis ; Snoring/*diagnosis/*etiology ; }, abstract = {OBJECTIVE: To study the feasibility of using acoustic signatures in snore signals for the diagnosis of obstructive sleep apnea (OSA).

METHODS: Snoring sounds of 30 apneic snorers (24 males; 6 females; apnea-hypopnea index, AHI=46.9+/-25.7events/h) and 10 benign snorers (6 males; 4 females; AHI=4.6+/-3.4events/h) were captured in a sleep laboratory. The recorded snore signals were preprocessed to remove noise, and subsequently, modeled using a linear predictive coding (LPC) technique. Formant frequencies (F1, F2, and F3) were extracted from the LPC spectrum for analysis. The accuracy of this approach was assessed using receiver operating characteristic curves and notched box plots. The relationship between AHI and F1 was further explored via regression analysis.

RESULTS: Quantitative differences in formant frequencies between apneic and benign snores are found in same- or both-gender snorers. Apneic snores exhibit higher formant frequencies than benign snores, especially F1, which can be related to the pathology of OSA. This study yields a sensitivity of 88%, a specificity of 82%, and a threshold value of F1=470Hz that best differentiate apneic snorers from benign snorers (both gender combined).

CONCLUSION: Acoustic signatures in snore signals carry information for OSA diagnosis, and snore-based analysis might potentially be a non-invasive and inexpensive diagnostic approach for mass screening of OSA.}, } @article {pmid17729144, year = {2007}, author = {Kazi, R and Prasad, VM and Kanagalingam, J and Georgalas, C and Venkitaraman, R and Nutting, CM and Clarke, P and Rhys-Evans, P and Harrington, KJ}, title = {Analysis of formant frequencies in patients with oral or oropharyngeal cancers treated by glossectomy.}, journal = {International journal of language & communication disorders}, volume = {42}, number = {5}, pages = {521-532}, doi = {10.1080/13682820601056566}, pmid = {17729144}, issn = {1368-2822}, mesh = {Adult ; Aged ; Carcinoma, Squamous Cell/physiopathology/*surgery ; Cross-Sectional Studies ; Female ; *Glossectomy/adverse effects ; Humans ; Male ; Middle Aged ; Mouth Neoplasms/physiopathology/*surgery ; Oropharyngeal Neoplasms/physiopathology/*surgery ; Postoperative Period ; Sex Factors ; Speech Acoustics ; *Voice Quality ; }, abstract = {AIMS: To compare voice quality as defined by formant analysis using a sustained vowel in patients who have undergone a partial glossectomy with a group of normal subjects.

METHODS & PROCEDURES: The design consisted of a single centre, cross-sectional cohort study. The setting was an Adult Tertiary Referral Unit. A total of 26 patients (19 males) who underwent partial glossectomy and 31 normal volunteers (18 males) participated in the study. Group comparisons using the first three formant frequencies (F1, F2 and F3) using linear predictive coding (Laryngograph Ltd, London, UK) were performed. The existence of any significant difference of F1, F2 and F3 between the two groups using the sustained vowel /i/ and the effects of other factors, namely age, first presentation versus recurrence, site (oral cavity, oropharynx), subsite (anterior two-thirds of the tongue, tongue base), stage, radiation, complication, and neck dissection, were analysed.

OUTCOMES & RESULTS: Formant frequencies F1, F2 and F3 were normally distributed. F1 and F2 were significantly different in normal males versus females. F1, F2 and F3 were not different statistically between male and female glossectomees. Comparison of only women showed significant differences between normal subjects and patients in F2 and F3, but none in F1. This was the opposite in men where F1 was significantly different. Age, tumour presentation, site, subsite, radiation and neck dissection showed no significant difference. Postoperative complications significantly affected the F1 formant frequency.

CONCLUSIONS: The study found that the formant values in patients following a partial glossectomy were altered significantly as compared with the normal control subjects. Only gender and complications and not the age, site, subsite, radiation and neck dissection were seen to influence the formant scores.}, } @article {pmid17682499, year = {2007}, author = {Adashinskaia, GA and Chernov, DN}, title = {[Acoustic correlates of person-distinctive functional and emotional states].}, journal = {Aviakosmicheskaia i ekologicheskaia meditsina = Aerospace and environmental medicine}, volume = {41}, number = {2}, pages = {3-13}, pmid = {17682499}, issn = {0233-528X}, mesh = {Affective Symptoms/*diagnosis/physiopathology ; Emotions/*physiology ; Humans ; Speech/*physiology ; *Speech Acoustics ; Speech Disorders/*diagnosis/physiopathology ; }, abstract = {Review of theoretical and applied publications on the modulation of speech acoustic properties by emotional and functional state of speaker showed that the most common correlates of emotional and functional states are frequency, temporal and power properties of vocal signals. As a rule, sthenic states lead to rising and asthenic states--to decreasing of the pitch, formant and intensity Acoustic correlates of emotional and functional states stem from speaker's personality and, therefore, may reveal different temporal and power trends. Diagnostics of psychoemotional states by speech properties can be improved through the analysis of general structures--intonemes, the permanent and easy to identify communication elements in different languages. Examination of person-distinctive acoustic correlates show promise in diagnostics of speaker's functional and emotional state.}, } @article {pmid17675595, year = {2007}, author = {Sapir, S and Spielman, JL and Ramig, LO and Story, BH and Fox, C}, title = {Effects of intensive voice treatment (the Lee Silverman Voice Treatment [LSVT]) on vowel articulation in dysarthric individuals with idiopathic Parkinson disease: acoustic and perceptual findings.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {4}, pages = {899-912}, doi = {10.1044/1092-4388(2007/064)}, pmid = {17675595}, issn = {1092-4388}, support = {R01DC0115/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Aged, 80 and over ; Dysarthria/*etiology/*therapy ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*complications ; Phonetics ; Reproducibility of Results ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Therapy/*methods/standards ; Treatment Outcome ; }, abstract = {PURPOSE: To evaluate the effects of intensive voice treatment targeting vocal loudness (the Lee Silverman Voice Treatment [LSVT]) on vowel articulation in dysarthric individuals with idiopathic Parkinson's disease (PD).

METHOD: A group of individuals with PD receiving LSVT (n = 14) was compared to a group of individuals with PD not receiving LSVT (n = 15) and a group of age-matched healthy individuals (n = 14) on the variables vocal sound pressure level (VocSPL); various measures of the first (F1) and second (F2) formants of the vowels /i/, /u/, and /a/; vowel triangle area; and perceptual vowel ratings. The vowels were extracted from the words key, stew, and Bobby embedded in phrases. Perceptual vowel rating was performed by trained raters using a visual analog scale.

RESULTS: Only VocSPL, F2 of the vowel /u/ (F2u), and the ratio F2i/F2u significantly differed between patients and healthy individuals pretreatment. These variables, along with perceptual vowel ratings, significantly changed (improved) in the group receiving LSVT only.

CONCLUSION: These results, along with previous findings, add further support to the generalized therapeutic impact of intensive voice treatment on orofacial functions (speech, swallowing, facial expression) and respiratory and laryngeal functions in individuals with PD.}, } @article {pmid17672660, year = {2007}, author = {Yoo, SD and Boston, JR and El-Jaroudi, A and Li, CC and Durrant, JD and Kovacyk, K and Shaiman, S}, title = {Speech signal modification to increase intelligibility in noisy environments.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {2}, pages = {1138-1149}, doi = {10.1121/1.2751257}, pmid = {17672660}, issn = {1520-8524}, mesh = {Algorithms ; Environment ; Filtration ; Humans ; *Noise ; Psychoacoustics ; Sound ; *Speech Intelligibility ; }, abstract = {The role of transient speech components on speech intelligibility was investigated. Speech was decomposed into two components--quasi-steady-state (QSS) and transient--using a set of time-varying filters whose center frequencies and bandwidths were controlled to identify the strongest formant components in speech. The relative energy and intelligibility of the QSS and transient components were compared to original speech. Most of the speech energy was in the QSS component, but this component had low intelligibility. The transient component had much lower energy but was almost as intelligible as the original speech, suggesting that the transient component included speech elements important to speech perception. A modified version of speech was produced by amplifying the transient component and recombining it with the original speech. The intelligibility of the modified speech in background noise was compared to that of the original speech, using a psychoacoustic procedure based on the modified rhyme protocol. Word recognition rates for the modified speech were significantly higher at low signal-to-noise ratios (SNRs), with minimal effect on intelligibility at higher SNRs. These results suggest that amplification of transient information may improve the intelligibility of speech in noise and that this improvement is more effective in severe noise conditions.}, } @article {pmid17672527, year = {2007}, author = {Assmann, PF and Nearey, TM}, title = {Relationship between fundamental and formant frequencies in voice preference.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {2}, pages = {EL35-43}, doi = {10.1121/1.2719045}, pmid = {17672527}, issn = {1520-8524}, mesh = {Choice Behavior ; Female ; *Hearing ; Humans ; Judgment ; Larynx ; Male ; Phonetics ; Recognition, Psychology ; Speech ; *Speech Perception ; Vocal Cords/*physiology ; *Voice Quality ; }, abstract = {Covariation in the size of laryngeal and vocal tract structures leads to a moderate correlation between fundamental frequency (F0) and formant frequencies (FFs) in natural speech. A method of adjustment procedure was used to test whether listeners prefer combinations of F0 and FFs that reflect this covariation. Vowel sequences spoken by two men and two women were processed by the STRAIGHT vocoder to construct three sets of frequency-shifted continua. The distributions of "best choice" responses in all three experiments confirm that listeners prefer coordinated patterns of F0 and FF similar to those of natural speech.}, } @article {pmid17614372, year = {2007}, author = {Morrison, GS and Nearey, TM}, title = {Testing theories of vowel inherent spectral change.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {1}, pages = {EL15-22}, doi = {10.1121/1.2739111}, pmid = {17614372}, issn = {1520-8524}, mesh = {Computer Simulation ; Glottis/physiology ; Humans ; Male ; Models, Biological ; Observer Variation ; *Phonetics ; Reproducibility of Results ; Sound Spectrography ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; Time Factors ; }, abstract = {Three competing accounts of vowel inherent spectral change in English all agree on the importance of initial formant frequencies; however, they disagree about the nature of the perceptually relevant aspects of formant change. The onset+offset hypothesis claims that the final formant values themselves matter. The onset+slope hypothesis claims that only the rate of change counts. The onset+direction hypothesis claims that only the general direction of change in formant frequencies is important. A synthetic-vowel perception experiment was designed to differentiate among the three. Results provide support for the superiority of the onset+offset hypothesis.}, } @article {pmid17614371, year = {2007}, author = {Hannukainen, A and Lukkari, T and Malinen, J and Palo, P}, title = {Vowel formants from the wave equation.}, journal = {The Journal of the Acoustical Society of America}, volume = {122}, number = {1}, pages = {EL1-7}, doi = {10.1121/1.2741599}, pmid = {17614371}, issn = {1520-8524}, mesh = {Computer Simulation ; Finite Element Analysis ; Glottis/anatomy & histology/*physiology ; Humans ; Magnetic Resonance Imaging ; *Models, Biological ; *Phonetics ; Pressure ; Reproducibility of Results ; *Speech Acoustics ; Sweden ; *Verbal Behavior ; Vibration ; Vocal Cords/physiology ; }, abstract = {This article describes modal analysis of acoustic waves in the human vocal tract while the subject is pronouncing [o]. The model used is the wave equation in three dimensions, together with physically relevant boundary conditions. The geometry is reconstructed from anatomical MRI data obtained by other researchers. The computations are carried out using the finite element method. The model is validated by comparing the computed modes with measured data.}, } @article {pmid17613097, year = {2007}, author = {Casper, MA and Raphael, LJ and Harris, KS and Geibel, JM}, title = {Speech prosody in cerebellar ataxia.}, journal = {International journal of language & communication disorders}, volume = {42}, number = {4}, pages = {407-426}, doi = {10.1080/13682820601056210}, pmid = {17613097}, issn = {1368-2822}, mesh = {Case-Control Studies ; Cerebellar Ataxia/*complications/pathology ; Dysarthria/*etiology/pathology ; Electromyography ; Humans ; Magnetic Resonance Imaging ; Phonetics ; Speech Production Measurement ; }, abstract = {Persons with cerebellar ataxia exhibit changes in physical coordination and speech and voice production. Previously, these alterations of speech and voice production were described primarily via perceptual coordinates. In this study, the spatial-temporal properties of syllable production were examined in 12 speakers, six of whom were healthy speakers and six with ataxia. The speaking task was designed to elicit six different prosodic conditions and four contrastive prosodic events. Distinct prosodic patterns were elicited by the examiner for cerebellar patients and healthy speakers. These utterances were digitally recorded and analysed acoustically and statistically. The healthy speakers showed statistically significant differences among all four prosodic contrasts. The normal model described by the prosodic contrasts provided a sensitive index of cerebellar pathology with quantitative acoustic analyses. A significant interaction between subject groups and prosodic conditions revealed a compromised prosody in cerebellar patients. Significant differences were found for durational parameters, F0 and formant frequencies. The cerebellar speakers demonstrated different patterns of syllable lengthening and syllable reduction from that of the healthy speakers.}, } @article {pmid17572066, year = {2008}, author = {Takahashi, H and Nakao, M and Kikuchi, Y and Kaga, K}, title = {Intra-oral pressure-based voicing control of electrolaryngeal speech with intra-oral vibrator.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {22}, number = {4}, pages = {420-429}, doi = {10.1016/j.jvoice.2006.10.004}, pmid = {17572066}, issn = {0892-1997}, mesh = {Adult ; Aged ; Electric Stimulation/instrumentation ; Humans ; Laryngectomy ; Male ; Middle Aged ; Mouth ; Phonetics ; *Pressure ; Sound Spectrography ; Speech Intelligibility ; *Speech, Alaryngeal ; *Vibration ; *Voice Quality ; }, abstract = {In normal speech, coordinated activities of intrinsic laryngeal muscles suspend a glottal sound at utterance of voiceless consonants, automatically realizing a voicing control. In electrolaryngeal speech, however, the lack of voicing control is one of the causes of unclear voice, voiceless consonants tending to be misheard as the corresponding voiced consonants. In the present work, we developed an intra-oral vibrator with an intra-oral pressure sensor that detected utterance of voiceless phonemes during the intra-oral electrolaryngeal speech, and demonstrated that an intra-oral pressure-based voicing control could improve the intelligibility of the speech. The test voices were obtained from one electrolaryngeal speaker and one normal speaker. We first investigated on the speech analysis software how a voice onset time (VOT) and first formant (F1) transition of the test consonant-vowel syllables contributed to voiceless/voiced contrasts, and developed an adequate voicing control strategy. We then compared the intelligibility of consonant-vowel syllables among the intra-oral electrolaryngeal speech with and without online voicing control. The increase of intra-oral pressure, typically with a peak ranging from 10 to 50 gf/cm2, could reliably identify utterance of voiceless consonants. The speech analysis and intelligibility test then demonstrated that a short VOT caused the misidentification of the voiced consonants due to a clear F1 transition. Finally, taking these results together, the online voicing control, which suspended the prosthetic tone while the intra-oral pressure exceeded 2.5 gf/cm2 and during the 35 milliseconds that followed, proved efficient to improve the voiceless/voiced contrast.}, } @article {pmid17552734, year = {2007}, author = {Adachi, S and Takemoto, H and Kitamura, T and Mokhtari, P and Honda, K}, title = {Vocal tract length perturbation and its application to male-female vocal tract shape conversion.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {6}, pages = {3874-3885}, doi = {10.1121/1.2730743}, pmid = {17552734}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Mathematics ; Models, Biological ; Sensitivity and Specificity ; Sex Characteristics ; *Sound ; Vocal Cords/*anatomy & histology/physiology ; Voice/*physiology ; }, abstract = {An alternative and complete derivation of the vocal tract length sensitivity function, which is an equation for finding a change in formant frequency due to perturbation of the vocal tract length [Fant, Quarterly Progress and Status Rep. No. 4, Speech Transmission Laboratory, Kungliga Teknisha Hogskolan, Stockholm, 1975, pp. 1-14] is presented. It is based on the adiabatic invariance of the vocal tract as an acoustic resonator and on the radiation pressure on the wall and at the exit of the vocal tract. An algorithm for tuning the vocal tract shape to match the formant frequencies to target values, such as those of a recorded speech signal, which was proposed in Story [J. Acoust. Soc. Am. 119, 715-718 (2006)], is extended so that the vocal tract length can also be changed. Numerical simulation of this extended algorithm shows that it can successfully convert between the vocal tract shapes of a male and a female for each of five Japanese vowels.}, } @article {pmid17552728, year = {2007}, author = {Lindblom, B and Agwuele, A and Sussman, HM and Cortes, EE}, title = {The effect of emphatic stress on consonant vowel coarticulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {6}, pages = {3802-3813}, doi = {10.1121/1.2730622}, pmid = {17552728}, issn = {1520-8524}, mesh = {Adult ; Humans ; *Language ; Linguistics ; Male ; Phonation/*physiology ; Phonetics ; Speech/*physiology ; *Speech Articulation Tests ; Voice/*physiology ; }, abstract = {This study assessed the acoustic coarticulatory effects of phrasal accent on [V1.CV2] sequences, when separately applied to V1 or V2, surrounding the voiced stops [b], [d], and [g]. Three adult speakers each produced 360 tokens (six V1 contexts x ten V2 contexts x three stops x two emphasis conditions). Realizing that anticipatory coarticulation of V2 onto the intervocalic C can be influenced by prosodic effects, as well as by vowel context effects, a modified locus equation regression metric was used to isolate the effect of phrasal accent on consonantal F2 onsets, independently of prosodically induced vowel expansion effects. The analyses revealed two main emphasis-dependent effects: systematic differences in F2 onset values and the expected expansion of vowel space. By accounting for the confounding variable of stress-induced vowel space expansion, a small but consistent coarticulatory effect of emphatic stress on the consonant was uncovered in lingually produced stops, but absent in labial stops. Formant calculations based on tube models indicated similarly increased F2 onsets when stressed /d/ and /g/ were simulated with deeper occlusions resulting from more forceful closure movements during phrasal accented speech.}, } @article {pmid17552727, year = {2007}, author = {Ménard, L and Polak, M and Denny, M and Burton, E and Lane, H and Matthies, ML and Marrone, N and Perkell, JS and Tiede, M and Vick, J}, title = {Interactions of speaking condition and auditory feedback on vowel production in postlingually deaf adults with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {6}, pages = {3790-3801}, doi = {10.1121/1.2710963}, pmid = {17552727}, issn = {1520-8524}, support = {R01-DC003007/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Pathways/*physiology ; *Cochlear Implants ; Deafness/*rehabilitation ; Feedback ; Hearing/*physiology ; Humans ; Language ; Models, Biological ; Reference Values ; Somatosensory Cortex/physiology ; Speech/*physiology ; }, abstract = {This study investigates the effects of speaking condition and auditory feedback on vowel production by postlingually deafened adults. Thirteen cochlear implant users produced repetitions of nine American English vowels prior to implantation, and at one month and one year after implantation. There were three speaking conditions (clear, normal, and fast), and two feedback conditions after implantation (implant processor turned on and off). Ten normal-hearing controls were also recorded once. Vowel contrasts in the formant space (expressed in mels) were larger in the clear than in the fast condition, both for controls and for implant users at all three time samples. Implant users also produced differences in duration between clear and fast conditions that were in the range of those obtained from the controls. In agreement with prior work, the implant users had contrast values lower than did the controls. The implant users' contrasts were larger with hearing on than off and improved from one month to one year postimplant. Because the controls and implant users responded similarly to a change in speaking condition, it is inferred that auditory feedback, although demonstrably important for maintaining normative values of vowel contrasts, is not needed to maintain the distinctiveness of those contrasts in different speaking conditions.}, } @article {pmid17550876, year = {2007}, author = {Charlton, BD and Reby, D and McComb, K}, title = {Female red deer prefer the roars of larger males.}, journal = {Biology letters}, volume = {3}, number = {4}, pages = {382-385}, pmid = {17550876}, issn = {1744-9561}, mesh = {Animals ; Body Size ; Deer/anatomy & histology/*physiology ; Female ; Male ; *Mating Preference, Animal ; Sexual Behavior, Animal ; *Vocalization, Animal ; }, abstract = {Surprisingly little is known about the role of acoustic cues in mammal female mate choice. Here, we examine the response of female red deer (Cervus elaphus) to male roars in which an acoustic cue to body size, the formants, has been re-scaled to simulate different size callers. Our results show that oestrous red deer hinds prefer roars simulating larger callers and constitute the first evidence that female mammals use an acoustic cue to body size in a mate choice context. We go on to suggest that sexual selection through female mating preferences may have provided an additional selection pressure along with male-male competition for broadcasting size-related information in red deer and other mammals.}, } @article {pmid17550203, year = {2007}, author = {Bharadwaj, SV and Graves, AG and Bauer, DD and Assmann, PF}, title = {Effects of auditory feedback deprivation length on the vowel /epsilon/ produced by pediatric cochlear-implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {5 Pt1}, pages = {EL196-202}, doi = {10.1121/1.2721375}, pmid = {17550203}, issn = {0001-4966}, support = {R03DC007052-01/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustic Stimulation ; Adolescent ; Child ; *Cochlear Implants ; Deafness/*rehabilitation ; *Feedback ; Female ; Humans ; Male ; *Phonetics ; Speech Intelligibility ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {Effects of auditory deprivation on speech production by ten cochlear-implanted children were investigated by turning off the implant for durations ranging from 0.3 to 5.0 s and measuring the formant frequencies (F1 and F2) of the vowel /epsilon/. In five of the ten talkers, F1 and/or F2 shifted when auditory feedback was eliminated. Without feedback, F2 frequency lowered consistently, suggesting vowel centralization. Phonetic transcription indicated that some of these acoustic changes led to perceptible shifts in phonetic quality. The results provide evidence that brief periods of auditory deprivation can produce perceptible changes in vowels produced by some cochlear-implanted children.}, } @article {pmid17543108, year = {2007}, author = {Eulitz, C and Obleser, J}, title = {Perception of acoustically complex phonological features in vowels is reflected in the induced brain-magnetic activity.}, journal = {Behavioral and brain functions : BBF}, volume = {3}, number = {}, pages = {26}, pmid = {17543108}, issn = {1744-9081}, abstract = {A central issue in speech recognition is which basic units of speech are extracted by the auditory system and used for lexical access. One suggestion is that complex acoustic-phonetic information is mapped onto abstract phonological representations of speech and that a finite set of phonological features is used to guide speech perception. Previous studies analyzing the N1m component of the auditory evoked field have shown that this holds for the acoustically simple feature place of articulation. Brain magnetic correlates indexing the extraction of acoustically more complex features, such as lip rounding (ROUND) in vowels, have not been unraveled yet. The present study uses magnetoencephalography (MEG) to describe the spatial-temporal neural dynamics underlying the extraction of phonological features. We examined the induced electromagnetic brain response to German vowels and found the event-related desynchronization in the upper beta-band to be prolonged for those vowels that exhibit the lip rounding feature (ROUND). It was the presence of that feature rather than circumscribed single acoustic parameters, such as their formant frequencies, which explained the differences between the experimental conditions. We conclude that the prolonged event-related desynchronization in the upper beta-band correlates with the computational effort for the extraction of acoustically complex phonological features from the speech signal. The results provide an additional biomagnetic parameter to study mechanisms of speech perception.}, } @article {pmid17538099, year = {2007}, author = {Mackersie, CL}, title = {Temporal intraspeech masking of plosive bursts: effects of hearing loss and frequency shaping.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {3}, pages = {554-563}, pmid = {17538099}, issn = {1092-4388}, support = {R03 DC007500/DC/NIDCD NIH HHS/United States ; R03 DC007500-01A2/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Audiometry, Pure-Tone ; Auditory Threshold/*physiology ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Middle Aged ; Perceptual Masking/*physiology ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {PURPOSE: The purposes were (a) to compare masking of consonant bursts by adjacent vowels for listeners with and without hearing loss and (b) to determine the extent to which the temporal intraspeech masking can be reduced by a simulated hearing-aid frequency-response shaping.

METHOD: Fourteen adults with sensorineural hearing loss and 10 with normal hearing participated. Seven of the participants with hearing loss had flat or gradually sloping audiograms, and 7 had steeply sloping losses. Stimuli consisted of 3 consonant bursts (/t/, /p/, /k/) presented in isolation and in vowel-consonant-vowel combinations using the vowel /a/ with formant transition information removed. Normal-hearing listeners were tested using unfiltered stimuli. Listeners with hearing loss were tested using unfiltered stimuli and stimuli filtered to approximate a hearing aid frequency response prescribed by NAL-R. All listeners were tested under earphones at the most comfortable level for the vowel stimulus. Temporal intraspeech masking was quantified as the threshold shift produced by the adjacent vowels.

RESULTS: Average intraspeech masking for listeners with steeply sloping hearing loss was significantly higher than that of normal-hearing listeners and those with flat/gradually sloping losses. Greater intraspeech masking was observed for /t/ and /p/ than for /k/. On average, frequency shaping significantly reduced the amount of intraspeech masking for listeners with steeply sloping hearing losses. Even with appropriate amplification/spectral shaping, however, temporal intraspeech masking remained greater than normal for several individuals.

CONCLUSION: Findings suggest that some individuals with steeply sloping losses may need additional signal processing to supplement frequency shaping to overcome the effect of temporal intraspeech masking.}, } @article {pmid17525146, year = {2007}, author = {Ross, D and Choi, J and Purves, D}, title = {Musical intervals in speech.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {104}, number = {23}, pages = {9852-9857}, pmid = {17525146}, issn = {0027-8424}, mesh = {Adult ; Aged ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Music ; *Phonetics ; Respiratory System/*anatomy & histology ; Sound Spectrography ; *Speech Acoustics ; }, abstract = {Throughout history and across cultures, humans have created music using pitch intervals that divide octaves into the 12 tones of the chromatic scale. Why these specific intervals in music are preferred, however, is not known. In the present study, we analyzed a database of individually spoken English vowel phones to examine the hypothesis that musical intervals arise from the relationships of the formants in speech spectra that determine the perceptions of distinct vowels. Expressed as ratios, the frequency relationships of the first two formants in vowel phones represent all 12 intervals of the chromatic scale. Were the formants to fall outside the ranges found in the human voice, their relationships would generate either a less complete or a more dilute representation of these specific intervals. These results imply that human preference for the intervals of the chromatic scale arises from experience with the way speech formants modulate laryngeal harmonics to create different phonemes.}, } @article {pmid17524172, year = {2008}, author = {Dagli, M and Sati, I and Acar, A and Stone, RE and Dursun, G and Eryilmaz, A}, title = {Mutational falsetto: intervention outcomes in 45 patients.}, journal = {The Journal of laryngology and otology}, volume = {122}, number = {3}, pages = {277-281}, doi = {10.1017/S0022215107008791}, pmid = {17524172}, issn = {1748-5460}, mesh = {Adolescent ; Adult ; Female ; Humans ; Larynx/ultrastructure ; Male ; Phonation ; Puberty/psychology ; Retrospective Studies ; Speech Acoustics ; Speech Production Measurement/*methods/psychology ; Speech Therapy/*methods/psychology ; Treatment Outcome ; Voice Disorders/*therapy ; Voice Quality ; }, abstract = {OBJECTIVE: The aim of this study was to evaluate the outcomes of therapeutic intervention in patients with mutational falsetto, by applying perceptual and acoustic analysis before and after voice therapy.

MATERIALS AND METHODS: Forty-five consecutive patients with mutational falsetto were studied retrospectively. Acoustic analysis (i.e. fundamental frequency, jitter, shimmer, and formants one, two and three) was performed using the Multi-Dimensional Voice Program. Perceptual voice analyses were performed, including graded severity-roughness-breathiness-aesthenicity-strain assessment.

RESULTS: Subjects' fundamental frequency, voice formants one, two and three, jitter, and shimmer were greater before than after treatment. There were statistically significant differences between pre- and post-treatment average values for fundamental frequency, jitter and shimmer. There were also statistically significant differences between pre- and post-treatment average values for formants one and two. These results were maintained after six months of follow up, and there was no significant difference between results at three- and six-month follow up. According to perceptual evaluation, each subject's voice had altered from mutational falsetto to chest voice by completion of the intervention. Thus, all of the patients successfully lowered their modal speaking voice to an appropriate level.

CONCLUSION: In the light of objective evaluations, and by applying the study treatment protocol, these results suggest that normal voice can be maintained after intervention, at six months' follow up.}, } @article {pmid17509609, year = {2008}, author = {Rosen, KM and Goozée, JV and Murdoch, BE}, title = {Examining the effects of multiple sclerosis on speech production: does phonetic structure matter?.}, journal = {Journal of communication disorders}, volume = {41}, number = {1}, pages = {49-69}, doi = {10.1016/j.jcomdis.2007.03.009}, pmid = {17509609}, issn = {0021-9924}, mesh = {Adult ; Aged ; Dysarthria/*diagnosis ; Female ; Humans ; Male ; Middle Aged ; Multiple Sclerosis, Chronic Progressive/*diagnosis ; Multiple Sclerosis, Relapsing-Remitting/*diagnosis ; *Phonetics ; Reading ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {UNLABELLED: The second formant (F2) is well-known to be important to intelligibility (e.g. [Delattre, P., Liberman, A., & Cooper, F. (1955). Acoustic loci and transitional cues for consonants. Journal of the Acoustical Society of America, 27, 769-774]) and is affected by a variety of dysarthrias [Weismer, G., & Martin, R. (1992). Acoustic and perceptual approaches to the study of intelligibility. In R. Kent (Ed.), Intelligibility in speech disorders (pp. 67-118). Amsterdam: John Benjamins Publishing Company]. This study tests two related hypotheses: (1) dysarthria associated with Multiple Sclerosis (MS) has a greater effect on the ability to produce extreme F2 movement than on typical F2 movement and (2) phonetic stimuli associated with large and/or rapid F2 movement in healthy speakers precipitate larger differences between healthy and dysarthric speech than do stimuli associated with small and/or slow F2 movement. Twelve participants with MS and 16 healthy controls read aloud the Grandfather Passage. F2 slopes were calculated from the F2 tracings (i.e. change in Hz over a 20ms lag). For each sentence, the following measures of F2 movement were calculated: F2 range (maximum F2-minimum F2), median slope, and 95%ile slope. The mean and the maximum observation for each participant were respectively used as measures of typical and extreme productions. A repeated-measures MANOVA detected significantly larger group differences in the slopes of maximum productions than for mean production. Also, group differences in F2 slope were greater in phonetic stimuli associated with the largest F2 slopes in healthy speech than in phonetic stimuli associated with more shallow slopes. No group differences in F2 range were detected. Results indicate dysarthria affects the production of extremely rapid changes in F2 and that some phonetic structures are more useful than others for detecting these impairments. It is concluded that comparison of speakers' best-productions (e.g. maximum F2 slope) yield more useful estimates of the effects of mild to moderate dysarthria on F2 slopes than do typical productions.

LEARNING OUTCOMES: The reader will be able to: (1) describe two approaches to measuring typical and extreme acoustic variability that can be applied to connected speech, (2) discuss evidence that MS affects extremely rapid changes in F2.}, } @article {pmid17485197, year = {2008}, author = {Björkner, E}, title = {Musical theater and opera singing--why so different? A study of subglottal pressure, voice source, and formant frequency characteristics.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {22}, number = {5}, pages = {533-540}, doi = {10.1016/j.jvoice.2006.12.007}, pmid = {17485197}, issn = {1873-4588}, mesh = {Adult ; Electromyography ; Female ; Humans ; Hydrostatic Pressure ; Larynx/*physiology ; Male ; *Music ; Phonation/*physiology ; *Phonetics ; Pulmonary Ventilation/physiology ; *Sound Spectrography ; Voice Quality/*physiology ; }, abstract = {The considerable voice timbre differences between musical theater (MT) and western operatic singers are analyzed with respect to voice source and formant frequencies in five representatives of each singer group. Audio, subglottal pressure (P(sub)), and electroglottograph (EGG) signals were recorded while the subjects sang a sequence of [pae:] syllables starting at maximal vocal loudness and then gradually decreasing vocal loudness. The task was performed at each of two fundamental frequencies (F(0)), approximately one octave apart. Ten equally spaced P(sub) values were then selected for each F(0). The subsequent vowels were analyzed in terms of flow glottograms derived by inverse filtering the audio signal, which also yielded formant frequency data. Period time (T(0)), peak-to-peak pulse amplitude (U(p-t-p)), and maximum flow declination rate (MFDR) were measured from the flow glottograms while closed quotient Q(closed) (T(cl)/T(0)) was determined in combination with the differentiated EGG signal. Also the relationship between the first and the second harmonic in the spectrum (H(1)-H(2)), the amplitude quotient (AQ), that is, the ratio between U(p-t-p) and MFDR, and normalized AQ, that is, AQ normalized with respect to period time was calculated as well as the sound pressure level. The results showed that both the MT and the opera singers varied their P(sub) systematically, approximately doubling P(sub) for a doubling of F(0). For a given value of P(sub), the MT singers produced higher values of MFDR, U(p-t-p), and Q(closed), and lower values of H(1)-H(2), indicating a weaker fundamental. Further, the MT singers showed higher formant frequencies and did not show the opera singers' characteristic clustering of F(3), F(4), and F(5).}, } @article {pmid17471743, year = {2007}, author = {Perkell, JS and Lane, H and Denny, M and Matthies, ML and Tiede, M and Zandipour, M and Vick, J and Burton, E}, title = {Time course of speech changes in response to unanticipated short-term changes in hearing state.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {4}, pages = {2296-2311}, doi = {10.1121/1.2642349}, pmid = {17471743}, issn = {0001-4966}, support = {R01-DC003007/DC/NIDCD NIH HHS/United States ; }, mesh = {Cochlear Implants ; Female ; Hearing/*physiology ; Hearing Loss, Sensorineural/surgery ; Humans ; Male ; Phonetics ; Posture/physiology ; Speech/*physiology ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {The timing of changes in parameters of speech production was investigated in six cochlear implant users by switching their implant microphones off and on a number of times in a single experimental session. The subjects repeated four short, two-word utterances, /dV1n#SV2d/ (S = /s/ or /S/), in quasi-random order. The changes between hearing and nonhearing states were introduced by a voice-activated switch at V1 onset. "Postural" measures were made of vowel sound pressure level (SPL), duration, F0; contrast measures were made of vowel separation (distance between pair members in the formant plane) and sibilant separation (difference in spectral means). Changes in parameter values were averaged over multiple utterances, lined up with respect to the switch. No matter whether prosthetic hearing was blocked or restored, contrast measures for vowels and sibilants did not change systematically. Some changes in duration, SPL and F0 were observed during the vowel within which hearing state was changed, V1, as well as during V2 and subsequent utterance repetitions. Thus, sound segment contrasts appear to be controlled differently from the postural parameters of speaking rate and average SPL and F0. These findings are interpreted in terms of the function of hypothesized feedback and feedforward mechanisms for speech motor control.}, } @article {pmid17471742, year = {2007}, author = {Iseli, M and Shue, YL and Alwan, A}, title = {Age, sex, and vowel dependencies of acoustic measures related to the voice source.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {4}, pages = {2283-2295}, doi = {10.1121/1.2697522}, pmid = {17471742}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Age Factors ; Aging/*physiology ; Child ; Female ; Humans ; Larynx/physiology ; Male ; Models, Biological ; Phonation ; *Phonetics ; *Speech Acoustics ; Voice/*physiology ; Voice Quality ; }, abstract = {The effects of age, sex, and vocal tract configuration on the glottal excitation signal in speech are only partially understood, yet understanding these effects is important for both recognition and synthesis of speech as well as for medical purposes. In this paper, three acoustic measures related to the voice source are analyzed for five vowels from 3145 CVC utterances spoken by 335 talkers (8-39 years old) from the CID database [Miller et al., Proceedings of ICASSP, 1996, Vol. 2, pp. 849-852]. The measures are: the fundamental frequency (F0), the difference between the "corrected" (denoted by an asterisk) first two spectral harmonic magnitudes, H1* - H2* (related to the open quotient), and the difference between the "corrected" magnitudes of the first spectral harmonic and that of the third formant peak, H1* - A3* (related to source spectral tilt). The correction refers to compensating for the influence of formant frequencies on spectral magnitude estimation. Experimental results show that the three acoustic measures are dependent to varying degrees on age and vowel. Age dependencies are more prominent for male talkers, while vowel dependencies are more prominent for female talkers suggesting a greater vocal tract-source interaction. All talkers show a dependency of F0 on sex and on F3, and of H1* - A3* on vowel type. For low-pitched talkers (F0 < or = 175 Hz), H1* - H2* is positively correlated with F0 while for high-pitched talkers, H1* - H2* is dependent on F1 or vowel height. For high-pitched talkers there were no significant sex dependencies of H1* - H2* and H1* - A3*. The statistical significance of these results is shown.}, } @article {pmid17463228, year = {2007}, author = {Hedrick, MS and Younger, MS}, title = {Perceptual weighting of stop consonant cues by normal and impaired listeners in reverberation versus noise.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {2}, pages = {254-269}, doi = {10.1044/1092-4388(2007/019)}, pmid = {17463228}, issn = {1092-4388}, support = {1 R55 DC03682/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; *Attitude ; Audiometry, Pure-Tone ; *Cues ; Female ; *Hearing ; Hearing Loss, Bilateral/diagnosis ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Middle Aged ; *Noise ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {PURPOSE: To determine if listeners with normal hearing and listeners with sensorineural hearing loss give different perceptual weightings to cues for stop consonant place of articulation in noise versus reverberation listening conditions.

METHOD: Nine listeners with normal hearing (23-28 years of age) and 10 listeners with sensorineural hearing loss (31-79 years of age, median 66 years) participated. The listeners were asked to label the consonantal portion of synthetic CV stimuli as either /p/ or /t/. Two cues were varied: (a) the amplitude of the spectral peak in the F4/F5 frequency region of the burst was varied across a 30-dB range relative to the adjacent vowel peak amplitude in the same frequency region, (b) F2/F3 formant transition onset frequencies were either appropriate for /p/, /t/ or neutral for the labial/alveolar contrast.

RESULTS: Weightings of relative amplitude and transition cues for voiceless stop consonants depended on the listening condition (quiet, noise, or reverberation), hearing loss, and age of listener. The effects of age with hearing loss reduced the perceptual integration of cues, particularly in reverberation. The effects of hearing loss reduced the effectiveness of both cues, notably relative amplitude in reverberation.

CONCLUSIONS: Reverberation and noise conditions have different perceptual effects. Hearing loss and age may have different, separable effects.}, } @article {pmid17456891, year = {2007}, author = {Adler-Bock, M and Bernhardt, BM and Gick, B and Bacsfalvi, P}, title = {The use of ultrasound in remediation of North American English /r/ in 2 adolescents.}, journal = {American journal of speech-language pathology}, volume = {16}, number = {2}, pages = {128-139}, doi = {10.1044/1058-0360(2007/017)}, pmid = {17456891}, issn = {1058-0360}, mesh = {Adolescent ; British Columbia ; Child ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Disorders/*diagnostic imaging/*therapy ; *Speech Production Measurement ; *Speech Therapy ; Tongue/*diagnostic imaging ; Treatment Outcome ; *Ultrasonography ; }, abstract = {PURPOSE: Ultrasound can provide images of the tongue during speech production. The present study set out to examine the potential utility of ultrasound in remediation of North American English /r/.

METHOD: The participants were 2 Canadian English-speaking adolescents who had not yet acquired /r/. The study included an initial period without ultrasound and 13 treatment sessions, each 1 hr long, using ultrasound. Speech samples were recorded at screening and immediately before and after treatment. Samples were analyzed acoustically and with listener judgments. Ultrasound images were obtained before, during, and after the treatment period.

RESULTS: Three speech-language pathologists unfamiliar with the participants rated significantly more posttreatment tokens as accurate [r]s in single words and some phrases. Acoustic analyses showed an expected lowering of the third formant after treatment. A qualitative observation of posttreatment ultrasound images for accurate [r] tokens showed tongue shapes to be more similar to those of typical adults than had been observed before treatment. Participants needed continued practice of their newly acquired skills in sentences and conversation.

CONCLUSION: Two-dimensional dynamic ultrasound appears to have potential utility for remediation of /r/ in speakers with residual /r/ impairment. Further research is needed with larger numbers of participants to establish the relative efficacy of ultrasound in treatment.}, } @article {pmid17453457, year = {2007}, author = {Fitzgerald, MB and Shapiro, WH and McDonald, PD and Neuburger, HS and Ashburn-Reed, S and Immerman, S and Jethanamest, D and Roland, JT and Svirsky, MA}, title = {The effect of perimodiolar placement on speech perception and frequency discrimination by cochlear implant users.}, journal = {Acta oto-laryngologica}, volume = {127}, number = {4}, pages = {378-383}, doi = {10.1080/00016480701258671}, pmid = {17453457}, issn = {0001-6489}, support = {R01-DC03937/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; *Electrodes, Implanted ; Equipment Failure Analysis ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; *Pitch Discrimination ; Prosthesis Design ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {CONCLUSION: Neither speech understanding nor frequency discrimination ability was better in Nucleus Contour users than in Nucleus 24 straight electrode users. Furthermore, perimodiolar electrode placement does not result in better frequency discrimination.

OBJECTIVES: We addressed three questions related to perimodiolar electrode placement. First, do patients implanted with the Contour electrode understand speech better than with an otherwise identical device that has a straight electrode? Second, do these groups have different frequency discrimination abilities? Third, is the distance of the electrode from the modiolus related to frequency discrimination ability?

SUBJECTS AND METHODS: Contour and straight electrode users were matched on four important variables. We then tested these listeners on CNC word and HINT sentence identification tasks, and on a formant frequency discrimination task. We also examined X-rays and measured the distance of the electrodes from the modiolus to determine whether there is a relationship between this factor and frequency discrimination ability.

RESULTS: Both speech understanding and frequency discrimination abilities were similar for listeners implanted with the Contour vs a straight electrode. Furthermore, there was no linear relationship between electrode-modiolus distance and frequency discrimination ability. However, we did note a second-order relationship between these variables, suggesting that frequency discrimination is worse when the electrodes are either too close or too far away from the modiolus.}, } @article {pmid17425003, year = {2006}, author = {Pillot, C and Vaissière, J}, title = {[Vocal effectiveness in speech and singing: acoustical, physiological and perceptive aspects. applications in speech therapy].}, journal = {Revue de laryngologie - otologie - rhinologie}, volume = {127}, number = {5}, pages = {293-298}, pmid = {17425003}, issn = {0035-1334}, mesh = {Acoustics ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; *Music ; Sound Spectrography ; Speech/*physiology ; Voice/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVE: What is vocal effectiveness in lyrical singing in comparison to speech? Our study tries to answer this question, using vocal efficiency and spectral vocal effectiveness.

MATERIAL AND METHODS: Vocal efficiency was mesured for a trained and untrained subject.

RESULTS: According to these invasive measures, it appears that the trained singer uses her larynx less efficiently. Efficiency of the larynx in terms of energy then appears to be secondary to the desired voice quality. The acoustic measures of spectral vocal effectiveness of vowels and sentences, spoken and sung by 23 singers, reveal two complementary markers: The "singing power ratio" and the difference in amplitude between the singing formant and the spectral minimum that follows it. Magnetic resonance imaging and simulations of [a], [i] and [o] spoken and sung show laryngeal lowering and the role of the piriform sinuses as the physiological foundations of spectral vocal effectiveness, perceptively related to carrying power.

CONCLUSION: These scientifical aspects allow applications in voice therapy, such as physiological and perceptual foundations allowing patients to recuperate voice carrying power with or without background noise.}, } @article {pmid17419344, year = {2007}, author = {Solov'eva, ES and Konyshev, VA and Selishchev, SV}, title = {[Use of pitch and formant analysis in speech biometry].}, journal = {Meditsinskaia tekhnika}, volume = {}, number = {1}, pages = {32-37}, pmid = {17419344}, issn = {0025-8075}, mesh = {Biometry/*methods ; *Emotions ; Humans ; Speech/*physiology ; *Speech Acoustics ; *Voice Quality ; }, abstract = {The goal of this work was to study the possibility of use of speech biometry for identification of speaker, determination of speech accent, and estimation of the emotional state of the speaker. It was suggested to solve these problems by analysis of the two main components of the linear model of speech production: pitch and formant frequencies. It was shown that pitch and formant analysis provided estimation of the emotional state of the speaker. It was found that sthenic emotions were accompanied by an increase in the pitch and its dispersion. The speech accent was determined using formant analysis and neural network classifiers. Speaker identification was successfully performed in a test group of 50 subjects.}, } @article {pmid17407888, year = {2007}, author = {Fulop, SA and Fitz, K}, title = {Separation of components from impulses in reassigned spectrograms.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {3}, pages = {1510-1518}, doi = {10.1121/1.2431329}, pmid = {17407888}, issn = {0001-4966}, mesh = {Humans ; *Models, Biological ; Phonation/*physiology ; Sound Spectrography/methods ; Speech/*physiology ; Vocal Cords/*physiology ; }, abstract = {Two computational methods for pruning a reassigned spectrogram to show only quasisinusoidal components, or only impulses, or both, are presented mathematically and provided with step-by-step algorithms. Both methods compute the second-order mixed partial derivative of the short-time Fourier transform phase, and rely on the conditions that components and impulses are each well-represented by reassigned spectrographic points possessing particular values of this derivative. This use of the mixed second-order derivative was introduced by Nelson [J. Acoust. Soc. Am. 110, 2575-2592 (2001)] but here our goals are to completely describe the computation of this derivative in a way that highlights the relations to the two most influential methods of computing a reassigned spectrogram, and also to demonstrate the utility of this technique for plotting spectrograms showing line components or impulses while excluding most other points. When applied to speech signals, vocal tract resonances (formants) or glottal pulsations can be effectively isolated in expanded views of the phonation process.}, } @article {pmid17407754, year = {2007}, author = {Munhall, KG and Byrne, SK}, title = {Animal communication: big talkers and small talk.}, journal = {Current biology : CB}, volume = {17}, number = {7}, pages = {R247-9}, doi = {10.1016/j.cub.2007.02.007}, pmid = {17407754}, issn = {0960-9822}, mesh = {Aging ; Animals ; *Body Size ; Humans ; Macaca mulatta/*physiology ; Vocal Cords/anatomy & histology/*physiology ; *Vocalization, Animal ; }, abstract = {Vocal tract resonances, known as formants, are important perceptual cues for the identification of human speech and animal calls. A recent study shows that monkeys can also use formants to determine the age and size of the monkey producing a call.}, } @article {pmid17405369, year = {2007}, author = {Vijayalakshmi, P and Reddy, MR and O'Shaughnessy, D}, title = {Acoustic analysis and detection of hypernasality using a group delay function.}, journal = {IEEE transactions on bio-medical engineering}, volume = {54}, number = {4}, pages = {621-629}, doi = {10.1109/TBME.2006.889191}, pmid = {17405369}, issn = {0018-9294}, mesh = {*Algorithms ; Diagnosis, Computer-Assisted/*methods ; Humans ; Nasal Cavity ; Pattern Recognition, Automated/*methods ; Reproducibility of Results ; Sensitivity and Specificity ; Sound Spectrography/*methods ; Speech Production Measurement/*methods ; Velopharyngeal Insufficiency/complications/*diagnosis ; Voice Disorders/*diagnosis/etiology ; Voice Quality ; }, abstract = {In this paper, we describe a group delay-based signal processing technique for the analysis and detection of hypernasal speech. Our preliminary acoustic analysis on nasalized vowels shows that, even though additional resonances are introduced at various frequency locations, the introduction of a new resonance in the low-frequency region (around 250 Hz) is found to be consistent. This observation is further confirmed by a perceptual analysis carried out on vowel sounds that are modified by introducing different nasal resonances, and an acoustic analysis on hypernasal speech. Based on this, for subsequent experiments the focus is given only to the low-frequency region. The additive property of the group delay function can be exploited to resolve two closely spaced formants. However, when the formants are very close with considerably wider bandwidths as in hypernasal speech, the group delay function also fails to resolve. To overcome this, we suggest a band-limited approach to estimate the locations of the formants. Using the band-limited group delay spectrum, we define a new acoustic measure for the detection of hypernasality. Experiments are carried out on the phonemes /a/, /i/, and /u/ uttered by 33 hypernasal speakers and 30 normal speakers. Using the group delay-based acoustic measure, the performance on a hypernasality detection task is found to be 100% for /a/, 88.78% for /i/ and 86.66% for /u/. The effectiveness of this acoustic measure is further cross-verified on a speech data collected in an entirely different recording environment.}, } @article {pmid17375578, year = {2007}, author = {Li, B and Shi, B and Zheng, Q and Meng, T and Yin, H and Lu, Y}, title = {[Study on the effects of different extent of cleft malformation on speech in patients with cleft palate].}, journal = {Hua xi kou qiang yi xue za zhi = Huaxi kouqiang yixue zazhi = West China journal of stomatology}, volume = {25}, number = {1}, pages = {55-57}, pmid = {17375578}, issn = {1000-1182}, mesh = {Cleft Lip ; *Cleft Palate ; Female ; Humans ; Male ; Palate, Hard ; *Speech ; }, abstract = {OBJECTIVE: To investigate the relation between different extents of cleft malformation with the speech characteristics in patients with cleft palate.

METHODS: The formant frequency of vowel [i] of 46 incomplete cleft palate patients (ICCP group) and 56 complete cleft palate patients (CCP group) before and after cleft palate repair, as well as 30 normal people (C group), were measured and analyzed on spectrogram.

RESULTS: The comparison of F1 between C group and CCP, ICCP before surgery showed no difference. So did the comparison of F1 between C group and CCP, ICCP after surgery. The comparison of F2 between C group and CCP, ICCP before surgery showed significant difference. The value of the C group was the highest. The value of the ICCP was higher than that of CCP. So did the comparison of F2 between C group and CCP, ICCP after surgery. The comparison of F3 between C group and CCP, ICCP(including before and after surgery) was similar to the results of F2 between the three groups. The comparison of F1 between before and after surgery in ICCP group showed no difference. However, the same kind of comparison of F2 and F3 showed significant differences: Both the values after surgery were higher than those before surgery. The comparison of Fl, F2 and F3 between before and after surgery in CCP group was similar to that in ICCP group.

CONCLUSION: The extent of the cleft malformation is closely related to the status of the speech in patients with cleft palate. With the malformation more severe, the tongue will move backward more obviously, the elevation of the soft palate after cleft palate repair will be less active. Two ways are recommended for those patients with CCP: (1) Early interceptive orthodontic treatment to reduce the extent of palate malformation; (2) The hard palate repair can be performed prior to the soft palate repair. Patients with severe cleft lip and palate can have hard palate repaired while accepting the early cleft lip repair.}, } @article {pmid17361926, year = {2006}, author = {Laures-Gore, J and Henson, JC and Weismer, G and Rambow, M}, title = {Two cases of foreign accent syndrome: an acoustic-phonetic description.}, journal = {Clinical linguistics & phonetics}, volume = {20}, number = {10}, pages = {781-790}, doi = {10.1080/02699200500391105}, pmid = {17361926}, issn = {0269-9206}, mesh = {Aged ; *Emigration and Immigration ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Speech Disorders/*diagnosis ; Speech Production Measurement ; Syndrome ; *Verbal Behavior ; Vocabulary ; }, abstract = {This paper presents an acoustic-phonetic description of two American English speakers with foreign accent syndrome (FAS). Speech samples were collected from both speakers. Acoustic measurements of voice onset time, vowel duration, first and second formant frequency analysis, fundamental frequency (F0), and consonantal duration were performed. Both speakers demonstrated timing errors and a hypothesized difference in vocal tract constriction. However, variability in acoustic-phonetic patterns between the two FAS speakers was noted in F0, F0 contours, manner of articulation, and linguistic modifications. These cases extend knowledge about the acoustic-phonetic patterns associated with FAS in an attempt to further describe the phenomenon.}, } @article {pmid17348534, year = {2007}, author = {Adank, P and van Hout, R and van de Velde, H}, title = {An acoustic description of the vowels of northern and southern standard Dutch II: regional varieties.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {2}, pages = {1130-1141}, doi = {10.1121/1.2409492}, pmid = {17348534}, issn = {0001-4966}, mesh = {Humans ; *Language ; Netherlands ; *Phonetics ; Signal Processing, Computer-Assisted ; *Social Environment ; Software ; *Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {An analysis is presented of regional variation patterns in the vowel system of Standard Dutch as spoken in the Netherlands (Northern Standard Dutch) and Flanders (Southern Standard Dutch). The speech material consisted of read monosyllabic utterances in a neutral consonantal context (i.e., /sVs/). The analyses were based on measurements of the duration and the frequencies of the first two formants of the vowel tokens. Recordings were made for 80 Dutch and 80 Flemish speakers, who were stratified for the social factors gender and region. These 160 speakers were distributed across four regions in the Netherlands and four regions in Flanders. Differences between regional varieties were found for duration, steady-state formant frequencies, and spectral change of formant frequencies. Variation patterns in the spectral characteristics of the long mid vowels /e o ø/ and the diphthongal vowels /ei oey bacwards c u/ were in accordance with a recent theory of pronunciation change in Standard Dutch. Finally, it was found that regional information was present in the steady-state formant frequency measurements of vowels produced by professional language users.}, } @article {pmid17344548, year = {2007}, author = {Nittrouer, S and Lowenstein, JH}, title = {Children's weighting strategies for word-final stop voicing are not explained by auditory sensitivities.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {50}, number = {1}, pages = {58-73}, pmid = {17344548}, issn = {1092-4388}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-20/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {*Attitude ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; Time Factors ; }, abstract = {PURPOSE: It has been reported that children and adults weight differently the various acoustic properties of the speech signal that support phonetic decisions. This finding is generally attributed to the fact that the amount of weight assigned to various acoustic properties by adults varies across languages, and that children have not yet discovered the mature weighting strategies of their own native languages. But an alternative explanation exists: Perhaps children's auditory sensitivities for some acoustic properties of speech are poorer than those of adults, and children cannot categorize stimuli based on properties to which they are not keenly sensitive. The purpose of the current study was to test that hypothesis.

METHOD: Edited-natural, synthetic-formant, and sine wave stimuli were all used, and all were modeled after words with voiced and voiceless final stops. Adults and children (5 and 7 years of age) listened to pairs of stimuli in 5 conditions: 2 involving a temporal property (1 with speech and 1 with nonspeech stimuli) and 3 involving a spectral property (1 with speech and 2 with nonspeech stimuli). An AX discrimination task was used in which a standard stimulus (A) was compared with all other stimuli (X) equal numbers of times (method of constant stimuli).

RESULTS: Adults and children had similar difference thresholds (i.e., 50% point on the discrimination function) for 2 of the 3 sets of nonspeech stimuli (1 temporal and 1 spectral), but children's thresholds were greater for both sets of speech stimuli.

CONCLUSION: Results are interpreted as evidence that children's auditory sensitivities are adequate to support weighting strategies similar to those of adults, and so observed differences between children and adults in speech perception cannot be explained by differences in auditory perception. Furthermore, it is concluded that listeners bring expectations to the listening task about the nature of the signals they are hearing based on their experiences with those signals.}, } @article {pmid17342877, year = {2006}, author = {Xue, SA and Hao, GJ and Mayo, R}, title = {Volumetric measurements of vocal tracts for male speakers from different races.}, journal = {Clinical linguistics & phonetics}, volume = {20}, number = {9}, pages = {691-702}, doi = {10.1080/02699200500297716}, pmid = {17342877}, issn = {0269-9206}, support = {R03 DC 005523-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Phonation/*physiology ; *Racial Groups ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Verbal Behavior ; Vocal Cords/*physiology ; *Voice Quality ; }, abstract = {Research examining physiologic and acoustic characteristics of culturally diverse populations is sorely needed, but rarely reported. The major aim of this study was to quantify vocal tract dimensional parameters (oral length, oral volume, pharyngeal length, pharyngeal volume, total vocal tract length and total vocal tract volume) of adult male speakers from three different racial populations (White American, African American, and Chinese). It also attempted to investigate if volumetric differences in the speakers' vocal tracts, like length differences, would contribute to the acoustic characteristics of these speakers from different races. The findings of this study support the hypothesis that speakers from different races may have morphological differences in their vocal tract dimensions, and these morphological differences (especially volumetric differences) could be partially responsible for the formant frequency differences in a vowel sound void of specific language/dialectal impacts. The study has provided speech scientists, speech-language pathologists, linguists and other health professionals with a new and preliminary acoustic and physiological database for adult male speakers from these three different races.}, } @article {pmid17324555, year = {2009}, author = {Hanayama, EM and Camargo, ZA and Tsuji, DH and Rebelo Pinho, SM}, title = {Metallic voice: physiological and acoustic features.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {23}, number = {1}, pages = {62-70}, doi = {10.1016/j.jvoice.2006.12.006}, pmid = {17324555}, issn = {1873-4588}, mesh = {Adolescent ; Adult ; Female ; Humans ; Larynx/*physiology ; Male ; Middle Aged ; Pharynx/*physiology ; *Speech Acoustics ; Voice Quality/*physiology ; Young Adult ; }, abstract = {The metallic voice is usually confused with ring or nasality by singers and nontrained listeners, who are not used to perceptual vocal analysis. They believe a metallic voice results from a rise in fundamental frequency. A diagnostic error in this aspect may lead to lowering pitch, an incorrect procedure that could cause vocal overload and fatigue. The purpose of this article is to study the quality of metallic voice considering the correlation between information of the physiological and acoustic plans, based on a perceptive consensual assumption. Fiberscopic video pharyngolaryngoscopy was performed on 21 professional singers while speaking vowel [e]--in normal and metallic modes to observe muscular movements and structural changes of the velopharynx, pharynx, and larynx. Vocal samples captured simultaneously to the fiberscopic examination were acoustically analyzed. Frequency and amplitude of the first four formants (F(1), F(2), F(3), and F(4)) were extracted by means of linear predictor coefficients (LPC) spectrum and were statistically analyzed. Vocal tract adjustments such as velar lowering, pharyngeal wall narrowing, laryngeal rise, aryepiglottic, and lateral laryngeal constrictions were frequently found; there were no significant changes in frequency and amplitude of F(1) in the metallic voice; there were significant increases in amplitudes of F(2), F(3), and F(4) and in frequency for F(2); metallic voice perceived as louder was correlated to an increase in amplitude of F(3) and F(4). Physiological adjustments of velopharynx, pharynx, and larynx are combined in characterizing the metallic voice and can be acoustically related to changes in formant pattern.}, } @article {pmid17320389, year = {2007}, author = {Ghazanfar, AA and Turesson, HK and Maier, JX and van Dinther, R and Patterson, RD and Logothetis, NK}, title = {Vocal-tract resonances as indexical cues in rhesus monkeys.}, journal = {Current biology : CB}, volume = {17}, number = {5}, pages = {425-430}, pmid = {17320389}, issn = {0960-9822}, support = {G0500221/MRC_/Medical Research Council/United Kingdom ; G0500221(73813)/MRC_/Medical Research Council/United Kingdom ; G9901257/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Animal Communication ; Animals ; Body Size ; Cues ; Macaca mulatta/*physiology ; Male ; Phonetics ; Sound Spectrography ; Vocal Cords/anatomy & histology/*physiology ; *Vocalization, Animal ; }, abstract = {Vocal-tract resonances (or formants) are acoustic signatures in the voice and are related to the shape and length of the vocal tract. Formants play an important role in human communication, helping us not only to distinguish several different speech sounds [1], but also to extract important information related to the physical characteristics of the speaker, so-called indexical cues. How did formants come to play such an important role in human vocal communication? One hypothesis suggests that the ancestral role of formant perception--a role that might be present in extant nonhuman primates--was to provide indexical cues [2-5]. Although formants are present in the acoustic structure of vowel-like calls of monkeys [3-8] and implicated in the discrimination of call types [8-10], it is not known whether they use this feature to extract indexical cues. Here, we investigate whether rhesus monkeys can use the formant structure in their "coo" calls to assess the age-related body size of conspecifics. Using a preferential-looking paradigm [11, 12] and synthetic coo calls in which formant structure simulated an adult/large- or juvenile/small-sounding individual, we demonstrate that untrained monkeys attend to formant cues and link large-sounding coos to large faces and small-sounding coos to small faces-in essence, they can, like humans [13], use formants as indicators of age-related body size.}, } @article {pmid17315851, year = {2007}, author = {Marshall, DC and Lee, JD and Austria, RA}, title = {Alerts for in-vehicle information systems: annoyance, urgency, and appropriateness.}, journal = {Human factors}, volume = {49}, number = {1}, pages = {145-157}, doi = {10.1518/001872007779598145}, pmid = {17315851}, issn = {0018-7208}, mesh = {Accidents, Traffic/prevention & control ; Acoustic Stimulation/*methods ; Adolescent ; Adult ; *Attention ; *Auditory Perception ; Automobile Driving/psychology ; Humans ; Reaction Time ; *Safety Management ; *Sound ; Sound Spectrography ; }, abstract = {OBJECTIVE: This study assesses the influence of the auditory characteristics of alerts on perceived urgency and annoyance and whether these perceptions depend on the context in which the alert is received.

BACKGROUND: Alert parameters systematically affect perceived urgency, and mapping the urgency of a situation to the perceived urgency of an alert is a useful design consideration. Annoyance associated with environmental noise has been thoroughly studied, but little research has addressed whether alert parameters differentially affect annoyance and urgency.

METHOD: Three 2(3) x 3 mixed within/between factorial experiments, with a total of 72 participants, investigated nine alert parameters in three driving contexts. These parameters were formant (similar to harmonic series), pulse duration, interpulse interval, alert onset and offset, burst duty cycle, alert duty cycle, interburst period, and sound type. Imagined collision warning, navigation alert, and E-mail notification scenarios defined the driving context.

RESULTS: All parameters influenced both perceived urgency and annoyance (p < .05), with pulse duration, interpulse interval, alert duty cycle, and sound type influencing urgency substantially more than annoyance. There was strong relationship between perceived urgency and rated appropriateness for high-urgency driving scenarios and a strong relationship between annoyance and rated appropriateness for low-urgency driving scenarios.

CONCLUSION: Sound parameters differentially affect annoyance and urgency. Also, urgency and annoyance differentially affect perceived appropriateness of warnings.

APPLICATION: Annoyance may merit as much attention as urgency in the design of auditory warnings, particularly in systems that alert drivers to relatively low-urgency situations.}, } @article {pmid17310544, year = {2007}, author = {Frey, R and Gebler, A and Fritsch, G and Nygrén, K and Weissengruber, GE}, title = {Nordic rattle: the hoarse vocalization and the inflatable laryngeal air sac of reindeer (Rangifer tarandus).}, journal = {Journal of anatomy}, volume = {210}, number = {2}, pages = {131-159}, pmid = {17310544}, issn = {0021-8782}, mesh = {Air Sacs/*anatomy & histology ; Animals ; Female ; Hoarseness ; Larynx/*anatomy & histology ; Male ; Neck Muscles/anatomy & histology ; Reindeer/*anatomy & histology/physiology ; Sexual Maturation/physiology ; Skull/anatomy & histology ; Vocalization, Animal/*physiology ; }, abstract = {Laryngeal air sacs have evolved convergently in diverse mammalian lineages including insectivores, bats, rodents, pinnipeds, ungulates and primates, but their precise function has remained elusive. Among cervids, the vocal tract of reindeer has evolved an unpaired inflatable ventrorostral laryngeal air sac. This air sac is not present at birth but emerges during ontogenetic development. It protrudes from the laryngeal vestibulum via a short duct between the epiglottis and the thyroid cartilage. In the female the growth of the air sac stops at the age of 2-3 years, whereas in males it continues to grow up to the age of about 6 years, leading to a pronounced sexual dimorphism of the air sac. In adult females it is of moderate size (about 100 cm3), whereas in adult males it is large (3000-4000 cm3) and becomes asymmetric extending either to the left or to the right side of the neck. In both adult females and males the ventral air sac walls touch the integument. In the adult male the air sac is laterally covered by the mandibular portion of the sternocephalic muscle and the skin. Both sexes of reindeer have a double stylohyoid muscle and a thyroepiglottic muscle. Possibly these muscles assist in inflation of the air sac. Head-and-neck specimens were subjected to macroscopic anatomical dissection, computer tomographic analysis and skeletonization. In addition, isolated larynges were studied for comparison. Acoustic recordings were made during an autumn round-up of semi-domestic reindeer in Finland and in a small zoo herd. Male reindeer adopt a specific posture when emitting their serial hoarse rutting calls. Head and neck are kept low and the throat region is extended. In the ventral neck region, roughly corresponding to the position of the large air sac, there is a mane of longer hairs. Neck swelling and mane spreading during vocalization may act as an optical signal to other males and females. The air sac, as a side branch of the vocal tract, can be considered as an additional acoustic filter. Individual acoustic recognition may have been the primary function in the evolution of a size-variable air sac, and this function is retained in mother-young communication. In males sexual selection seems to have favoured a considerable size increase of the air sac and a switch to call series instead of single calls. Vocalization became restricted to the rutting period serving the attraction of females. We propose two possibilities for the acoustic function of the air sac in vocalization that do not exclude each other. The first assumes a coupling between air sac and the environment, resulting in an acoustic output that is a combination of the vocal tract resonance frequencies emitted via mouth and nostrils and the resonance frequencies of the air sac transmitted via the neck skin. The second assumes a weak coupling so that resonance frequencies of the air sac are lost to surrounding tissues by dissipation. In this case the resonance frequencies of the air sac solely influence the signal that is further filtered by the remaining vocal tract. According to our results one acoustic effect of the air sac in adult reindeer might be to mask formants of the vocal tract proper. In other cervid species, however, formants of rutting calls convey essential information on the quality of the sender, related to its potential reproductive success, to conspecifics. Further studies are required to solve this inconsistency.}, } @article {pmid17297808, year = {2007}, author = {Smith, J and Rey, G and Dickens, P and Fletcher, N and Hollenberg, L and Wolfe, J}, title = {Vocal tract resonances and the sound of the Australian didjeridu (yidaki). III. Determinants of playing quality.}, journal = {The Journal of the Acoustical Society of America}, volume = {121}, number = {1}, pages = {547-558}, doi = {10.1121/1.2384849}, pmid = {17297808}, issn = {0001-4966}, mesh = {Acoustic Impedance Tests ; *Acoustics ; Australia ; Humans ; *Music ; *Pitch Perception ; Vocal Cords/*physiology ; }, abstract = {Traditional didjeridus have a broad range of bore geometries with many details not immediately apparent to a player, and are therefore suitable for examining the relationship between perceived quality and physical properties. Seven experienced players assessed the overall playing quality of 38 didjeridus that spanned a wide range of quality, pitch, and geometry, as well as 11 plastic cylindrical pipes. The ranking of these instruments was correlated with detailed measurements of their acoustic input impedance spectra. Most significantly, the ranked quality of a didjeridu was found to be negatively correlated with the magnitude of its acoustic input impedance, particularly in the frequency range from 1 to 2 kHz. This is in accord with the fact that maxima in the impedance of the player's vocal tract can inhibit acoustic flow, and consequently sound production, once the magnitude of these impedance maxima becomes comparable with or greater than those of the instrument. This produces the varying spectral peaks or formants in the sound envelope that characterize this instrument. Thus an instrument with low impedance and relatively weak impedance maxima in this frequency range would allow players greater control of the formants in the output sound and thus lead to a higher perceived playing quality.}, } @article {pmid17293643, year = {2006}, author = {Mitterer, H}, title = {Is vowel normalization independent of lexical processing?.}, journal = {Phonetica}, volume = {63}, number = {4}, pages = {209-229}, doi = {10.1159/000097306}, pmid = {17293643}, issn = {0031-8388}, mesh = {Awareness ; Humans ; *Phonetics ; *Semantics ; Sound Spectrography ; *Speech Perception ; *Vocabulary ; }, abstract = {Vowel normalization in speech perception was investigated in three experiments. The range of the second formant in a carrier phrase was manipulated and this affected the perception of a target vowel in a compensatory fashion: A low F2 range in the carrier phrase made it more likely that the target vowel was perceived as a front vowel, that is, with a high F2. Recent experiments indicated that this effect might be moderated by the lexical status of the constituents of the carrier phrase. Manipulation of the lexical status in the present experiments, however, did not affect vowel normalization. In contrast, the range of vowels in the carrier phrase did influence vowel normalization. If the carrier phrase consisted of mid-to-high front vowels only, vowel categories shifted only for mid-to-high front vowels. It is argued that these results are a challenge for episodic models of word recognition.}, } @article {pmid17282413, year = {2005}, author = {Tsao, YC and Iqbal, K}, title = {Can acoustic vowel space predict the habitual speech rate of the speaker?.}, journal = {Conference proceedings : ... Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual Conference}, volume = {2005}, number = {}, pages = {1220-1223}, doi = {10.1109/IEMBS.2005.1616644}, pmid = {17282413}, issn = {1557-170X}, abstract = {This study aims to find whether the acoustic vowel space reflect the habitual speaking rate of the speaker. The vowel space is defined as the area of the quadrilateral formed by the four corner vowels (i.e.,/i/,/æ/,/u/,/α) in the F1F2- 2 plane. The study compares the acoustic vowel space in the speech of habitually slow and fast talkers and further analyzes them by gender. In addition to the measurement of vowel duration and midpoint frequencies of F1 and F2, the F1/F2 vowel space areas were measured and compared across speakers. The results indicate substantial overlap in vowel space area functions between slow and fast talkers, though the slow speakers were found to have larger vowel spaces. Furthermore, large variability in vowel space area functions was noted among interspeakers in each group. Both F1 and F2 formant frequencies were found to be gender sensitive in consistence with the existing data. No predictive relation between vowel duration and formant frequencies was observed among speakers.}, } @article {pmid17271592, year = {2004}, author = {Moore, E and Clements, M and Peifer, J and Weisser, L}, title = {Comparing objective feature statistics of speech for classifying clinical depression.}, journal = {Conference proceedings : ... Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual Conference}, volume = {2006}, number = {}, pages = {17-20}, doi = {10.1109/IEMBS.2004.1403079}, pmid = {17271592}, issn = {1557-170X}, abstract = {Human communication is saturated with emotional context that aids in interpreting a speakers mental state. Speech analysis research involving the classification of emotional states has been studied primarily with prosodic (e.g., pitch, energy, speaking rate) and/or spectral (e.g., formants) features. Glottal waveform features, while receiving less attention (due primarily to the difficulty of feature extraction), have also shown strong clustering potential of various emotional and stress states. This study provides a comparison of the major categories of speech analysis in the application of identifying and clustering feature statistics from a control group and a patient group suffering from a clinical diagnosis of depression.}, } @article {pmid17226064, year = {2007}, author = {Ey, E and Pfefferle, D and Fischer, J}, title = {Do age- and sex-related variations reliably reflect body size in non-human primate vocalizations? A review.}, journal = {Primates; journal of primatology}, volume = {48}, number = {4}, pages = {253-267}, pmid = {17226064}, issn = {0032-8332}, mesh = {Aging/*physiology ; Animals ; *Body Size ; Female ; Male ; Primates/*anatomy & histology/*growth & development ; Sex Characteristics ; Time Factors ; *Vocalization, Animal/physiology ; }, abstract = {In vocal communication, the mechanisms of sound production are well understood. The length of the vocal folds determines the minimum fundamental frequency, while the size and the shape of the vocal tract affect its filtering characteristics and hence, the resonant frequencies. Both measures-vocal fold length and vocal tract length-are related to body size and therefore, acoustic features are expected to vary with body size. Because direct measures of body size are difficult to obtain from free-ranging animals, age and sex have often been used as proxies. We surveyed studies which included direct measures of size or weight, and also studies in which only age and/or sex differences were examined. The main purpose was to examine whether age- and sex-related variations in acoustic features meet the predictions generated from our knowledge about sound production. Our survey revealed that compared to smaller animals, larger animals utter longer calls, with a lower fundamental frequency, with smaller formant dispersion, and with the energy concentrated in lower frequencies. Age and sex reliably reflect the influence of body size on acoustic features when gross size differences are examined. However, within age- and sex classes, this relationship may break down. In addition to body size, other factors such as internal state or social context may also influence the structure of vocal signals and highlight the richness of information in calls that is potentially available to listeners.}, } @article {pmid17225426, year = {2006}, author = {Iverson, P and Smith, CA and Evans, BG}, title = {Vowel recognition via cochlear implants and noise vocoders: effects of formant movement and duration.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {6}, pages = {3998-4006}, doi = {10.1121/1.2372453}, pmid = {17225426}, issn = {0001-4966}, mesh = {Adult ; Aged ; *Cochlear Implants ; Deafness/therapy ; Female ; Humans ; Male ; Middle Aged ; *Noise ; *Phonation ; *Phonetics ; *Recognition, Psychology ; *Speech Perception ; Time Factors ; }, abstract = {Previous work has demonstrated that normal-hearing individuals use fine-grained phonetic variation, such as formant movement and duration, when recognizing English vowels. The present study investigated whether these cues are used by adult postlingually deafened cochlear implant users, and normal-hearing individuals listening to noise-vocoder simulations of cochlear implant processing. In Experiment 1, subjects gave forced-choice identification judgments for recordings of vowels that were signal processed to remove formant movement and/or equate vowel duration. In Experiment 2, a goodness-optimization procedure was used to create perceptual vowel space maps (i.e., best exemplars within a vowel quadrilateral) that included F1, F2, formant movement, and duration. The results demonstrated that both cochlear implant users and normal-hearing individuals use formant movement and duration cues when recognizing English vowels. Moreover, both listener groups used these cues to the same extent, suggesting that postlingually deafened cochlear implant users have category representations for vowels that are similar to those of normal-hearing individuals.}, } @article {pmid17204903, year = {2007}, author = {Horev, N and Most, T and Pratt, H}, title = {Categorical Perception of Speech (VOT) and Analogous Non-Speech (FOT) signals: Behavioral and electrophysiological correlates.}, journal = {Ear and hearing}, volume = {28}, number = {1}, pages = {111-128}, doi = {10.1097/01.aud.0000250021.69163.96}, pmid = {17204903}, issn = {0196-0202}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Perception/*physiology ; Behavior ; Discrimination, Psychological ; Evoked Potentials, Auditory ; Female ; Humans ; Israel ; Language ; Male ; Reaction Time ; Reference Values ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: To determine whether voicing perception is influenced primarily by linguistic experience or if it is due to innate temporal sensitivity to voicing boundaries, by examining behavioral and electrophysiological correlates of speech Voice-Onset-Time (VOT) and nonspeech Formant-Onset-Time (FOT) categorical perception.

DESIGN: Behavioral measures and auditory event-related potentials (ERPs) were obtained from 14 normal-hearing Hebrew speakers, whose voicing distinction is different than English, during identification and discrimination of two sets of stimuli: a VOT continuum, created by editing natural productions of /ba/ and /pa/, and an analogous nonspeech continuum, composed of two synthesized formants, varying in their onset time-FOT.

RESULTS: VOT and FOT continua yielded similar behavioral identification curves. Differences between the two stimulus types were found in discrimination of within-category differences and in reaction time effects. During identification and discrimination tasks, ERPs were differently affected by the VOT or FOT value of the stimulus: VOT value had a significant effect on N1 latency and on N1 and P2 amplitudes whereas FOT value had a significant effect on P2 amplitude. Additionally, during identification tasks, whereas all speech signals evoked a P3, regardless of overt categorization, only the perceptually "rare" nonspeech stimulus (+15 msec FOT) evoked a P3.

CONCLUSIONS: Voicing boundaries corresponded to Hebrew VOT values of production, suggesting that voicing perception in Hebrew is mediated mainly by linguistic experience rather than by innate temporal sensitivity. ERP data differed to VOT versus FOT stimuli as early as N1, indicating that brain processing of the temporal aspects of speech and nonspeech signals differ from their early stages. Further studies to establish the neural response patterns to voicing in speakers of languages that use different voicing categories than English are warranted.}, } @article {pmid17164660, year = {2006}, author = {Kudoh, M and Nakayama, Y and Hishida, R and Shibuki, K}, title = {Requirement of the auditory association cortex for discrimination of vowel-like sounds in rats.}, journal = {Neuroreport}, volume = {17}, number = {17}, pages = {1761-1766}, doi = {10.1097/WNR.0b013e32800fef9d}, pmid = {17164660}, issn = {0959-4965}, mesh = {Acoustic Stimulation/methods ; Analysis of Variance ; Animals ; Auditory Cortex/injuries/*physiology ; Auditory Perception/*physiology ; Behavior, Animal/physiology ; Brain Mapping ; Discrimination Learning/*physiology ; Dose-Response Relationship, Radiation ; Electrolysis/adverse effects ; Male ; Rats ; Rats, Wistar ; Reaction Time ; *Sound ; Time Factors ; }, abstract = {We investigated the roles of the auditory cortex in discrimination learning of vowel-like sounds consisting of multiple formants. Rats were trained to discriminate between synthetic sounds with four formants. Bilateral electrolytic lesions including the primary auditory cortex and the dorsal auditory association cortex impaired multiformant discrimination, whereas they did not significantly affect discrimination between sounds with a single formant or between pure tones. Local lesions restricted to the dorsal/rostral auditory association cortex were sufficient to attenuate multiformant discrimination learning, and lesions restricted to the primary auditory cortex had no significant effects. These findings indicate that the dorsal/rostral auditory association cortex but not the primary auditory cortex is required for discrimination learning of vowel-like sounds with multiple formants in rats.}, } @article {pmid17139743, year = {2006}, author = {McGowan, RS}, title = {Perception of synthetic vowel exemplars of 4-year-old children and estimation of their corresponding vocal tract shapes.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {5 Pt 1}, pages = {2850-2858}, doi = {10.1121/1.2345833}, pmid = {17139743}, issn = {0001-4966}, support = {NIHD-03782//PHS HHS/United States ; }, mesh = {Adult ; Age Factors ; Algorithms ; Child, Preschool ; Female ; Humans ; Larynx/*anatomy & histology/*physiology ; Male ; Phonation/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Vocal Cords/anatomy & histology/physiology ; }, abstract = {Formant scalings for vowel exemplars of American 4 year olds who were imitating adult production were used along with published data of American adult male vowel production to synthesize /a, ae, u, i/. Other vowel exemplars were also synthesized. Adult listeners were asked to categorize these synthetic vowels in a forced choice task. With some exceptions, the formant frequencies preferred for the vowels /a, ae, u, i/ were close to the published data. In order to gain insight on children's articulation during imitation of vowels /a, ae, u, i/, a five-tube model was used in an algorithm to infer vocal tract shape from the first three formant frequencies of the adult productions, the formant frequencies derived for 4 year olds by scaling, and formant frequencies for 4 year olds derived based on the listening experiments. It was found that the rear tube length for the children, in proportionate terms, was nearly always greater than that of the adult. The rear tube length was proportionately twice as long in children compared to adults for the vowel /u/. Tongue root flexibility and the oblique angle between the pharynx and mouth may be more important than pharynx length in determining formant scalings for 4 year old children.}, } @article {pmid17134874, year = {2008}, author = {Master, S and De Biase, N and Chiari, BM and Laukkanen, AM}, title = {Acoustic and perceptual analyses of Brazilian male actors' and nonactors' voices: long-term average spectrum and the "actor's formant".}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {22}, number = {2}, pages = {146-154}, doi = {10.1016/j.jvoice.2006.09.006}, pmid = {17134874}, issn = {0892-1997}, mesh = {Adult ; Age Factors ; Brazil ; Humans ; Middle Aged ; *Occupations ; Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; Voice Training ; }, abstract = {This study investigates the possible differences between actors' and nonactors' vocal projection strategies using acoustic and perceptual analyses. A total of 11 male actors and 10 male nonactors volunteered as subjects, reading an extended text sample in habitual, moderate, and loud levels. The samples were analyzed for sound pressure level (SPL), alpha ratio (difference between the average SPL of the 1-5kHz region and the average SPL of the 50Hz-1kHz region), fundamental frequency (F0), and long-term average spectrum (LTAS). Through LTAS, the mean frequency of the first formant (F1) range, the mean frequency of the "actor's formant," the level differences between the F1 frequency region and the F0 region (L1-L0), and the level differences between the strongest peak at 0-1kHz and that at 3-4kHz were measured. Eight voice specialists evaluated perceptually the degree of projection, loudness, and tension in the samples. The actors had a greater alpha ratio, stronger level of the "actor's formant" range, and a higher degree of perceived projection and loudness in all loudness levels. SPL, however, did not differ significantly between the actors and nonactors, and no differences were found in the mean formant frequencies ranges. The alpha ratio and the relative level of the "actor's formant" range seemed to be related to the degree of perceived loudness. From the physiological point of view, a more favorable glottal setting, providing a higher glottal closing speed, may be characteristic of these actors' projected voices. So, the projected voices, in this group of actors, were more related to the glottic source than to the resonance of the vocal tract.}, } @article {pmid17134872, year = {2008}, author = {Lin, E and Hwang, TZ and Hornibrook, J and Ormond, T}, title = {Voice of postradiotherapy nasopharyngeal carcinoma patients: evidence of vocal tract effect.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {22}, number = {3}, pages = {351-364}, doi = {10.1016/j.jvoice.2006.09.011}, pmid = {17134872}, issn = {0892-1997}, mesh = {Adult ; Electrodiagnosis ; Humans ; Laryngeal Diseases/*etiology ; Male ; Nasopharyngeal Neoplasms/pathology/*radiotherapy ; Neoplasm Staging ; Phonation/radiation effects ; Radiation Injuries/*etiology ; Sound Spectrography ; Vocal Cords/*radiation effects ; Voice Disorders/*etiology ; Voice Quality/radiation effects ; }, abstract = {This study was aimed at identifying acoustic and physiological measures useful for monitoring voice changes in postnasopharyngeal patients with nonlaryngeal malignancies, and providing evidences of vocal tract effect on voice through comparisons between individuals with and without intact vocal tract. Simultaneous acoustic-electroglottographic signals recorded during phonation of vowels /i/ and /a/ sustained at habitual, high, and low pitch levels were compared among 10 postradiotherapy patients with nasopharyngeal carcinoma (NPC), 10 voice patients (VPs) with intact vocal tract, and 10 healthy individuals with normal voice (NORM). Results from a series of discriminant analyses revealed that the NPC group generally exhibited lower signal-to-noise (SNR) and open quotient (OQ) and higher Formant 1 frequency (F(1)) and speed quotient (SQ) than the NORM group. Unlike both VP and NORM groups, the NPC group failed to show a pitch effect on all voice measures, including OQ, SQ, percent jitter, percent shimmer, and SNR, suggesting an effect of radiotherapy and/or vocal tract on laryngeal behaviors. For the vowel /i/, on the other hand, only the NPC and NORM groups showed a pattern of pitch-dependent F(1) raising, a reflection of increased pharyngeal narrowing. These findings suggested that the pitch effect on laryngeal behaviors differed not only between individuals with intact vocal tract and those without but also between those with structural and dynamic changes of vocal tract.}, } @article {pmid17131856, year = {2006}, author = {Paradowska-Opałka, B}, title = {[Morphology of the glottis after supracricoid laryngectomy with CHP or CHEP and restoration of basic laryngeal function with special attention to quality of phonation].}, journal = {Annales Academiae Medicae Stetinensis}, volume = {52}, number = {1}, pages = {125-34; discussion 134-5}, pmid = {17131856}, issn = {1427-440X}, mesh = {Adult ; Aged ; Cricoid Cartilage/*surgery ; Deglutition ; Female ; Glottis/*pathology/*physiopathology ; Humans ; Hyoid Bone/*surgery ; Laryngeal Neoplasms/mortality/*surgery ; Laryngectomy/*methods ; Male ; Middle Aged ; Otorhinolaryngologic Surgical Procedures/methods ; *Phonation ; Speech Acoustics ; Treatment Outcome ; Voice ; Voice Quality ; }, abstract = {PURPOSE: The aim of this work was to describe the morphology of the reconstructed larynx after supracricoid horizontal laryngectomy with simultaneous cricohyopexy (CHP) or cricohyoepiglottopexy (CEHP) preserving one or both arytenoid cartilages, to study the protective and respiratory functions of the larynx, to perform phoniatric evaluation of phonation quality, to analyze voice and speech phonetic-acoustic parameters, and to search for correlations between reconstruction type and phonetic-acoustic characteristics of voice and speech.

MATERIAL AND METHODS: The material comprised 58 patients (32 after CHP and 26 after CHEP).

CONCLUSION: Oval shape of the neoglottis dominated. Swallowing was normal in 46% of patients and did not depend on reconstruction type. Decannulation was possible in 66% of patients. The results were inferior after CHEP with both arytenoid cartilages intact. Socially efficient speech was found in 74% of patients and the results were better with CHEP. The phonetic-acoustic structure of voice and resonant speech was considerably different from the phonetic-acoustic structure of voice and speech under physiologic conditions. These differences applied to segmental (formant structure frequencies, noise range), as well as suprasegmental voice features.}, } @article {pmid17124837, year = {2006}, author = {Robinson, P and Derksen, FJ and Stick, JA and Sullins, KE and DeTolve, PG and Robinson, NE}, title = {Effects of unilateral laser-assisted ventriculocordectomy in horses with laryngeal hemiplegia.}, journal = {Equine veterinary journal}, volume = {38}, number = {6}, pages = {491-496}, doi = {10.2746/042516406x154813}, pmid = {17124837}, issn = {0425-1644}, mesh = {Animals ; Female ; Hemiplegia/surgery/*veterinary ; Horse Diseases/*surgery ; Horses ; Laryngectomy/methods/*veterinary ; Laser Therapy/methods/*veterinary ; Male ; Minimally Invasive Surgical Procedures/methods/veterinary ; Physical Conditioning, Animal/physiology ; Pressure ; Respiratory Sounds/*veterinary ; Risk Factors ; Time Factors ; Treatment Outcome ; Vocal Cord Paralysis/surgery/*veterinary ; Vocal Cords/surgery ; }, abstract = {REASONS FOR PERFORMING STUDY: Recent studies have evaluated surgical techniques aimed at reducing noise and improving airway function in horses with recurrent laryngeal neuropathy (RLN). These techniques require general anaesthesia and are invasive. A minimally invasive transnasal surgical technique for treatment of RLN that may be employed in the standing, sedated horse would be advantageous.

OBJECTIVE: To determine whether unilateral laser-assisted ventriculocordectomy (LVC) improves upper airway function and reduces noise during inhalation in exercising horses with laryngeal hemiplegia (LH).

METHODS: Six Standardbred horses were used; respiratory sound and inspiratory transupper airway pressure (Pui) measured before and after induction of LH, and 60, 90 and 120 days after LVC. Inspiratory sound level (SL) and the sound intensities of formants 1, 2 and 3 (Fl, F2 and F3, respectively), were measured using computer-based sound analysis programmes. In addition, upper airway endoscopy was performed at each time interval, at rest and during treadmill exercise.

RESULTS: In LH-affected horses, Pui, SL and the sound intensity of F2 and F3 were increased significantly from baseline values. At 60 days after LVC, Pui and SL had returned to baseline, and F2 and F3 values had improved partially compared to LH values. At 90 and 120 days, however, SL increased again to LH levels.

CONCLUSIONS: LVC decreases LH-associated airway obstruction by 60 days after surgery, and reduces inspiratory noise but not as effectively as bilateral ventriculocordectomy.

POTENTIAL RELEVANCE: LVC may be recommended as a treatment of LH, where reduction of upper airway obstruction and respiratory noise is desired and the owner wishes to avoid risks associated with a laryngotomy incision or general anaesthesia.}, } @article {pmid17105317, year = {2006}, author = {Bundy, EL and Zajac, DJ}, title = {Estimation of transpalatal nasalance during production of voiced stop consonants by noncleft speakers using an oral-nasal mask.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {43}, number = {6}, pages = {691-701}, doi = {10.1597/04-103}, pmid = {17105317}, issn = {1055-6656}, support = {R01 DE 10175/DE/NIDCR NIH HHS/United States ; }, mesh = {Acoustics/instrumentation ; Adult ; Energy Transfer ; Female ; Humans ; Lip/physiology ; Male ; *Masks ; Nasal Cavity/physiology ; Palate/*physiology ; Palate, Soft/physiology ; Pharynx/physiology ; Phonation/*physiology ; *Phonetics ; Pulmonary Ventilation/physiology ; Reproducibility of Results ; Sex Factors ; Tongue/physiology ; Voice Quality/*physiology ; }, abstract = {OBJECTIVE: Our objective was to estimate nasalance due to transpalatal transfer of acoustic energy during production of voiced stop consonants by noncleft speakers. We also determined the relationship between the transpalatal nasalance and fundamental frequency (F0) of the speakers.

METHOD: Participants were 8 men and 10 women (mean age = 21.9 years, SD = 4.0) without cleft palate who produced voiced stop (/b d g/) and nasal (/m n eta/) consonants in syllables embedded in a carrier phrase. Participants also read the Zoo Passage. A divided OroNasal Nasality System mask was used to simultaneously obtain acoustic nasalance and airflow during production of the consonants. Both F0-derived and first formant (F1)-derived nasalance were computed.

RESULTS: F0-derived and F1-derived peak nasalance across all speakers ranged from a low of 20% to a high of 80% during production of stop consonants. An estimate of error from the combined sources of transoral transfer of energy (5%) and acoustic crossover between microphones (15%) was no greater than 20%. Analysis of variance revealed no significant effects of the sex of the speakers for either F0-derived or F1-derived nasalance of stops. There was a significant effect of the place of stop production for F0-derived nasalance (p;th< .05). Nonsignificant but positive correlations were found between the F0 of the speakers and F0-derived (r = .25) and F1-derived (r = .45) nasalance.

CONCLUSIONS: Transpalatal transfer of oral acoustic energy accounts for most nasalance obtained during production of voiced stop consonants by noncleft speakers. F1-derived nasalance appears to better reflect transpalatal effects. Clinical implications are discussed.}, } @article {pmid17084421, year = {2007}, author = {Stefanatos, GA and Braitman, LE and Madigan, S}, title = {Fine grain temporal analysis in aphasia: evidence from auditory gap detection.}, journal = {Neuropsychologia}, volume = {45}, number = {5}, pages = {1127-1133}, doi = {10.1016/j.neuropsychologia.2006.09.011}, pmid = {17084421}, issn = {0028-3932}, mesh = {Analysis of Variance ; Aphasia/*complications/physiopathology ; Auditory Perception/*physiology ; Auditory Threshold/physiology ; Case-Control Studies ; Female ; Humans ; Male ; Matched-Pair Analysis ; Middle Aged ; Perceptual Disorders/*complications/physiopathology ; *Phonetics ; Time Factors ; }, abstract = {Auditory temporal processing was investigated in individuals with acquired aphasia using a task in which they were asked to detect brief silent gaps inserted between noise segments modeled after formants in speech. To examine within-channel gap detection, gaps of 10, 20, 40, and 80ms duration were inserted between an initial segment (IS) and a trailing segment (TS) centered at the same frequency (1kHz). In a between-channel gap detection condition, gaps of 20, 40, 80, and 100ms duration were inserted between an IS that differed in frequency (4kHz) from the TS (1kHz). The effect of gap onset timing was examined in both conditions by systematically varying the duration of the IS by 10, 20, or 40ms. A combined analysis revealed that for both conditions and all gap and IS durations, individuals with aphasia produced fewer correct responses than age-matched neurologically intact controls. Separate condition analyses revealed that when noise segments were centered at the same frequency, individuals with aphasia demonstrated poorer accuracy in detecting 40 and 80ms gaps relative to normal controls (p<0.001). When gaps were inserted between noise segments differing in frequency, on average, aphasic subjects performed less accurately at durations of 40, 80 and 100ms (p<0.025). Detection in both groups decreased with smaller IS durations. The difficulties with gap detection observed in the aphasic group suggest the existence of fundamental problems in processing the temporal form or microstructure of sounds characterized by rapidly changing onset dynamics.}, } @article {pmid17077221, year = {2006}, author = {Zajac, DJ and Roberts, JE and Hennon, EA and Harris, AA and Barnes, EF and Misenheimer, J}, title = {Articulation rate and vowel space characteristics of young males with fragile X syndrome: preliminary acoustic findings.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {49}, number = {5}, pages = {1147-1155}, doi = {10.1044/1092-4388(2006/082)}, pmid = {17077221}, issn = {1092-4388}, support = {R01 HD44935/HD/NICHD NIH HHS/United States ; R03 HD40640/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Case-Control Studies ; Child ; Child, Preschool ; Fragile X Syndrome/complications/*physiopathology ; Humans ; Male ; Multivariate Analysis ; *Phonetics ; *Speech Acoustics ; Speech Disorders/etiology/*physiopathology ; Speech Intelligibility ; Speech Production Measurement ; Tape Recording ; }, abstract = {PURPOSE: Increased speaking rate is a commonly reported perceptual characteristic among males with fragile X syndrome (FXS). The objective of this preliminary study was to determine articulation rate-one component of perceived speaking rate-and vowel space characteristics of young males with FXS.

METHOD: Young males with FXS (n = 38), developmental age (DA)-matched males (n = 21), and chronological age (CA)-matched males (n = 16) were audiotaped while engaged in spontaneous conversation and a picture-naming task. Articulation rate in syllables per second during intelligible utterances and vowel space area/dispersion measures were acoustically determined for each speaker.

RESULTS: Males with FXS did not articulate significantly faster than CA-matched males. Area and dispersion of the acoustic vowel space also were similar between the 2 groups. Males with FXS, however, used significantly shorter utterances and had a tendency to pause less often than CA-matched males. In addition, males with FXS exhibited greater intraspeaker variability of formants associated with the vowel /a/.

CONCLUSIONS: These preliminary findings suggest that articulation rate may not be a primary factor contributing to perceived speaking rate of males with FXS. Limitations of the study relative to speech production tasks and utterance intelligibility are discussed.}, } @article {pmid17069322, year = {2006}, author = {Wagner, A and Ernestus, M and Cutler, A}, title = {Formant transitions in fricative identification: the role of native fricative inventory.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2267-2277}, doi = {10.1121/1.2335422}, pmid = {17069322}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The distribution of energy across the noise spectrum provides the primary cues for the identification of a fricative. Formant transitions have been reported to play a role in identification of some fricatives, but the combined results so far are conflicting. We report five experiments testing the hypothesis that listeners differ in their use of formant transitions as a function of the presence of spectrally similar fricatives in their native language. Dutch, English, German, Polish, and Spanish native listeners performed phoneme monitoring experiments with pseudowords containing either coherent or misleading formant transitions for the fricatives /s/ and /f/. Listeners of German and Dutch, both languages without spectrally similar fricatives, were not affected by the misleading formant transitions. Listeners of the remaining languages were misled by incorrect formant transitions. In an untimed labeling experiment both Dutch and Spanish listeners provided goodness ratings that revealed sensitivity to the acoustic manipulation. We conclude that all listeners may be sensitive to mismatching information at a low auditory level, but that they do not necessarily take full advantage of all available systematic acoustic variation when identifying phonemes. Formant transitions may be most useful for listeners of languages with spectrally similar fricatives.}, } @article {pmid17069320, year = {2006}, author = {Rvachew, S and Mattock, K and Polka, L and Ménard, L}, title = {Developmental and cross-linguistic variation in the infant vowel space: the case of Canadian English and Canadian French.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2250-2259}, doi = {10.1121/1.2266460}, pmid = {17069320}, issn = {0001-4966}, mesh = {*Child Language ; Computer Simulation ; Cross-Sectional Studies ; Female ; Humans ; Infant ; Larynx/*physiology ; Linear Models ; Lip/*physiology ; Male ; Models, Biological ; Movement/physiology ; Phonation/*physiology ; Regression Analysis ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {This article describes the results of two experiments. Experiment 1 was a cross-sectional study designed to explore developmental and cross-linguistic variation in the vowel space of 10- to 18-month-old infants, exposed to either Canadian English or Canadian French. Acoustic parameters of the infant vowel space were described (specifically the mean and standard deviation of the first and second formant frequencies) and then used to derive the grave, acute, compact, and diffuse features of the vowel space across age. A decline in mean F1 with age for French-learning infants and a decline in mean F2 with age for English-learning infants was observed. A developmental expansion of the vowel space into the high-front and high-back regions was also evident. In experiment 2, the Variable Linear Articulatory Model was used to model the infant vowel space taking into consideration vocal tract size and morphology. Two simulations were performed, one with full range of movement for all articulatory paramenters, and the other for movement of jaw and lip parameters only. These simulated vowel spaces were used to aid in the interpretation of the developmental changes and cross-linguistic influences on vowel production in experiment 1.}, } @article {pmid17069319, year = {2006}, author = {Kitamura, T and Takemoto, H and Adachi, S and Mokhtari, P and Honda, K}, title = {Cyclicity of laryngeal cavity resonance due to vocal fold vibration.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2239-2249}, doi = {10.1121/1.2335428}, pmid = {17069319}, issn = {0001-4966}, mesh = {Acoustics ; Adult ; Computer Simulation ; Glottis/physiology ; Humans ; Larynx/*physiology ; Magnetic Resonance Imaging ; Male ; Models, Biological ; Sound Spectrography ; Time Factors ; *Vibration ; Vocal Cords/*physiology ; Voice/*physiology ; }, abstract = {Acoustic effects of the time-varying glottal area due to vocal fold vibration on the laryngeal cavity resonance were investigated based on vocal tract area functions and acoustic analysis. The laryngeal cavity consists of the vestibular and ventricular parts of the larynx, and gives rise to a regional acoustic resonance within the vocal tract, with this resonance imparting an extra formant to the vocal tract resonance pattern. Vocal tract transfer functions of the five Japanese vowels uttered by three male subjects were calculated under open- and closed-glottis conditions. The results revealed that the resonance appears at the frequency region from 3.0 to 3.7 kHz when the glottis is closed and disappears when it is open. Real spectra estimated from open- and closed-glottis periods of vowel sounds also showed the on-off pattern of the resonance within a pitch period. Furthermore, a time-domain acoustic analysis of vowels indicated that the resonance component could be observed as a pitch-synchronized rise-and-fall pattern of the bandpass amplitude. The cyclic nature of the resonance can be explained as the laryngeal cavity acting as a closed tube that generates the resonance during a closed-glottis period, but damps the resonance off during an open-glottis period.}, } @article {pmid17069318, year = {2006}, author = {Takemoto, H and Adachi, S and Kitamura, T and Mokhtari, P and Honda, K}, title = {Acoustic roles of the laryngeal cavity in vocal tract resonance.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2228-2238}, doi = {10.1121/1.2261270}, pmid = {17069318}, issn = {0001-4966}, mesh = {Adult ; Computer Simulation ; Humans ; Hypopharynx/anatomy & histology/physiology ; Larynx/*anatomy & histology/*physiology ; Male ; Models, Biological ; Phonation/*physiology ; *Speech Acoustics ; Vocal Cords/physiology ; Voice ; }, abstract = {The acoustic effects of the laryngeal cavity on the vocal tract resonance were investigated by using vocal tract area functions for the five Japanese vowels obtained from an adult male speaker. Transfer functions were examined with the laryngeal cavity eliminated from the whole vocal tract, volume velocity distribution patterns were calculated, and susceptance matching analysis was performed between the laryngeal cavity and the vocal tract excluding the laryngeal cavity (vocal tract proper). It was revealed that the laryngeal cavity generates one of the formants of the vocal tract, which is the fourth in the present study. At this formant, the resonance of the laryngeal cavity (the 1/4 wavelength resonance) induces the open-tube resonance of the vocal tract proper (the 3/2 wavelength resonance). At the other formants, on the other hand, the vocal tract proper acts as a closed tube, because the laryngeal cavity has only a small contribution to generating these formants and the effective closed end of the whole vocal tract is the junction between the laryngeal cavity and the vocal tract proper.}, } @article {pmid17069313, year = {2006}, author = {van Dinther, R and Patterson, RD}, title = {Perception of acoustic scale and size in musical instrument sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2158-2176}, pmid = {17069313}, issn = {0001-4966}, support = {G0500221/MRC_/Medical Research Council/United Kingdom ; G0500221(73813)/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Female ; Humans ; Loudness Perception/*physiology ; Male ; Multivariate Analysis ; *Music ; Pitch Perception/*physiology ; Psychometrics ; Voice/physiology ; }, abstract = {There is size information in natural sounds. For example, as humans grow in height, their vocal tracts increase in length, producing a predictable decrease in the formant frequencies of speech sounds. Recent studies have shown that listeners can make fine discriminations about which of two speakers has the longer vocal tract, supporting the view that the auditory system discriminates changes on the acoustic-scale dimension. Listeners can also recognize vowels scaled well beyond the range of vocal tracts normally experienced, indicating that perception is robust to changes in acoustic scale. This paper reports two perceptual experiments designed to extend research on acoustic scale and size perception to the domain of musical sounds: The first study shows that listeners can discriminate the scale of musical instrument sounds reliably, although not quite as well as for voices. The second experiment shows that listeners can recognize the family of an instrument sound which has been modified in pitch and scale beyond the range of normal experience. We conclude that processing of acoustic scale in music perception is very similar to processing of acoustic scale in speech perception.}, } @article {pmid17069311, year = {2006}, author = {Fitch, WT and Fritz, JB}, title = {Rhesus macaques spontaneously perceive formants in conspecific vocalizations.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {2132-2141}, doi = {10.1121/1.2258499}, pmid = {17069311}, issn = {0001-4966}, support = {T32 DC00038/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Female ; Macaca mulatta/*physiology ; Male ; Multivariate Analysis ; Pitch Perception/*physiology ; Random Allocation ; Signal Processing, Computer-Assisted ; Species Specificity ; Videotape Recording ; Vocalization, Animal/*physiology ; }, abstract = {We provide a direct demonstration that nonhuman primates spontaneously perceive changes in formant frequencies in their own species-typical vocalizations, without training or reinforcement. Formants are vocal tract resonances leading to distinctive spectral prominences in the vocal signal, and provide the acoustic determinant of many key phonetic distinctions in human languages. We developed algorithms for manipulating formants in rhesus macaque calls. Using the resulting computer-manipulated calls in a habituation/dishabituation paradigm, with blind video scoring, we show that rhesus macaques spontaneously respond to a change in formant frequencies within the normal macaque vocal range. Lack of dishabituation to a "synthetic replica" signal demonstrates that dishabituation was not due to an artificial quality of synthetic calls, but to the formant shift itself. These results indicate that formant perception, a significant component of human voice and speech perception, is a perceptual ability shared with other primates.}, } @article {pmid17069277, year = {2006}, author = {Nittrouer, S}, title = {Children hear the forest.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {4}, pages = {1799-1802}, pmid = {17069277}, issn = {0001-4966}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-19/DC/NIDCD NIH HHS/United States ; DC000633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; *Child Language ; Child, Preschool ; Humans ; Infant ; *Linguistics ; Phonetics ; Psychoacoustics ; Recognition, Psychology/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; Verbal Learning ; }, abstract = {How do children develop the ability to recognize phonetic structure in their native language with the accuracy and efficiency of adults? In particular, how do children learn what information in speech signals is relevant to linguistic structure in their native language, and what information is not? These questions are the focus of considerable investigation, including several studies by Catherine Mayo and Alice Turk. In a proposed Letter by Mayo and Turk, the comparative role of the isolated consonant-vowel formant transition in children's and adults' speech perception was questioned. Although Mayo and Turk ultimately decided to withdraw their letter, this note, originally written as a reply to their letter, was retained. It highlights the fact that the isolated formant transition must be viewed as part of a more global aspect of structurein the acoustic speech stream, one that arises from the rather slowly changing adjustments made invocal-tract geometry. Only by maintaining this perspective of acoustic speech structure can we ensure that we design stimuli that provide valid tests of our hypotheses and interpret results in a meaningful way.}, } @article {pmid17037642, year = {2006}, author = {Campisi, P and Low, AJ and Papsin, BC and Mount, RJ and Harrison, RV}, title = {Multidimensional voice program analysis in profoundly deaf children: quantifying frequency and amplitude control.}, journal = {Perceptual and motor skills}, volume = {103}, number = {1}, pages = {40-50}, doi = {10.2466/pms.103.1.40-50}, pmid = {17037642}, issn = {0031-5125}, mesh = {Adolescent ; Child ; Child, Preschool ; Cochlear Implantation ; *Deafness/diagnosis/etiology/therapy ; Equipment Design ; Feedback ; Female ; Hearing Aids ; Humans ; Male ; Preoperative Care ; Severity of Illness Index ; Speech Acoustics ; Speech Production Measurement ; Speech, Alaryngeal/*instrumentation ; Temporal Bone/diagnostic imaging ; Tomography, X-Ray Computed ; Voice Disorders/*diagnosis/physiopathology ; }, abstract = {Characterization of the vocal profile of profoundly deaf children using an objective voice analysis was carried out in a university-based pediatric otolaryngology clinic. 21 persons ages 3.5 to 18 years were assessed. From each sustained phonation of the vowel /a/ the following acoustic variables were extracted: fundamental frequency (F0), jitter percentage, shimmer percentage, fundamental frequency variation (vF0), peak amplitude variation (vAM), and first, second, and third formant frequencies (F1, F2, F3). Mean F0 was 267.8 Hz and consistent with established normative data. Mean measurements of jitter (0.88%) and shimmer (3.5%) were also within normal limits. The notable feature of the acoustic analysis was a statistically significant elevation in vF0 (2.81%) and vAM (23.58%). With the exception of one subject, the F1, F2, and F3 formant frequencies were comparable to those for normal hearing children. Auditory deprivation results in poor long-term control of frequency and amplitude during sustained phonation. The inability to maintain a sustained phonation may represent the partial collapse of an internal model of voice and speech.}, } @article {pmid17014985, year = {2008}, author = {Moura, CP and Cunha, LM and Vilarinho, H and Cunha, MJ and Freitas, D and Palha, M and Pueschel, SM and Pais-Clemente, M}, title = {Voice parameters in children with Down syndrome.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {22}, number = {1}, pages = {34-42}, doi = {10.1016/j.jvoice.2006.08.011}, pmid = {17014985}, issn = {0892-1997}, mesh = {Child ; Child, Preschool ; *Down Syndrome/epidemiology ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; Speech Disorders/epidemiology/therapy ; Speech Perception ; Speech Therapy/methods ; *Voice Quality ; }, abstract = {Down syndrome (DS) is the most frequent chromosomal disorder. Commonly, individuals with DS have difficulties with speech and show an unusual quality in the voice. Their phenotypic characteristics include general hypotonia and maxillary hypoplasia with relative macroglossia, and these contribute to particular acoustic alterations. Subjective perceptual and acoustic assessments of the voice (Praat-4.1 software) were performed in 66 children with DS, 36 boys and 30 girls, aged 3 to 8 years. These data were compared with those of an age-matched group of children from the general population. Perceptual evaluations showed significant differences in the group of children with DS. The voice of children with DS presented a lower fundamental frequency (F(0)) with elevated dispersion. The conjunction of frequencies for formants (F(1) and F(2)) revealed a decreased distinction between the vowels, reflecting the loss of articulatory processing. The DS vocalic anatomical functional ratio represents the main distinctive parameter between the two groups studied, and it may be useful in conducting assessments.}, } @article {pmid17010569, year = {2007}, author = {Kazi, RA and Prasad, VM and Kanagalingam, J and Nutting, CM and Clarke, P and Rhys-Evans, P and Harrington, KJ}, title = {Assessment of the formant frequencies in normal and laryngectomized individuals using linear predictive coding.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {6}, pages = {661-668}, doi = {10.1016/j.jvoice.2006.07.001}, pmid = {17010569}, issn = {0892-1997}, mesh = {Female ; Humans ; Laryngeal Neoplasms/*pathology/*surgery ; *Laryngectomy ; Male ; Middle Aged ; Neoplasm Staging ; *Phonetics ; Predictive Value of Tests ; Retrospective Studies ; Sound Spectrography ; Voice Disorders/*diagnosis ; }, abstract = {The objective of this study was to assess the difference in voice quality as defined by acoustical analysis using sustained vowel in laryngectomized patients in comparison with normal volunteers. This was designed as a retrospective single center cohort study. An adult tertiary referral unit formed the setting of this study. Fifty patients (40 males) who underwent total laryngectomy and 31 normal volunteers (18 male) participated. Group comparisons with the first three formant frequencies (F1, F2, and F3) using linear predictive coding (LPC) (Laryngograph Ltd, London, UK) was performed. The existence of any significant difference of F1, F2, and F3 between the two groups using the sustained vowel /i/ and the effects of other factors namely, tumor stage (T), chemoradiotherapy, pharyngectomy, cricothyroid myotomy, closure of pharyngoesophageal segment, and postoperative complication were analyzed. Formant frequencies F1, F2, and F3 were significantly different in male laryngectomees compared to controls: F1 (P<0.001, Mann-Whitney U test), F2 (P<0.001, Student's t test), and F3 (P=0.008, Student's t test). There was no significant difference between females in both groups for all three formant frequencies. Chemoradiotherapy and postoperative complications (pharyngocutaneous fistula) caused a significantly lower formant F1 in men, but showed little effect in F2 and F3. Laryngectomized males produced significantly higher formant frequencies, F1, F2, and F3, compared to normal volunteers, and this is consistent with literature. Chemoradiotherapy and postoperative complications significantly influenced the formant scores in the laryngectomee population. This study shows that robust and reliable data could be obtained using electroglottography and LPC in normal volunteers and laryngectomees using a sustained vowel.}, } @article {pmid17004467, year = {2006}, author = {Tan, Q and Carney, LH}, title = {Predictions of formant-frequency discrimination in noise based on model auditory-nerve responses.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {3}, pages = {1435-1445}, pmid = {17004467}, issn = {0001-4966}, support = {R01 DC001641/DC/NIDCD NIH HHS/United States ; R01 DC001641-16/DC/NIDCD NIH HHS/United States ; R01-01641//PHS HHS/United States ; }, mesh = {Acoustic Stimulation ; Auditory Threshold/*physiology ; Cochlear Nerve/*physiology ; Discrimination, Psychological/physiology ; Humans ; *Models, Biological ; *Noise ; Phonetics ; Predictive Value of Tests ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {To better understand how the auditory system extracts speech signals in the presence of noise, discrimination thresholds for the second formant frequency were predicted with simulations of auditory-nerve responses. These predictions employed either average-rate information or combined rate and timing information, and either populations of model fibers tuned across a wide range of frequencies or a subset of fibers tuned to a restricted frequency range. In general, combined temporal and rate information for a small population of model fibers tuned near the formant frequency was most successful in replicating the trends reported in behavioral data for formant-frequency discrimination. To explore the nature of the temporal information that contributed to these results, predictions based on model auditory-nerve responses were compared to predictions based on the average rates of a population of cross-frequency coincidence detectors. These comparisons suggested that average response rate (count) of cross-frequency coincidence detectors did not effectively extract important temporal information from the auditory-nerve population response. Thus, the relative timing of action potentials across auditory-nerve fibers tuned to different frequencies was not the aspect of the temporal information that produced the trends in formant-frequency discrimination thresholds.}, } @article {pmid16997507, year = {2006}, author = {Eriksson, JL and Villa, AE}, title = {Learning of auditory equivalence classes for vowels by rats.}, journal = {Behavioural processes}, volume = {73}, number = {3}, pages = {348-359}, doi = {10.1016/j.beproc.2006.08.005}, pmid = {16997507}, issn = {0376-6357}, mesh = {Animals ; *Association Learning ; Classification ; Computer Simulation ; *Discrimination Learning ; *Generalization, Psychological ; Male ; Models, Neurological ; Phonetics ; Rats ; Rats, Long-Evans ; Reaction Time ; Sound Spectrography ; *Speech Perception ; }, abstract = {Four male Long-Evans rats were trained to discriminate between synthetic vowel sounds using a GO/NOGO response choice task. The vowels were characterized by an increase in fundamental frequency correlated with an upward shift in formant frequencies. In an initial phase we trained the subjects to discriminate between two vowel categories using two exemplars from each category. In a subsequent phase the ability of the rats to generalize the discrimination between the two categories was tested. To test whether rats might exploit the fact that attributes of training stimuli covaried, we used non-standard stimuli with a reversed relation between fundamental frequency and formants. The overall results demonstrate that rats are able to generalize the discrimination to new instances of the same vowels. We present evidence that the performance of the subjects depended on the relation between fundamental and formant frequencies that they had previously been exposed to. Simple simulation results with artificial neural networks could reproduce most of the behavioral results and support the hypothesis that equivalence classes for vowels are associated with an experience-driven process based on general properties of peripheral auditory coding mixed with elementary learning mechanisms. These results suggest that rats use spectral and temporal cues similarly to humans despite differences in basic auditory capabilities.}, } @article {pmid16966154, year = {2006}, author = {Svancara, P and Horácek, J and Vokrál, J and Cerný, L}, title = {Computational modelling of effect of tonsillectomy on voice production.}, journal = {Logopedics, phoniatrics, vocology}, volume = {31}, number = {3}, pages = {117-125}, doi = {10.1080/14015430500342277}, pmid = {16966154}, issn = {1401-5439}, mesh = {Adolescent ; Adult ; *Computer Simulation ; Female ; Finite Element Analysis ; Humans ; Male ; Mathematical Computing ; *Models, Biological ; *Tonsillectomy/adverse effects ; Voice/*physiology ; Voice Disorders/etiology/physiopathology ; }, abstract = {The aim of this study is to develop mathematical 3D finite element (FE) models for numerical simulations of vibroacoustic properties of the human vocal tract after a tonsillectomy. Similar experimental studies are not easily realizable on living subjects. The FE models of the acoustic spaces corresponding to the human vocal tract for the Czech vowels /a:/ and /i:/ and the acoustic space around the human head were used in numerical simulations of phonation. The acoustic resonant characteristics of the FE models were studied using modal and transient analyses (excitation by a short pulse). Calculated results show that a tonsillectomy causes a frequency shift of the 3rd (down by approximately 180 Hz) and 4th (down by approximately 120 Hz) formants down to the lower frequencies for the vowel /a:/ and similarly for the 2nd, 4th and 5th formants for the vowel /i:/ (all down by approximately 100 Hz). Similar effects and results can be found in experimental studies in literature. The formant changes are dependent on the size of additional acoustic spaces that occur after a tonsillectomy. The verification of the model was performed on the recordings of patients before and after the tonsillectomy operation. Multi-Dimensional Voice Program (MDVP Advanced, KAY Elemetrics Corp.) was used for the comparing of the formant centre frequencies. Very small differences in the results of subjective evaluation of the voice before and after tonsillectomy were found.}, } @article {pmid16952440, year = {2007}, author = {Evans, MK and Deliyski, DD}, title = {Acoustic voice analysis of prelingually deaf adults before and after cochlear implantation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {6}, pages = {669-682}, doi = {10.1016/j.jvoice.2006.07.005}, pmid = {16952440}, issn = {0892-1997}, mesh = {Adult ; Aged ; Child, Preschool ; *Cochlear Implantation ; Deafness/surgery ; Humans ; Male ; Phonetics ; *Postoperative Care ; *Preoperative Care ; *Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; }, abstract = {It is widely accepted that many severe to profoundly deaf adults have benefited from cochlear implants (CIs). However, limited research has been conducted to investigate changes in voice and speech of prelingually deaf adults who receive CIs, a population well known for presenting with a variety of voice and speech abnormalities. The purpose of this study was to use acoustic analysis to explore changes in voice and speech for three prelingually deaf males pre- and postimplantation over 6 months. The following measurements, some measured in varying contexts, were obtained: fundamental frequency (F0), jitter, shimmer, noise-to-harmonic ratio, voice turbulence index, soft phonation index, amplitude- and F0-variation, F0-range, speech rate, nasalance, and vowel production. Characteristics of vowel production were measured by determining the first formant (F1) and second formant (F2) of vowels in various contexts, magnitude of F2-variation, and rate of F2-variation. Perceptual measurements of pitch, pitch variability, loudness variability, speech rate, and intonation were obtained for comparison. Results are reported using descriptive statistics. The results showed patterns of change for some of the parameters while there was considerable variation across the subjects. All participants demonstrated a decrease in F0 in at least one context and demonstrated a change in nasalance toward the norm as compared to their normal hearing control. The two participants who were oral-language communicators were judged to produce vowels with an average of 97.2% accuracy and the sign-language user demonstrated low percent accuracy for vowel production.}, } @article {pmid16939694, year = {2006}, author = {Kunisue, K and Fukushima, K and Nagayasu, R and Kawasaki, A and Nishizaki, K}, title = {Longitudinal formant analysis after cochlear implantation in school-aged children.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {70}, number = {12}, pages = {2033-2042}, doi = {10.1016/j.ijporl.2006.07.012}, pmid = {16939694}, issn = {0165-5876}, mesh = {Child ; *Cochlear Implantation ; Follow-Up Studies ; Hearing Loss/*congenital/physiopathology/*surgery/therapy ; Humans ; *Language Development ; Longitudinal Studies ; Male ; Sound Spectrography/methods ; Speech Intelligibility/physiology ; Speech Perception/physiology ; Voice/physiology ; }, abstract = {INTRODUCTION: The purpose of this investigation was to describe the correlation between vocal and hearing development by longitudinal analysis of sound spectrograms, as a basic system for evaluating progress in vocal development.

SUBJECTS AND METHODS: Two school-aged children with prelingual deafness were evaluated diachronically to assess speech perception and speech intelligibility after cochlear implantation. One child had non-syndromic hearing impairment without any known neurological deficit except for hearing loss, while the other had hearing impairment accompanied by mild mental retardation and attention deficit disorder. Their voices were recorded for monthly follow-up after cochlear implantation; these were used for formant analysis and compared with their mother's voice, and alteration of the formant data was also compared with monosyllable speech perception.

RESULTS: Formant analysis demonstrated high concordance was observed between monosyllable speech perception and speech intelligibility. F1-F2 forms of the patients more closely resembled those of their mothers after 1 year's follow-up. The time point at which speech development altered was very similar in both cases although the final outcomes were different.

CONCLUSION: Fair improvement of articulation after cochlear implant was demonstrated by the F1-F2 gram analysis. This procedure can be used for data sharing and cooperation between medical and educational specialists.}, } @article {pmid16938984, year = {2006}, author = {Purcell, DW and Munhall, KG}, title = {Adaptive control of vowel formant frequency: evidence from real-time formant manipulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {120}, number = {2}, pages = {966-977}, doi = {10.1121/1.2217714}, pmid = {16938984}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Analysis of Variance ; Feedback/*physiology ; Female ; Humans ; Male ; *Phonetics ; Pitch Discrimination/*physiology ; Regression Analysis ; *Speech Acoustics ; Time Factors ; }, abstract = {Auditory feedback during speech production is known to play a role in speech sound acquisition and is also important for the maintenance of accurate articulation. In two studies the first formant (F1) of monosyllabic consonant-vowel-consonant words (CVCs) was shifted electronically and fed back to the participant very quickly so that participants perceived the modified speech as their own productions. When feedback was shifted up (experiment 1 and 2) or down (experiment 1) participants compensated by producing F1 in the opposite frequency direction from baseline. The threshold size of manipulation that initiated a compensation in F1 was usually greater than 60 Hz. When normal feedback was returned, F1 did not return immediately to baseline but showed an exponential deadaptation pattern. Experiment 1 showed that this effect was not influenced by the direction of the F1 shift, with both raising and lowering of F1 exhibiting the same effects. Experiment 2 showed that manipulating the number of trials that F1 was held at the maximum shift in frequency (0, 15, 45 trials) did not influence the recovery from adaptation. There was a correlation between the lag-one autocorrelation of trial-to-trial changes in F1 in the baseline recordings and the magnitude of compensation. Some participants therefore appeared to more actively stabilize their productions from trial-to-trial. The results provide insight into the perceptual control of speech and the representations that govern sensorimotor coordination.}, } @article {pmid16886853, year = {2006}, author = {Bertino, G and Matti, E and Migliazzi, S and Pagella, F and Tinelli, C and Benazzo, M}, title = {Acoustic changes in voice after surgery for snoring: preliminary results.}, journal = {Acta otorhinolaryngologica Italica : organo ufficiale della Societa italiana di otorinolaringologia e chirurgia cervico-facciale}, volume = {26}, number = {2}, pages = {110-114}, pmid = {16886853}, issn = {0392-100X}, mesh = {Adult ; Aged ; Female ; Humans ; Male ; Middle Aged ; Pharynx/surgery ; Postoperative Care ; Preoperative Care ; Sleep Apnea, Obstructive/*surgery ; Snoring/*surgery ; *Speech Acoustics ; Treatment Outcome ; *Voice Quality ; }, abstract = {All surgical procedures for treatment of snoring and obstructive sleep apnoea modify the anatomical structure of the upper airways and the resonance characteristics of the vocal tract; this can lead to a modification in voice quality. Purpose of this study was to evaluate the possible modifications of the fundamental frequency (F0) and of the frequency and amplitude of the first (F1) and second (F2) formants of the 5 Italian vowels after different surgical procedures for snoring, to verify if and how these operations can influence voice quality. A total of 40 snoring or obstructive sleep apnoea syndrome patients, not affected by laryngeal, pulmonary or neurologic disorders likely to alter voice production, were selected for the study. All were submitted to acoustic voice analysis prior to surgery and again 1 month after discharge. F0 was unchanged. The frequency of F1 of the vowel /a/ audio of F2 of the vowel /e/ were significantly higher, while F1 of /i/ and F2 of /o/ and /u/ were significantly lower compared to pre-operative values. The modifications in the anatomical structure and volume of the vocal tract, induced by the surgical procedures used for the treatment of snoring, can modify the values of the formants and, as a consequence, quality of the voice. This change can be detected not only by means of the acoustic analysis but also by the patient itself. For this reason, singers and all professional voice users about to undergo surgical treatment for snoring should be informed of this potential modification of the voice not only for clinical reasons but also for legal purposes.}, } @article {pmid16838546, year = {2006}, author = {Hillenbrand, JM and Houde, RA and Gayvert, RT}, title = {Speech perception based on spectral peaks versus spectral shape.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {6}, pages = {4041-4054}, doi = {10.1121/1.2188369}, pmid = {16838546}, issn = {0001-4966}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Child ; Female ; Humans ; Linguistics ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; Speech Perception/*physiology ; }, abstract = {This study was designed to measure the relative contributions to speech intelligibility of spectral envelope peaks (including, but not limited to formants) versus the detailed shape of the spectral envelope. The problem was addressed by asking listeners to identify sentences and nonsense syllables that were generated by two structurally identical source-filter synthesizers, one of which constructs the filter function based on the detailed spectral envelope shape while the other constructs the filter function using a purposely coarse estimate that is based entirely on the distribution of peaks in the envelope. Viewed in the broadest terms the results showed that nearly as much speech information is conveyed by the peaks-only method as by the detail-preserving method. Just as clearly, however, every test showed some measurable advantage for spectral detail, although the differences were not large in absolute terms.}, } @article {pmid16825777, year = {2006}, author = {Chen, Y}, title = {Production of tense-lax contrast by Mandarin speakers of English.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {58}, number = {4}, pages = {240-249}, doi = {10.1159/000093181}, pmid = {16825777}, issn = {1021-7762}, mesh = {Adult ; Female ; Humans ; Language ; *Linguistics ; Male ; Middle Aged ; *Multilingualism ; Multivariate Analysis ; Phonation/*physiology ; Speech Acoustics ; }, abstract = {The spectral and temporal characteristics of three English tense-lax vowel pairs [see text] produced by native Mandarin speakers are reported. The euclidean distance based on first and second formant frequencies (F1 and F2), the durational differences, as well as the perceptual judgment of the tense-lax vowel contrast were examined in the syllable level productions of 40 Mandarin speakers compared to 40 American English speakers. Results of the comparative analysis indicated that Mandarin speakers differed significantly from the American English speakers in distinguishing the English tense/lax contrast. The general pattern shown across the Mandarin subjects was one in which the temporal feature, rather than the spectral feature, was more indicative of the tense-lax contrast as compared to the American speakers. In addition, the perception for the tense-lax contrast produced by Mandarin speakers was less distinctive as compared to the American speakers. The phonetic influences of the Mandarin language on the production of English tense-lax vowels are discussed.}, } @article {pmid16822646, year = {2007}, author = {Carew, L and Dacakis, G and Oates, J}, title = {The effectiveness of oral resonance therapy on the perception of femininity of voice in male-to-female transsexuals.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {5}, pages = {591-603}, doi = {10.1016/j.jvoice.2006.05.005}, pmid = {16822646}, issn = {0892-1997}, mesh = {Adult ; Humans ; Male ; Phonetics ; Speech Acoustics ; Speech Production Measurement ; Speech Therapy/methods ; *Transsexualism ; *Voice Quality ; *Voice Training ; }, abstract = {Ten male-to-female transsexuals participated in five sessions of oral resonance voice therapy targeting lip spreading and forward tongue carriage. Acoustic analysis of recordings made pre- and posttherapy found that participant formant frequency values (F1, F2, and F3, from the vowels /a/, /i/, and /mho/), as well as fundamental frequency (F0), underwent a general increase posttherapy. F3 values, in particular, increased significantly posttreatment. Trends in listener ratings of these recordings showed that the majority of participants were perceived to sound more feminine following treatment. Participants' self-ratings of their voices pre- and posttreatment also indicated that participants perceived their voices as sounding more feminine and that they were more satisfied with their voices following treatment. The present study supports the findings of previous studies that have demonstrated that resonance characteristics in male-to-female transsexuals can be changed to more closely approximate those of females through oral resonance therapy. This intervention study also demonstrates that a spontaneous increase in F0 is achieved during the course of therapy. Further, this study provides preliminary evidence to suggest that oral resonance therapy may be effective in increasing femininity of voice in male-to-female transsexual clients.}, } @article {pmid16806816, year = {2007}, author = {Morris, RJ and Mustafa, AJ and McCrea, CR and Fowler, LP and Aspaas, C}, title = {Acoustic analysis of the interaction of choral arrangements, musical selection, and microphone location.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {5}, pages = {568-575}, doi = {10.1016/j.jvoice.2006.04.006}, pmid = {16806816}, issn = {0892-1997}, mesh = {Acoustic Stimulation ; Adult ; *Amplifiers, Electronic ; Female ; Humans ; Male ; *Music ; *Phonation ; *Speech Acoustics ; *Voice Quality ; }, abstract = {Acoustic differences were evaluated among three choral arrangements and two choral textures recorded at three microphone locations. A choir was recorded when singing two musical selections of different choral texture, one homophonic and one polyphonic. Both musical selections were sung in three choral arrangements: block sectional, sectional-in-columns, and mixed. Microphones were placed at the level of the choristers, the conductor, and the audience. The recordings at each location were analyzed using long-term average spectrum (LTAS). The LTAS from the mixed arrangement exhibited more signal amplitude than the other arrangements in the range of 1000-3500Hz. When considering the musical selections, the chorus produced more signal amplitude in the region of 1800-2200Hz for the homophonic selection. In addition, the LTAS produced by the choir for the homophonic selection varied across the microphone locations. As for the microphone location, the LTAS of the signal detected directly in front of the chorus had a greater slope than the other two locations. Thus, the acoustic signal near the choristers differed from the signals near the conductor and in the audience. Conductors may be using acoustic information from the region of the second and third formants when they decide how to arrange a choir for a particular musical selection.}, } @article {pmid16806702, year = {2006}, author = {Saltuklaroglu, T and Kalinowski, J}, title = {The inhibition of stuttering via the presentation of natural speech and sinusoidal speech analogs.}, journal = {Neuroscience letters}, volume = {404}, number = {1-2}, pages = {196-201}, doi = {10.1016/j.neulet.2006.05.057}, pmid = {16806702}, issn = {0304-3940}, mesh = {Brain/physiopathology ; Electric Stimulation ; Humans ; Neurons/physiology ; *Speech ; Stuttering/*prevention & control ; *Verbal Behavior ; }, abstract = {Sensory signals containing speech or gestural (articulatory) information (e.g., choral speech) have repeatedly been found to be highly effective inhibitors of stuttering. Sine wave analogs of speech consist of a trio of changing pure tones representative of formant frequencies. They are otherwise devoid of traditional speech cues, yet have proven to evoke consistent linguistic percepts in listeners. Thus, we investigated the potency of sinusoidal speech for inhibiting stuttering. Ten adults who stutter read while listening to (a) forward-flowing natural speech; (b) forward-flowing sinusoid analogs of natural speech; (c) reversed natural speech; (d) reversed sinusoid analogs of natural speech; and (e) a continuous 1000 Hz pure tone. The levels of stuttering inhibition achieved using the sinusoidal stimuli were potent and not significantly different from those achieved using natural speech (approximately 50% in forward conditions and approximately 25% in the reversed conditions), suggesting that the patterns of undulating pure tones are sufficient to endow sinusoidal sentences with 'quasi-gestural' qualities. These data highlight the sensitivity of a specialized 'phonetic module' for extracting gestural information from sensory stimuli. Stuttering inhibition is thought to occur when perceived gestural information facilitates fluent productions via the engagement of mirror neurons (e.g., in Broca's area), which appear to play a crucial role in our ability to perceive and produce speech.}, } @article {pmid16787902, year = {2006}, author = {Katz, WF and Bharadwaj, SV and Stettler, MP}, title = {Influences of electromagnetic articulography sensors on speech produced by healthy adults and individuals with aphasia and apraxia.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {49}, number = {3}, pages = {645-659}, doi = {10.1044/1092-4388(2006/047)}, pmid = {16787902}, issn = {1092-4388}, mesh = {Adult ; Aged ; Aphasia/*physiopathology ; Apraxias/*physiopathology ; Case-Control Studies ; Electromagnetic Phenomena ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Phonetics ; *Speech Intelligibility ; Speech Production Measurement/*instrumentation ; Verbal Behavior ; }, abstract = {PURPOSE: This study examined whether the intraoral transducers used in electromagnetic articulography (EMA) interfere with speech and whether there is an added risk of interference when EMA systems are used to study individuals with aphasia and apraxia.

METHOD: Ten adult talkers (5 individuals with aphasia/apraxia, 5 controls) produced 12 American English vowels in /hVd/ words, the fricative-vowel (FV) words (/si/, /su/, /ei/, /eu/), and the sentence She had your dark suit in greasy wash water all year, in EMA sensors-on and sensors-off conditions. Segmental durations, vowel formant frequencies, and fricative spectral moments were measured to address possible acoustic effects of sensor placement. A perceptual experiment examined whether FV words produced in the sensors-on condition were less identifiable than those produced in the sensors-off condition.

RESULTS: EMA sensors caused no consistent acoustic effects across all talkers, although significant within-subject effects were noted for a small subset of the talkers. The perceptual results revealed some instances of sensor-related intelligibility loss for FV words produced by individuals with aphasia and apraxia.

CONCLUSIONS: The findings support previous suggestions that acoustic screening procedures be used to protect articulatory experiments from those individuals who may show consistent effects of having devices placed on intraoral structures. The findings further suggest that studies of fricatives produced by individuals with aphasia and apraxia may require additional safeguards to ensure that results are not adversely affected by intraoral sensor interference.}, } @article {pmid16763428, year = {2006}, author = {Cerçi, U and Kandoğan, T and Olgun, L and Gültekin, G and Alper, S}, title = {[The effect of cochlear implantation on voice development].}, journal = {Kulak burun bogaz ihtisas dergisi : KBB = Journal of ear, nose, and throat}, volume = {16}, number = {3}, pages = {112-121}, pmid = {16763428}, issn = {1300-7475}, mesh = {Case-Control Studies ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*surgery ; Female ; Humans ; *Language Development ; Male ; Speech Disorders/*prevention & control ; Time Factors ; *Voice ; }, abstract = {OBJECTIVES: The effect of cochlear implantation on voice development in prelingually deaf children was investigated.

PATIENTS AND METHODS: The study included 60 prelingually deaf children (28 girls, 32 boys; mean age 68 months; range 37 to 128 months) who underwent cochlear implantation. Voice analyses were made between 6 to 21 months after the first fitting and six months after the baseline. The patients were divided into two groups (i) according to age (younger or older than 48 months) and (ii) to the duration of implant use (more or less than 18 months) to observe the changes in fundamental frequency (F(0)), and two formant frequencies (F(1) and F(2)). Forty-seven children (22 girls, 25 boys; mean age 62 months; range 38 to 118 months) with normal speech and language development comprised the control group. In all the cases, F(0), F(1), and F(2) values of the vowel /a/ were studied.

RESULTS: In the patient group, F(0) and F(2) values significantly differed between the first and second voice analyses, whereas the change in F(1) values was insignificant. No significant differences were found in terms of F(0), F(1) and F(2) values between the patients younger than 48 months and age-matched controls; however, F(0) and F(1) values significantly differed in those older than 48 months while F(2) values remained insignificant. Albeit statistically insignificant, F(0), F(1), and F(2) values approximated normal levels in children in whom the duration of implant use exceeded 18 months.

CONCLUSION: Our study confirms that early cochlear implantation and longer implant use result in improved development of voice, speech, and language.}, } @article {pmid16708960, year = {2006}, author = {Aylett, M and Turk, A}, title = {Language redundancy predicts syllabic duration and the spectral characteristics of vocalic syllable nuclei.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {5 Pt 1}, pages = {3048-3058}, doi = {10.1121/1.2188331}, pmid = {16708960}, issn = {0001-4966}, mesh = {*Communication ; Female ; Humans ; *Language ; Linear Models ; Male ; Models, Biological ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; }, abstract = {The language redundancy of a syllable, measured by its predictability given its context and inherent frequency, has been shown to have a strong inverse relationship with syllabic duration. This relationship is predicted by the smooth signal redundancy hypothesis, which proposes that robust communication in a noisy environment can be achieved with an inverse relationship between language redundancy and the predictability given acoustic observations (acoustic redundancy). A general version of the hypothesis predicts similar relationships between the spectral characteristics of speech and language redundancy. However, investigating this claim is hampered by difficulties in measuring the spectral characteristics of speech within large conversational corpora, and difficulties in forming models of acoustic redundancy based on these spectral characteristics. This paper addresses these difficulties by testing the smooth signal redundancy hypothesis with a very high-quality corpus collected for speech synthesis, and presents both durational and spectral data from vowel nuclei on a vowel-by-vowel basis. Results confirm the duration/language redundancy results achieved in previous work, and show a significant relationship between language redundancy factors and the first two formants, although these results vary considerably by vowel. In general, however, vowels show increased centralization with increased language redundancy.}, } @article {pmid16708958, year = {2006}, author = {Hay, JF and Sato, M and Coren, AE and Moran, CL and Diehl, RL}, title = {Enhanced contrast for vowels in utterance focus: a cross-language study.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {5 Pt 1}, pages = {3022-3033}, doi = {10.1121/1.2184226}, pmid = {16708958}, issn = {0001-4966}, support = {R01 DC00427-13/DC/NIDCD NIH HHS/United States ; R01 DC00427-14/DC/NIDCD NIH HHS/United States ; R01 DC00427-15/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Analysis of Variance ; Female ; Humans ; *Language ; Male ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {The present study examined several potential distinctiveness-enhancing correlates of vowels produced in utterance focus by talkers of American English, French, and German. These correlates included possible increases in vowel space size, in formant movement within individual vowels, and in duration variance among vowels. Each language group enhanced the distinctiveness of vowels in [+focus] context but used somewhat differing means to achieve this. All three groups used spectral differences, but only German talkers used durational differences, to enhance distinctiveness. The results suggest that the amount of distinctiveness enhancement of a vowel property in [+focus] context is positively related to the between-category variation of that property in [-focus] context. Thus, consistent with the theory of adaptive dispersion, utterance clarity appears to vary directly with information content.}, } @article {pmid16698051, year = {2007}, author = {Gentilucci, M and Bernardis, P}, title = {Imitation during phoneme production.}, journal = {Neuropsychologia}, volume = {45}, number = {3}, pages = {608-615}, doi = {10.1016/j.neuropsychologia.2006.04.004}, pmid = {16698051}, issn = {0028-3932}, mesh = {Adult ; Biomechanical Phenomena ; Female ; Humans ; Imitative Behavior/*physiology ; Lipreading ; Male ; *Phonetics ; Photic Stimulation/methods ; Spectrum Analysis ; *Speech ; Speech Production Measurement/methods ; Time Factors ; *Verbal Behavior ; Visual Perception/physiology ; Voice ; }, abstract = {Does listening to and observing the speaking interlocutor influence phoneme production? In two experiments female participants were required to recognize and, then, to repeat the string-of-phonemes /aba/ presented by actors visually, acoustically and audiovisually. In experiment 1 a male actor presented the string-of-phonemes and the participants' lip kinematics and voice spectra were compared with those of a reading control condition. In experiment 2 female and male actors presented the string-of-phonemes and the lip kinematics and the voice spectra of the participants' responses to the male actors were compared with those to the female actors (control condition). In both experiments 1 and 2, the lip kinematics in the visual presentations and the voice spectra in the acoustical presentations changed in the comparison with the control conditions approaching the male actors' values, which were different from those of the female participants and actors. The variation in lip kinematics induced changes also in voice formants but only in the visual presentation. The data suggest that both features of the lip kinematics and of the voice spectra tend to be automatically imitated when repeating a string-of-phonemes presented by a visible and/or audible speaking interlocutor. The use of imitation, in place of the usual lip kinematics and vocal features, suggests an automatic and unconscious tendency of the perceiver to interact closely with the interlocutor. This is in accordance with the idea that resonant circuits are activated by the activity of the mirror system, which relates observation to execution of arm and mouth gestures.}, } @article {pmid16671852, year = {2006}, author = {Rosen, KM and Kent, RD and Delaney, AL and Duffy, JR}, title = {Parametric quantitative acoustic analysis of conversation produced by speakers with dysarthria and healthy speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {49}, number = {2}, pages = {395-411}, doi = {10.1044/1092-4388(2006/031)}, pmid = {16671852}, issn = {1092-4388}, support = {5 R01 DC 00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Case-Control Studies ; Discriminant Analysis ; Dysarthria/etiology/*physiopathology ; Humans ; Male ; Middle Aged ; Multivariate Analysis ; Parkinson Disease/complications ; *Phonetics ; Sound Spectrography ; Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {PURPOSE: This study's main purpose was to (a) identify acoustic signatures of hypokinetic dysarthria (HKD) that are robust to phonetic variation in conversational speech and (b) determine specific characteristics of the variability associated with HKD.

METHOD: Twenty healthy control (HC) participants and 20 participants with HKD associated with idiopathic Parkinson's disease (PD) repeated 3 isolated sentences (controlled phonetic content) and 2 min of conversational speech (phonetic content treated as a random variable). A MATLAB-based program automatically calculated measures of contrastivity: speech-pause ratio, intensity variation, median and maximum formant slope, formant range, change in the upper and lower spectral envelope, and range of the spectral envelope. t tests were used to identify which measures were sensitive to HKD and which measures differed by task. Discriminant analysis was used to identify the combination of measures that best predicted HKD, and this analysis was then used as a general measure of contrastivity (Contrastivity Index). Differential effects of HKD on maximum and typical contrastivity levels were tested with interaction of maximum, minimum, and median observations of individual speakers and with pairwise comparisons of skewness and kurtosis of the contrastivity index distributions.

RESULTS: Group differences were detected with pairwise comparisons with t tests in 8 of the 9 measures. Percentage pause time and spectral range were identified as the most specific (95%) and accurate (95%) differentiators of HKD and HC conversational speech. Sentence repetition elicited significantly higher levels of contrastivity than conversational speech in both HC and HKD speakers. Maximum and minimum contrastivities were significantly lower in HKD speech, but there was no evidence that HKD affects maximum contrastivity levels more than median contrastivity levels. The HKD speakers' contrastivity distributions were significantly more skewed to lower levels of production.

CONCLUSION: HKD can be consistently distinguished from HC speech in both sentence repetition and conversational speech on the basis of intensity variation and spectral range. Although speakers with HKD were effectively able to produce higher contrastivity levels in sentence repetition tasks, they habitually performed closer to the lower end of their production ranges.}, } @article {pmid16650947, year = {2007}, author = {Rautava, L and Lempinen, A and Ojala, S and Parkkola, R and Rikalainen, H and Lapinleimu, H and Haataja, L and Lehtonen, L and , }, title = {Acoustic quality of cry in very-low-birth-weight infants at the age of 1 1/2 years.}, journal = {Early human development}, volume = {83}, number = {1}, pages = {5-12}, doi = {10.1016/j.earlhumdev.2006.03.004}, pmid = {16650947}, issn = {0378-3782}, mesh = {Acoustics ; Apgar Score ; Body Weight ; Brain/*growth & development/*physiology ; Bronchopulmonary Dysplasia/physiopathology ; Child Development/physiology ; Crying/*physiology ; Follow-Up Studies ; Humans ; Infant ; Infant, Newborn ; Infant, Premature ; *Infant, Very Low Birth Weight ; Magnetic Resonance Imaging ; Neurologic Examination ; Phonation/*physiology ; }, abstract = {BACKGROUND: Infant cry characteristics reflect the integrity of the central nervous system. Previous studies have shown that preterm infants and infants with neurological conditions have different cry characteristics such as fundamental frequency compared to healthy full-term infants. Cry characteristics of preterm infants after the first year of life have not been studied.

AIMS: The aim of this study was to assess the quality of cry in 1 1/2-year-old very-low-birth-weight infants (VLBWI, < or =1500 g at birth). STUDY SUBJECTS AND DESIGN: Study groups included 21 VLBWI and 25 healthy full-term controls. Thirty seconds of pain cry after vaccination was recorded at well-baby clinics. The first cry utterance was acoustically analyzed using Praat software. The quality of cry was compared between the groups. In addition, the association of cry quality to patient characteristics, to developmental outcome, and to findings in brain imaging studies of the VLBWI was studied.

RESULTS: The cry response was elicited in 20 of the 21 VLBWI and in 20 out of 25 full-term infants. VLBWI had higher minimum fundamental frequency and fourth formant values. Patient characteristics that were associated with cry quality were 5-min Apgar scores, the occurrence of bronchopulmonary dysplasia, Bayley Psychomotor Index scores at 12 months, and current weight and head circumference.

CONCLUSIONS: Differences found between the study groups were not explained primarily by brain pathology or by patient characteristics, so it seems that prematurity has an impact on cry quality still at the age of 1 1/2 years.}, } @article {pmid16642852, year = {2006}, author = {Stack, JW and Strange, W and Jenkins, JJ and Clarke, WD and Trent, SA}, title = {Perceptual invariance of coarticulated vowels over variations in speaking rate.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {4}, pages = {2394-2405}, doi = {10.1121/1.2171837}, pmid = {16642852}, issn = {0001-4966}, support = {DC-00323/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; Judgment ; Male ; Phonation ; *Phonetics ; *Reaction Time ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; *Verbal Behavior ; }, abstract = {This study examined the perception and acoustics of a large corpus of vowels spoken in consonant-vowel-consonant syllables produced in citation-form (lists) and spoken in sentences at normal and rapid rates by a female adult. Listeners correctly categorized the speaking rate of sentence materials as normal or rapid (2% errors) but did not accurately classify the speaking rate of the syllables when they were excised from the sentences (25% errors). In contrast, listeners accurately identified the vowels produced in sentences spoken at both rates when presented the sentences and when presented the excised syllables blocked by speaking rate or randomized. Acoustical analysis showed that formant frequencies at syllable midpoint for vowels in sentence materials showed "target undershoot" relative to citation-form values, but little change over speech rate. Syllable durations varied systematically with vowel identity, speaking rate, and voicing of final consonant. Vowel-inherent-spectral-change was invariant in direction of change over rate and context for most vowels. The temporal location of maximum F1 frequency further differentiated spectrally adjacent lax and tense vowels. It was concluded that listeners were able to utilize these rate- and context-independent dynamic spectrotemporal parameters to identify coarticulated vowels, even when sentential information about speaking rate was not available.}, } @article {pmid16642842, year = {2006}, author = {Purcell, DW and Munhall, KG}, title = {Compensation following real-time manipulation of formants in isolated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {4}, pages = {2288-2297}, doi = {10.1121/1.2173514}, pmid = {16642842}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; *Attention ; *Feedback ; Female ; Habituation, Psychophysiologic ; Humans ; *Loudness Perception ; Male ; *Phonetics ; *Pitch Discrimination ; Reference Values ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; *Verbal Behavior ; }, abstract = {Auditory feedback influences human speech production, as demonstrated by studies using rapid pitch and loudness changes. Feedback has also been investigated using the gradual manipulation of formants in adaptation studies with whispered speech. In the work reported here, the first formant of steady-state isolated vowels was unexpectedly altered within trials for voiced speech. This was achieved using a real-time formant tracking and filtering system developed for this purpose. The first formant of vowel /epsilon/ was manipulated 100% toward either /ae/ or /I/, and participants responded by altering their production with average Fl compensation as large as 16.3% and 10.6% of the applied formant shift, respectively. Compensation was estimated to begin <460 ms after stimulus onset. The rapid formant compensations found here suggest that auditory feedback control is similar for both F0 and formants.}, } @article {pmid16583921, year = {2006}, author = {Boersma, P and Kovacic, G}, title = {Spectral characteristics of three styles of Croatian folk singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {3}, pages = {1805-1816}, doi = {10.1121/1.2168549}, pmid = {16583921}, issn = {0001-4966}, mesh = {Adult ; Humans ; Male ; Middle Aged ; *Music ; Principal Component Analysis ; Sound Spectrography ; *Speech ; *Voice Quality ; }, abstract = {This paper examines the differences between three Croatian folk singing styles, namely klapa, ojkanje, and tarankanje. In order to factor out singer-specific properties, each of the styles was performed by the same 12 professional male singers. The 36 performances were analyzed with a long-term average spectrum (LTAS) method from which direct effects of the pitch distribution were removed. After factoring out each singer's average, the 36 pitch-corrected LTAS contours were reduced to a two-dimensional representation in two ways: (1) a principal-component analysis and (2) a graphical plot of spectral slope versus speaker's formant strength. Both ways clearly separate the three styles. The spectrum of the klapa style turns out to be similar to that of speech. The ojkanje style is extremely loud and shows two spectral peaks: a sharp one tuned at twice the fundamental frequency and appropriate for long-distance communication on mountain slopes, and a broad one around 3.5 kHz, reminiscent of a speaker's formant. The tarankanje style has a very flat spectrum implemented by vocal pressedness and nasality, which is appropriate for blending into or imitating the timbral characteristics of the sopile folk instrument.}, } @article {pmid16583913, year = {2006}, author = {Ohde, RN and Haley, KL and Barnes, CW}, title = {Perception of the [m]-[n] distinction in consonant-vowel (CV) and vowel-consonant (VC) syllables produced by child and adult talkers.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {3}, pages = {1697-1711}, doi = {10.1121/1.2140830}, pmid = {16583913}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; DC00523-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Humans ; *Linguistics ; Male ; Sound Spectrography ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The contribution of the nasal murmur and vocalic formant transition to the perception of the [m]-[n] distinction by adult listeners was investigated for speakers of different ages in both consonant-vowel (CV) and vowel-consonant (VC) syllables. Three children in each of the speaker groups 3, 5, and 7 years old, and three adult females and three adult males produced CV and VC syllables consisting of either [m] or [n] and followed or preceded by [i ae u a], respectively. Two productions of each syllable were edited into seven murmur and transitions segments. Across speaker groups, a segment including the last 25 ms of the murmur and the first 25 ms of the vowel yielded higher perceptual identification of place of articulation than any other segment edited from the CV syllable. In contrast, the corresponding vowel+murmur segment in the VC syllable position improved nasal identification relative to other segment types for only the adult talkers. Overall, the CV syllable was perceptually more distinctive than the VC syllable, but this distinctiveness interacted with speaker group and stimulus duration. As predicted by previous studies and the current results of perceptual testing, acoustic analyses of adult syllable productions showed systematic differences between labial and alveolar places of articulation, but these differences were only marginally observed in the youngest children's speech. Also predicted by the current perceptual results, these acoustic properties differentiating place of articulation of nasal consonants were reliably different for CV syllables compared to VC syllables. A series of comparisons of perceptual data across speaker groups, segment types, and syllable shape provided strong support, in adult speakers, for the "discontinuity hypothesis" [K. N. Stevens, in Phonetic Linguistics: Essays in Honor of Peter Ladefoged, edited by V. A. Fromkin (Academic, London, 1985), pp. 243-255], according to which spectral discontinuities at acoustic boundaries provide critical cues to the perception of place of articulation. In child speakers, the perceptual support for the "discontinuity hypothesis" was weaker and the results indicative of developmental changes in speech production.}, } @article {pmid16583909, year = {2006}, author = {Ternström, S and Bohman, M and Södersten, M}, title = {Loud speech over noise: some spectral attributes, with gender differences.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {3}, pages = {1648-1665}, doi = {10.1121/1.2161435}, pmid = {16583909}, issn = {0001-4966}, mesh = {Adult ; Environment ; Female ; Humans ; Male ; Middle Aged ; *Noise ; Phonation/*physiology ; *Sex Characteristics ; Sound Spectrography ; *Speech Acoustics ; }, abstract = {In seeking an acoustic description of overloaded voice, simulated environmental noise was used to elicit loud speech. A total of 23 adults, 12 females and 11 males, read six passages of 90 s duration, over realistic noise presented over loudspeakers. The noise was canceled out, exposing the speech signal to analysis. Spectrum balance (SB) was defined as the level of the 2-6 kHz band relative to the 0.1-1 kHz band. SB averaged across many similar vowel segments became less negative with increasing sound pressure level (SPL), as described in the literature, but only at moderate SPL. At high SPL, SB exhibited a personal "saturation" point, above which the high-band level no longer increased faster than the overall SPL, or even stopped increasing altogether, on average at 90.3 dB (@30 cm) for females and 95.5 dB for males. Saturation occurred 6-8 dB below the personal maximum SPL, regardless of gender. The loudest productions were often characterized by a relative increase in low-frequency energy, apparently in a sharpened first formant. This suggests a change of vocal strategy when the high spectrum can rise no further. The progression of SB with SPL was characteristically different for individual subjects.}, } @article {pmid16581228, year = {2007}, author = {Clément, P and Hans, S and Hartl, DM and Maeda, S and Vaissière, J and Brasnu, D}, title = {Vocal tract area function for vowels using three-dimensional magnetic resonance imaging. A preliminary study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {5}, pages = {522-530}, doi = {10.1016/j.jvoice.2006.01.005}, pmid = {16581228}, issn = {0892-1997}, mesh = {Adult ; Humans ; *Imaging, Three-Dimensional ; *Magnetic Resonance Imaging ; Male ; Phonation/physiology ; *Phonetics ; Speech Acoustics ; Speech Perception ; Speech Production Measurement ; Vocal Cords/*physiology ; }, abstract = {OBJECTIVE: To assess whether magnetic resonance imaging (MRI) allows the vocal tract (VT) area function to be determined for a normal male speaker.

METHOD: VT shapes were acquired using MRI during sustained production of French points vowels: /i/, /a/, /u/. Cross-sectional areas were measured from a series of planes spaced at intervals of 1 cm along the length of the VT and were used as input in a previously described VT model to simulate the vowels. The first three formant frequencies, F1, F2, and F3, computed from the MRI-measured VT model were compared with subject's natural formant frequencies.

RESULTS: Including piriform sinuses, calculated formants differed from measured formants F1, F2, and F3, respectively, for /i/ by -3.5%, +7.7%, and +27.5%; for /a/ by +11% +19.5%, and -4.3%; and for /u/ by +.9%, +23.4%, and +9.6%. Excluding piriform sinuses, calculated formants differed from measured formants F1, F2, and F3, respectively, for /i/ by -3.5%, +12%, and +28%, and for /u/ by +10.1%, +26.8%, and +13.7% The piriform sinuses were not visualized for /a/ on MRI.

CONCLUSIONS: MRI is a noninvasive technique that allows VT imaging and determination of VT area function for a normal male speaker. Several possible sources of discrepancies are as follows: variability of the articulation, difficulties in assessment of VT wall boundaries, role of the piriform sinuses, and VT length.}, } @article {pmid16567614, year = {2006}, author = {Riede, T and Suthers, RA and Fletcher, NH and Blevins, WE}, title = {Songbirds tune their vocal tract to the fundamental frequency of their song.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {103}, number = {14}, pages = {5543-5548}, pmid = {16567614}, issn = {0027-8424}, mesh = {*Animal Communication ; Animals ; Esophagus/anatomy & histology/diagnostic imaging/physiology ; Larynx/anatomy & histology/diagnostic imaging/physiology ; Male ; Mouth/anatomy & histology/diagnostic imaging/physiology ; Pharynx/anatomy & histology/diagnostic imaging/physiology ; Radiography ; Songbirds/*physiology ; }, abstract = {In human speech, the sound generated by the larynx is modified by articulatory movements of the upper vocal tract, which acts as a variable resonant filter concentrating energy near particular frequencies, or formants, essential in speech recognition. Despite its potential importance in vocal communication, little is known about the presence of tunable vocal tract filters in other vertebrates. The tonal quality of much birdsong, in which upper harmonics have relatively little energy, depends on filtering of the vocal source, but the nature of this filter is controversial. Current hypotheses treat the songbird vocal tract as a rigid tube with a resonance that is modulated by the end-correction of a variable beak opening. Through x-ray cinematography of singing birds, we show that birdsong is accompanied by cyclical movements of the hyoid skeleton and changes in the diameter of the cranial end of the esophagus that maintain an inverse relationship between the volume of the oropharyngeal cavity and esophagus and the song's fundamental frequency. A computational acoustic model indicates that this song-related motor pattern tunes the major resonance of the oropharyngeal-esophageal cavity to actively track the song's fundamental frequency.}, } @article {pmid16533082, year = {2006}, author = {Khouw, E and Ciocca, V}, title = {An acoustic and perceptual study of final stops produced by profoundly hearing impaired adolescents.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {49}, number = {1}, pages = {172-185}, doi = {10.1044/1092-4388(2006/014)}, pmid = {16533082}, issn = {1092-4388}, mesh = {Adolescent ; Analysis of Variance ; Articulation Disorders/*etiology/physiopathology ; Audiometry, Speech ; Auditory Perception/*physiology ; Case-Control Studies ; Child ; Female ; Hearing Loss, Sensorineural/complications/*physiopathology ; Humans ; Male ; Severity of Illness Index ; *Speech Acoustics ; }, abstract = {PURPOSE: This study investigated formant frequencies for their role as acoustic and perceptual correlates to the place of articulation of Cantonese final stops produced by profoundly hearing impaired speakers.

METHOD: Speakers were 10 Cantonese adolescents (mean age=13;5 [years;months]) who were profoundly hearing impaired (HI). Control speakers were 10 adolescents (mean age=13;5) with normal hearing. Stimuli were Cantonese words that were minimally contrastive in place of final stops (/p, t, k/). Listeners were 10 final-year speech therapy students. The frequencies of F1, F2, and F3 were measured at the middle, 40 ms before the end, and at the end of the vocalic segments.

RESULTS: Control speakers distinguished place contrasts through formant frequency differences at the end positions of the vowels. HI speakers produced final stops with missing formant transitions and neutralized vowels preceding final stops. Listeners relied on F2 transition cues for stops produced by control speakers, whereas F1 and F3 transition cues were used for stops produced by HI speakers.

CONCLUSIONS: Formant frequencies of final stops produced by HI speakers showed reduced place distinction. When listeners identified the place of final stops produced by HI speakers, they relied on formant frequency cues that were different from those used for stops produced by control speakers.}, } @article {pmid16532770, year = {2006}, author = {Manfredi, C and Peretti, G}, title = {A new insight into postsurgical objective voice quality evaluation: application to thyroplastic medialization.}, journal = {IEEE transactions on bio-medical engineering}, volume = {53}, number = {3}, pages = {442-451}, doi = {10.1109/TBME.2005.864495}, pmid = {16532770}, issn = {0018-9294}, mesh = {Diagnosis, Computer-Assisted/methods ; Humans ; *Laryngoscopy ; Postoperative Care/methods ; Prognosis ; Recovery of Function ; Reproducibility of Results ; Sensitivity and Specificity ; Severity of Illness Index ; Sound Spectrography/*methods ; Treatment Outcome ; Vocal Cord Paralysis/classification/complications/*diagnosis/*surgery ; Vocal Cords/*surgery ; Voice Disorders/classification/*diagnosis/etiology/*prevention & control ; Voice Quality ; }, abstract = {This paper aims at providing new objective parameters and plots, easily understandable and usable by clinicians and logopaedicians, in order to assess voice quality recovering after vocal fold surgery. The proposed software tool performs presurgical and postsurgical comparison of main voice characteristics (fundamental frequency, noise, formants) by means of robust analysis tools, specifically devoted to deal with highly degraded speech signals as those under study. Specifically, we address the problem of quantifying voice quality, before and after medialization thyroplasty, for patients affected by glottis incompetence. Functional evaluation after thyroplastic medialization is commonly based on several approaches: videolaryngostroboscopy (VLS), for morphological aspects evaluation, GRBAS scale and Voice Handicap Index (VHI), relative to perceptive and subjective voice analysis respectively, and Multi-Dimensional Voice Program (MDVP), that provides objective acoustic parameters. While GRBAS has the drawback to entirely rely on perceptive evaluation of trained professionals, MDVP often fails in performing analysis of highly degraded signals, thus preventing from presurgical/postsurgical comparison in such cases. On the contrary, the new tool, being capable to deal with severely corrupted signals, always allows a complete objective analysis. The new parameters are compared to scores obtained with the GRBAS scale and to some MDVP parameters, suitably modified, showing good correlation with them. Hence, the new tool could successfully replace or integrate existing ones. With the proposed approach, deeper insight into voice recovering and its possible changes after surgery can thus be obtained and easily evaluated by the clinician.}, } @article {pmid16530428, year = {2006}, author = {Benson, RR and Richardson, M and Whalen, DH and Lai, S}, title = {Phonetic processing areas revealed by sinewave speech and acoustically similar non-speech.}, journal = {NeuroImage}, volume = {31}, number = {1}, pages = {342-353}, doi = {10.1016/j.neuroimage.2005.11.029}, pmid = {16530428}, issn = {1053-8119}, support = {DC-00493/DC/NIDCD NIH HHS/United States ; HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/physiology ; *Brain Mapping ; Dominance, Cerebral/physiology ; Female ; Humans ; *Image Processing, Computer-Assisted ; *Imaging, Three-Dimensional ; *Magnetic Resonance Imaging ; Male ; Nerve Net/physiology ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Temporal Lobe/physiology ; }, abstract = {The neural substrates underlying speech perception are still not well understood. Previously, we found dissociation of speech and nonspeech processing at the earliest cortical level (AI), using speech and nonspeech complexity dimensions. Acoustic differences between speech and nonspeech stimuli in imaging studies, however, confound the search for linguistic-phonetic regions. Presently, we used sinewave speech (SWsp) and nonspeech (SWnon), which replace speech formants with sinewave tones, in order to match acoustic spectral and temporal complexity while contrasting phonetics. Chord progressions (CP) were used to remove the effects of auditory coherence and object processing. Twelve normal RH volunteers were scanned with fMRI while listening to SWsp, SWnon, CP, and a baseline condition arranged in blocks. Only two brain regions, in bilateral superior temporal sulcus, extending more posteriorly on the left, were found to prefer the SWsp condition after accounting for acoustic modulation and coherence effects. Two regions responded preferentially to the more frequency-modulated stimuli, including one that overlapped the right temporal phonetic area and another in the left angular gyrus far from the phonetic area. These findings are proposed to form the basis for the two subtypes of auditory word deafness. Several brain regions, including auditory and non-auditory areas, preferred the coherent auditory stimuli and are likely involved in auditory object recognition. The design of the current study allowed for separation of acoustic spectrotemporal, object recognition, and phonetic effects resulting in distinct and overlapping components.}, } @article {pmid16521781, year = {2006}, author = {Fletcher, NH and Hollenberg, LC and Smith, J and Tarnopolsky, AZ and Wolfe, J}, title = {Vocal tract resonances and the sound of the Australian didjeridu (yidaki) II. Theory.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {1205-1213}, doi = {10.1121/1.2146090}, pmid = {16521781}, issn = {0001-4966}, mesh = {*Acoustics ; Australia ; Electric Impedance ; Humans ; Larynx/*physiology ; Lip/physiology ; Lung/*physiology ; Models, Biological ; Mouth/*physiology ; *Music ; Pharynx/*physiology ; }, abstract = {The didjeridu (didgeridoo) or yidaki of the Australian Aboriginal people consists of the narrow trunk of a small Eucalypt tree that has been hollowed out by the action of termites, cut to a length of about 1.5 m, smoothed, and decorated. It is lip-blown like a trumpet and produces a simple drone in the frequency range 55 to 80 Hz. Interest arises from the fact that a skilled player can make a very wide variety of sounds with formants rather like those of human vowels, and can also produce additional complex sounds by adding vocalization. An outline is given of the way in which the whole system can be analyzed using the harmonic-balance technique, but a simpler approach with lip motion assumed shows easily that upper harmonics of the drone with frequencies lying close to impedance maxima of the vocal tract are suppressed, so that formant bands appear near impedance minima of the vocal tract. This agrees with experimental findings. Simultaneous vibration of the player's lips and vocal folds is shown to generate multiple sum and difference tones, and can be used to produce subharmonics of the drone. A brief discussion is given of player preference of particular bore profiles.}, } @article {pmid16521780, year = {2006}, author = {Tarnopolsky, AZ and Fletcher, NH and Hollenberg, LC and Lange, BD and Smith, J and Wolfe, J}, title = {Vocal tract resonances and the sound of the Australian didjeridu (yidaki) I. experiment.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {1194-1204}, doi = {10.1121/1.2146089}, pmid = {16521780}, issn = {0001-4966}, mesh = {Acoustics ; Australia ; Humans ; Larynx/*physiology ; Lip/physiology ; Mouth/*physiology ; *Music ; Pharynx/*physiology ; Phonation/*physiology ; Sound Spectrography ; }, abstract = {The didjeridu, or yidaki, is a simple tube about 1.5 m long, played with the lips, as in a tuba, but mostly producing just a tonal, rhythmic drone sound. The acoustic impedance spectra of performers' vocal tracts were measured while they played and compared with the radiated sound spectra. When the tongue is close to the hard palate, the vocal tract impedance has several maxima in the range 1-3 kHz. These maxima, if sufficiently large, produce minima in the spectral envelope of the sound because the corresponding frequency components of acoustic current in the flow entering the instrument are small. In the ranges between the impedance maxima, the lower impedance of the tract allows relatively large acoustic current components that correspond to strong formants in the radiated sound. Broad, weak formants can also be observed when groups of even or odd harmonics coincide with bore resonances. Schlieren photographs of the jet entering the instrument and high speed video images of the player's lips show that the lips are closed for about half of each cycle, thus generating high levels of upper harmonics of the lip frequency. Examples of the spectra of "circular breathing" and combined playing and vocalization are shown.}, } @article {pmid16521771, year = {2006}, author = {Jiang, J and Chen, M and Alwan, A}, title = {On the perception of voicing in syllable-initial plosives in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {1092-1105}, doi = {10.1121/1.2149841}, pmid = {16521771}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Auditory Threshold ; Female ; Humans ; Male ; Noise/adverse effects ; Sound Spectrography ; *Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Perception/*physiology ; Speech Production Measurement ; Tape Recording ; }, abstract = {Previous studies [Lisker, J. Acoust. Soc. Am. 57, 1547-1551 (1975); Summerfield and Haggard, J. Acoust. Soc. Am. 62, 435-448 (1977)] have shown that voice onset time (VOT) and the onset frequency of the first formant are important perceptual cues of voicing in syllable-initial plosives. Most prior work, however, has focused on speech perception in quiet environments. The present study seeks to determine which cues are important for the perception of voicing in syllable-initial plosives in the presence of noise. Perceptual experiments were conducted using stimuli consisting of naturally spoken consonant-vowel syllables by four talkers in various levels of additive white Gaussian noise. Plosives sharing the same place of articulation and vowel context (e.g., /pa,ba/) were presented to subjects in two alternate forced choice identification tasks, and a threshold signal-to-noise-ratio (SNR) value (corresponding to the 79% correct classification score) was estimated for each voiced/voiceless pair. The threshold SNR values were then correlated with several acoustic measurements of the speech tokens. Results indicate that the onset frequency of the first formant is critical in perceiving voicing in syllable-initial plosives in additive white Gaussian noise, while the VOT duration is not.}, } @article {pmid16521769, year = {2006}, author = {Tsao, YC and Weismer, G and Iqbal, K}, title = {The effect of intertalker speech rate variation on acoustic vowel space.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {1074-1082}, doi = {10.1121/1.2149774}, pmid = {16521769}, issn = {0001-4966}, support = {R01 DC00319/DC/NIDCD NIH HHS/United States ; R01 DC03273/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; *Phonetics ; Regression Analysis ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {The present study aimed to examine the size of the acoustic vowel space in talkers who had previously been identified as having slow and fast habitual speaking rates [Tsao, Y.-C. and Weismer, G. (1997) J. Speech Lang. Hear. Res. 40, 858-866]. Within talkers, it is fairly well known that faster speaking rates result in a compression of the vowel space relative to that measured for slower rates, so the current study was completed to determine if the same differences in the size of the vowel space occur across talkers who differ significantly in their habitual speaking rates. Results indicated that there was no difference in the average size of the vowel space for slow vs fast talkers, and no relationship across talkers between vowel duration and formant frequencies. One difference between the slow and fast talkers was in intertalker variability of the vowel spaces, which was clearly greater for the slow talkers, for both speaker sexes. Results are discussed relative to theories of speech production and vowel normalization in speech perception.}, } @article {pmid16521766, year = {2006}, author = {Takemoto, H and Honda, K and Masaki, S and Shimada, Y and Fujimoto, I}, title = {Measurement of temporal changes in vocal tract area function from 3D cine-MRI data.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {1037-1049}, doi = {10.1121/1.2151823}, pmid = {16521766}, issn = {0001-4966}, mesh = {Humans ; Imaging, Three-Dimensional ; Larynx/*physiology ; Magnetic Resonance Imaging, Cine/*methods ; Male ; Middle Aged ; Pharynx/*physiology ; Phonation/*physiology ; }, abstract = {A 3D cine-MRI technique was developed based on a synchronized sampling method [Masaki et al., J. Acoust. Soc. Jpn. E 20, 375-379 (1999)] to measure the temporal changes in the vocal tract area function during a short utterance /aiueo/ in Japanese. A time series of head-neck volumes was obtained after 640 repetitions of the utterance produced by a male speaker, from which area functions were extracted frame-by-frame. A region-based analysis showed that the volumes of the front and back cavities tend to change reciprocally and that the areas near the larynx and posterior edge of the hard palate were almost constant throughout the utterance. The lower four formants were calculated from all the area functions and compared with those of natural speech sounds. The mean absolute percent error between calculated and measured formants among all the frames was 4.5%. The comparison of vocal tract shapes for the five vowels with those from the static MRI method suggested a problem of MRI observation of the vocal tract: data from static MRI tend to result in a deviation from natural vocal tract geometry because of the gravity effect.}, } @article {pmid16521730, year = {2006}, author = {Story, BH}, title = {Technique for "tuning" vocal tract area functions based on acoustic sensitivity functions.}, journal = {The Journal of the Acoustical Society of America}, volume = {119}, number = {2}, pages = {715-718}, doi = {10.1121/1.2151802}, pmid = {16521730}, issn = {0001-4966}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Glottis/physiology ; Humans ; Kinetics ; Larynx/*physiology ; Mathematics ; Phonation/*physiology ; *Speech Acoustics ; *Voice ; }, abstract = {A technique for modifying vocal tract area functions is developed by using sum and difference combinations of acoustic sensitivity functions to perturb an initial vocal tract configuration. First, sensitivity functions [e.g., Fant and Pauli, Proc. Speech Comm. Sem. 74, 1975] are calculated for a given area function, at its specific formant frequencies. The sensitivity functions are then multiplied by scaling coefficients that are determined from the difference between a desired set of formant frequencies and those supported by the current area function. The scaled sensitivity functions are then summed together to generate a perturbation of the area function. This produces a new area function whose associated formant frequencies are closer to the desired values than the previous one. This process is repeated iteratively until the coefficients are equal to zero or are below a threshold value.}, } @article {pmid16519239, year = {2006}, author = {Bruckert, L and Liénard, JS and Lacroix, A and Kreutzer, M and Leboucher, G}, title = {Women use voice parameters to assess men's characteristics.}, journal = {Proceedings. Biological sciences}, volume = {273}, number = {1582}, pages = {83-89}, pmid = {16519239}, issn = {0962-8452}, mesh = {Adolescent ; Adult ; Age Factors ; Analysis of Variance ; *Auditory Perception ; Body Height ; Body Weight ; Female ; Humans ; Judgment ; Male ; Personality ; Phonation ; Psychoacoustics ; *Sex Characteristics ; Speech Acoustics ; Testosterone/metabolism ; Voice Quality ; }, abstract = {The purpose of this study was: (i) to provide additional evidence regarding the existence of human voice parameters, which could be reliable indicators of a speaker's physical characteristics and (ii) to examine the ability of listeners to judge voice pleasantness and a speaker's characteristics from speech samples. We recorded 26 men enunciating five vowels. Voices were played to 102 female judges who were asked to assess vocal attractiveness and speakers' age, height and weight. Statistical analyses were used to determine: (i) which physical component predicted which vocal component and (ii) which vocal component predicted which judgment. We found that men with low-frequency formants and small formant dispersion tended to be older, taller and tended to have a high level of testosterone. Female listeners were consistent in their pleasantness judgment and in their height, weight and age estimates. Pleasantness judgments were based mainly on intonation. Female listeners were able to correctly estimate age by using formant components. They were able to estimate weight but we could not explain which acoustic parameters they used. However, female listeners were not able to estimate height, possibly because they used intonation incorrectly. Our study confirms that in all mammal species examined thus far, including humans, formant components can provide a relatively accurate indication of a vocalizing individual's characteristics. Human listeners have the necessary information at their disposal; however, they do not necessarily use it.}, } @article {pmid16504472, year = {2007}, author = {Sundberg, J and Birch, P and Gümoes, B and Stavad, H and Prytz, S and Karle, A}, title = {Experimental findings on the nasal tract resonator in singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {2}, pages = {127-137}, doi = {10.1016/j.jvoice.2005.11.005}, pmid = {16504472}, issn = {0892-1997}, mesh = {Humans ; Paranasal Sinuses/*physiology ; *Phonation ; Vocal Cords/physiology ; *Voice Quality ; }, abstract = {Many professional operatic singers sing the vowel /a/ with a velopharyngeal opening.(1) Here resonatory effects of such an opening are analyzed. On the basis of CAT scan imaging of a baritone singer's vocal tract and nasal cavity system, including the maxillary sinuses, acoustic epoxy models were constructed, in which velopharyngeal openings were modeled by different tubes. The sound transfer characteristics of this model were determined by means of sine-tone sweep measurements. In an idealized (iron tube) model, the VPO introduced a zero in the transfer function at the frequency of the nasal resonance. In the epoxy models, however, the resonances of the nasal system, and hence the zero, were heavily damped, particularly when the maxillary sinuses were included in the nasal system. A velopharyngeal opening was found to attenuate the first formant in /a/, such that the relative level of the singer's formant increased. A similar effect was observed in a modified epoxy model shaped to approximate the vocal tract of an /u/ and an /i/, although it also showed a substantial widening of the first formant bandwidth. Varying the size of the velopharyngeal opening affected the transfer function only slightly. It seems likely that singers can enhance higher spectrum partials by a careful tuning of a velopharyngeal opening.}, } @article {pmid16479132, year = {2006}, author = {Alku, P and Story, B and Airas, M}, title = {Estimation of the voice source from speech pressure signals: evaluation of an inverse filtering technique using physical modelling of voice production.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {58}, number = {2}, pages = {102-113}, doi = {10.1159/000089611}, pmid = {16479132}, issn = {1021-7762}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01 DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Glottis/*physiology ; Humans ; Models, Biological ; Phonation/*physiology ; Pressure ; *Pulmonary Ventilation ; Sound Spectrography ; Voice/*physiology ; }, abstract = {OBJECTIVE: The goal of the study is to use physical modelling of voice production to assess the performance of an inverse filtering method in estimating the glottal flow from acoustic speech pressure signals.

METHODS: An automatic inverse filtering method is presented, and speech pressure signals are generated using physical modelling of voice production so as to obtain test vowels with a known shape of the glottal excitation waveform. The speech sounds produced consist of 4 different vowels, each with 10 different values of the fundamental frequency. Both the original glottal flows given by physical modelling and their estimates computed by inverse filtering were parametrised with two robust voice source parameters: the normalized amplitude quotient and the difference (in decibels) between the levels of the first and second harmonics.

RESULTS: The results show that for both extracted parameters the error introduced by inverse filtering was, in general, small. The effect of the distortion caused by inverse filtering on the parameter values was clearly smaller than the change in the corresponding parameters when the phonation type was altered. The distortion was largest for high-pitched vowels with the lowest value of the first formant.

CONCLUSIONS: The study shows that the proposed inverse filtering technique combined with the extracted parameters constitutes a voice source analysis tool that is able to measure the voice source dynamics automatically with satisfactory accuracy.}, } @article {pmid16478372, year = {2005}, author = {Yunusova, Y and Weismer, G and Kent, RD and Rusche, NM}, title = {Breath-group intelligibility in dysarthria: characteristics and underlying correlates.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {48}, number = {6}, pages = {1294-1310}, doi = {10.1044/1092-4388(2005/090)}, pmid = {16478372}, issn = {1092-4388}, support = {R01 DC00319/DC/NIDCD NIH HHS/United States ; R01 DC003723/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Amyotrophic Lateral Sclerosis/complications ; Case-Control Studies ; Dysarthria/etiology/*physiopathology ; Female ; Humans ; Linguistics ; Male ; Middle Aged ; Parkinson Disease/complications ; Regression Analysis ; Reproducibility of Results ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {PURPOSE: This study was designed to determine whether within-speaker fluctuations in speech intelligibility occurred among speakers with dysarthria who produced a reading passage, and, if they did, whether selected linguistic and acoustic variables predicted the variations in speech intelligibility.

METHOD: Participants with dysarthria included a total of 10 persons with Parkinson's disease and amyotrophic lateral sclerosis; a control group of 10 neurologically normal speakers was also studied. Each participant read a passage that was subsequently separated into consecutive breath groups for estimates of individual breath group intelligibility. Sixty listeners participated in 2 perceptual experiments, generating intelligibility scores across speakers and for each breath group produced by speakers with dysarthria.

RESULTS: Individual participants with dysarthria had fluctuations in intelligibility across breath groups. Breath groups of participants with dysarthria had fewer average words and reduced interquartile ranges for the 2nd formant, the latter a global measure of articulatory mobility. Regression analyses with intelligibility measures as the criterion variable and linguistic and acoustic measures as predictor variables produced significant functions both within and across speakers, but the solutions were not the same.

CONCLUSIONS: Linguistic or acoustic variables that predict across-speaker variations in speech intelligibility may not function in the same way when within-speaker variations in intelligibility are considered.}, } @article {pmid16460814, year = {2006}, author = {Poissant, SF and Peters, KA and Robb, MP}, title = {Acoustic and perceptual appraisal of speech production in pediatric cochlear implant users.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {70}, number = {7}, pages = {1195-1203}, doi = {10.1016/j.ijporl.2005.12.008}, pmid = {16460814}, issn = {0165-5876}, mesh = {Child ; Child, Preschool ; *Cochlear Implants ; Feedback ; Female ; Hearing Loss, Sensorineural/*physiopathology/*therapy ; Humans ; Male ; Phonetics ; Sensory Deprivation ; *Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {OBJECTIVE: The purpose of the present study was to examine the relationship between objectively measurable acoustic changes in speech production and subjective speech production accuracy and perceived intelligibility immediately following a disruption in auditory feedback normally provided to subjects from a cochlear implant.

METHODS: Six children with profound sensorineural hearing loss participated in the study. Their task was to produce speech samples in two conditions: (1) with auditory feedback from their cochlear implants, and (2) without auditory feedback from their cochlear implants. Samples were subjected to both objective and subjective analyses. Objectively, measures were made of duration, fundamental frequency, and the first and second formants of the vowels. Subjectively, two groups of listeners, one familiar with the speech of children with hearing loss and the other unfamiliar, transcribed the productions and provided ratings of intelligibility.

RESULTS: All the children in this study exhibited significant differences from the cochlear implant-on to the cochlear implant-off condition, although these changes were not always in the predicted direction, nor were they always perceptually salient.

CONCLUSIONS: Consistent with previous studies, children in this investigation demonstrated variable acoustic voice and speech changes following deactivation of their cochlear implant device. Few of these acoustic changes affected speech intelligibility. The results of this study overall suggest that during the initial years following implantation children who are deaf rely to some extent on the auditory feedback provided by a cochlear implant to control and modify F0, duration, and vowel formant production.}, } @article {pmid16428239, year = {2006}, author = {Horga, D and Liker, M}, title = {Voice and pronunciation of cochlear implant speakers.}, journal = {Clinical linguistics & phonetics}, volume = {20}, number = {2-3}, pages = {211-217}, doi = {10.1080/02699200400027015}, pmid = {16428239}, issn = {0269-9206}, mesh = {Adolescent ; Case-Control Studies ; Child ; *Cochlear Implants ; *Hearing Aids ; Hearing Loss/*rehabilitation ; Humans ; Multivariate Analysis ; Phonetics ; Reaction Time ; Speech Acoustics ; *Speech Intelligibility ; Time Factors ; *Voice ; Voice Quality ; }, abstract = {Patients with cochlear implants have the ability to exercise auditory control over their own speech production and over the speech of others, which is important for the development of speech control. In the present investigation three groups of 10 subjects were compared. The groups comprised: (1) cochlear implant users, (2) profoundly deaf using traditional hearing aids, and (3) hearing controls. The subjects in three groups were matched in age. While repeating after a model the subjects were recorded and the following linguistic voice variables were analysed: (1) vowel formant space, (2) voice vs. voiceless difference, (3) closure duration and VOT, (4) word accent production, (5) sentence stress production, (6) voice quality, (7) pronunciation quality. Acoustic analysis and perceptual assessment by phoneticians showed that in great majority of variables, subjects with cochlear implants performed better than the profoundly deaf subjects with traditional hearing-aids.}, } @article {pmid16428229, year = {2006}, author = {Whitehill, TL and Ciocca, V and Chan, JC and Samman, N}, title = {Acoustic analysis of vowels following glossectomy.}, journal = {Clinical linguistics & phonetics}, volume = {20}, number = {2-3}, pages = {135-140}, doi = {10.1080/02699200400026694}, pmid = {16428229}, issn = {0269-9206}, mesh = {Adult ; Aged ; Female ; Glossectomy/*adverse effects ; Humans ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Speech Disorders/etiology/*physiopathology ; *Speech Intelligibility ; Speech Production Measurement ; Voice/*physiology ; }, abstract = {This study examined the acoustic characteristics of vowels produced by speakers with partial glossectomy. Acoustic variables investigated included first formant (F1) frequency, second formant (F2) frequency, F1 range, F2 range and vowel space area. Data from the speakers with partial glossectomy were compared with age- and gender-matched controls. Results indicated no significant group difference in the mean F1 values and ranges. The speakers with partial glossectomy exhibited significantly lower mean F2 values for the vowel /i/, and restricted F2 ranges, when compared with the control speakers. These data suggest a limited range of lingual movement along the anterior-posterior dimension for vowel production. The significantly smaller vowel space areas for the speakers with glossectomy supported the hypothesis of vowel formant centralization. Two acoustic measures, vowel space area and F2 range, had a positive correlation with vowel intelligibility scores. F2 range could serve as a sensitive correlate of vowel intelligibility for speakers with partial glossectomy.}, } @article {pmid16427767, year = {2007}, author = {Reid, KL and Davis, P and Oates, J and Cabrera, D and Ternström, S and Black, M and Chapman, J}, title = {The acoustic characteristics of professional opera singers performing in chorus versus solo mode.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {21}, number = {1}, pages = {35-45}, doi = {10.1016/j.jvoice.2005.08.010}, pmid = {16427767}, issn = {0892-1997}, mesh = {*Cooperative Behavior ; Humans ; *Occupations ; *Phonation ; Sound Spectrography ; *Speech Acoustics ; *Voice Quality ; }, abstract = {In this study, members of a professional opera chorus were recorded using close microphones, while singing in both choral and solo modes. The analysis included computation of long-term average spectra (LTAS) for the two song sections performed and calculation of singing power ratio (SPR) and energy ratio (ER), which provide an indication of the relative energy in the singer's formant region. Vibrato rate and extent were determined from two matched vowels, and SPR and ER were calculated for these vowels. Subjects sung with equal or more power in the singer's formant region in choral versus solo mode in the context of the piece as a whole and in individual vowels. There was no difference in vibrato rate and extent between the two modes. Singing in choral mode, therefore, required the ability to use a similar vocal timbre to that required for solo opera singing.}, } @article {pmid16419830, year = {2005}, author = {Parikh, G and Loizou, PC}, title = {The influence of noise on vowel and consonant cues.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {6}, pages = {3874-3888}, doi = {10.1121/1.2118407}, pmid = {16419830}, issn = {0001-4966}, support = {R01 DC03421/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Models, Biological ; *Noise ; Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {This study assessed the acoustic and perceptual effect of noise on vowel and stop-consonant spectra. Multi-talker babble and speech-shaped noise were added to vowel and stop stimuli at -5 to +10 dB S/N, and the effect of noise was quantified in terms of (a) spectral envelope differences between the noisy and clean spectra in three frequency bands, (b) presence of reliable F1 and F2 information in noise, and (c) changes in burst frequency and slope. Acoustic analysis indicated that F1 was detected more reliably than F2 and the largest spectral envelope differences between the noisy and clean vowel spectra occurred in the mid-frequency band. This finding suggests that in extremely noisy conditions listeners must be relying on relatively accurate F1 frequency information along with partial F2 information to identify vowels. Stop consonant recognition remained high even at -5 dB despite the disruption of burst cues due to additive noise, suggesting that listeners must be relying on other cues, perhaps formant transitions, to identify stops.}, } @article {pmid16411803, year = {2005}, author = {Tampas, JW and Harkrider, AW and Hedrick, MS}, title = {Neurophysiological indices of speech and nonspeech stimulus processing.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {48}, number = {5}, pages = {1147-1164}, doi = {10.1044/1092-4388(2005/081)}, pmid = {16411803}, issn = {1092-4388}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Speech ; Speech Perception/*physiology ; }, abstract = {Auditory event-related potentials (mismatch negativity and P300) and behavioral discrimination were measured to synthetically generated consonant-vowel (CV) speech and nonspeech contrasts in 10 young adults with normal auditory systems. Previous research has demonstrated that behavioral and P300 responses reflect a phonetic, categorical level of processing. The aims of the current investigation were (a) to examine whether the mismatch negativity (MMN) response is also influenced by the phonetic characteristics of a stimulus or if it reflects purely an acoustic level of processing and (b) to expand our understanding of the neurophysiology underlying categorical perception, a phenomenon crucial in the processing of speech. The CVs were 2 within-category stimuli and the nonspeech stimuli were 2 glides whose frequency ramps matched the formant transitions of the CV stimuli. Listeners exhibited better behavioral discrimination to the nonspeech versus speech stimuli in same/different and oddball behavioral paradigms. MMN responses were elicited by the nonspeech stimuli, but absent to CV speech stimuli. Larger amplitude and earlier P300s were elicited by the nonspeech stimuli, while smaller and longer latency P300s were elicited by the speech stimulus contrast. Results suggest that the 2 types of stimuli were processed differently when measured behaviorally, with MMN, or P300. The better discrimination and clearer neurophysiological representation of the frequency glide, nonspeech stimuli versus the CV speech stimuli of analogous acoustic content support (a) categorical perception representation at the level of the MMN generators and (b) parallel processing of acoustic (sensory) and phonetic (categorical) information at the level of the MMN generators.}, } @article {pmid16396003, year = {2005}, author = {Wade, T and Holt, LL}, title = {Perceptual effects of preceding nonspeech rate on temporal properties of speech categories.}, journal = {Perception & psychophysics}, volume = {67}, number = {6}, pages = {939-950}, doi = {10.3758/bf03193621}, pmid = {16396003}, issn = {0031-5117}, support = {5 R01 DC04674-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Reaction Time ; *Speech Perception ; Time Factors ; }, abstract = {The rate of context speech can influence phonetic perception. This study investigated the bounds of rate dependence by observing the influence of nonspeech precursor rate on speech categorization. Three experiments tested the effects of pure-tone precursor presentation rate on the perception of a [ba]-[wa] series defined by duration-varying formant transitions that shared critical temporal and spectral characteristics with the tones. Results showed small but consistent shifts in the stop-continuant boundary distinguishing [ba] and [wa] syllables as a function of the rate of precursor tones, across various manipulations in the amplitude of the tones. The effect of the tone precursors extended to the entire graded structure of the [w] category, as estimated by category goodness judgments. These results suggest a role for durational contrast in rate-dependent speech categorization.}, } @article {pmid16391497, year = {2005}, author = {Biersack, S and Kempe, V}, title = {Exploring the influence of vocal emotion expression on communicative effectiveness.}, journal = {Phonetica}, volume = {62}, number = {2-4}, pages = {106-119}, doi = {10.1159/000090092}, pmid = {16391497}, issn = {0031-8388}, mesh = {Adolescent ; Adult ; *Affect ; Aged ; *Communication ; Female ; Humans ; Male ; Middle Aged ; *Verbal Behavior ; }, abstract = {This study explores whether speaker emotion influences communicative effectiveness. Two hundred participants rated their current emotional state and gave a description of a route on a simple map. The quality of the linguistic content of the descriptions was assessed using Latent Semantic Analysis. Six hundred participants provided route drawings based on the map descriptions. Median route deviation served as a measure of communicative effectiveness. Eighty additional participants rated invariant parts of the descriptions for perceived speaker happiness. Path analysis revealed that while speaker emotion did not affect the linguistic content of the descriptions, it had an effect on communicative effectiveness both through the effects of vocal cues directly as well as mediated by perceived happiness of speech. Specifically, higher first formants were associated with higher reported and perceived happiness as well as higher communicative effectiveness. Jitter, on the other hand, was negatively related to the perception of happiness and positively related to communicative effectiveness. These findings suggest that mood-related modulation of phonation and articulation can influence the effectiveness of communication in addition to message content.}, } @article {pmid16378485, year = {2005}, author = {Bourland Hicks, C and Ohde, RN}, title = {Developmental role of static, dynamic, and contextual cues in speech perception.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {48}, number = {4}, pages = {960-974}, doi = {10.1044/1092-4388(2005/066)}, pmid = {16378485}, issn = {1092-4388}, support = {DC00523-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; *Cues ; Female ; Humans ; *Language Development ; Linguistics ; Male ; Phonetics ; *Semantics ; *Speech Perception ; }, abstract = {The purpose of the current study was to examine the role of syllable duration context as well as static and dynamic acoustic properties in child and adult speech perception. Ten adults and eleven 4-5-year-old children identified a syllable as [ba] or [wa] (stop-glide contrast) in 3 conditions differing in synthetic continua. The 1st condition tested the potential existence of the syllable duration effect in young children, whereas the 2nd and 3rd conditions examined the developmental role of static and dynamic cues, respectively, as related to syllable duration context effects. In the 1st condition, the 1st and 2nd formant transition duration of stimuli varied from those appropriate for [ba] to those appropriate for [wa]. For the 2nd condition, a static burst was added to Condition 1 stimuli. For the 3rd condition, the dynamic transition frequency and transition duration for the first 3 formants varied as appropriate for [ba] and [wa]. In each condition, 3 syllable context durations of 105 ms, 170 ms, and 315 ms were tested. The results indicated that syllable duration context effects were present across all conditions for both adults and children. However, the adults and children did differ in the 3rd condition, in which both the transition frequency and the transition duration were altered. Thus, children used the dynamic formant transitions differently than adults when transition frequency was varied along with transition duration. These findings show that children have a bias toward formant transitions and indicate that young children of 4-5 years of age attend differently than adults to changes in dynamic cues such as formant transitions, as predicted by the developmental cue weighting shift model.}, } @article {pmid16378475, year = {2005}, author = {Burlingame, E and Sussman, HM and Gillam, RB and Hay, JF}, title = {An investigation of speech perception in children with specific language impairment on a continuum of formant transition duration.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {48}, number = {4}, pages = {805-816}, doi = {10.1044/1092-4388(2005/056)}, pmid = {16378475}, issn = {1092-4388}, mesh = {Child ; Child, Preschool ; Female ; Humans ; Language Disorders/*diagnosis ; Male ; Phonetics ; Severity of Illness Index ; Speech Disorders/*diagnosis ; *Speech Perception ; Speech Production Measurement ; Speech Reception Threshold Test ; }, abstract = {Fifteen children diagnosed with specific language impairment (SLI) and 15 typically developing (TD) children were tested for identification performance on 2 synthetic speech continua varying in formant transition durations (FTDs). One continuum varied from /ba/ to /wa/, and the other varied from /da/ to /ja/. Various d'-related measures from signal detection theory were used to compare category boundaries and indirectly derive sensitivity to phonetic changes in category tokens along each continuum. The SLI group showed less consistent identification performance along the /ba/-/wa/ series relative to the TD group, as well as reduced sensitivity to phonetic changes along the continuum. On the /da/-/ja/ series, the SLI group revealed less consistent identification performance on the short FTD end but similar identification levels to the TD group at the long FTD end. The overall results support the contention that children with SLI reveal a deficiency in the processing of speech sounds at the level of segmental identity.}, } @article {pmid16376410, year = {2006}, author = {Lieberman, P}, title = {Limits on tongue deformation--Diana monkey formants and the impossible vocal tract shapes proposed by Riede et al. (2005).}, journal = {Journal of human evolution}, volume = {50}, number = {2}, pages = {219-21; discussion 222-5}, doi = {10.1016/j.jhevol.2005.07.010}, pmid = {16376410}, issn = {0047-2484}, mesh = {Animals ; Cercopithecus/anatomy & histology/*physiology ; Humans ; Male ; Models, Biological ; Tongue/anatomy & histology/*physiology ; Vocal Cords/anatomy & histology/*physiology ; Vocalization, Animal/*physiology ; }, } @article {pmid16360301, year = {2006}, author = {Bhutta, MF and Worley, GA and Harries, ML}, title = {"Hot potato voice" in peritonsillitis: a misnomer.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {20}, number = {4}, pages = {616-622}, doi = {10.1016/j.jvoice.2005.07.005}, pmid = {16360301}, issn = {0892-1997}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Peritonsillar Abscess/*epidemiology ; Phonetics ; Severity of Illness Index ; *Terminology as Topic ; *Voice Disorders/diagnosis/epidemiology/physiopathology ; *Voice Quality ; }, abstract = {The "hot potato voice" is widely recognized as a symptom of peritonsillar cellulitis or abscess; yet there have been no studies assessing the resonance characteristics of the vocal tract in peritonsillitis. Analysis was undertaken of formant frequencies in the articulation of the vowels /i:/. /a:/ and /u:/ in six subjects with peritonsillitis and compared with articulation once the peritonsillitis had settled. Significant variation was found in F1 when articulating /i:/ and in F2 when articulating /a:/, which are explainable by dyskinesis of the peritonsillar musculature. These findings were compared with six subjects articulating the same vowels with and without a hot potato in their mouth. Variation was found in both F1 and F2 when articulating /i:/, which can be related to interference of the potato with movement of the anterior tongue. The changes in the vocal tract differ in these two cases and the title "hot potato voice" in peritonsillitis is a misnomer.}, } @article {pmid16342613, year = {2005}, author = {Lyakso, EE and Gromova, AD and Frolova, OV and Romanova, OD}, title = {Acoustic aspects of the formation of speech in children in the third year of life.}, journal = {Neuroscience and behavioral physiology}, volume = {35}, number = {6}, pages = {573-583}, pmid = {16342613}, issn = {0097-0549}, mesh = {Adult ; *Child Language ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography/*methods ; *Speech Acoustics ; Speech Discrimination Tests/*methods ; Speech Intelligibility/*physiology ; Speech Production Measurement/*methods ; }, abstract = {This study covers the third year of life as part of a longitudinal investigation of the establishment of speech in Russian children performed on the basis of listener, phonetic, and instrumented acoustical analysis. The present report addresses the establishment of those additional acoustic and phonetic characteristics in children's speech which allow speech recognition. This is the first instrumented analysis in Russian children with statistical assessment of the dynamics of vowel formants in children's words, of the establishment of characteristics (stress, lack of stress), opposition (palatalization, lack of palatalization of consonants), and voice onset time for plosive consonants. The results showed that recognition of children's words by listeners with a high probability of success resulted from the formation of a system of acoustically stable properties in the children's speech which together provide informational adequacy for verbal communication.}, } @article {pmid16334904, year = {2005}, author = {Kewley-Port, D and Goodman, SS}, title = {Thresholds for second formant transitions in front vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {5}, pages = {3252-3260}, doi = {10.1121/1.2074667}, pmid = {16334904}, issn = {0001-4966}, support = {NIHDCD-02229/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Auditory Threshold/*physiology ; Humans ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Formant dynamics in vowel nuclei contribute to vowel classification in English. This study examined listeners' ability to discriminate dynamic second formant transitions in synthetic high front vowels. Acoustic measurements were made from the nuclei (steady state and 20% and 80% of vowel duration) for the vowels /i, I, e, epsilon, ae/ spoken by a female in /bVd/ context. Three synthesis parameters were selected to yield twelve discrimination conditions: initial frequency value for F2 (2525, 2272, or 2068 Hz), slope direction (rising or falling), and duration (110 or 165 ms). F1 frequency was roved. In the standard stimuli, F0 and F1-F4 were steady state. In the comparison stimuli only F2 frequency varied linearly to reach a final frequency. Five listeners were tested under adaptive tracking to estimate the threshold for frequency extent, the minimal detectable difference in frequency between the initial and final F2 values, called deltaF extent. Analysis showed that initial F2 frequency and direction of movement for some F2 frequencies contributed to significant differences in deltaF extent. Results suggested that listeners attended to differences in the stimulus property of frequency extent (hertz), not formant slope (hertz/second). Formant extent thresholds were at least four times smaller than extents measured in the natural speech tokens, and 18 times smaller than for the diphthongized vowel /e/.}, } @article {pmid16325374, year = {2006}, author = {Bele, IV}, title = {The speaker's formant.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {20}, number = {4}, pages = {555-578}, doi = {10.1016/j.jvoice.2005.07.001}, pmid = {16325374}, issn = {0892-1997}, mesh = {Adult ; Aged ; Humans ; Larynx/anatomy & histology/physiology ; Male ; Middle Aged ; Phonation/physiology ; Phonetics ; Professional Competence ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Perception ; Speech Production Measurement ; Students ; *Voice Quality ; }, abstract = {The current study concerns speaking voice quality in two groups of professional voice users, teachers (n = 35) and actors (n = 36), representing trained and untrained voices. The voice quality of text reading at two intensity levels was acoustically analyzed. The central concept was the speaker's formant (SPF), related to the perceptual characteristics "better normal voice quality" (BNQ) and "worse normal voice quality" (WNQ). The purpose of the current study was to get closer to the origin of the phenomenon of the SPF, and to discover the differences in spectral and formant characteristics between the two professional groups and the two voice quality groups. The acoustic analyses were long-term average spectrum (LTAS) and spectrographical measurements of formant frequencies. At very high intensities, the spectral slope was rather quandrangular without a clear SPF peak. The trained voices had a higher energy level in the SPF region compared with the untrained, significantly so in loud phonation. The SPF seemed to be related to both sufficiently strong overtones and a glottal setting, allowing for a lowering of F4 and a closeness of F3 and F4. However, the existence of SPF also in LTAS of the WNQ voices implies that more research is warranted concerning the formation of SPF, and concerning the acoustic correlates of the BNQ voices.}, } @article {pmid16325373, year = {2006}, author = {Hunter, EJ and Svec, JG and Titze, IR}, title = {Comparison of the produced and perceived voice range profiles in untrained and trained classical singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {20}, number = {4}, pages = {513-526}, pmid = {16325373}, issn = {0892-1997}, support = {R01 DC004224/DC/NIDCD NIH HHS/United States ; R01 DC004347/DC/NIDCD NIH HHS/United States ; R01 DC004224-05/DC/NIDCD NIH HHS/United States ; R01 DC004347-05/DC/NIDCD NIH HHS/United States ; DC04347-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Occupations ; Phonation/*physiology ; Phonetics ; *Professional Competence ; Speech Perception/*physiology ; Speech Production Measurement/*methods ; Voice Quality/*physiology ; }, abstract = {Frequency and intensity ranges (in true decibel sound pressure level, 20 microPa at 1 m) of voice production in trained and untrained vocalists were compared with the perceived dynamic range (phons) and units of loudness (sones) of the ear. Results were reported in terms of standard voice range profiles (VRPs), perceived VRPs (as predicted by accepted measures of auditory sensitivities), and a new metric labeled as an overall perceptual level construct. Trained classical singers made use of the most sensitive part of the hearing range (around 3-4 kHz) through the use of the singer's formant. When mapped onto the contours of equal loudness (depicting nonuniform spectral and dynamic sensitivities of the auditory system), the formant is perceived at an even higher sound level, as measured in phons, than a flat or A-weighted spectrum would indicate. The contributions of effects like the singer's formant and the sensitivities of the auditory system helped the trained singers produce 20% to 40% more units of loudness, as measured in sones, than the untrained singers. Trained male vocalists had a maximum overall perceptual level construct that was 40% higher than the untrained male vocalists. Although the A-weighted spectrum (commonly used in VRP measurement) is a reasonable first-order approximation of auditory sensitivities, it misrepresents the most salient part of the sensitivities (where the singer's formant is found) by nearly 10 dB.}, } @article {pmid16301106, year = {2005}, author = {Smith, CG and Finnegan, EM and Karnell, MP}, title = {Resonant voice: spectral and nasendoscopic analysis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {19}, number = {4}, pages = {607-622}, doi = {10.1016/j.jvoice.2004.09.004}, pmid = {16301106}, issn = {0892-1997}, support = {R01 DC 03437-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Endoscopy/*methods ; Female ; Glottis/physiology ; Humans ; Larynx/physiology ; Male ; Phonation/physiology ; Sound Spectrography/methods ; *Speech Acoustics ; Vocal Cords/physiology ; Voice/*physiology ; Voice Quality ; }, abstract = {Although resonant voice therapy is a widely used therapeutic approach, little is known about what characterizes resonant voice and how it is physiologically produced. The purpose of this study was to test the hypothesis that resonant voice is produced by narrowing the laryngeal vestibule and is characterized by first formant tuning and more ample harmonics. Videonasendoscopic recordings of the laryngeal vestibule were made during nonresonant and resonant productions of /i/ in six subjects. Spectrums of the two voice types were also obtained. Spectral analysis showed that first formant tuning was exhibited during resonant voice productions and that the degree of harmonic enhancement in the range of 2.0 to 3.5 kHz was related to voice quality: nonresonant voice had the least amount of energy in this range, whereas a resonant-relaxed voice had more energy, and a resonant-bright voice had the greatest amount of energy. Visual-perceptual judgments of the videoendoscopic data indicated that laryngeal vestibule constriction was not consistently associated with resonant voice production.}, } @article {pmid16301101, year = {2005}, author = {Gelfer, MP and Mikos, VA}, title = {The relative contributions of speaking fundamental frequency and formant frequencies to gender identification based on isolated vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {19}, number = {4}, pages = {544-554}, doi = {10.1016/j.jvoice.2004.10.006}, pmid = {16301101}, issn = {0892-1997}, mesh = {Adult ; Analysis of Variance ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; *Sex Characteristics ; *Speech Acoustics ; Speech Perception/*physiology ; Transsexualism/*physiopathology ; }, abstract = {The purpose of this study was to determine the accuracy with which listeners could identify the gender of a speaker from a synthesized isolated vowel based on the natural production of that speaker when (1) the fundamental frequency was consistent with the speaker's gender, (2) the fundamental frequency was inconsistent with the the speaker's gender, and (3) the speaker was transgendered. Ten male-to-female transgendered persons, 10 men and 10 women, served as subjects. Each speaker produced the vowels /i/, /u/, and //. These vowels were analyzed for fundamental frequency and the first three formant frequencies and bandwidths. Formant frequency and bandwidth information was used to synthesize two vowel tokens for each speaker, one at a fundamental frequency of 120 Hz and one at 240 Hz. Listeners were asked to listen to these tokens and determine whether the original speaker was male or female. Listeners were not aware of the use of transgendered speakers. Results showed that, in all cases, gender identifications were based on fundamental frequency, even when fundamental frequency and formant frequency information was contradictory.}, } @article {pmid16287654, year = {2005}, author = {Sangiorgi, T and Manfredi, C and Bruscaglioni, P}, title = {Objective analysis of the singing voice as a training aid.}, journal = {Logopedics, phoniatrics, vocology}, volume = {30}, number = {3-4}, pages = {136-146}, doi = {10.1080/14015430500294064}, pmid = {16287654}, issn = {1401-5439}, mesh = {Acoustics ; Auditory Perception/*physiology ; Fourier Analysis ; Humans ; Models, Biological ; *Music ; Phonation/physiology ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Voice/*physiology ; }, abstract = {A new tool for robust tracking of fundamental frequency is proposed, along with an objective measure of main singing voice parameters, such as vibrato rate, vibrato extent, and vocal intonation. High-resolution Power Spectral Density estimation is implemented, based on AutoRegressive models of suitable order, allowing reliable formant tracking also in vocalizations characterized by highly varying values. The proposed techniques are applied to about 1000 vocalizations, coming from both professional and non-professional singers, and show better performance as compared to classical Fourier-based approaches. If properly implemented, and with a user-friendly interface, the new tool would allow real-time analysis of singing voice. Hence, it could be of help in giving non-professional singers and singing teachers reliable measures of possible improvements during and after training.}, } @article {pmid16281283, year = {2006}, author = {Obleser, J and Boecker, H and Drzezga, A and Haslinger, B and Hennenlotter, A and Roettinger, M and Eulitz, C and Rauschecker, JP}, title = {Vowel sound extraction in anterior superior temporal cortex.}, journal = {Human brain mapping}, volume = {27}, number = {7}, pages = {562-571}, pmid = {16281283}, issn = {1065-9471}, mesh = {Acoustic Stimulation ; Adult ; Attention/physiology ; Auditory Cortex/anatomy & histology/*physiology ; Auditory Pathways/anatomy & histology/*physiology ; Brain Mapping ; Female ; Functional Laterality/physiology ; Humans ; *Language ; Language Tests ; Magnetic Resonance Imaging ; Male ; Speech Perception/*physiology ; Temporal Lobe/anatomy & histology/*physiology ; }, abstract = {We investigated the functional neuroanatomy of vowel processing. We compared attentive auditory perception of natural German vowels to perception of nonspeech band-passed noise stimuli using functional magnetic resonance imaging (fMRI). More specifically, the mapping in auditory cortex of first and second formants was considered, which spectrally characterize vowels and are linked closely to phonological features. Multiple exemplars of natural German vowels were presented in sequences alternating either mainly along the first formant (e.g., [u]-[o], [i]-[e]) or along the second formant (e.g., [u]-[i], [o]-[e]). In fixed-effects and random-effects analyses, vowel sequences elicited more activation than did nonspeech noise in the anterior superior temporal cortex (aST) bilaterally. Partial segregation of different vowel categories was observed within the activated regions, suggestive of a speech sound mapping across the cortical surface. Our results add to the growing evidence that speech sounds, as one of the behaviorally most relevant classes of auditory objects, are analyzed and categorized in aST. These findings also support the notion of an auditory "what" stream, with highly object-specialized areas anterior to primary auditory cortex.}, } @article {pmid16280632, year = {2005}, author = {Neumann, K and Schunda, P and Hoth, S and Euler, HA}, title = {The interplay between glottis and vocal tract during the male passaggio.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {57}, number = {5-6}, pages = {308-327}, doi = {10.1159/000087084}, pmid = {16280632}, issn = {1021-7762}, mesh = {Adult ; Glottis/*physiology ; Humans ; Male ; Middle Aged ; Music ; Speech Acoustics ; Vocal Cords/*physiology ; *Voice ; *Voice Quality ; }, abstract = {The transition between 'chest' and 'head' register is essential for male opera singers in order to reach the higher pitches. The 'passaggio', which is a scale passage where this transition takes place, but also a maneuver of register equalization, is typically difficult to learn. Studies on parameters for a definition of this transition are restricted to a small number of singers so far. Audio, electroglottographic, and equivalent subglottic pressure signals of 11 male opera singers were recorded while singing scales on open back vowels and passing the register transition. A spectrum analysis of the audio signal revealed that the second harmonic (H2) dominates in 'chest', resonated by the first formant (F1), together with the fourth harmonic (H4), supported by the second formant (F2). During the passaggio, H2 level decreases because it loses the resonance of F1, while the third harmonic (H3) gains the resonance of F2. At this point the H4 level drops because that harmonic is no longer supported by F2. The transition from 'chest' to 'head' register is marked by characteristic changes in the amplitude patterns of the partials H2, H3, and H4, and the frequency progressions of the first two formants, defining an objective distinction between the two registers.}, } @article {pmid16280631, year = {2005}, author = {Schutte, HK and Miller, DG and Duijnstee, M}, title = {Resonance strategies revealed in recorded tenor high notes.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {57}, number = {5-6}, pages = {292-307}, doi = {10.1159/000087082}, pmid = {16280631}, issn = {1021-7762}, mesh = {*Famous Persons ; History, 20th Century ; Humans ; *Music ; *Phonation ; Sound Spectrography ; Speech Acoustics ; Speech Production Measurement/*history/*methods ; *Vibration ; Vocal Cords/*physiology ; Voice/*physiology ; *Voice Quality ; }, abstract = {With careers that depend to a large extent on the amplitude and sonorous beauty of their voices, opera singers must pay special attention to high notes, where the wide spacing of the harmonics of the voice source intensifies the critical importance of the tuning of the resonances of the vocal tract. This study uses spectrum analysis to examine a large number of recordings of a particularly challenging high note for tenors, the final sustained B4-flat of the aria 'Celeste Aida' from Verdi's opera Aida. The resonance strategy most frequently found uses a markedly displaced second formant to create a dominant resonance at the frequency of the third harmonic, about 1,400 Hz. Other strategies and combinations of strategies are examined as well. Arguments concerning the validity of drawing conclusions from the acoustic signals available in commercial recordings are considered, and it is concluded that such recordings can contain valuable information on resonance strategies for those who aspire to sing similar high notes.}, } @article {pmid16280195, year = {2006}, author = {Evans, S and Neave, N and Wakelin, D}, title = {Relationships between vocal characteristics and body size and shape in human males: an evolutionary explanation for a deep male voice.}, journal = {Biological psychology}, volume = {72}, number = {2}, pages = {160-163}, doi = {10.1016/j.biopsycho.2005.09.003}, pmid = {16280195}, issn = {0301-0511}, mesh = {Adolescent ; Adult ; Aged ; *Biological Evolution ; *Body Constitution ; *Body Size ; Humans ; Male ; Middle Aged ; Social Desirability ; Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; }, abstract = {A deep male voice may play a role in courtship and competitive behaviours in humans by attracting female mates and indicating body size to male competitors. The current correlational study investigated the relationship between vocal measures (fundamental and formant frequencies) and both body size and shape. Vocal samples and physical measures were obtained from 50 heterosexual male volunteers. A significant negative relationship was found between fundamental frequency and measures of body shape and weight. Further, a significant negative relationship was found between formant dispersion (the relationship between successive formant frequencies) and measures of body size as well as body shape. Findings are discussed in relation to the 'good genes' model of sexual selection and the size exaggeration theory of laryngeal descent.}, } @article {pmid16266181, year = {2005}, author = {Munson, B and Nelson, PB}, title = {Phonetic identification in quiet and in noise by listeners with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {4}, pages = {2607-2617}, doi = {10.1121/1.2005887}, pmid = {16266181}, issn = {0001-4966}, support = {P01 DC00110/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Aged ; Analysis of Variance ; Audiometry, Speech ; Case-Control Studies ; *Cochlear Implants ; Hearing Loss/*physiopathology/therapy ; Humans ; Middle Aged ; Noise/*adverse effects ; Perceptual Masking/*physiology ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {This study examined the effect of noise on the identification of four synthetic speech continua (/ra/-/la/, /wa/-/ja/, /i/-/u/, and say-stay) by adults with cochlea implants (CIs) and adults with normal-hearing (NH) sensitivity in quiet and noise. Significant group-by-SNR interactions were found for endpoint identification accuracy for all continua except /i/-/u/. The CI listeners showed the least NH-like identification functions for the /ra/-/la/ and /wa/-/ja/ continua. In a second experiment, NH adults identified four- and eight-band cochlear implant stimulations of the four continua, to examine whether group differences in frequency selectivity could account for the group differences in the first experiment. Number of bands and SNR interacted significantly for /ra/-/la/, /wa/-/ja/, and say-stay endpoint identification; strongest effects were found for the /ra/-/la/ and say-stay continua. Results suggest that the speech features that are most vulnerable to misperception in noise by listeners with CIs are those whose acoustic cues are rapidly changing spectral patterns, like the formant transitions in the /wa/-/ja/ and /ra/-/la/ continua. However, the group differences in the first experiment cannot be wholly attributable to frequency selectivity differences, as the number of bands in the second experiment affected performance differently than suggested by group differences in the first experiment.}, } @article {pmid16266180, year = {2005}, author = {Kiefte, M and Kluender, KR}, title = {Pattern playback revisited: unvoiced stop consonant perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {4}, pages = {2599-2606}, doi = {10.1121/1.2040047}, pmid = {16266180}, issn = {0001-4966}, support = {R01 DC04072/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Audiometry, Speech ; Female ; Humans ; Male ; Models, Biological ; *Phonetics ; Software ; Sound Spectrography ; Speech Perception/*physiology ; }, abstract = {Among the most influential publications in speech perception is Liberman, Delattre, and Cooper's [Am. J. Phys. 65, 497-516 (1952)] report on the identification of synthetic, voiceless stops generated by the Pattern Playback. Their map of stop consonant identification shows a highly complex relationship between acoustics and perception. This complex mapping poses a challenge to many classes of relatively simple pattern recognition models which are unable to capture the original finding of Liberman et al. that identification of /k/ was bimodal for bursts preceding front vowels but otherwise unimodal. A replication of this experiment was conducted in an attempt to reproduce these identification patterns using a simulation of the Pattern Playback device. Examination of spectrographic data from stimuli generated by the Pattern Playback revealed additional spectral peaks that are consistent with harmonic distortion characteristic of tube amplifiers of that era. Only when harmonic distortion was introduced did bimodal /k/ responses in front-vowel context emerge. The acoustic consequence of this distortion is to add, e.g., a high-frequency peak to midfrequency bursts or a midfrequency peak to a low-frequency burst. This likely resulted in additional /k/ responses when the second peak approximated the second formant of front vowels. Although these results do not challenge the main observations made by Liberman et al. that perception of stop bursts is context dependent, they do show that the mapping from acoustics to perception is much less complex without these additional distortion products.}, } @article {pmid16266179, year = {2005}, author = {Rossi-Katz, JA and Arehart, KH}, title = {Effects of cochlear hearing loss on perceptual grouping cues in competing-vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {4}, pages = {2588-2598}, doi = {10.1121/1.2031975}, pmid = {16266179}, issn = {0001-4966}, support = {UR3/CCU824219/CC/ODCDC CDC HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Aged, 80 and over ; Analysis of Variance ; Audiometry, Speech ; Case-Control Studies ; Cochlea/*physiopathology ; *Cues ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; Regression Analysis ; Speech Perception/*physiology ; }, abstract = {This study compared how normal-hearing listeners (NH) and listeners with moderate to moderately severe cochlear hearing loss (HI) use and combine information within and across frequency regions in the perceptual separation of competing vowels with fundamental frequency differences (deltaF0) ranging from 0 to 9 semitones. Following the procedure of Culling and Darwin [J. Acoust. Soc. Am. 93, 3454-3467 (1993)], eight NH listeners and eight HI listeners identified competing vowels with either a consistent or inconsistent harmonic structure. Vowels were amplified to assure audibility for HI listeners. The contribution of frequency region depended on the value of deltaF0 between the competing vowels. When deltaF0 was small, both groups of listeners effectively utilized deltaF0 cues in the low-frequency region. In contrast, HI listeners derived significantly less benefit than NH listeners from deltaF0 cues conveyed by the high-frequency region at small deltaF0's. At larger deltaF0's, both groups combined deltaF0 cues from the low and high formant-frequency regions. Cochlear impairment appears to negatively impact the ability to use F0 cues for within-formant grouping in the high-frequency region. However, cochlear loss does not appear to disrupt the ability to use within-formant F0 cues in the low-frequency region or to group F0 cues across formant regions.}, } @article {pmid16240831, year = {2005}, author = {Mayo, C and Turk, A}, title = {The influence of spectral distinctiveness on acoustic cue weighting in children's and adults' speech perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {3 Pt 1}, pages = {1730-1741}, doi = {10.1121/1.1979451}, pmid = {16240831}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Child ; Child, Preschool ; *Cues ; Differential Threshold/physiology ; Female ; Humans ; Male ; Regression Analysis ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Children and adults appear to weight some acoustic cues differently in perceiving certain speech contrasts. One possible explanation for this difference is that children and adults make use of different strategies in the way that they process speech. An alternative explanation is that adult-child cue weighting differences are due to more general sensory (auditory) processing differences between the two groups. It has been proposed that children may be less able to deal with incomplete or insufficient acoustic information than are adults, and thus may require cues that are longer, louder, or more spectrally distinct to identify or discriminate between auditory stimuli. The current study tested this hypothesis by examining adults' and 3- to 7-year-old children's cue weighting for contrasts in which vowel-onset formant transitions varied from spectrally distinct (/no/-/mo/, /do/-/bo/, and /ta/-/da/) to spectrally similar (/ni/-/mi/, /de/-/be/, and /ti/-/di/). Spectrally distinct cues were more likely to yield different consonantal responses than were spectrally similar cues, for all listeners. Furthermore, as predicted by a sensory hypothesis, children were less likely to give different consonantal responses to stimuli distinguished by spectrally similar transitional cues than were adults. However, this pattern of behavior did not hold for all contrasts. Implications for theories of adult-child cue weighting differences are discussed.}, } @article {pmid16240825, year = {2005}, author = {Clopper, CG and Pisoni, DB and de Jong, K}, title = {Acoustic characteristics of the vowel systems of six regional varieties of American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {3 Pt 1}, pages = {1661-1676}, pmid = {16240825}, issn = {0001-4966}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; K08 DC000111/DC/NIDCD NIH HHS/United States ; T32 DC00012/DC/NIDCD NIH HHS/United States ; F32 DC000111/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; R01 DC00111/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Analysis of Variance ; Female ; Humans ; *Language ; Male ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; United States ; }, abstract = {Previous research by speech scientists on the acoustic characteristics of American English vowel systems has typically focused on a single regional variety, despite decades of sociolinguistic research demonstrating the extent of regional phonological variation in the United States. In the present study, acoustic measures of duration and first and second formant frequencies were obtained from five repetitions of 11 different vowels produced by 48 talkers representing both genders and six regional varieties of American English. Results revealed consistent variation due to region of origin, particularly with respect to the production of low vowels and high back vowels. The Northern talkers produced shifted low vowels consistent with the Northern Cities Chain Shift, the Southern talkers produced fronted back vowels consistent with the Southern Vowel Shift, and the New England, Midland, and Western talkers produced the low back vowel merger. These findings indicate that the vowel systems of American English are better characterized in terms of the region of origin of the talkers than in terms of a single set of idealized acoustic-phonetic baselines of "General" American English and provide benchmark data for six regional varieties.}, } @article {pmid16225699, year = {2005}, author = {Tiitinen, H and Mäkelä, AM and Mäkinen, V and May, PJ and Alku, P}, title = {Disentangling the effects of phonation and articulation: hemispheric asymmetries in the auditory N1m response of the human brain.}, journal = {BMC neuroscience}, volume = {6}, number = {}, pages = {62}, pmid = {16225699}, issn = {1471-2202}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Perception/physiology ; Brain/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; Male ; Phonation/*physiology ; Speech/*physiology ; }, abstract = {BACKGROUND: The cortical activity underlying the perception of vowel identity has typically been addressed by manipulating the first and second formant frequency (F1 & F2) of the speech stimuli. These two values, originating from articulation, are already sufficient for the phonetic characterization of vowel category. In the present study, we investigated how the spectral cues caused by articulation are reflected in cortical speech processing when combined with phonation, the other major part of speech production manifested as the fundamental frequency (F0) and its harmonic integer multiples. To study the combined effects of articulation and phonation we presented vowels with either high (/a/) or low (/u/) formant frequencies which were driven by three different types of excitation: a natural periodic pulseform reflecting the vibration of the vocal folds, an aperiodic noise excitation, or a tonal waveform. The auditory N1m response was recorded with whole-head magnetoencephalography (MEG) from ten human subjects in order to resolve whether brain events reflecting articulation and phonation are specific to the left or right hemisphere of the human brain.

RESULTS: The N1m responses for the six stimulus types displayed a considerable dynamic range of 115-135 ms, and were elicited faster (approximately 10 ms) by the high-formant /a/ than by the low-formant /u/, indicating an effect of articulation. While excitation type had no effect on the latency of the right-hemispheric N1m, the left-hemispheric N1m elicited by the tonally excited /a/ was some 10 ms earlier than that elicited by the periodic and the aperiodic excitation. The amplitude of the N1m in both hemispheres was systematically stronger to stimulation with natural periodic excitation. Also, stimulus type had a marked (up to 7 mm) effect on the source location of the N1m, with periodic excitation resulting in more anterior sources than aperiodic and tonal excitation.

CONCLUSION: The auditory brain areas of the two hemispheres exhibit differential tuning to natural speech signals, observable already in the passive recording condition. The variations in the latency and strength of the auditory N1m response can be traced back to the spectral structure of the stimuli. More specifically, the combined effects of the harmonic comb structure originating from the natural voice excitation caused by the fluctuating vocal folds and the location of the formant frequencies originating from the vocal tract leads to asymmetric behaviour of the left and right hemisphere.}, } @article {pmid16219993, year = {2006}, author = {Laneau, J and Wouters, J and Moonen, M}, title = {Improved music perception with explicit pitch coding in cochlear implants.}, journal = {Audiology & neuro-otology}, volume = {11}, number = {1}, pages = {38-52}, doi = {10.1159/000088853}, pmid = {16219993}, issn = {1420-3030}, mesh = {Adult ; Aged ; Analysis of Variance ; *Cochlear Implants ; Hearing Loss/physiopathology/*rehabilitation ; Humans ; Middle Aged ; *Music ; Pitch Perception/*physiology ; }, abstract = {Music perception and appraisal is very poor in cochlear implant (CI) subjects partly because (musical) pitch is inadequately transmitted by the current clinically used sound processors. A new sound processing scheme (F0mod) was designed to optimize pitch perception, and its performance for music and pitch perception was compared in four different experiments to that of the current clinically used sound processing scheme (ACE) in six Nucleus CI24 subjects. In the F0mod scheme, slowly varying channel envelopes are explicitly modulated sinusoidally at the fundamental frequency (F0) of the input signal, with 100% modulation depth and in phase across channels to maximize temporal envelope pitch cues. The results of the four experiments show that: (1) F0 discrimination of single-formant stimuli was not significantly different for the two schemes, (2) F0 discrimination of musical notes of five instruments was three times better with the F0mod scheme for F0 up to 250 Hz, (3) melody recognition of familiar Flemish songs (with all rhythm cues removed) was improved with the F0mod scheme, and (4) estimates of musical pitch intervals, obtained in a musically trained CI subject, matched more closely the presented intervals with the F0mod scheme. These results indicate that explicit F0 modulation of the channel envelopes improves music perception in CI subjects.}, } @article {pmid16209426, year = {2005}, author = {Stefanatos, GA and Gershkoff, A and Madigan, S}, title = {On pure word deafness, temporal processing, and the left hemisphere.}, journal = {Journal of the International Neuropsychological Society : JINS}, volume = {11}, number = {4}, pages = {456-70; discussion 455}, pmid = {16209426}, issn = {1355-6177}, mesh = {Adult ; Aphasia, Wernicke/*etiology/pathology ; Audiometry/methods ; Female ; Humans ; Intracranial Aneurysm/*complications/pathology ; Magnetic Resonance Imaging/methods ; Neuropsychological Tests ; Sound Spectrography/methods ; Speech Perception/*physiology ; Temporal Lobe/*pathology ; }, abstract = {Pure word deafness (PWD) is a rare neurological syndrome characterized by severe difficulties in understanding and reproducing spoken language, with sparing of written language comprehension and speech production. The pathognomonic disturbance of auditory comprehension appears to be associated with a breakdown in processes involved in mapping auditory input to lexical representations of words, but the functional locus of this disturbance and the localization of the responsible lesion have long been disputed. We report here on a woman with PWD resulting from a circumscribed unilateral infarct involving the left superior temporal lobe who demonstrated significant problems processing transitional spectrotemporal cues in both speech and nonspeech sounds. On speech discrimination tasks, she exhibited poor differentiation of stop consonant-vowel syllables distinguished by voicing onset and brief formant frequency transitions. Isolated formant transitions could be reliably discriminated only at very long durations (> 200 ms). By contrast, click fusion threshold, which depends on millisecond-level resolution of brief auditory events, was normal. These results suggest that the problems with speech analysis in this case were not secondary to general constraints on auditory temporal resolution. Rather, they point to a disturbance of left hemisphere auditory mechanisms that preferentially analyze rapid spectrotemporal variations in frequency. The findings have important implications for our conceptualization of PWD and its subtypes.}, } @article {pmid16158662, year = {2005}, author = {Nittrouer, S}, title = {Age-related differences in weighting and masking of two cues to word-final stop voicing in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {2}, pages = {1072-1088}, pmid = {16158662}, issn = {0001-4966}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Age Factors ; Child ; Child, Preschool ; *Cues ; Data Interpretation, Statistical ; Female ; Humans ; Male ; Noise/*adverse effects ; Perceptual Masking/*physiology ; *Phonetics ; Speech Perception/*physiology ; Voice ; }, abstract = {Because laboratory studies are conducted in optimal listening conditions, often with highly stylized stimuli that attenuate or eliminate some naturally occurring cues, results may have constrained applicability to the "real world." Such studies show that English-speaking adults weight vocalic duration greatly and formant offsets slightly in voicing decisions for word-final obstruents. Using more natural stimuli, Nittrouer [J. Acoust. Soc. Am. 115, 1777-1790 (2004)] found different results, raising questions about what would happen if experimental conditions were even more like the real world. In this study noise was used to simulate the real world. Edited natural words with voiced and voiceless final stops were presented in quiet and noise to adults and children (4 to 8 years) for labeling. Hypotheses tested were (1) Adults (and perhaps older children) would weight vocalic duration more in noise than in quiet; (2) Previously reported age-related differences in cue weighting might not be found in this real-world simulation; and (3) Children would experience greater masking than adults. Results showed: (1) no increase for any age listeners in the weighting of vocalic duration in noise; (2) age-related differences in the weighting of cues in both quiet and noise; and (3) masking effects for all listeners, but more so for children than adults.}, } @article {pmid16158661, year = {2005}, author = {Molis, MR}, title = {Evaluating models of vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {2}, pages = {1062-1071}, doi = {10.1121/1.1943907}, pmid = {16158661}, issn = {0001-4966}, support = {R01 DC00427-13/DC/NIDCD NIH HHS/United States ; R01 DC00427-14/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Humans ; Likelihood Functions ; Logistic Models ; *Models, Biological ; Nonlinear Dynamics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {There is a long-standing debate concerning the efficacy of formant-based versus whole spectrum models of vowel perception. Categorization data for a set of synthetic steady-state vowels were used to evaluate both types of models. The models tested included various combinations of formant frequencies and amplitudes, principal components derived from excitation patterns, and perceptually scaled LPC cepstral coefficients. The stimuli were 54 five-formant synthesized vowels that had a common F1 frequency and varied orthogonally in F2 and F3 frequency. Twelve speakers of American English categorized the stimuli as the vowels /I/, /[symbol: see text]/, or /[symbol: see text]/. Results indicate that formant frequencies provided the best account of the data only if nonlinear terms, in the form of squares and cross products of the formant values, were also included in the analysis. The excitation pattern principal components also produced reasonably accurate fits to the data. Although a wish to use the lowest-dimensional representation would dictate that formant frequencies are the most appropriate vowel description, the relative success of richer, more flexible, and more neurophysiologically plausible whole spectrum representations suggests that they may be preferred for understanding human vowel perception.}, } @article {pmid16158657, year = {2005}, author = {Liu, H and Wan, M and Wang, S and Wang, X and Lu, C}, title = {Acoustic characteristics of Mandarin esophageal speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {2}, pages = {1016-1025}, doi = {10.1121/1.1942349}, pmid = {16158657}, issn = {0001-4966}, mesh = {Aged ; Humans ; Male ; Middle Aged ; Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; *Speech, Esophageal ; }, abstract = {The present study attempted to investigate the acoustic characteristics of Mandarin laryngeal and esophageal speech. Eight normal laryngeal and seven esophageal speakers participated in the acoustic experiments. Results from acoustic analyses of syllables /ma/and /ba/ indicated that, F0, intensity, and signal-to-noise ratio of laryngeal speech were significantly higher than those of esophageal speech. However, opposite results were found for vowel duration, jitter, and shimmer. Mean F0, intensity, and word per minute in reading were greater but number of pauses was smaller in laryngeal speech than those in esophageal speech. Similar patterns of F0 contours and vowel duration as a function of tone were found between laryngeal and esophageal speakers. Long-time spectra analysis indicated that higher first and second formant frequencies were associated with esophageal speech than that with normal laryngeal speech.}, } @article {pmid16155696, year = {2005}, author = {Xing, GF and Jiao, T and Sun, J and Jiang, YL}, title = {[Evaluation of the speech outcomes in patients with unilateral maxillary defect rehabilitated with maxillary obturator prosthesis].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {14}, number = {4}, pages = {352-354}, pmid = {16155696}, issn = {1006-7248}, mesh = {Humans ; Maxilla/*abnormalities ; *Maxillofacial Prosthesis ; Palatal Obturators ; Palate, Soft/surgery ; Phonetics ; *Speech ; }, abstract = {PURPOSE: Evaluation of the outcomes in 15 patients with unilateral maxillary defect before and after maxillofacial obturator prosthesis.

METHODS: 15 patients with unilateral maxillary defect were included in this study, who received obturator prosthesis for maxillary rehabilitation. The pronunciation of the examined phonetics such as /a/, /o/, /e/, /i/, /u/ were transferred into CSL4400 before and after treatment. The mean value of the first, second, third formant were measured. Paired t test of SPSS11.0 was used for statistical analysis.

RESULTS: There was statistically significant difference on F2 and F3 before and after treatment (P<0.05). After treatment the value of F3 was increased significantly.

CONCLUSIONS: The soft palate was reconstructed after obturator treatment. The space between the nasal and oral cavity was sealed,through which a similar normal oral cavity was obtained. Except F1, F2 and F3 of all the vowels changed after treatment. Maxillofacial prosthesis can improve the speech function of patients with unilateral maxillary defect effectively.}, } @article {pmid16155692, year = {2005}, author = {Chen, Y and Yang, YS and Wu, YL and Wang, GM}, title = {[A preliminary study of the influence of different consonants on the vowel in normal people].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {14}, number = {4}, pages = {338-340}, pmid = {16155692}, issn = {1006-7248}, mesh = {*Articulation Disorders ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; Young Adult ; }, abstract = {PURPOSE: To study the influence of different consonants on the vowel in normal people for guiding research and treatment of cleft palate misarticulations.

METHODS: 35 normal students including 15 males and 20 females were selected for this study. The average age was 20.43 year. The six test syllables were divided into three groups: /si/-/ji/, /bo/-/po/ and /jü/-/qü/. The measured indexes of vowel, which were input into CSL4400, were duration (Dur), pitch, the first three formants (F1, F2, F3) and energy (E). The acoustic features of the vowel of each group were compared by SPSS software with Paired t test.

RESULTS: Dur of the vowel was 0.39+/-0.08 in /si/ and 0.31+/-0.07 in /ji/, F2 of the vowel was 2093.55+/-350.62 in /si/ and 2454.33+/-422.00 in /ji/, there was significant difference in Dur and F2 between the vowels of /si/-/ji/ (P<0.01); Dur of the vowel was 0.35+/-0.09 in /bo/ and 0.53+/-0.07 in /po/, E of the vowel was 64.11+/-4.58 in /bo/ and 69.94+/-3.59 in /po/, there was significant difference in Dur and E between the vowels of /bo/-/po/ (P<0.01); Dur of the vowel was 0.29+/-0.07 in /jü/ and 0.35+/-0.09 in /qü/, F3 of the vowel was 3180.30+/-301.72 in /jü/ and 2901.06+/-259.75 in /qü/, E of the vowel was 68.99+/-4.59 in /jü/ and 71.70+/-5.21 in /qü/, there was significant difference in Dur, E and F3 between the vowels of /jü/-/qü/ (P<0.05).

CONCLUSIONS: It is suggested that the aspiration of the consonant, the part and the extent of the block of the consonant could affect the next vowel, but couldn't change its quality.}, } @article {pmid16119364, year = {2005}, author = {Ouni, S and Laprie, Y}, title = {Modeling the articulatory space using a hypercube codebook for acoustic-to-articulatory inversion.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {1}, pages = {444-460}, doi = {10.1121/1.1921448}, pmid = {16119364}, issn = {0001-4966}, mesh = {Algorithms ; Humans ; Linear Models ; *Models, Biological ; Mouth/*physiology ; *Phonetics ; Speech/*physiology ; Vocal Cords/*physiology ; }, abstract = {Acoustic-to-articulatory inversion is a difficult problem mainly because of the nonlinearity between the articulatory and acoustic spaces and the nonuniqueness of this relationship. To resolve this problem, we have developed an inversion method that provides a complete description of the possible solutions without excessive constraints and retrieves realistic temporal dynamics of the vocal tract shapes. We present an adaptive sampling algorithm to ensure that the acoustical resolution is almost independent of the region in the articulatory space under consideration. This leads to a codebook that is organized in the form of a hierarchy of hypercubes, and ensures that, within each hypercube, the articulatory-to-acoustic mapping can be approximated by means of a linear transform. The inversion procedure retrieves articulatory vectors corresponding to acoustic entries from the hypercube codebook. A nonlinear smoothing algorithm together with a regularization technique is then used to recover the best articulatory trajectory. The inversion ensures that inverse articulatory parameters generate original formant trajectories with high precision and a realistic sequence of the vocal tract shapes.}, } @article {pmid16119358, year = {2005}, author = {Green, T and Faulkner, A and Rosen, S and Macherey, O}, title = {Enhancement of temporal periodicity cues in cochlear implants: effects on prosodic perception and vowel identification.}, journal = {The Journal of the Acoustical Society of America}, volume = {118}, number = {1}, pages = {375-385}, doi = {10.1121/1.1925827}, pmid = {16119358}, issn = {0001-4966}, support = {G7/ACT_/RNID/United Kingdom ; }, mesh = {Adult ; Aged ; Case-Control Studies ; *Cochlear Implants ; *Cues ; Discrimination, Psychological ; Female ; Humans ; Male ; Middle Aged ; *Periodicity ; Phonetics ; *Pitch Perception ; Signal Processing, Computer-Assisted ; Speech Acoustics ; *Speech Perception ; *Time Perception ; }, abstract = {Standard continuous interleaved sampling processing, and a modified processing strategy designed to enhance temporal cues to voice pitch, were compared on tests of intonation perception, and vowel perception, both in implant users and in acoustic simulations. In standard processing, 400 Hz low-pass envelopes modulated either pulse trains (implant users) or noise carriers (simulations). In the modified strategy, slow-rate envelope modulations, which convey dynamic spectral variation crucial for speech understanding, were extracted by low-pass filtering (32 Hz). In addition, during voiced speech, higher-rate temporal modulation in each channel was provided by 100% amplitude-modulation by a sawtooth-like wave form whose periodicity followed the fundamental frequency (F0) of the input. Channel levels were determined by the product of the lower- and higher-rate modulation components. Both in acoustic simulations and in implant users, the ability to use intonation information to identify sentences as question or statement was significantly better with modified processing. However, while there was no difference in vowel recognition in the acoustic simulation, implant users performed worse with modified processing both in vowel recognition and in formant frequency discrimination. It appears that, while enhancing pitch perception, modified processing harmed the transmission of spectral information.}, } @article {pmid16115081, year = {2005}, author = {Brown, JA and Derksen, FJ and Stick, JA and Hartmann, WM and Robinson, NE}, title = {Laser vocal cordectomy fails to effectively reduce respiratory noise in horses with laryngeal hemiplegia.}, journal = {Veterinary surgery : VS}, volume = {34}, number = {3}, pages = {247-252}, doi = {10.1111/j.1532-950X.2005.00037.x}, pmid = {16115081}, issn = {0161-3499}, mesh = {Animals ; Female ; Horse Diseases/physiopathology/*surgery ; Horses ; Male ; Physical Conditioning, Animal ; Respiratory Function Tests/veterinary ; Respiratory Sounds ; Treatment Outcome ; Vocal Cord Paralysis/surgery/*veterinary ; Vocal Cords/physiopathology/*surgery ; }, abstract = {OBJECTIVE: To report the effect of unilateral laser vocal cordectomy on respiratory noise and airway function in horses with experimentally induced laryngeal hemiplegia (LH).

STUDY DESIGN: Experimental study.

ANIMALS: Six Standardbred horses without upper airway abnormalities at rest or during high-speed treadmill exercise.

METHODS: Respiratory sounds and inspiratory trans-upper airway pressure (P(Ui)) were measured before (baseline) and 14 days after induction of LH by left recurrent laryngeal neurectomy, and again 30, 60, 90, and 120 days after endoscopically assisted laser cordectomy of the left vocal cord. Data were collected with the horses exercising on a treadmill at a speed producing maximum heart rate (HR(max)).

RESULTS: In horses exercising at HR(max), induction of LH caused a significant increase in P(Ui), sound level (SL), and the sound intensity of formant 2 (F(2)) and 3 (F(3)). The sound intensity of formant 1 (F(1)) was unaffected by induction of LH. Laser vocal cordectomy had no effect on SL, or on the sound intensity of F(1) and F(3). At 30, 60, 90, and 120 days after surgery, P(Ui) and the sound intensity of F(2) were significantly reduced, but these variables remained significantly different from baseline values.

CONCLUSIONS: Unilateral laser vocal cordectomy did not effectively improve upper airway noise in horses with LH. The procedure decreased upper airway obstruction to the same degree as bilateral ventriculocordectomy.

CLINICAL RELEVANCE: Currently, laser vocal cordectomy cannot be recommended for the treatment of upper airway noise in horses with LH.}, } @article {pmid16102670, year = {2005}, author = {Pinczower, R and Oates, J}, title = {Vocal projection in actors: the long-term average spectral features that distinguish comfortable acting voice from voicing with maximal projection in male actors.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {19}, number = {3}, pages = {440-453}, doi = {10.1016/j.jvoice.2004.07.002}, pmid = {16102670}, issn = {0892-1997}, mesh = {Adult ; Humans ; Male ; Middle Aged ; Pharynx/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Voice/*physiology ; Voice Quality ; Voice Training ; }, abstract = {This study explored whether acoustic and perceptual features could distinguish comfortable from maximally projected acting voice. Thirteen professional male actors performed a passage from William Shakespeare's Julius Caesar twice. The first delivery used their comfortably projected voices, whereas the second used maximal projection. Acoustic measures, expert ratings, and self-ratings of projection and voice quality were investigated. Long-term average spectra (LTAS) and sound pressure level (SPL) analyses were conducted. Perceptual variables included projection, breathiness, roughness, and strain. When comparing the intensity difference between the higher (2-4 kHz) and lower (0-2 kHz) regions of the spectrum in voice samples from the maximal projected condition, LTAS analyses demonstrated increased acoustic energy in the higher part of the spectrum. This LTAS pattern was not as evident in the comfortable projected condition. These findings offered some preliminary support for the existence of an actor's formant (prominent peak in the upper part of the spectrum) during maximal projection.}, } @article {pmid16102666, year = {2005}, author = {Cannito, MP and Buder, EH and Chorna, LB}, title = {Spectral amplitude measures of adductor spasmodic dysphonic speech.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {19}, number = {3}, pages = {391-410}, doi = {10.1016/j.jvoice.2004.07.001}, pmid = {16102666}, issn = {0892-1997}, support = {1-R15-DCOD02299-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Mathematical Computing ; Middle Aged ; Predictive Value of Tests ; Reproducibility of Results ; Sensitivity and Specificity ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; Tape Recording ; Vocal Cords/*physiopathology ; Voice Disorders/*diagnosis ; Voice Quality ; }, abstract = {Spectral amplitude measures are sensitive to varying degrees of vocal fold adduction in normal speakers. This study examined the applicability of harmonic amplitude differences to adductor spasmodic dysphonia (ADSD) in comparison with normal controls. Amplitudes of the first and second harmonics (H1, H2) and of harmonics affiliated with the first, second, and third formants (A1, A2, A3) were obtained from spectra of vowels and /i/ excerpted from connected speech. Results indicated that these measures could be made reliably in ADSD. With the exception of H1(*)-H2(*), harmonic amplitude differences (H1(*)-A1, H1(*)-A2, and H1(*)-A3(*)) exhibited significant negative linear relationships (P < 0.05) with clinical judgments of overall severity. The four harmonic amplitude differences significantly differentiated between pre-BT and post-BT productions (P < 0.05). After treatment, measurements from detected significant differences between ADSD and normal controls (P < 0.05), but measurements from /i/ did not. LTAS analysis of ADSD patients' speech samples proved a good fit with harmonic amplitude difference measures. Harmonic amplitude differences also significantly correlated with perceptual judgments of breathiness and roughness (P < 0.05). These findings demonstrate high clinical applicability for harmonic amplitude differences for characterizing phonation in the speech of persons with ADSD, as well as normal speakers, and they suggest promise for future application to other voice pathologies.}, } @article {pmid16088292, year = {2005}, author = {Nagulic, M and Davidovic, J and Nagulic, I}, title = {Parkinsonian voice acoustic analysis in real-time after stereotactic thalamotomy.}, journal = {Stereotactic and functional neurosurgery}, volume = {83}, number = {2-3}, pages = {115-121}, doi = {10.1159/000087308}, pmid = {16088292}, issn = {1011-6125}, mesh = {Acoustics ; Aged ; Functional Laterality ; Humans ; Male ; Middle Aged ; Parkinson Disease/*physiopathology/*surgery ; Radiosurgery/*methods ; Speech/*physiology ; Thalamus/*surgery ; *Voice ; }, abstract = {The voice acoustic structure in 7 male patients with Parkinson's disease was analyzed 1 week before and 1 week after right-sided thalamotomy. The voice signal of the 3-digit number 306 was analyzed in real-time using a digital frequency analyzer. The average value of intensity level during the initial segment of the speech signal became higher after thalamotomy (75.04 vs. 59.24 dB). The voice fundamental frequency increased from 104 to 122 Hz. Before the operation, the energy focus of the speech signal was 73.19 vs. 76.87 dB postoperatively. The voice formants F1 and F2 shifted to the higher energy and frequency regions. The total sound pressure level of the pronounced 3-digit number 306 became two times higher after operation. The overall fluency of pronunciation was improved.}, } @article {pmid16055287, year = {2005}, author = {Zeftawi, MS}, title = {MMN to natural Arabic CV syllables: 2 - cross language study.}, journal = {Hearing research}, volume = {210}, number = {1-2}, pages = {24-29}, doi = {10.1016/j.heares.2005.06.012}, pmid = {16055287}, issn = {0378-5955}, mesh = {Adolescent ; Adult ; Auditory Pathways/physiology ; Egypt ; Electrophysiology ; Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; *Language ; Male ; *Phonetics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Mismatch negativity response parameters; latency, amplitude, and duration - to natural Arabic and natural English CV syllables - were obtained from normal-hearing adult Egyptians, in two experiments. In the first experiment, MMN was obtained in response to English CV syllable paradigms (Ba-Wa) and (Ga-Da) differing in formant duration and start of third formant, respectively. In the second experiment, MMN response for Arabic paradigm (Baa-Waa), English paradigm (Ba-Wa), and for Arabic-English paradigm (Waa-Wa) was obtained. Results revealed that the three levels of speech representation; acoustic, phonetic and phonologic could be probed preattentatively by MMN. The acoustic properties of speech signal are processed earlier than the phonetic and phonologic properties.}, } @article {pmid16043402, year = {2005}, author = {Harkrider, AW and Plyler, PN and Hedrick, MS}, title = {Effects of age and spectral shaping on perception and neural representation of stop consonant stimuli.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {116}, number = {9}, pages = {2153-2164}, doi = {10.1016/j.clinph.2005.05.016}, pmid = {16043402}, issn = {1388-2457}, mesh = {Acoustic Stimulation ; Adult ; Aged ; Aging/*physiology ; Auditory Cortex/physiology ; Auditory Threshold ; Cues ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Male ; Middle Aged ; Psychometrics ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: To determine if (1) aging affects neural representation of a dynamic spectral speech cue and (2) spectrally-shaped gain applied to the cue reduces any aging effects.

METHODS: Psychometric functions and cortical evoked responses were compared in young and older listeners with normal hearing. Stimuli were consonant-vowels along a /bdg/ place-of-articulation continuum in an unshaped and shaped condition. Shaped stimuli enhanced audibility of the F2 formant transition relative to the rest of the stimulus.

RESULTS: Compared with younger listeners, older listeners had larger /d/ categories, longer P2 latencies, and larger N1 amplitudes to unshaped stimuli. To shaped stimuli, older listeners had /d/ categories and P2 latencies more similar to those measured from younger listeners, while N1 amplitudes were larger.

CONCLUSIONS: Aging significantly affects the processing of dynamic spectral information. For some measures, differences due to aging were minimized with spectrally-shaped stimuli.

SIGNIFICANCE: Aging reduces neural responsiveness to dynamic spectral cues. If the cue is enhanced, neural responsiveness is increased and perceptual measures are more like those from the younger listeners for some stimuli. This suggests that aging may decrease responsiveness of intact neurons as opposed to destroying neurons and/or distorting spectral coding.}, } @article {pmid16042003, year = {2005}, author = {Luo, X and Fu, QJ}, title = {Speaker normalization for chinese vowel recognition in cochlear implants.}, journal = {IEEE transactions on bio-medical engineering}, volume = {52}, number = {7}, pages = {1358-1361}, doi = {10.1109/TBME.2005.847530}, pmid = {16042003}, issn = {0018-9294}, support = {R01-DC04993/DC/NIDCD NIH HHS/United States ; }, mesh = {*Artificial Intelligence ; China ; *Cochlear Implants ; Computer-Aided Design ; Equipment Failure Analysis ; Humans ; Phonation ; Prosthesis Design ; Sound Spectrography/*methods ; *Speech Acoustics ; *Speech Perception ; *Speech Recognition Software ; }, abstract = {Because of the limited spectra-temporal resolution associated with cochlear implants, implant patients often have greater difficulty with multitalker speech recognition. The present study investigated whether multitalker speech recognition can be improved by applying speaker normalization techniques to cochlear implant speech processing. Multitalker Chinese vowel recognition was tested with normal-hearing Chinese-speaking subjects listening to a 4-channel cochlear implant simulation, with and without speaker normalization. For each subject, speaker normalization was referenced to the speaker that produced the best recognition performance under conditions without speaker normalization. To match the remaining speakers to this "optimal" output pattern, the overall frequency range of the analysis filter bank was adjusted for each speaker according to the ratio of the mean third formant frequency values between the specific speaker and the reference speaker. Results showed that speaker normalization provided a small but significant improvement in subjects' overall recognition performance. After speaker normalization, subjects' patterns of recognition performance across speakers changed, demonstrating the potential for speaker-dependent effects with the proposed normalization technique.}, } @article {pmid16039081, year = {2006}, author = {Zhang, C and van de Weijer, J and Cui, J}, title = {Intra- and inter-speaker variations of formant pattern for lateral syllables in Standard Chinese.}, journal = {Forensic science international}, volume = {158}, number = {2-3}, pages = {117-124}, doi = {10.1016/j.forsciint.2005.04.043}, pmid = {16039081}, issn = {0379-0738}, mesh = {Adult ; *Asian People ; China ; Forensic Medicine ; Humans ; *Language ; Male ; Phonetics ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {Speech variation of speakers is a crucial issue for speaker recognition and identification, especially for forensic practice. Greater intra-speaker variation is one main reason for incorrect speaker identification in real forensic situations. Understanding the stability of acoustic parameters and their variation in speech is therefore significant for the evaluation of effective parameters for speaker identification. In this paper, all vowels in Standard Chinese including five monophthongs, eight diphthongs and four triphthongs were combined with lateral /l/. Finally, 15 lateral syllables with different tones for 10 speakers were selected and acoustically analyzed. Central frequencies of the first four formants for each syllable were measured for quantitative comparison of intra- and inter-speaker variation in order to provide a general idea of speaker variation in Standard Chinese, and finally serving for forensic application. Results show that the overall intra-speaker variation is less than the inter-speaker variation in great extent under laboratory condition though occasionally they are contrary. This supports the basis for forensic speaker identification, that is, intra-speaker variation should be less than inter-speaker variation in many acoustic features, and further validates the probability and reliability of forensic speaker identification.}, } @article {pmid16034571, year = {2005}, author = {Gentilucci, M and Cattaneo, L}, title = {Automatic audiovisual integration in speech perception.}, journal = {Experimental brain research}, volume = {167}, number = {1}, pages = {66-75}, pmid = {16034571}, issn = {0014-4819}, mesh = {Acoustic Stimulation/methods ; Adult ; Female ; Humans ; *Lipreading ; Male ; Photic Stimulation/methods ; Psychoacoustics ; Sound Spectrography/methods ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement/methods ; Verbal Learning/physiology ; Visual Perception/*physiology ; Vocabulary ; }, abstract = {Two experiments aimed to determine whether features of both the visual and acoustical inputs are always merged into the perceived representation of speech and whether this audiovisual integration is based on either cross-modal binding functions or on imitation. In a McGurk paradigm, observers were required to repeat aloud a string of phonemes uttered by an actor (acoustical presentation of phonemic string) whose mouth, in contrast, mimicked pronunciation of a different string (visual presentation). In a control experiment participants read the same printed strings of letters. This condition aimed to analyze the pattern of voice and the lip kinematics controlling for imitation. In the control experiment and in the congruent audiovisual presentation, i.e. when the articulation mouth gestures were congruent with the emission of the string of phones, the voice spectrum and the lip kinematics varied according to the pronounced strings of phonemes. In the McGurk paradigm the participants were unaware of the incongruence between visual and acoustical stimuli. The acoustical analysis of the participants' spoken responses showed three distinct patterns: the fusion of the two stimuli (the McGurk effect), repetition of the acoustically presented string of phonemes, and, less frequently, of the string of phonemes corresponding to the mouth gestures mimicked by the actor. However, the analysis of the latter two responses showed that the formant 2 of the participants' voice spectra always differed from the value recorded in the congruent audiovisual presentation. It approached the value of the formant 2 of the string of phonemes presented in the other modality, which was apparently ignored. The lip kinematics of the participants repeating the string of phonemes acoustically presented were influenced by the observation of the lip movements mimicked by the actor, but only when pronouncing a labial consonant. The data are discussed in favor of the hypothesis that features of both the visual and acoustical inputs always contribute to the representation of a string of phonemes and that cross-modal integration occurs by extracting mouth articulation features peculiar for the pronunciation of that string of phonemes.}, } @article {pmid16024350, year = {2005}, author = {Reby, D and McComb, K and Cargnelutti, B and Darwin, C and Fitch, WT and Clutton-Brock, T}, title = {Red deer stags use formants as assessment cues during intrasexual agonistic interactions.}, journal = {Proceedings. Biological sciences}, volume = {272}, number = {1566}, pages = {941-947}, pmid = {16024350}, issn = {0962-8452}, mesh = {Acoustic Stimulation ; Agonistic Behavior/*physiology ; Animals ; Body Size ; Deer/*physiology ; Larynx/anatomy & histology/physiology ; Male ; Scotland ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {While vocal tract resonances or formants are key acoustic parameters that define differences between phonemes in human speech, little is known about their function in animal communication. Here, we used playback experiments to present red deer stags with re-synthesized vocalizations in which formant frequencies were systematically altered to simulate callers of different body sizes. In response to stimuli where lower formants indicated callers with longer vocal tracts, stags were more attentive, replied with more roars and extended their vocal tracts further in these replies. Our results indicate that mammals other than humans use formants in vital vocal exchanges and can adjust their own formant frequencies in relation to those that they hear.}, } @article {pmid16021682, year = {2004}, author = {Mäkelä, AM and Alku, P and May, PJ and Mäkinen, V and Tiitinen, H}, title = {Cortical activity elicited by isolated vowels and diphthongs.}, journal = {Neurology & clinical neurophysiology : NCN}, volume = {2004}, number = {}, pages = {91}, pmid = {16021682}, issn = {1526-8748}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Auditory Cortex/*physiology ; Female ; Functional Laterality/physiology ; Humans ; Male ; Reading ; Speech Perception/*physiology ; }, abstract = {Cortical activity underlying speech perception has been studied mostly by using isolated vowels with constant formant frequencies. Speech, however, is characterized by formant transitions whereby formant frequencies change as a function of time. We used magnetoencephalography (MEG) to investigate cortical activity elicited by isolated vowels and diphthongs containing formant transitions. Ten subjects were presented with two isolated vowels /a/ and /u/ and diphthongs /au/ and /ua/. Stimulus duration was 200 ms, and the diphthongs started and ended with a 50-ms constant-formant period and included a 100-ms linear transition period. Apart from studying the auditory N100m response, we examined subsequent brain activity in a 500-ms poststimulus time window, as the transitions were expected to elicit activity also in later stages of cognitive processing. All the stimuli elicited prominent N100m responses. Thereafter, both the isolated vowels and diphthongs elicited sustained brain activity lasting up to 500 ms. The present observations indicate that identification of the speech sounds as well as changes in their identity are reflected in the auditory N100m. Notably, the stimuli appeared to elicit left-hemispheric activity resembling the N400, typically obtained by using more complicated speech stimuli such as words and sentences.}, } @article {pmid16018492, year = {2005}, author = {Dajani, HR and Wong, W and Kunov, H}, title = {Fine structure spectrography and its application in speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {6}, pages = {3902-3918}, doi = {10.1121/1.1896365}, pmid = {16018492}, issn = {0001-4966}, mesh = {Algorithms ; Humans ; Phonetics ; Sensitivity and Specificity ; *Sound Spectrography ; *Speech Acoustics ; }, abstract = {A filterbank-based algorithm for time-varying spectral analysis is proposed. The algorithm, which is an enhanced realization of the conventional spectrogram, consists of hundreds or thousands of highly overlapping wideband filter/detector stages, followed by a peak detector that probes the filter/detector outputs at very short time intervals. Analysis with synthetic modulated signals illustrates how the proposed method demodulates these signals. The resulting spectrogram-like display, referred to as a "fine structure spectrogram," shows the fine structure of the modulations in substantially higher detail than is possible with conventional spectrograms. Error evaluation is performed as a function of various parameters of a single- and two-component synthetic modulated signal, and of parameters of the analysis system. In speech, the fine structure spectrogram can detect small frequency and amplitude modulations in the formants. It also appears to identify additional significant time-frequency components in speech that are not detected by other methods, making it potentially useful in speech processing applications.}, } @article {pmid16018489, year = {2005}, author = {Cho, T}, title = {Prosodic strengthening and featural enhancement: evidence from acoustic and articulatory realizations of /a,i/ in English.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {6}, pages = {3867-3878}, doi = {10.1121/1.1861893}, pmid = {16018489}, issn = {0001-4966}, mesh = {Adult ; Humans ; *Phonation ; *Phonetics ; Psychoacoustics ; *Speech Acoustics ; *Speech Articulation Tests ; }, abstract = {In this study the effects of accent and prosodic boundaries on the production of English vowels (/a,i/), by concurrently examining acoustic vowel formants and articulatory maxima of the tongue, jaw, and lips obtained with EMA (Electromagnetic Articulography) are investigated. The results demonstrate that prosodic strengthening (due to accent and/or prosodic boundaries) has differential effects depending on the source of prominence (in accented syllables versus at edges of prosodic domains; domain initially versus domain finally). The results are interpreted in terms of how the prosodic strengthening is related to phonetic realization of vowel features. For example, when accented, /i/ was fronter in both acoustic and articulatory vowel spaces (enhancing [-back]), accompanied by an increase in both lip and jaw openings (enhancing sonority). By contrast, at edges of prosodic domains (especially domain-finally), /i/ was not necessarily fronter, but higher (enhancing [+high]), accompanied by an increase only in the lip (not jaw) opening. This suggests that the two aspects of prosodic structure (accent versus boundary) are differentiated by distinct phonetic patterns. Further, it implies that prosodic strengthening, though manifested in fine-grained phonetic details, is not simply a low-level phonetic event but a complex linguistic phenomenon, closely linked to the enhancement of phonological features and positional strength that may license phonological contrasts.}, } @article {pmid16018441, year = {2005}, author = {Jones, C}, title = {Effects of vocalic duration and first formant offset on final voicing judgments by children and adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {6}, pages = {3385-3388}, doi = {10.1121/1.1906058}, pmid = {16018441}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; *Judgment ; *Language Development ; Male ; *Phonation ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; *Time Perception ; }, } @article {pmid16012623, year = {2004}, author = {Tiitinen, H and Mäkelä, AM and Mäkinen, V and May, PJ and Alku, P}, title = {Periodic glottal excitation and formant frequencies in the perception of vowels.}, journal = {Neurology & clinical neurophysiology : NCN}, volume = {2004}, number = {}, pages = {103}, pmid = {16012623}, issn = {1526-8748}, mesh = {Acoustic Stimulation/*methods ; Adult ; Auditory Cortex/physiology ; Evoked Potentials, Auditory/*physiology ; Female ; *Glottis ; Humans ; Magnetoencephalography/*methods ; Male ; Phonetics ; Reaction Time/physiology ; Speech Perception/*physiology ; }, abstract = {Voiced speech is created by the fluctuating vocal folds generating the glottal pulseform. This excitation signal is the source of the speech fundamental frequency and its harmonic integer multiples. Periodic glottal excitation is required for the elicitation of speech-specific cortical processes indexed by the auditory N100m response. Here, we studied the cortical processing underlying the perception of the vowels /a/ and /u/ produced using normal and aperiodic phonation. The behavior of the N100m, registered with magnetoencephalography (MEG), was studied in 10 subjects. The amplitude and latency of the N100m as well as the center of gravity of the activated cortical areas varied as a function of stimulus periodicity. Further, the presence of glottal excitation had differential effects on the latency of the N100m elicited by the vowels /a/ and /u/. Thus, changes affecting the perceptual quality of speech signals without changing their phonetic content modify the dynamics of human auditory cortex.}, } @article {pmid16008260, year = {2005}, author = {Pan, T and Ma, FR and Cao, KL and Song, WM and Wei, CG and Cui, WL and Liao, HR}, title = {[Voice analysis in pre-lingual cochlear implant adults].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {40}, number = {4}, pages = {271-274}, pmid = {16008260}, issn = {1673-0860}, mesh = {Adolescent ; Adult ; Case-Control Studies ; *Cochlear Implantation ; Cochlear Implants ; Deafness/*therapy ; Humans ; Male ; Speech Perception ; Treatment Outcome ; *Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To observe voice characteristic of pre-lingual cochlear implant adults for cochlear implantation and phoniatrics.

METHODS: 3s-sustained voice of vowel [ a: ] of 28 pre-lingual cochlear implant adults, 18 pre-lingual deafness adults and 10 adults with normal hearing were analyzed. Specifically, the Voice analyses include fundamental frequency, first formant, second formant, frequency perturbation quotient (FPQ), amplitude perturbation quotient (APQ) and harmonic noise ratio (HNR). The outcomes of 3 groups were compared.

RESULTS: The fundamental frequency was lower in cochlear implant group [(175.42+/-25. 31) Hz] than that in deafness group [(210.84+/-54.300) Hz] (P = 0.02). The position of formant of cochlear implant group [F2 = (1264. 64 +/- 152.19) Hz] was more access to normal than that of normal hearing group[ F2 = (1422.44 +/- 232. 37) Hz, P = 0. 02]. FPQ of cochlear implant group (2.09 +/- 1.15) was more access to normal than that of deafness group (5.32+/-4.29, P=0.006). The voice of cochlear implanted and deafness adults were much more different individually.

CONCLUSIONS: In the aspect of acoustic characteristic of voice, pre-lingual cochlear implant adults could benefit cochlear implantation finitely. As speech perception of pre-lingual cochlear implant adults was far worse than that of children and post-lingual cochlear implant adults, the general outcome of pre-lingual cochlear implant adults was very limited. Cochlear implant of those candidate should be cautious.}, } @article {pmid16008257, year = {2005}, author = {Zhu, M and Zhang, DX and Liu, YX and Yang, XJ}, title = {[Singing formant analysis of KunQu actors in their mutation and grown-up].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {40}, number = {4}, pages = {258-260}, pmid = {16008257}, issn = {1673-0860}, mesh = {Adolescent ; Child ; Humans ; Male ; *Music ; *Phonation ; *Voice Quality ; }, abstract = {OBJECTIVE: To compare the singing formant differences between successful opera actors and non-successful opera actors during their adolescence period, and to compare the same index between adolescence and adult period of successful actors.

METHODS: From 1985 to 1986, the author had 21 adolescent actors' voice recorded, all of them were from Beijing KunQu opera troupe. In 2000, all the 21 subjects had their voice recorded and singing formant (Fs) analyzed by using computer and sound spectrograph, 7 of them had become adult actors, others quitted their actors career after adolescents period.

RESULT: Successful actors have obvious Fs, and stronger acoustic energy; successful actors had weaker Fs value during adolescence period than during adult period (t = 2. 9600, P < 0.05).

CONCLUSION: Fs's presence and its acoustic energy were important to evaluate adolescent actors future locality potential.}, } @article {pmid15990272, year = {2006}, author = {Laukkanen, AM and Björkner, E and Sundberg, J}, title = {Throaty voice quality: subglottal pressure, voice source, and formant characteristics.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {20}, number = {1}, pages = {25-37}, doi = {10.1016/j.jvoice.2004.11.008}, pmid = {15990272}, issn = {0892-1997}, mesh = {Female ; Glottis/physiology ; Humans ; Magnetic Resonance Imaging ; Male ; Phonation/*physiology ; Pressure ; *Voice Quality ; }, abstract = {"Throaty" voice quality has been regarded by voice pedagogues as undesired and even harmful. This study attempts to identify acoustic and physiological correlates of this quality. One male and one female subject read a text habitually and with a throaty voice quality. Oral pressure during p-occlusion was measured as an estimate of subglottal pressure. Long-term average spectrum analysis described the average spectrum characteristics. Sixteen syllables, perceptually evaluated with regard to throaty quality by five experts, were selected for analysis. Formant frequencies and voice source characteristics were measured by means of inverse filtering, and the vocal tract shape of the throaty and normal versions of the vowels [a,u,i,ae] of the male subject were recorded by magnetic resonance imaging. From this material, area functions were derived and their resonance frequencies were determined. The throaty versions of these four vowels all showed a pharynx that was narrower than in the habitually produced versions. To test the relevance of formant frequencies to perceived throaty quality, experts rated degree of throatiness in synthetic vowel samples, in which the measured formant frequency values of the subject were used. The main acoustic correlates of throatiness seemed to be an increase of F1, a decrease of F4, and in front vowels a decrease of F2, which presumably results from a narrowing of the pharynx. In the male subject, voice source parameters suggested a more hyperfunctional voice in throaty samples.}, } @article {pmid15974353, year = {2005}, author = {Rami, MK and Kalinowski, J and Rastatter, MP and Holbert, D and Allen, M}, title = {Choral reading with filtered speech: effect on stuttering.}, journal = {Perceptual and motor skills}, volume = {100}, number = {2}, pages = {421-431}, doi = {10.2466/pms.100.2.421-431}, pmid = {15974353}, issn = {0031-5125}, mesh = {Acoustic Stimulation ; Adult ; Cues ; *Feedback ; Female ; Humans ; Imitative Behavior ; Male ; Middle Aged ; *Reading ; Speech Acoustics ; *Speech Perception ; Speech Therapy/*methods ; Stuttering/*therapy ; Voice ; }, abstract = {This study investigated use of choral reading with filtered components of speech and whispered speech on the frequency of stuttering. Three passages read by a normal adult male were lowpass filtered with kneepoint frequencies at 100 Hz (approximate glottal source), 500 Hz (source and first formant), and 1 kHz (source and the first two formants). Along with a whispered passage, a normal passage, and a control condition, these stimuli were used in a repeated-measures design with 12 adult stutterers as they read passages while listening to one of the stimuli. Frequencies of stuttering in each condition were analyzed. The choral speech, the 500-Hz, the 1-kHz, and the whispered speech conditions all decreased the frequency of stuttering while the 100-Hz stimuli did not. It is suggested that articulatory events, chiefly the encoded speech output from the vocal tract, create effective cues and may induce fluent speech in people who stutter.}, } @article {pmid15957788, year = {2005}, author = {Adachi, S and Yu, J}, title = {Two-dimensional model of vocal fold vibration for sound synthesis of voice and soprano singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {5}, pages = {3213-3224}, doi = {10.1121/1.1861592}, pmid = {15957788}, issn = {0001-4966}, mesh = {Humans ; Models, Biological ; Phonation/*physiology ; Phonetics ; *Sound ; Speech Production Measurement ; *Speech, Alaryngeal ; *Vibration ; Vocal Cords/*physiology ; *Voice Quality ; }, abstract = {Voiced sounds were simulated with a computer model of the vocal fold composed of a single mass vibrating both parallel and perpendicular to the airflow. Similarities with the two-mass model are found in the amplitudes of the glottal area and the glottal volume flow velocity, the variation in the volume flow waveform with the vocal tract shape, and the dependence of the oscillation amplitude upon the average opening area of the glottis, among other similar features. A few dissimilarities are also found in the more symmetric glottal and volume flow waveforms in the rising and falling phases. The major improvement of the present model over the two-mass model is that it yields a smooth transition between oscillations with an inductive load and a capacitive load of the vocal tract with no sudden jumps in the vibration frequency. Self-excitation is possible both below and above the first formant frequency of the vocal tract. By taking advantage of the wider continuous frequency range, the two-dimensional model can successfully be applied to the sound synthesis of a high-pitched soprano singing, where the fundamental frequency sometimes exceeds the first formant frequency.}, } @article {pmid15957787, year = {2005}, author = {Nieto-Castanon, A and Guenther, FH and Perkell, JS and Curtin, HD}, title = {A modeling investigation of articulatory variability and acoustic stability during American English /r/ production.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {5}, pages = {3196-3212}, doi = {10.1121/1.1893271}, pmid = {15957787}, issn = {0001-4966}, support = {R01 DC01925/DC/NIDCD NIH HHS/United States ; R01 DC02852/DC/NIDCD NIH HHS/United States ; }, mesh = {Cues ; Humans ; *Language ; Models, Biological ; *Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Vocal Cords/anatomy & histology ; }, abstract = {This paper investigates the functional relationship between articulatory variability and stability of acoustic cues during American English /r/ production. The analysis of articulatory movement data on seven subjects shows that the extent of intrasubject articulatory variability along any given articulatory direction is strongly and inversely related to a measure of acoustic stability (the extent of acoustic variation that displacing the articulators in this direction would produce). The presence and direction of this relationship is consistent with a speech motor control mechanism that uses a third formant frequency (F3) target; i.e., the final articulatory variability is lower for those articulatory directions most relevant to determining the F3 value. In contrast, no consistent relationship across speakers and phonetic contexts was found between hypothesized vocal-tract target variables and articulatory variability. Furthermore, simulations of two speakers' productions using the DIVA model of speech production, in conjunction with a novel speaker-specific vocal-tract model derived from magnetic resonance imaging data, mimic the observed range of articulatory gestures for each subject, while exhibiting the same articulatory/acoustic relations as those observed experimentally. Overall these results provide evidence for a common control scheme that utilizes an acoustic, rather than articulatory, target specification for American English /r/.}, } @article {pmid15957773, year = {2005}, author = {Lyzenga, J and Carlyon, RP}, title = {Detection, direction discrimination, and off-frequency interference of center-frequency modulations and glides for vowel formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {5}, pages = {3042-3053}, doi = {10.1121/1.1882943}, pmid = {15957773}, issn = {0001-4966}, support = {//Wellcome Trust/United Kingdom ; }, mesh = {Auditory Threshold/physiology ; Humans ; *Phonetics ; *Speech Discrimination Tests ; Speech Perception/*physiology ; Time Factors ; }, abstract = {Vowels are mainly classified by the positions of peaks in their frequency spectra, the formants. For normal-hearing subjects, change detection and direction discrimination were measured for linear glides in the center frequency (CF) of formantlike sounds. A CF rove was used to prevent subjects from using either the start or end points of the glides as cues. In addition, change detection and starting-phase (start-direction) discrimination were measured for similar stimuli with a sinusoidal 5-Hz formant-frequency modulation. The stimuli consisted of single formants generated using a number of different stimulus parameters including fundamental frequency, spectral slope, frequency region, and position of the formant relative to the harmonic spectrum. The change detection thresholds were in good agreement with the predictions of a model which analyzed and combined the effects of place-of-excitation and temporal cues. For most stimuli, thresholds were approximately equal for change detection and start-direction discrimination. Exceptions were found for stimuli that consisted of only one or two harmonics. In a separate experiment, it was shown that change detection and start-direction discrimination of linear and sinusoidal formant-frequency modulations were impaired by off-frequency frequency-modulated interferers. This frequency modulation detection interference was larger for formants with shallow than for those with steep spectral slopes.}, } @article {pmid15953512, year = {2005}, author = {Stevens, HE and Wickesberg, RE}, title = {Auditory nerve representation of naturally-produced vowels with variable acoustics.}, journal = {Hearing research}, volume = {205}, number = {1-2}, pages = {21-34}, doi = {10.1016/j.heares.2005.02.008}, pmid = {15953512}, issn = {0378-5955}, support = {DC02819/DC/NIDCD NIH HHS/United States ; DC05014/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/instrumentation ; Animals ; Auditory Perception/*physiology ; Auditory Threshold/physiology ; Chinchilla ; Cochlear Microphonic Potentials/*physiology ; Cochlear Nerve/*physiology ; Female ; Humans ; Male ; Nerve Fibers/*physiology ; Phonetics ; Reaction Time ; Sound Spectrography ; }, abstract = {This investigation compared the encoding of naturally-produced, whispered and normally-voiced vowels by auditory nerve fibers. Speech syllables containing the vowels /open o/ and /ae/ were produced by two female speakers and presented at three intensities to ketamine-anesthetized chinchillas. Six different representations of the spectral components in the vowels in the responses of the auditory nerve fibers were evaluated. For both normal and whispered vowels over a 30 dB range, the formant peaks in the vowel were best displayed using rate-place representations. The spectral detail in the vowel was revealed by average localized synchronized rates (ALSR) and autocorrelations of individual peristimulus time histograms. The average localized interval rates (ALIR), autocorrelations of ensemble responses, and autocorrelations of individual spike trains demonstrated poor representations of vowel spectra, although the frequency components of normally-voiced vowels had better representations than those of whispered vowels. These analyses suggest that rate-based and synchronization-based measures yields two very different pieces of information, but only a normalized rate-based measure consistently identified the formants of both the whispered and normally-voiced vowels.}, } @article {pmid15938202, year = {2005}, author = {Pan, T and Ma, F and Cao, K and Song, W and Wei, C and Cui, W}, title = {[The changes of fundamental frequency and formants of vowel in cochlear implant pre-lingual children of different age].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {19}, number = {4}, pages = {145-148}, pmid = {15938202}, mesh = {Adolescent ; Age Factors ; Child ; Child, Preschool ; *Cochlear Implants ; Deafness/*surgery ; Female ; Humans ; Male ; Phonation/*physiology ; Speech Production Measurement ; }, abstract = {OBJECTIVE: To observe the changes of fundamental frequency and formants of vowel in cochlear implant pre-lingual children of different age and to provide the basis and direction for post-operative rehabilitation. To find the key period of language development from the aspect of phonetics and to illustrate the necessity of early implantation.

METHOD: Nineteen pre-lingual cochlear implant children age from 3-13 who use 24 M or 24 Contour implant and Esprint speech processor participate the experiment. Voice analysis software was Vs99 which was explored by Beijing yangchen electronic technology corporation. Fundamental frequency and formants were abstracted from the sustained segment of vowel[a:].

RESULT: No significant difference of fundamental frequency was found between the age 4 group and normative group. Significant difference of fundamental frequency was found between the age 10 group and normative group (P < 0.05). No significant difference of formant 1 was found between the age 4 group and normative group. Significant difference of formant 1 was found between the age 10 group and normative group (P < 0.01). Formant 2 of the age 4 group was higher than the normative group, significantly. No significant difference of formant 2 was found between the age 10 group and normative group. No significant difference of the ratio of formant 1 to 2 was found between the age 4 group and normative group. Significant difference of the ratio of formant 1 to 2 was found between the age 10 group and normative group (P < 0.01).

CONCLUSION: The phonation of the children from the age 4 to age 10 was obviously affected by age. This stage was the key stage for language development.}, } @article {pmid15938065, year = {2005}, author = {Cleary, M and Pisoni, DB and Kirk, KI}, title = {Influence of voice similarity on talker discrimination in children with normal hearing and children with cochlear implants.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {48}, number = {1}, pages = {204-223}, pmid = {15938065}, issn = {1092-4388}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; R01DC00064/DC/NIDCD NIH HHS/United States ; R01 DC000064/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; R01DC00111/DC/NIDCD NIH HHS/United States ; T32DC00012/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; *Cochlear Implants ; Deafness/*surgery ; Female ; *Hearing ; Humans ; Male ; *Recognition, Psychology ; Speech Discrimination Tests ; *Speech Perception ; Speech Production Measurement ; *Voice ; }, abstract = {The perception of voice similarity was examined in 5-year-old children with normal hearing sensitivity and in pediatric cochlear implant users, 5-12 years of age. Recorded sentences were manipulated to form a continuum of similar-sounding voices. An adaptive procedure was then used to determine how acoustically different, in terms of average fundamental and formant frequencies, 2 sentences needed to be for a child to categorize the sentences as spoken by 2 different talkers. The average spectral characteristics of 2 utterances (including their fundamental frequencies) needed to differ by at least 11%-16% (2-2.5 semitones) for normal-hearing children to perceive the voices as belonging to different talkers. Introducing differences in the linguistic content of the 2 sentences to be compared did not change performance. Although several children with cochlear implants performed similarly to normal-hearing children, most found the task very difficult. Pediatric cochlear implant users who scored above the group mean of 64% of words correct on a monosyllabic open-set word identification task categorized the voices more like children with normal hearing sensitivity.}, } @article {pmid15933518, year = {2005}, author = {Campisi, P and Low, A and Papsin, B and Mount, R and Cohen-Kerem, R and Harrison, R}, title = {Acoustic analysis of the voice in pediatric cochlear implant recipients: a longitudinal study.}, journal = {The Laryngoscope}, volume = {115}, number = {6}, pages = {1046-1050}, doi = {10.1097/01.MLG.0000163343.10549.4C}, pmid = {15933518}, issn = {0023-852X}, mesh = {Acoustics ; Adolescent ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*physiopathology/rehabilitation ; Female ; Humans ; Longitudinal Studies ; Male ; *Voice ; }, abstract = {OBJECTIVE: To characterize inherent acoustic abnormalities of the deaf pediatric voice and the effect of artificially restoring auditory feedback with cochlear implantation.

DESIGN: Inception cohort.

SETTING: Academic referral center.

PATIENTS: Twenty-one children with severe to profound hearing loss (15 prelingually deaf, 6 postlingually deaf) accepted into the cochlear implant program were followed for up to 6 months. Patients unable to perform the vocal exercises were excluded.

INTERVENTIONS: Objective voice analysis was performed using the Computerized Speech Laboratory (Kay Elemetrics) prior to cochlear implantation, at the time of implant activation and at 2 and 6 months postactivation. Assessments were based on sustained phonations and dynamic ranges.

MAIN OUTCOME MEASURE: Fundamental frequency, long-term control of fundamental frequency (vF0) and long-term control of amplitude (vAM) were derived from sustained phonations. The dynamic frequency range was derived from scale exercises. Formant frequencies (F1, F2, F3) were determined using linear predictive coding.

RESULTS: Fundamental frequency was not altered by implant activation or experience (P = 0.342). With profoundly deaf subject, the most prevalent acoustic abnormality was a poor long-term control of frequency (vF0, 2.81%) and long-term control of amplitude (vAm, 23.58%). Implant activation and experience had no effect on the long-term control of frequency (P = 0.106) but normalized the long-term control of amplitude (P = 0.007). The mean frequency range increased from 311.9 Hz preimplantation to 483.5 Hz postimplantation (P = 0.08). The F1/F2 ratio remained stable (P = 0.476).

CONCLUSION: In children, severe to profound deafness results in poor long-term control of frequency and amplitude. Cochlear implantation restores control of amplitude only and implies the need for additional rehabilitative strategies for restoration of control of frequency.}, } @article {pmid15907439, year = {2005}, author = {Amir, O and Amir, N and Michaeli, O}, title = {Evaluating the influence of warmup on singing voice quality using acoustic measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {19}, number = {2}, pages = {252-260}, doi = {10.1016/j.jvoice.2004.02.008}, pmid = {15907439}, issn = {0892-1997}, mesh = {Adolescent ; Adult ; Electrophysiology/instrumentation ; Female ; Humans ; Phonation/*physiology ; Phonetics ; *Speech Acoustics ; Voice/*physiology ; *Voice Quality ; }, abstract = {Vocal warmup is generally accepted as vital for singing performance. However, only a limited number of studies have evaluated this effect quantitatively. In this study, we evaluated the effect of vocal warmup on voice production, among young female singers, using a set of acoustic parameters. Warmup reduced frequency-perturbation (p < 0.001) and amplitude-perturbation values (p < 0.05). In addition, warmup increased singer's formant amplitude (p < 0.05) and improved noise-to-harmonic ratio (p < 0.05). Tone-matching accuracy, however, was not affected by warmup. The effect of vocal warmup on frequency-perturbation parameters was more evident among mezzo-soprano singers than it was among soprano singers. It was also more evident in the low pitch-range than in the higher pitch-ranges (p < 0.05). The results of this study provide valid support for the advantageous effect of vocal warmup on voice quality and present acoustic analysis as a valuable and sensitive tool for quantifying this effect.}, } @article {pmid15898653, year = {2005}, author = {Enders, J and Geng, W and Li, P and Frazier, MW and Scholl, DJ}, title = {The shift-invariant discrete wavelet transform and application to speech waveform analysis.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {4 Pt 1}, pages = {2122-2133}, doi = {10.1121/1.1869732}, pmid = {15898653}, issn = {0001-4966}, mesh = {Algorithms ; Computer Graphics ; Humans ; Models, Statistical ; *Phonetics ; *Signal Processing, Computer-Assisted ; *Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {The discrete wavelet transform may be used as a signal-processing tool for visualization and analysis of nonstationary, time-sampled waveforms. The highly desirable property of shift invariance can be obtained at the cost of a moderate increase in computational complexity, and accepting a least-squares inverse (pseudoinverse) in place of a true inverse. A new algorithm for the pseudoinverse of the shift-invariant transform that is easier to implement in array-oriented scripting languages than existing algorithms is presented together with self-contained proofs. Representing only one of the many and varied potential applications, a recorded speech waveform illustrates the benefits of shift invariance with pseudoinvertibility. Visualization shows the glottal modulation of vowel formants and frication noise, revealing secondary glottal pulses and other waveform irregularities. Additionally, performing sound waveform editing operations (i.e., cutting and pasting sections) on the shift-invariant wavelet representation automatically produces quiet, click-free section boundaries in the resulting sound. The capabilities of this wavelet-domain editing technique are demonstrated by changing the rate of a recorded spoken word. Individual pitch periods are repeated to obtain a half-speed result, and alternate individual pitch periods are removed to obtain a double-speed result. The original pitch and formant frequencies are preserved. In informal listening tests, the results are clear and understandable.}, } @article {pmid15896836, year = {2006}, author = {Kenney, MK and Barac-Cikoja, D and Finnegan, K and Jeffries, N and Ludlow, CL}, title = {Speech perception and short-term memory deficits in persistent developmental speech disorder.}, journal = {Brain and language}, volume = {96}, number = {2}, pages = {178-190}, pmid = {15896836}, issn = {0093-934X}, support = {Z01 NS002980-09//Intramural NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged ; Child ; Cues ; Female ; Humans ; Language Development Disorders/*epidemiology ; Male ; Memory Disorders/*diagnosis/*epidemiology ; *Memory, Short-Term ; Middle Aged ; Neuropsychological Tests ; *Speech Perception ; Time Perception ; }, abstract = {Children with developmental speech disorders may have additional deficits in speech perception and/or short-term memory. To determine whether these are only transient developmental delays that can accompany the disorder in childhood or persist as part of the speech disorder, adults with a persistent familial speech disorder were tested on speech perception and short-term memory. Nine adults with a persistent familial developmental speech disorder without language impairment were compared with 20 controls on tasks requiring the discrimination of fine acoustic cues for word identification and on measures of verbal and nonverbal short-term memory. Significant group differences were found in the slopes of the discrimination curves for first formant transitions for word identification with stop gaps of 40 and 20 ms with effect sizes of 1.60 and 1.56. Significant group differences also occurred on tests of nonverbal rhythm and tonal memory, and verbal short-term memory with effect sizes of 2.38, 1.56, and 1.73. No group differences occurred in the use of stop gap durations for word identification. Because frequency-based speech perception and short-term verbal and nonverbal memory deficits both persisted into adulthood in the speech-impaired adults, these deficits may be involved in the persistence of speech disorders without language impairment.}, } @article {pmid15812305, year = {2005}, author = {Mäkelä, AM and Alku, P and May, PJ and Mäkinen, V and Tiitinen, H}, title = {Left-hemispheric brain activity reflects formant transitions in speech sounds.}, journal = {Neuroreport}, volume = {16}, number = {6}, pages = {549-553}, doi = {10.1097/00001756-200504250-00006}, pmid = {15812305}, issn = {0959-4965}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Dominance, Cerebral/*physiology ; Female ; Humans ; *Magnetoencephalography ; Male ; Phonetics ; Reaction Time/physiology ; Speech Perception/*physiology ; }, abstract = {Connected speech is characterized by formant transitions whereby formant frequencies change over time. Here, using magneto-encephalography, we investigated the cortical activity in 10 participants in response to constant-formant vowels and diphthongs with formant transitions. All the stimuli elicited prominent auditory N100m responses, but the formant transitions resulted in latency modulations specific to the left hemisphere. Following the elicitation of the N100m, cortical activity shifted some 10 mm towards anterior brain areas. This late activity resembled the N400m, typically obtained with more complex utterances such as words and/or sentences. Thus, the present study demonstrates how magnetoencephalography can be used to investigate the spatiotemporal evolution in cortical activity related to the various stages of the processing of speech.}, } @article {pmid15807027, year = {2005}, author = {Kiefte, M and Kluender, KR}, title = {The relative importance of spectral tilt in monophthongs and diphthongs.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {3 Pt 1}, pages = {1395-1404}, doi = {10.1121/1.1861158}, pmid = {15807027}, issn = {0001-4966}, support = {R01 DC04072/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Ito et al. [J. Acoust. Soc. Am. 110, 1141-1149 (2001)] demonstrated that listeners can reliably identify vowel stimuli on the basis of relative formant amplitude in the absence of, or in spite of, F2 peak frequency. In the present study, formant frequencies and global spectral tilt are manipulated independently in synthetic steady-state vowels. Listeners' identification of these sounds demonstrate strong perceptual effects for both local (formant frequency) and global (spectral tilt) acoustic characteristics. Subsequent experiments reveal that effects of spectral tilt are attenuated in synthetic stimuli for which formant center frequencies change continuously. When formant peaks are kinematic, perceptual salience of the relative amplitudes of low- and high-frequency formants (as determined by spectral tilt) is mitigated. Because naturally produced English vowels are rarely spectrally static, one may conclude that gross spectral properties may play only a limited role in perception of fluently produced vowel sounds.}, } @article {pmid15807020, year = {2005}, author = {Lyzenga, J and Moore, BC}, title = {Effect of frequency-modulation coherence for inharmonic stimuli: frequency-modulation phase discrimination and identification of artificial double vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {3 Pt 1}, pages = {1314-1325}, doi = {10.1121/1.1856251}, pmid = {15807020}, issn = {0001-4966}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Female ; Humans ; Male ; Phonetics ; Psychometrics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The ability to compare patterns of frequency modulation (FM) in separate frequency regions was explored. In experiment 1, listeners had to distinguish whether the FM applied to two nonharmonically related sinusoidal carriers was in phase or out of phase. The FM rate was the same for each carrier. The starting phase of the modulation was randomized for each stimulus in a three alternative, forced-choice (3AFC) trial. Subjects were sensitive to relative FM phase for modulation rates of 2 and 4 Hz, but not for higher rates. In experiment 2, vowel identification was compared for artificial single and double vowels. The vowels were constructed from complex tones with components spaced at 2-ERB(N) (equivalent rectangular bandwidth) intervals, by increasing the levels of three components by 15 dB, to create three "formants." In the double vowels, the components of the two vowels were interleaved, to give 1-ERB(N) spacing. The three "formant" components were frequency modulated at 2, 4, or 8 Hz, with either the same or different rates for the two vowels. The identification of double vowels was not improved by a difference in FM rate across vowels, suggesting that differences in FM rate do not support perceptual segregation of inharmonic stimuli.}, } @article {pmid15805731, year = {2005}, author = {Kim, HH and Yeon, SC and Houpt, KA and Lee, HC and Chang, HH and Lee, HJ}, title = {Acoustic feature of barks of ovariohysterectomized and intact German Shepherd bitches.}, journal = {The Journal of veterinary medical science}, volume = {67}, number = {3}, pages = {281-285}, doi = {10.1292/jvms.67.281}, pmid = {15805731}, issn = {0916-7250}, mesh = {*Aggression ; Animals ; Dogs/*physiology ; Female ; Hysterectomy/*veterinary ; Korea ; Observation ; Ovariectomy/*veterinary ; Sound Spectrography/veterinary ; *Territoriality ; Video Recording ; Vocalization, Animal/*physiology ; }, abstract = {This study was carried out to investigate the effect of ovariohysterectomy on vocalization during territorial aggression of German Shepherd dogs. Sixteen clinically healthy dogs of 5 to 10 months old were assigned randomly to one of two groups: ovariohysterectomy or control. Their behaviors and vocalizations induced by the approach of a strange with a strange dog were recorded using digital camcorder at four and five months after surgery and were analysed. When territorial aggression was induced, dogs in ovariohysterectomized group showed more offensive territorial aggression. The bark was the most frequent vocalization. In this study, the average number of barks was 45 times in ovariohysterectomized group, and 26 times in the control group. The pitch of vocalization was significantly lower in ovariohysterectomized group than control group. First formant, second formant, third formant, and fourth formant frequency in ovariohysterectomized group, which represent the degree of sound energy in specific frequency, were lower than those of control group. Ovariohysterectomy of bitches may influence the frequency of aggressive vocalization and affect the acoustic feature of dogs' vocalization. Analysis of vocalization could be a useful method of evaluating the dogs' intention.}, } @article {pmid19844607, year = {2005}, author = {Hunter, EJ and Titze, IR}, title = {OVERLAP OF HEARING AND VOICING RANGES IN SINGING.}, journal = {Journal of singing : the official journal of the National Association of Teachers of Singing}, volume = {61}, number = {4}, pages = {387-392}, pmid = {19844607}, issn = {1086-7732}, support = {R01 DC004347/DC/NIDCD NIH HHS/United States ; R01 DC004347-03/DC/NIDCD NIH HHS/United States ; }, abstract = {Frequency and intensity ranges in voice production by trained and untrained singers were superimposed onto the average normal human hearing range. The vocal output for all subjects was shown both in Voice Range Profiles and Spectral Level Profiles. Trained singers took greater advantage of the dynamic range of the auditory system with harmonic energy (45% of the hearing range compared to 38% for untrained vocalists). This difference seemed to come from the trained singers ablily to exploit the most sensitive part of the hearing range (around 3 to 4 kHz) through the use of the singer's formant. The trained vocalists' average maximum third-octave spectral band level was 95 dB SPL, compared to 80 dB SPL for untrained.}, } @article {pmid15764212, year = {2004}, author = {Nordenberg, M and Sundberg, J}, title = {Effect on LTAS of vocal loudness variation.}, journal = {Logopedics, phoniatrics, vocology}, volume = {29}, number = {4}, pages = {183-191}, doi = {10.1080/14015430410004689}, pmid = {15764212}, issn = {1401-5439}, mesh = {Adult ; Female ; Humans ; Male ; *Reading ; Reference Values ; *Sound Spectrography ; *Speech Acoustics ; *Voice Quality ; }, abstract = {Long-term-average spectrum (LTAS) is an efficient method for voice analysis, revealing both voice source and formant characteristics. However, the LTAS contour is non-uniformly affected by vocal loudness. This variation was analyzed in 15 male and 16 female untrained voices reading a text 7 times at different degrees of vocal loudness, mean change in overall equivalent sound level (Leq) amounting to 27.9 dB and 28.4 dB for the female and male subjects. For all frequency values up to 4 kHz, spectrum level was strongly and linearly correlated with Leq for each subject. The gain factor, that is to say, the rate of level increase, varied with frequency, from about 0.5 at low frequencies to about 1.5 in the frequency range 1.5-3 kHz. Using the gain factors for a subject, LTAS contours could be predicted at any Leq within the measured range, with an average accuracy of 2-3 dB below 4 kHz. Mean LTAS calculated for an Leq of 70 dB for each subject showed considerable individual variation for both males and females, SD of the level varying between 7 dB and 4 dB depending on frequency. On the other hand, the results also suggest that meaningful comparisons of LTAS, recorded for example before and after voice therapy, can be made, provided that the documentation includes a set of recordings at different loudness levels from one recording session.}, } @article {pmid15759713, year = {2005}, author = {Rendall, D and Kollias, S and Ney, C and Lloyd, P}, title = {Pitch (F0) and formant profiles of human vowels and vowel-like baboon grunts: the role of vocalizer body size and voice-acoustic allometry.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {2}, pages = {944-955}, doi = {10.1121/1.1848011}, pmid = {15759713}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Animals ; Biometry ; *Body Size ; Female ; Humans ; Male ; *Papio ; *Phonation ; *Phonetics ; Sound Spectrography ; Species Specificity ; *Speech Acoustics ; *Vocalization, Animal ; *Voice Quality ; }, abstract = {Key voice features--fundamental frequency (F0) and formant frequencies--can vary extensively between individuals. Much of the variation can be traced to differences in the size of the larynx and vocal-tract cavities, but whether these differences in turn simply reflect differences in speaker body size (i.e., neutral vocal allometry) remains unclear. Quantitative analyses were therefore undertaken to test the relationship between speaker body size and voice F0 and formant frequencies for human vowels. To test the taxonomic generality of the relationships, the same analyses were conducted on the vowel-like grunts of baboons, whose phylogenetic proximity to humans and similar vocal production biology and voice acoustic patterns recommend them for such comparative research. For adults of both species, males were larger than females and had lower mean voice F0 and formant frequencies. However, beyond this, F0 variation did not track body-size variation between the sexes in either species, nor within sexes in humans. In humans, formant variation correlated significantly with speaker height but only in males and not in females. Implications for general vocal allometry are discussed as are implications for speech origins theories, and challenges to them, related to laryngeal position and vocal tract length.}, } @article {pmid15759708, year = {2005}, author = {Assmann, PF and Katz, WF}, title = {Synthesis fidelity and time-varying spectral change in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {2}, pages = {886-895}, doi = {10.1121/1.1852549}, pmid = {15759708}, issn = {0001-4966}, mesh = {Adult ; Child ; Child, Preschool ; *Communication Aids for Disabled ; Female ; Humans ; Male ; *Phonation ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; Speech Perception ; }, abstract = {Recent studies have shown that synthesized versions of American English vowels are less accurately identified when the natural time-varying spectral changes are eliminated by holding the formant frequencies constant over the duration of the vowel. A limitation of these experiments has been that vowels produced by formant synthesis are generally less accurately identified than the natural vowels after which they are modeled. To overcome this limitation, a high-quality speech analysis-synthesis system (STRAIGHT) was used to synthesize versions of 12 American English vowels spoken by adults and children. Vowels synthesized with STRAIGHT were identified as accurately as the natural versions, in contrast with previous results from our laboratory showing identification rates 9%-12% lower for the same vowels synthesized using the cascade formant model. Consistent with earlier studies, identification accuracy was not reduced when the fundamental frequency was held constant across the vowel. However, elimination of time-varying changes in the spectral envelope using STRAIGHT led to a greater reduction in accuracy (23%) than was previously found with cascade formant synthesis (11%). A statistical pattern recognition model, applied to acoustic measurements of the natural and synthesized vowels, predicted both the higher identification accuracy for vowels synthesized using STRAIGHT compared to formant synthesis, and the greater effects of holding the formant frequencies constant over time with STRAIGHT synthesis. Taken together, the experiment and modeling results suggest that formant estimation errors and incorrect rendering of spectral and temporal cues by cascade formant synthesis contribute to lower identification accuracy and underestimation of the role of time-varying spectral change in vowels.}, } @article {pmid15759697, year = {2005}, author = {Prodi, N and Velecka, S}, title = {A scale value for the balance inside a historical opera house.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {2}, pages = {771-779}, doi = {10.1121/1.1842774}, pmid = {15759697}, issn = {0001-4966}, abstract = {In the framework of opera house acoustics, the term "balance" refers to the acoustical competition between the singer on the stage and the orchestra in the pit. The mechanism allowing the operatic singers to be heard over the orchestra has to do with their skill in enhancing the vocal emission by a peculiar use of the formant frequencies. This vital factor is sensed by the listeners and, apart from the obvious sound power ratio of the stage and the pit sources, is the main cue that helps to formulate a subjective impression of the balance. To achieve its objective qualification, two calibrated sound sources can be placed on the stage and in the pit, respectively, and their sound level difference is measured at the listeners' seats. The scope of this work is to investigate the relationship between the subjective impression and the objective indicator of the balance and to develop a scale value for the parameter in the case of a historical opera house. For this scope a set of acoustical data from the Teatro Comunale in Ferrara will be used to create synthetic sound fields with controlled conditions of the balance between the stage and the pit. This methodology employs an anechoic piece for soprano (with piano accompaniment) and is implemented in a dead room equipped with an acoustical rendering system. The sound fields are used to investigate the appropriate balance values by means of listening tests. The results of the scaling exercise show that a suitable range of values can be extracted and that the sound from the stage and the pit is perceived as balanced when the loudness difference between the two is comprised within -2.0 dBA and +2.3 dBA.}, } @article {pmid15704427, year = {2005}, author = {Nittrouer, S and Estee, S and Lowenstein, JH and Smith, J}, title = {The emergence of mature gestural patterns in the production of voiceless and voiced word-final stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {1}, pages = {351-364}, pmid = {15704427}, issn = {0001-4966}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; Female ; *Gestures ; Humans ; Male ; Phonation/*physiology ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Voice/*physiology ; }, abstract = {The organization of gestures was examined in children's and adults' samples of consonant-vowel-stop words differing in stop voicing. Children (5 and 7 years old) and adults produced words from five voiceless/voiced pairs, five times each in isolation and in sentences. Acoustic measurements were made of vocalic duration, and of the first and second formants at syllable center and voicing offset. The predicted acoustic correlates of syllable-final voicing were observed across speakers: vocalic segments were shorter and first formants were higher in words with voiceless, rather than voiced, final stops. In addition, the second formant was found to differ depending on the voicing of the final stop for all speakers. It was concluded that by 5 years of age children produce words ending in stops with the same overall gestural organization as adults. However, some age-related differences were observed for jaw gestures, and variability for all measures was greater for children than for adults. These results suggest that children are still refining their organization of articulatory gestures past the age of 7 years. Finally, context effects (isolation or sentence) showed that the acoustic correlates of syllable-final voicing are attenuated when words are produced in sentences, rather than in isolation.}, } @article {pmid15704423, year = {2005}, author = {Smith, DR and Patterson, RD and Turner, R and Kawahara, H and Irino, T}, title = {The processing and perception of size information in speech sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {1}, pages = {305-318}, pmid = {15704423}, issn = {0001-4966}, support = {G0500221/MRC_/Medical Research Council/United Kingdom ; G0500221(73813)/MRC_/Medical Research Council/United Kingdom ; G9900369/MRC_/Medical Research Council/United Kingdom ; G9900369(68372)/MRC_/Medical Research Council/United Kingdom ; }, mesh = {Humans ; Phonetics ; *Pitch Perception ; Sound ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {There is information in speech sounds about the length of the vocal tract; specifically, as a child grows, the resonators in the vocal tract grow and the formant frequencies of the vowels decrease. It has been hypothesized that the auditory system applies a scale transform to all sounds to segregate size information from resonator shape information, and thereby enhance both size perception and speech recognition [Irino and Patterson, Speech Commun. 36, 181-203 (2002)]. This paper describes size discrimination experiments and vowel recognition experiments designed to provide evidence for an auditory scaling mechanism. Vowels were scaled to represent people with vocal tracts much longer and shorter than normal, and with pitches much higher and lower than normal. The results of the discrimination experiments show that listeners can make fine judgments about the relative size of speakers, and they can do so for vowels scaled well beyond the normal range. Similarly, the recognition experiments show good performance for vowels in the normal range, and for vowels scaled well beyond the normal range of experience. Together, the experiments support the hypothesis that the auditory system automatically normalizes for the size information in communication sounds.}, } @article {pmid15704422, year = {2005}, author = {Brungart, DS and Simpson, BD and Darwin, CJ and Arbogast, TL and Kidd, G}, title = {Across-ear interference from parametrically degraded synthetic speech signals in a dichotic cocktail-party listening task.}, journal = {The Journal of the Acoustical Society of America}, volume = {117}, number = {1}, pages = {292-304}, doi = {10.1121/1.1835509}, pmid = {15704422}, issn = {0001-4966}, support = {R01 DC004545/DC/NIDCD NIH HHS/United States ; DC00100/DC/NIDCD NIH HHS/United States ; DC045045/DC/NIDCD NIH HHS/United States ; DC04663/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Dichotic Listening Tests ; Ear, Middle/*physiology ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking/physiology ; *Social Environment ; *Speech Perception ; *Speech, Alaryngeal ; }, abstract = {Recent results have shown that listeners attending to the quieter of two speech signals in one ear (the target ear) are highly susceptible to interference from normal or time-reversed speech signals presented in the unattended ear. However, speech-shaped noise signals have little impact on the segregation of speech in the opposite ear. This suggests that there is a fundamental difference between the across-ear interference effects of speech and nonspeech signals. In this experiment, the intelligibility and contralateral-ear masking characteristics of three synthetic speech signals with parametrically adjustable speech-like properties were examined: (1) a modulated noise-band (MNB) speech signal composed of fixed-frequency bands of envelope-modulated noise; (2) a modulated sine-band (MSB) speech signal composed of fixed-frequency amplitude-modulated sinewaves; and (3) a "sinewave speech" signal composed of sine waves tracking the first four formants of speech. In all three cases, a systematic decrease in performance in the two-talker target-ear listening task was found as the number of bands in the contralateral speech-like masker increased. These results suggest that speech-like fluctuations in the spectral envelope of a signal play an important role in determining the amount of across-ear interference that a signal will produce in a dichotic cocktail-party listening task.}, } @article {pmid15699602, year = {2005}, author = {Yeon, SC and Lee, HC and Chang, HH and Lee, HJ}, title = {Sound signature for identification of tracheal collapse and laryngeal paralysis in dogs.}, journal = {The Journal of veterinary medical science}, volume = {67}, number = {1}, pages = {91-95}, doi = {10.1292/jvms.67.91}, pmid = {15699602}, issn = {0916-7250}, mesh = {Animals ; Dog Diseases/*diagnosis ; Dogs ; Male ; Respiratory Sounds/*veterinary ; Sound Spectrography/veterinary ; Tracheal Stenosis/diagnosis/*veterinary ; Vocal Cord Paralysis/diagnosis/*veterinary ; }, abstract = {The aims of this study were to investigate whether upper airway sounds of dogs with laryngeal paralysis and tracheal collapse have distinct sound characteristics, compared with unaffected dogs. The sounds of 5 dogs with laryngeal paralysis and 5 dogs with tracheal collapse were recorded. Honking sound appeared as predominant clinical signs in dogs with tracheal collapse. Laryngeal stridors appeared as predominant clinical signs in dogs with experimentally produced laryngeal paralysis by resection of laryngeal nerve, in which two types of stridor, I and II, were recorded. All these sounds were analyzed using sound spectrogam analysis. There were significant differences in duration (sec), intensity (dB), pitch (Hz), first formant (Hz), second formant (Hz), third formant (Hz), fourth formant (Hz) of sounds between the normal bark and two types of stridor or honking sound, indicating that the sound analysis might be a useful diagnostic modality for dogs with tracheal collapse and laryngeal paralysis.}, } @article {pmid15666614, year = {2004}, author = {Stojcević, I and Carek, A and Buković, D and Hedjever, M}, title = {Influence of the partial denture on the articulation of dental and postalveolar sounds.}, journal = {Collegium antropologicum}, volume = {28}, number = {2}, pages = {799-807}, pmid = {15666614}, issn = {0350-6134}, mesh = {Denture, Partial/*adverse effects ; Female ; Humans ; Male ; Middle Aged ; Reading ; *Speech Acoustics ; *Speech Intelligibility ; }, abstract = {Dental prosthesis is a foreign body in oral cavity and thus necessarily interferes with speech articulation. The purpose of this study was to examine influence of partial denture on speech quality and to show eventual differences in pronunciation of dental sounds c[ts], z [z], s [s] and postalveolar sounds c [t], z [3] and s [integral of]. We have examined differences in pronunciation between subjects with removable partial dentures, the same group without partial dentures and a control group. The study was performed on 30 subjects with removable partial dentures and 30 subjects with complete dental arch. All subjects were recorded while reading six Croatian words containing the examined sounds. Recordings were analyzed with Multispeech Program (Kay Elemetrics Inc.). Acoustic analysis--LPC (linear prediction coding) provided formant peaks (Hz) for each examined sound, its intensity (dB) and formant bandwidths (Hz). Results showed that subjects with partial dentures had 50% less distorted variables and that prostheses did not completely restore articulation of postalveolar sounds. Groups with and without prostheses had lower formant peaks intensities and wider formant bandwidths in comparison to the control group. Partial dentures have not significantly interfered with resonance frequency. At the same time, pronunciation of the examined sounds was significantly improved. However, precision of the articulation movements has deteriorated.}, } @article {pmid15661112, year = {2005}, author = {Laufer, I and Pratt, H}, title = {The 'F-complex' and MMN tap different aspects of deviance.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {116}, number = {2}, pages = {336-352}, doi = {10.1016/j.clinph.2004.08.007}, pmid = {15661112}, issn = {1388-2457}, mesh = {Adult ; Brain Mapping ; Cerebral Cortex/*physiology ; Electroencephalography ; Electromagnetic Phenomena ; Electrophysiology ; *Evoked Potentials, Auditory ; Female ; Functional Laterality ; Humans ; Illusions ; Male ; Speech Perception/*physiology ; Tomography ; User-Computer Interface ; }, abstract = {OBJECTIVE: To compare the 'F(fusion)-complex' with the Mismatch negativity (MMN), both components associated with automatic detection of changes in the acoustic stimulus flow.

METHODS: Ten right-handed adult native Hebrew speakers discriminated vowel-consonant-vowel (V-C-V) sequences /ada/ (deviant) and /aga/ (standard) in an active auditory 'Oddball' task, and the brain potentials associated with performance of the task were recorded from 21 electrodes. Stimuli were generated by fusing the acoustic elements of the V-C-V sequences as follows: base was always presented in front of the subject, and formant transitions were presented to the front, left or right in a virtual reality room. An illusion of a lateralized echo (duplex sensation) accompanied base fusion with the lateralized formant locations. Source current density estimates were derived for the net response to the fusion of the speech elements (F-complex) and for the MMN, using low-resolution electromagnetic tomography (LORETA). Statistical non-parametric mapping was used to estimate the current density differences between the brain sources of the F-complex and the MMN.

RESULTS: Occipito-parietal regions and prefrontal regions were associated with the F-complex in all formant locations, whereas the vicinity of the supratemporal plane was bilaterally associated with the MMN, but only in case of front-fusion (no duplex effect).

CONCLUSIONS: MMN is sensitive to the novelty of the auditory object in relation to other stimuli in a sequence, whereas the F-complex is sensitive to the acoustic features of the auditory object and reflects a process of matching them with target categories.

SIGNIFICANCE: The F-complex and MMN reflect different aspects of auditory processing in a stimulus-rich and changing environment: content analysis of the stimulus and novelty detection, respectively.}, } @article {pmid15658705, year = {2004}, author = {Holmes, SD and Sumner, CJ and O'Mard, LP and Meddis, R}, title = {The temporal representation of speech in a nonlinear model of the guinea pig cochlea.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {6}, pages = {3534-3545}, doi = {10.1121/1.1815111}, pmid = {15658705}, issn = {0001-4966}, mesh = {Animals ; Basilar Membrane/physiology ; Cochlea/*physiology ; Cochlear Nerve/physiology ; Computer Simulation ; Fourier Analysis ; Guinea Pigs ; Hair Cells, Auditory, Inner/physiology ; Loudness Perception/physiology ; *Nonlinear Dynamics ; Phonetics ; Pitch Discrimination/physiology ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Time Perception/*physiology ; }, abstract = {The temporal representation of speechlike stimuli in the auditory-nerve output of a guinea pig cochlea model is described. The model consists of a bank of dual resonance nonlinear filters that simulate the vibratory response of the basilar membrane followed by a model of the inner hair cell/auditory nerve complex. The model is evaluated by comparing its output with published physiological auditory nerve data in response to single and double vowels. The evaluation includes analyses of individual fibers, as well as ensemble responses over a wide range of best frequencies. In all cases the model response closely follows the patterns in the physiological data, particularly the tendency for the temporal firing pattern of each fiber to represent the frequency of a nearby formant of the speech sound. In the model this behavior is largely a consequence of filter shapes; nonlinear filtering has only a small contribution at low frequencies. The guinea pig cochlear model produces a useful simulation of the measured physiological response to simple speech sounds and is therefore suitable for use in more advanced applications including attempts to generalize these principles to the response of human auditory system, both normal and impaired.}, } @article {pmid15656937, year = {2005}, author = {Riede, T and Bronson, E and Hatzikirou, H and Zuberbühler, K}, title = {Vocal production mechanisms in a non-human primate: morphological data and a model.}, journal = {Journal of human evolution}, volume = {48}, number = {1}, pages = {85-96}, doi = {10.1016/j.jhevol.2004.10.002}, pmid = {15656937}, issn = {0047-2484}, mesh = {Animals ; Cercopithecus/*anatomy & histology/*physiology ; Computer Simulation ; Female ; Male ; *Models, Biological ; Radiography ; Vocal Cords/*anatomy & histology/diagnostic imaging/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Human beings are thought to be unique amongst the primates in their capacity to produce rapid changes in the shape of their vocal tracts during speech production. Acoustically, vocal tracts act as resonance chambers, whose geometry determines the position and bandwidth of the formants. Formants provide the acoustic basis for vowels, which enable speakers to refer to external events and to produce other kinds of meaningful communication. Formant-based referential communication is also present in non-human primates, most prominently in Diana monkey alarm calls. Previous work has suggested that the acoustic structure of these calls is the product of a non-uniform vocal tract capable of some degree of articulation. In this study we test this hypothesis by providing morphological measurements of the vocal tract of three adult Diana monkeys, using both radiography and dissection. We use these data to generate a vocal tract computational model capable of simulating the formant structures produced by wild individuals. The model performed best when it combined a non-uniform vocal tract consisting of three different tubes with a number of articulatory manoeuvres. We discuss the implications of these findings for evolutionary theories of human and non-human vocal production.}, } @article {pmid15620224, year = {2004}, author = {Jacobsen, T and Schröger, E}, title = {Input to verbal working memory: Preattentive construction of the central speech representation.}, journal = {Experimental psychology}, volume = {51}, number = {4}, pages = {231-239}, doi = {10.1027/1618-3169.51.4.231}, pmid = {15620224}, issn = {1618-3169}, mesh = {Adult ; Female ; Humans ; Language ; Male ; *Memory ; Phonetics ; *Speech ; Speech Perception ; *Vocabulary ; }, abstract = {Working memory uses central sound representations as an informational basis. The central sound representation is the temporally and feature-integrated mental representation that corresponds to phenomenal perception. It is used in (higher-order) mental operations and stored in long-term memory. In the bottom-up processing path, the central sound representation can be probed at the level of auditory sensory memory with the mismatch negativity (MMN) of the event-related potential. The present paper reviews a newly developed MMN paradigm to tap into the processing of speech sound representations. Preattentive vowel categorization based on F1-F2 formant information occurs in speech sounds and complex tones even under conditions of high variability of the auditory input. However, an additional experiment demonstrated the limits of the preattentive categorization of language-relevant information. It tested whether the system categorizes complex tones containing the F1 and F2 formant components of the vowel /a/ differently than six sounds with nonlanguage-like F1-F2 combinations. From the absence of an MMN in this experiment, it is concluded that no adequate vowel representation was constructed. This shows limitations of the capability of preattentive vowel categorization.}, } @article {pmid15619865, year = {2004}, author = {Mürbe, D and Lindner, P and Zöllner, S and Welsch, H and Kuhlisch, E and Hüttenbrink, KB and Sundberg, J}, title = {Change of voice characteristics during +3 Gz acceleration.}, journal = {Aviation, space, and environmental medicine}, volume = {75}, number = {12}, pages = {1081-1085}, pmid = {15619865}, issn = {0095-6562}, mesh = {Adult ; Automation ; Centrifugation ; Humans ; Hypergravity/*adverse effects ; Male ; Space Flight ; Speech Disorders/*etiology ; *Voice ; }, abstract = {BACKGROUND: This study was performed to test the feasibility of an experimental approach for assessing voice changes during exposure to increased +Gz acceleration. Such changes are probably due to mechanical alterations of the structures involved in voice production. This may be relevant to automatic speech analysis for flight control. Because voice control by means of auditory feedback may compensate for acceleration effects, the investigations included conditions with masked auditory feedback.

METHOD: Four male subjects read standard speech test material while seated in a human centrifuge both during sustained +3 Gz acceleration and in a reference condition. Both test runs were repeated with masking of the auditory feedback by a white noise presented via headphones. Microphone and acceleration signals were recorded on a PC-based dynamic signal acquisition board. Long-time average spectra (LTAS), fundamental frequency (F0), and the frequency of the first, second, and third formant (F1, F2, F3) of the vowels /a/, /o/, and /i/ were extracted from the microphone signal for the different conditions.

RESULTS: LTAS clearly differed between the masking conditions, but not between reference and +3 Gz conditions. F0 clearly rose with auditory masking and showed a small increase under +3 Gz acceleration. Several effects of +3 Gz acceleration on formant frequencies were found, all of rather small magnitude. Increased acceleration lowered F2 for the vowel /i/. A decrease of F3 was observed for vowels /i/ and /o/.

CONCLUSION: This pilot study has shown the feasibility of an experimental approach to assessing voice changes during exposure to increased +Gz acceleration. Exposure to +3 Gz showed small effects on FO and several formant frequencies. A definitive forthcoming study should assess the significance of these effects by investigating a greater number of subjects during exposure to acceleration higher than +3 Gz.}, } @article {pmid15605431, year = {2004}, author = {Munson, B and Solomon, NP}, title = {The effect of phonological neighborhood density on vowel articulation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {47}, number = {5}, pages = {1048-1058}, pmid = {15605431}, issn = {1092-4388}, support = {R03 DC005702/DC/NIDCD NIH HHS/United States ; R03 DC006096/DC/NIDCD NIH HHS/United States ; R03 DC 05702/DC/NIDCD NIH HHS/United States ; R03 DC 06096/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Audiometry, Pure-Tone ; Female ; Humans ; Male ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/physiology ; Speech Production Measurement ; }, abstract = {Recent literature suggests that phonological neighborhood density and word frequency can affect speech production, in addition to the well-documented effects that they have on speech perception. This article describes 2 experiments that examined how phonological neighborhood density influences the durations and formant frequencies of adults' productions of vowels in real words. In Experiment 1, 10 normal speakers produced words that covaried in phonological neighborhood density and word frequency. Infrequent words with many phonological neighbors were produced with shorter durations and more expanded vowel spaces than frequent words with few phonological neighbors. Results of this experiment confirmed that this effect was not related to the duration of the vowels constituting the high- and low-density words. In Experiment 2, 15 adults produced words that varied in both word frequency and neighborhood density. Neighborhood density affected vowel articulation in both high- and low-frequency words. Moreover, frequent words were produced with more contracted vowel spaces than infrequent words. There was no interaction between these factors, and the vowel duration did not vary as a function of neighborhood density. Taken together, the results suggest that neighborhood density affects vowel production independent of word frequency and vowel duration.}, } @article {pmid15603459, year = {2004}, author = {Hedrick, MS and Nabelek, AK}, title = {Effect of F2 intensity on identity of /u/ in degraded listening conditions.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {47}, number = {5}, pages = {1012-1021}, doi = {10.1044/1092-4388(2004/075)}, pmid = {15603459}, issn = {1092-4388}, support = {1 R55 DC03682/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Case-Control Studies ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The current study investigated the influence of the second formant (F2) intensity on vowel labeling along a /u/-/i/ continuum. Twenty-two listeners with normal-hearing (NH) sensitivity and 14 listeners with sensorineural hearing impairment (HI) were initially presented 2 stimuli for which the F2 intensity differed by 20 dB. The listeners were asked to label the 2 stimuli categorically as /u/ or /i/. After passing this criterion test, listeners were presented 9 stimuli whose F2 intensity varied within the 20-dB range. The 9 stimuli were evaluated in 3 listening conditions: in quiet, in the presence of a continuous speech spectrum noise (0-dB signal-to-noise ratio), and in the presence of reverberation (T = 1.0 s). The intensity manipulation altered the vowel labeling of NH listeners and yielded a differential effect in noise versus reverberation. Only 5 of the HI listeners were able to pass the criterion test, and of these 5, only 2 were able to label the 9 stimuli categorically. Results from HI listeners suggest problems in categorizing spectral shape.}, } @article {pmid15603157, year = {2004}, author = {Liu, C and Kewley-Port, D}, title = {Formant discrimination in noise for isolated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {5}, pages = {3119-3129}, doi = {10.1121/1.1802671}, pmid = {15603157}, issn = {0001-4966}, support = {DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Discrimination, Psychological/physiology ; Environment ; Humans ; *Linguistics ; Noise/adverse effects ; Pitch Discrimination/*physiology ; Regression Analysis ; Speech Perception/*physiology ; }, abstract = {Formant discrimination for isolated vowels presented in noise was investigated for normal-hearing listeners. Discrimination thresholds for F1 and F2, for the seven American English vowels /i, I, epsilon, ae, [symbol see text], a, u/, were measured under two types of noise, long-term speech-shaped noise (LTSS) and multitalker babble, and also under quiet listening conditions. Signal-to-noise ratios (SNR) varied from -4 to +4 dB in steps of 2 dB. All three factors, formant frequency, signal-to-noise ratio, and noise type, had significant effects on vowel formant discrimination. Significant interactions among the three factors showed that threshold-frequency functions depended on SNR and noise type. The thresholds at the lowest levels of SNR were highly elevated by a factor of about 3 compared to those in quiet. The masking functions (threshold vs SNR) were well described by a negative exponential over F1 and F2 for both LTSS and babble noise. Speech-shaped noise was a slightly more effective masker than multitalker babble, presumably reflecting small benefits (1.5 dB) due to the temporal variation of the babble.}, } @article {pmid15603155, year = {2004}, author = {Adank, P and Smits, R and van Hout, R}, title = {A comparison of vowel normalization procedures for language variation research.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {5}, pages = {3099-3107}, doi = {10.1121/1.1795335}, pmid = {15603155}, issn = {0001-4966}, mesh = {Discriminant Analysis ; Female ; Humans ; *Language ; *Linguistics ; Male ; Mathematics ; *Phonation ; *Voice ; }, abstract = {An evaluation of vowel normalization procedures for the purpose of studying language variation is presented. The procedures were compared on how effectively they (a) preserve phonemic information, (b) preserve information about the talker's regional background (or sociolinguistic information), and (c) minimize anatomical/physiological variation in acoustic representations of vowels. Recordings were made for 80 female talkers and 80 male talkers of Dutch. These talkers were stratified according to their gender and regional background. The normalization procedures were applied to measurements of the fundamental frequency and the first three formant frequencies for a large set of vowel tokens. The normalization procedures were evaluated through statistical pattern analysis. The results show that normalization procedures that use information across multiple vowels ("vowel-extrinsic" information) to normalize a single vowel token performed better than those that include only information contained in the vowel token itself ("vowel-intrinsic" information). Furthermore, the results show that normalization procedures that operate on individual formants performed better than those that use information across multiple formants (e.g., "formant-extrinsic" F2-F1).}, } @article {pmid15597045, year = {2004}, author = {Jääskeläinen, IP and Ojanen, V and Ahveninen, J and Auranen, T and Levänen, S and Möttönen, R and Tarnanen, I and Sams, M}, title = {Adaptation of neuromagnetic N1 responses to phonetic stimuli by visual speech in humans.}, journal = {Neuroreport}, volume = {15}, number = {18}, pages = {2741-2744}, pmid = {15597045}, issn = {0959-4965}, support = {R01 HD040712/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adaptation, Physiological/*physiology ; Adult ; Analysis of Variance ; Auditory Cortex/*physiology ; Brain Mapping ; Electroencephalography/methods ; Female ; Functional Laterality/physiology ; Humans ; *Lipreading ; Magnetoencephalography/methods ; Male ; *Phonetics ; Photic Stimulation/methods ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The technique of 306-channel magnetoencephalogaphy (MEG) was used in eight healthy volunteers to test whether silent lip-reading modulates auditory-cortex processing of phonetic sounds. Auditory test stimuli (either Finnish vowel /ae/ or /ø/) were preceded by a 500 ms lag by either another auditory stimulus (/ae/, /ø/ or the second-formant midpoint between /ae/ and /ø/), or silent movie of a person articulating /ae/ or /ø/. Compared with N1 responses to auditory /ae/ and /ø/ when presented without a preceding stimulus, the amplitudes of left-hemisphere N1 responses to the test stimuli were significantly suppressed both when preceded by auditory and visual stimuli, this effect being significantly stronger with preceding auditory stimuli. This suggests that seeing articulatory gestures of a speaker influences auditory speech perception by modulating the responsiveness of auditory-cortex neurons.}, } @article {pmid15589197, year = {2005}, author = {Brattico, E and Kujala, T and Tervaniemi, M and Alku, P and Ambrosi, L and Monitillo, V}, title = {Long-term exposure to occupational noise alters the cortical organization of sound processing.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {116}, number = {1}, pages = {190-203}, doi = {10.1016/j.clinph.2004.07.030}, pmid = {15589197}, issn = {1388-2457}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Threshold/physiology ; Brain Mapping ; Case-Control Studies ; Contingent Negative Variation/physiology ; Electroencephalography/methods ; Evoked Potentials, Auditory/*physiology ; Functional Laterality/*physiology ; Humans ; Male ; Middle Aged ; Noise, Occupational/*adverse effects ; Reaction Time/physiology ; Speech Perception/*physiology ; *Time ; }, abstract = {OBJECTIVE: Long-term exposure to noise may cause an altered hemispheric lateralization of speech processing even in silent conditions. We examined whether this lateralization shift is speech specific or occurs also for other sounds.

METHODS: Brain responses from 10 healthy noise-exposed workers (>5 years) and 10 matched controls were recorded with a 32-channel electroencephalogram in two conditions, one including standard and deviant speech sounds, the other non-speech sounds, with novel sounds in both.

RESULTS: The deviant-sound elicited mismatch negativity (MMN) was larger to non-speech than speech sounds in control subjects, while it did not differ between the sound types in the noise-exposed subjects. Moreover, the MMN to speech sounds was lateralized to the right hemisphere in exposed workers, while it was left-hemisphere predominant in control subjects. No group topography difference was found for non-speech sounds. The deviant sounds that were close in formant space to the standards elicited a longer MMN latency in both speech and non-speech conditions in exposed subjects than controls. No group differences were found for cortical responses to novel sounds.

CONCLUSIONS: Long-term noise exposure altered the strength and the hemispheric organization of speech-sound discrimination and decreased the speed of sound-change processing.

SIGNIFICANCE: Subpathological changes in cortical responses to sounds may occur even in subjects without a peripheral damage but continuously exposed to noisy auditory environments.}, } @article {pmid15582625, year = {2004}, author = {van der Lely, HK and Rosen, S and Adlard, A}, title = {Grammatical language impairment and the specificity of cognitive domains: relations between auditory and language abilities.}, journal = {Cognition}, volume = {94}, number = {2}, pages = {167-183}, doi = {10.1016/j.cognition.2004.01.003}, pmid = {15582625}, issn = {0010-0277}, mesh = {Adult ; Child ; Cognition Disorders/*diagnosis ; Female ; Humans ; Language Disorders/*diagnosis ; Male ; Phonetics ; Speech Acoustics ; *Speech Perception ; *Verbal Learning ; }, abstract = {Grammatical-specific language impairment (G-SLI) in children, arguably, provides evidence for the existence of a specialised grammatical sub-system in the brain, necessary for normal language development. Some researchers challenge this, claiming that domain-general, low-level auditory deficits, particular to rapid processing, cause phonological deficits and thereby SLI. We investigate this possibility by testing the auditory discrimination abilities of G-SLI children for speech and non-speech sounds, at varying presentation rates, and controlling for the effects of age and language on performance. For non-speech formant transitions, 69% of the G-SLI children showed normal auditory processing, whereas for the same acoustic information in speech, only 31% did so. For rapidly presented tones, 46% of the G-SLI children performed normally. Auditory performance with speech and non-speech sounds differentiated the G-SLI children from their age-matched controls, whereas speed of processing did not. The G-SLI children evinced no relationship between their auditory and phonological/grammatical abilities. We found no consistent evidence that a deficit in processing rapid acoustic information causes or maintains G-SLI. The findings, from at least those G-SLI children who do not exhibit any auditory deficits, provide further evidence supporting the existence of a primary domain-specific deficit underlying G-SLI.}, } @article {pmid15567049, year = {2004}, author = {Laukkanen, AM and Mickelson, NP and Laitala, M and Syrjä, T and Salo, A and Sihvo, M}, title = {Effects of HearFones on speaking and singing voice quality.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {4}, pages = {475-487}, doi = {10.1016/j.jvoice.2003.05.007}, pmid = {15567049}, issn = {0892-1997}, mesh = {Adult ; Auditory Perception/*physiology ; Feedback/*physiology ; Female ; Humans ; Male ; Middle Aged ; Music ; Phonation/*physiology ; Reading ; Reference Values ; Sound Spectrography ; Speech Acoustics ; Speech Perception ; Voice Quality/*physiology ; *Voice Training ; }, abstract = {HearFones (HF) have been designed to enhance auditory feedback during phonation. This study investigated the effects of HF (1) on sound perceivable by the subject, (2) on voice quality in reading and singing, and (3) on voice production in speech and singing at the same pitch and sound level. Test 1: Text reading was recorded with two identical microphones in the ears of a subject. One ear was covered with HF, and the other was free. Four subjects attended this test. Tests 2 and 3: A reading sample was recorded from 13 subjects and a song from 12 subjects without and with HF on. Test 4: Six females repeated [pa:p:a] in speaking and singing modes without and with HF on same pitch and sound level. Long-term average spectra were made (Tests 1-3), and formant frequencies, fundamental frequency, and sound level were measured (Tests 2 and 3). Subglottic pressure was estimated from oral pressure in [p], and simultaneously electroglottography (EGG) was registered during voicing on [a:] (Test 4). Voice quality in speech and singing was evaluated by three professional voice trainers (Tests 2-4). HF seemed to enhance sound perceivable at the whole range studied (0-8 kHz), with the greatest enhancement (up to ca 25 dB) being at 1-3 kHz and at 4-7 kHz. The subjects tended to decrease loudness with HF (when sound level was not being monitored). In more than half of the cases, voice quality was evaluated "less strained" and "better controlled" with HF. When pitch and loudness were constant, no clear differences were heard but closed quotient of the EGG signal was higher and the signal more skewed, suggesting a better glottal closure and/or diminished activity of the thyroarytenoid muscle.}, } @article {pmid15563727, year = {2005}, author = {Parviainen, T and Helenius, P and Salmelin, R}, title = {Cortical differentiation of speech and nonspeech sounds at 100 ms: implications for dyslexia.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {15}, number = {7}, pages = {1054-1063}, doi = {10.1093/cercor/bhh206}, pmid = {15563727}, issn = {1047-3211}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Dyslexia/*physiopathology ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Intelligence ; *Magnetoencephalography ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Neurophysiological measures indicate cortical sensitivity to speech sounds by 150 ms after stimulus onset. In this time window dyslexic subjects start to show abnormal cortical processing. We investigated whether phonetic analysis is reflected in the robust auditory cortical activation at approximately 100 ms (N100m), and whether dyslexic subjects show abnormal N100m responses to speech or nonspeech sounds. We used magnetoencephalography to record auditory responses of 10 normally reading and 10 dyslexic adults. The speech stimuli were synthetic Finnish speech sounds (/a/, /u/, /pa/, /ka/). The nonspeech stimuli were complex nonspeech sounds and simple sine wave tones, composed of the F1+F2+F3 and F2 formant frequencies of the speech sounds, respectively. All sounds evoked a prominent N100m response in the bilateral auditory cortices. The N100m activation was stronger to speech than nonspeech sounds in the left but not in the right auditory cortex, in both subject groups. The leftward shift of hemispheric balance for speech sounds is likely to reflect analysis at the phonetic level. In dyslexic subjects the overall interhemispheric amplitude balance and timing were altered for all sound types alike. Dyslexic individuals thus seem to have an unusual cortical organization of general auditory processing in the time window of speech-sensitive analysis.}, } @article {pmid15553657, year = {2004}, author = {Leigh, JR and Henshall, KR and McKay, CM}, title = {Optimizing frequency-to-electrode allocation in cochlear implants.}, journal = {Journal of the American Academy of Audiology}, volume = {15}, number = {8}, pages = {574-584}, doi = {10.3766/jaaa.15.8.5}, pmid = {15553657}, issn = {1050-0545}, mesh = {Aged ; Analysis of Variance ; Cochlear Implantation/methods ; *Cochlear Implants ; Electric Stimulation ; Electrodes ; Female ; Hearing Loss/*therapy ; Humans ; Linear Models ; Male ; Middle Aged ; Prosthesis Design ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This study tested the hypothesis that speech perception of cochlear implant users could be improved by increasing the number of electrodes allocated to frequencies below 2.6 kHz, with correspondingly fewer electrodes allocated above this point. Eight users of the Nucleus CI22 implant participated and wore experimental maps in which the widths of the analysis filters were altered to provide either two or three extra low-frequency channels. Speech perception was tested following periods of take-home experience. Information transmission analysis of vowel and consonant confusions appear to support our hypothesis, showing that vowel first formant information was significantly better perceived with the experimental map and that consonant information was not significantly different. The results of CNC word testing appear to contradict this result, showing that the experimental map degraded both vowel and consonant perception. Overall, the experimental map had a small and variable effect on speech perception. These results do not support our hypothesis and indicate that further research is needed to investigate the possible effects of narrowing the filter bandwidth in low frequencies, as done in this experiment.}, } @article {pmid15532663, year = {2004}, author = {McGowan, RS}, title = {Secant lines in a preliminary study of phonetic reduction of /I/ and /epsilon/.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {4 Pt 1}, pages = {2324-2337}, doi = {10.1121/1.1789491}, pmid = {15532663}, issn = {0001-4966}, support = {NIDCD-01247/DC/NIDCD NIH HHS/United States ; }, mesh = {Articulation Disorders/*physiopathology ; Female ; Humans ; Linguistics ; Male ; Movement ; Palate/*physiology ; Phonation/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Tongue/*physiology ; }, abstract = {Two subjects from the X-Ray Microbeam Speech Production Database were examined in their production of the vowels /I/ and /epsilon/ in alveolar and dental consonant contexts. Secant lines, or first-order splines, between the three most anterior pellets were examined at vowel critical times. These critical times were zero crossings in the tangential acceleration of the midpoints of the secant lines. We expected and found, in general, that vowel reduction occurred as a function of vowel duration in measures of the secant line midpoint-to-palate distance and secant line orientation at vowel critical times. The shorter the vowel, the smaller the distance of the secant line midpoints to the palate and the less downward the orientation of the secant lines at the vowel critical times. Phonetic reduction was also apparent in the formant frequencies. There were differences between the speakers in terms of the range of vowel duration and degree of reduction. The subjects differed in the functional parts of the tongue spanned by the secant lines and the shape of their palates. These differences were factors in the observed relations between formant frequencies and the articulatory, secant line measures for each subject.}, } @article {pmid15485129, year = {2004}, author = {Kishon-Rabin, L and Patael, S and Menahemi, M and Amir, N}, title = {Are the perceptual effects of spectral smearing influenced by speaker gender?.}, journal = {Journal of basic and clinical physiology and pharmacology}, volume = {15}, number = {1-2}, pages = {41-55}, doi = {10.1515/jbcpp.2004.15.1-2.41}, pmid = {15485129}, issn = {0792-6855}, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Perception/*physiology ; Female ; Humans ; Israel ; Language ; Male ; Noise ; Perceptual Masking/*physiology ; Sex Factors ; Sound Spectrography/methods ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The purpose of this study was to determine whether the effect of loss of spectral detail on speech perception is influenced by the gender of the speaker. Spectral smearing was carried out by multiplying the speech signal by a series of low-passed white noise samples, causing tonal components in the signal to be replaced by noise. Smearing bandwidths of 0 Hz (no smearing), 250 Hz, 500 Hz, 1,000 Hz, 2,000 Hz, 4,000 Hz, 8,000 Hz and full bandwidth were used. Smearing was applied to 15 isophonemic lists, each with 10 one-syllable CVC Hebrew words. The words were recorded using two speakers, a male and a female, both native speakers of Hebrew. A total of 23 subjects participated in the study: eight listened to the male speaker and 15 to the female. The results show no significant differences in the effect of speaker on recognition of words, phonemes, vowels and consonants at the different smearing bandwidths. The results also show that regardless of the speaker's gender, vowels were adversely affected by spectral smearing, as compared to consonants. Interpolation of the results shows that smearing bandwidths of 1,080 Hz, 1,950 Hz, 1,590 Hz and 2,150 Hz are required to reduce word, phoneme, vowel and consonant recognition to 50%, respectively. Several tentative explanations are offered for the fact that the results were independent of gender: all smearing bandwidths were larger than the average interharmonic spacing for both speakers, and the difference between male and female formant frequencies is typically smaller then the difference in formant frequencies of the different vowels.}, } @article {pmid15482056, year = {2004}, author = {Nicastro, N}, title = {Perceptual and acoustic evidence for species-level differences in meow vocalizations by domestic cats (Felis catus) and African wild cats (Felis silvestris lybica).}, journal = {Journal of comparative psychology (Washington, D.C. : 1983)}, volume = {118}, number = {3}, pages = {287-296}, doi = {10.1037/0735-7036.118.3.287}, pmid = {15482056}, issn = {0735-7036}, support = {T32 MN 19389/MN/OMHHE CDC HHS/United States ; }, mesh = {*Acoustics ; Adult ; *Affect ; Animals ; Animals, Domestic/*psychology ; Animals, Wild/*psychology ; *Auditory Perception ; Cats ; Female ; Humans ; Male ; Sound Spectrography ; Time Factors ; *Vocalization, Animal ; }, abstract = {To test for possible anthropogenic selection effects on meows in domestic felids, vocalizations by domestic cats (Felis catus) were compared with cries by their closest wild relative, the African wild cat (Felis silvestris lybica). Comparisons included analysis of acoustic characteristics and perceptual studies with human (Homo sapiens) listeners. The perceptual studies obtained human listener ratings of call pleasantness. Both the acoustic and perceptual comparisons revealed clear species-level differences: The domestic cat meows were significantly shorter in mean duration than the wild cat meows, showed higher mean formant frequencies, and exhibited higher mean fundamental frequencies. Human listeners at all levels of experience and affinity for cats rated domestic cat meows as far more pleasant sounding than wild cat vocalizations. These results are consistent with a model of cat domestication that posits selective pressure on meows based on human perceptual biases.}, } @article {pmid15478445, year = {2004}, author = {de Wet, F and Weber, K and Boves, L and Cranen, B and Bengio, S and Bourlard, H}, title = {Evaluation of formant-like features on an automatic vowel classification task.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {3}, pages = {1781-1792}, doi = {10.1121/1.1781620}, pmid = {15478445}, issn = {0001-4966}, support = {1-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Databases, Factual ; Discriminant Analysis ; Female ; Humans ; Male ; Markov Chains ; Models, Biological ; Noise ; *Phonetics ; Sex Factors ; *Speech Acoustics ; }, abstract = {Numerous attempts have been made to find low-dimensional, formant-related representations of speech signals that are suitable for automatic speech recognition. However, it is often not known how these features behave in comparison with true formants. The purpose of this study was to compare two sets of automatically extracted formant-like features, i.e., robust formants and HMM2 features, to hand-labeled formants. The robust formant features were derived by means of the split Levinson algorithm while the HMM2 features correspond to the frequency segmentation of speech signals obtained by two-dimensional hidden Markov models. Mel-frequency cepstral coefficients (MFCCs) were also included in the investigation as an example of state-of-the-art automatic speech recognition features. The feature sets were compared in terms of their performance on a vowel classification task. The speech data and hand-labeled formants that were used in this study are a subset of the American English vowels database presented in Hillenbrand et al. [J. Acoust. Soc. Am. 97, 3099-3111 (1995)]. Classification performance was measured on the original, clean data and in noisy acoustic conditions. When using clean data, the classification performance of the formant-like features compared very well to the performance of the hand-labeled formants in a gender-dependent experiment, but was inferior to the hand-labeled formants in a gender-independent experiment. The results that were obtained in noisy acoustic conditions indicated that the formant-like features used in this study are not inherently noise robust. For clean and noisy data as well as for the gender-dependent and gender-independent experiments the MFCCs achieved the same or superior results as the formant features, but at the price of a much higher feature dimensionality.}, } @article {pmid15478440, year = {2004}, author = {Adank, P and van Hout, R and Smits, R}, title = {An acoustic description of the vowels of Northern and Southern Standard Dutch.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {3}, pages = {1729-1738}, doi = {10.1121/1.1779271}, pmid = {15478440}, issn = {0001-4966}, mesh = {Adult ; Discriminant Analysis ; Female ; Humans ; Male ; Middle Aged ; Netherlands ; *Phonetics ; Sex Factors ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Tape Recording/methods ; }, abstract = {A database is presented of measurements of the fundamental frequency, the frequencies of the first three formants, and the duration of the 15 vowels of Standard Dutch as spoken in the Netherlands (Northern Standard Dutch) and in Belgium (Southern Standard Dutch). The speech material consisted of read monosyllabic utterances in a neutral consonantal context (i.e., /sVs/). Recordings were made for 20 female talkers and 20 male talkers, who were stratified for the factors age, gender, and region. Of the 40 talkers, 20 spoke Northern Standard Dutch and 20 spoke Southern Standard Dutch. The results indicated that the nine monophthongal Dutch vowels /a [see symbol in text] epsilon i I [see symbol in text] u y Y/ can be separated fairly well given their steady-state characteristics, while the long mid vowels /e o ø/ and three diphthongal vowels /epsilon I [see symbol in text]u oey/ also require information about their dynamic characteristics. The analysis of the formant values indicated that Northern Standard Dutch and Southern Standard Dutch differ little in the formant frequencies at steady-state for the nine monophthongal vowels. Larger differences between these two language varieties were found for the dynamic specifications of the three long mid vowels, and, to a lesser extent, of the three diphthongal vowels.}, } @article {pmid15478436, year = {2004}, author = {Hienz, RD and Jones, AM and Weerts, EM}, title = {The discrimination of baboon grunt calls and human vowel sounds by baboons.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {3}, pages = {1692-1697}, doi = {10.1121/1.1778902}, pmid = {15478436}, issn = {0001-4966}, support = {DA 12139/DA/NIDA NIH HHS/United States ; DA04731/DA/NIDA NIH HHS/United States ; MH65317/MH/NIMH NIH HHS/United States ; }, mesh = {Animals ; Auditory Perception/*physiology ; Humans ; Male ; Papio/*physiology ; Species Specificity ; *Speech ; *Vocalization, Animal ; }, abstract = {The ability of baboons to discriminate changes in the formant structures of a synthetic baboon grunt call and an acoustically similar human vowel (/epsilon/) was examined to determine how comparable baboons are to humans in discriminating small changes in vowel sounds, and whether or not any species-specific advantage in discriminability might exist when baboons discriminate their own vocalizations. Baboons were trained to press and hold down a lever to produce a pulsed train of a standard sound (e.g., /epsilon/ or a baboon grunt call), and to release the lever only when a variant of the sound occurred. Synthetic variants of each sound had the same first and third through fifth formants (F1 and F3-5), but varied in the location of the second formant (F2). Thresholds for F2 frequency changes were 55 and 67 Hz for the grunt and vowel stimuli, respectively, and were not statistically different from one another. Baboons discriminated changes in vowel formant structures comparable to those discriminated by humans. No distinct advantages in discrimination performances were observed when the baboons discriminated these synthetic grunt vocalizations.}, } @article {pmid15451379, year = {2004}, author = {Qin, L and Chimoto, S and Sakai, M and Sato, Y}, title = {Spectral-shape preference of primary auditory cortex neurons in awake cats.}, journal = {Brain research}, volume = {1024}, number = {1-2}, pages = {167-175}, doi = {10.1016/j.brainres.2004.07.061}, pmid = {15451379}, issn = {0006-8993}, mesh = {Acoustic Stimulation/*methods ; Action Potentials/*physiology ; Animals ; Auditory Cortex/*physiology ; Cats ; Neurons/*physiology ; Wakefulness/*physiology ; }, abstract = {The study of the influence of spectral envelopes of complex tones on cortical neuron discharges is important with respect to understanding the formant processing of vowels. That the sharpness of formants can contribute to the vowel quality has previously been shown in psychophysical experiments. However, it is unknown how this parameter is reflected by cortical neuron discharges, especially when a formant falls into different portions of the neuronal frequency receptive field (FRF). To show this is the aim of this study focusing on the sustained discharging neurons in the low frequency portion of cat primary auditory cortex (A1). The stimuli were multi-frequency tones whose spectral envelope was sinusoidally modulated with a peak fixed at a neuron's best frequency. The modulation depth, defined as damping-amplitude (DA), varied systematically. Stimulus bandwidth also varied systematically. Large DA at off-center frequencies produces a prominent spectral peak with steep slope, whereas small DA has a less well-defined spectral peak with gentle slope. Single frequency and two tone stimuli served to delineate the excitatory and inhibitory subfields of FRF. The A1 neuron preferred large DA when the sound energy fell into FRF with the relatively large inhibitory-subfield, while the same cell preferred small DA when it fell into FRF with the relatively small inhibitory-subfield. It is concluded that the A1 neuron can estimate steepness of slope of a spectral peak, and that a preference for steep slopes stems from a shift of balance toward the spectral inhibition, whereas a preference for gentle slopes, toward the spectral excitation.}, } @article {pmid15382833, year = {2004}, author = {Bruce, IC}, title = {Physiological assessment of contrast-enhancing frequency shaping and multiband compression in hearing aids.}, journal = {Physiological measurement}, volume = {25}, number = {4}, pages = {945-956}, doi = {10.1088/0967-3334/25/4/013}, pmid = {15382833}, issn = {0967-3334}, mesh = {*Algorithms ; Cochlea/*physiology ; Cochlear Nerve/physiology ; Equipment Design ; *Hearing Aids ; Humans ; *Models, Theoretical ; Spectrum Analysis ; *Speech Perception ; }, abstract = {Spectral enhancement is now being used in many hearing aids in an attempt to compensate for broadened cochlear filtering. However, spectral enhancement may be counteracted by multiband-compression algorithms designed to compensate for the reduced dynamic range of the impaired cochlea. An alternative scheme for spectral enhancement, contrast-enhancing frequency shaping (CEFS), has been proposed, which results in an improved neural representation of the first and second formants of voiced speech segments in the impaired ear. In this paper, models of the normal and impaired ear are used to assess the compatibility of CEFS with multiband compression. Model auditory nerve responses were assessed under four conditions: (1) unmodified speech presented to a normal ear; (2) amplified, unshaped speech presented to an impaired ear; (3) CEFS speech presented to an impaired ear; and (4) CEFS+multiband-compression speech presented to an impaired ear. The results show that multiband compression does not reduce the benefits of CEFS, and in some cases multiband compression assists in preventing distortion of the neural representation of formants. These results indicate that the combination of contrast-enhancing frequency shaping and multiband compression should lead to improved perception of voiced speech segments in hearing aid users.}, } @article {pmid15376687, year = {2004}, author = {Liu, C and Kewley-Port, D}, title = {Vowel formant discrimination for high-fidelity speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {2}, pages = {1224-1233}, doi = {10.1121/1.1768958}, pmid = {15376687}, issn = {0001-4966}, support = {DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Analysis of Variance ; Auditory Threshold ; Female ; Humans ; Male ; Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The goal of this study was to establish the ability of normal-hearing listeners to discriminate formant frequency in vowels in everyday speech. Vowel formant discrimination in syllables, phrases, and sentences was measured for high-fidelity (nearly natural) speech synthesized by STRAIGHT [Kawahara et al., Speech Commun. 27, 187-207 (1999)]. Thresholds were measured for changes in F1 and F2 for the vowels /I, epsilon, ae, lambda/ in /bVd/ syllables. Experimental factors manipulated included phonetic context (syllables, phrases, and sentences), sentence discrimination with the addition of an identification task, and word position. Results showed that neither longer phonetic context nor the addition of the identification task significantly affected thresholds, while thresholds for word final position showed significantly better performance than for either initial or middle position in sentences. Results suggest that an average of 0.37 barks is required for normal-hearing listeners to discriminate vowel formants in modest length sentences, elevated by 84% compared to isolated vowels. Vowel formant discrimination in several phonetic contexts was slightly elevated for STRAIGHT-synthesized speech compared to formant-synthesized speech stimuli reported in the study by Kewley-Port and Zheng [J. Acoust. Soc. Am. 106, 2945-2958 (1999)]. These elevated thresholds appeared related to greater spectral-temporal variability for high-fidelity speech produced by STRAIGHT than for formant-synthesized speech.}, } @article {pmid15376684, year = {2004}, author = {Vallabha, GK and Tuller, B}, title = {Perceptuomotor bias in the imitation of steady-state vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {2}, pages = {1184-1197}, doi = {10.1121/1.1764832}, pmid = {15376684}, issn = {0001-4966}, support = {MH19116/MH/NIMH NIH HHS/United States ; MH42900/MH/NIMH NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Bias ; Humans ; Male ; Models, Biological ; Phonation ; Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Verbal Behavior/*physiology ; }, abstract = {Previous studies suggest that speakers are systematically inaccurate, or biased, when imitating self-produced vowels. The direction of these biases in formant space and their variation may offer clues about the organization of the vowel perceptual space. To examine these patterns, three male speakers were asked to imitate 45 self-produced vowels that were systematically distributed in F1/F2 space. All three speakers showed imitation bias, and the bias magnitudes were significantly larger than those predicted by a model of articulatory noise. Each speaker showed a different pattern of bias directions, but the pattern was unrelated to the locations of prototypical vowels produced by that speaker. However, there were substantial quantitative regularities: (1) The distribution of imitation variability and bias magnitudes were similar for all speakers, (2) the imitation variability was independent of the bias magnitudes, and (3) the imitation variability (a production measure) was commensurate with the formant discrimination limen (a perceptual measure). These results indicate that there is additive Gaussian noise in the imitation process that independently affects each formant and that there are speaker-dependent and potentially nonlinguistic biases in vowel perception and production.}, } @article {pmid15371732, year = {2004}, author = {Hill, PR and McArthur, GM and Bishop, DV}, title = {Phonological categorization of vowels: a mismatch negativity study.}, journal = {Neuroreport}, volume = {15}, number = {14}, pages = {2195-2199}, doi = {10.1097/00001756-200410050-00010}, pmid = {15371732}, issn = {0959-4965}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; }, abstract = {There is electrophysiological evidence that phonological categorization has occurred within 100-200 ms post stimulus onset for the syllables /tae/ and /dae/, which vary in voice onset time. Using a similar paradigm, this study investigated when phonological categorization occurred for the contrast between /I/ and /epsilon/, using synthesized speech tokens that differed in the frequency of the first formant. Here we show that phonological categorization of these tokens has not occurred 100-200 ms after stimulus onset. However, the presence of a late mismatch negativity (350 ms after stimulus onset) indicated that phonological categorization had taken place by this time.}, } @article {pmid15341748, year = {2004}, author = {Beckers, GJ and Nelson, BS and Suthers, RA}, title = {Vocal-tract filtering by lingual articulation in a parrot.}, journal = {Current biology : CB}, volume = {14}, number = {17}, pages = {1592-1597}, doi = {10.1016/j.cub.2004.08.057}, pmid = {15341748}, issn = {0960-9822}, support = {R01 NS029467/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustics ; Animals ; Female ; Male ; Parakeets/*physiology ; Sound Spectrography ; Tongue/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Human speech and bird vocalization are complex communicative behaviors with notable similarities in development and underlying mechanisms. However, there is an important difference between humans and birds in the way vocal complexity is generally produced. Human speech originates from independent modulatory actions of a sound source, e.g., the vibrating vocal folds, and an acoustic filter, formed by the resonances of the vocal tract (formants). Modulation in bird vocalization, in contrast, is thought to originate predominantly from the sound source, whereas the role of the resonance filter is only subsidiary in emphasizing the complex time-frequency patterns of the source (e.g., but see). However, it has been suggested that, analogous to human speech production, tongue movements observed in parrot vocalizations modulate formant characteristics independently from the vocal source. As yet, direct evidence of such a causal relationship is lacking. In five Monk parakeets, Myiopsitta monachus, we replaced the vocal source, the syrinx, with a small speaker that generated a broad-band sound, and we measured the effects of tongue placement on the sound emitted from the beak. The results show that tongue movements cause significant frequency changes in two formants and cause amplitude changes in all four formants present between 0.5 and 10 kHz. We suggest that lingual articulation may thus in part explain the well-known ability of parrots to mimic human speech, and, even more intriguingly, may also underlie a speech-like formant system in natural parrot vocalizations.}, } @article {pmid15331110, year = {2004}, author = {Meurer, EM and Wender, MC and von Eye Corleta, H and Capp, E}, title = {Phono-articulatory variations of women in reproductive age and postmenopausal.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {3}, pages = {369-374}, doi = {10.1016/j.jvoice.2003.02.001}, pmid = {15331110}, issn = {0892-1997}, mesh = {Adult ; Age Factors ; Case-Control Studies ; Female ; Humans ; Middle Aged ; Postmenopause/*physiology ; Premenopause/*physiology ; *Speech Acoustics ; Tape Recording ; Voice/*physiology ; }, abstract = {Human verbal expression requires coordinated interaction among cortical, neuromuscular, and peripheral events. It is affected by hormonal factors and shows different characteristics in reproductive age and postmenopausal women. The present study compared phono-articulatory characteristics between women in reproductive age postmenopausal. Acoustic variations in fundamental frequency, voice sustenance, formants, vocal intensity, and verbal diadochokinesis were measured. Forty-five women in reproductive age with regular menstrual cycles and taking no hormonal contraceptives and 45 postmenopausal women receiving no hormonal replacement therapy for at least 3 years were interviewed, and their verbal productions were tape-recorded. Acoustic analyses were performed using the Kay Elemetrics Motor Speech Profile (Lincoln Park, NJ). Student t test was employed to compare data between the two groups when they presented normal distribution and Mann-Whitney test when they were asymmetrical. Results showed a greater variability in vocal sustenance, less variation in formants, and verbal diadochokinesis in postmenopausal women. There were no significant variations in fundamental frequency and vocal intensity. These results emphasize the need of a multidisciplinary integrated research, when assessing phono-articulatory processes after the menopause. A better understanding in this field will make it possible to elaborate strategies to offer a better life quality for postmenopausal women.}, } @article {pmid15331104, year = {2004}, author = {Kiliç, MA and Oğüt, F and Dursun, G and Okur, E and Yildirim, I and Midilli, R}, title = {The effects of vowels on voice perturbation measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {3}, pages = {318-324}, doi = {10.1016/j.jvoice.2003.09.007}, pmid = {15331104}, issn = {0892-1997}, mesh = {Adult ; Humans ; *Language ; *Linguistics ; Male ; Speech Acoustics ; Speech Production Measurement ; Voice/*physiology ; }, abstract = {This study examines voice perturbation parameters of the sustained [a] in English and of the eight vowels in Turkish to discover whether any difference exists between these languages, and whether a correlation exists between voice perturbation parameters and articulatory and acoustic properties of the Turkish vowels. Eight Turkish vowels uttered by 26 healthy nonsmoker volunteer males who are native Turkish speakers were compared with a voice database that includes samples of normal and disordered voices belonging to American English speakers. Fundamental frequencies, the first and second formants, and perturbation parameters, such as jitter percent, pitch perturbation quotient, shimmer percent, and amplitude perturbation quotient of the sustained vowels, were measured. Also, the first and second formants of the sustained [a] in English were measured, and other parameters have been obtained from the database. When the voice perturbation parameters in Turkish and English were compared, statistically significant differences were not found. However, when Turkish vowels compared with each other, statistically significant differences were found among perturbation values. Categorical comparisons of the Turkish vowels like high-low, rounded-unrounded, and front-back revealed significant differences in perturbation values. In correlation analysis, a weak linear inverse relation between jitter percent and the first formant (r=-0.260, p<0.05) was found.}, } @article {pmid15331101, year = {2004}, author = {Titze, IR}, title = {A theoretical study of F0-F1 interaction with application to resonant speaking and singing voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {3}, pages = {292-298}, doi = {10.1016/j.jvoice.2003.12.010}, pmid = {15331101}, issn = {0892-1997}, support = {5R01DC04224/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Larynx/physiology ; Models, Anatomic ; *Speech Acoustics ; Vocal Cords/*physiology ; Voice/*physiology ; }, abstract = {An interactive source-filter system, consisting of a three-mass body-cover model of the vocal folds and a wave reflection model of the vocal tract, was used to test the dependence of vocal fold vibration on the vocal tract. The degree of interaction is governed by the epilarynx tube, which raises the vocal tract impedance to match the impedance of the glottis. The key component of the impedance is inertive reactance. Whenever there is inertive reactance, the vocal tract assists the vocal folds in vibration. The amplitude of vibration and the glottal flow can more than double, and the oral radiated power can increase up to 10 dB. As F0 approaches F1, the first formant frequency, the interactive source-filter system loses its advantage (because inertive reactance changes to compliant reactance) and the noninteractive system produces greater vocal output. Thus, from a voice training and control standpoint, there may be reasons to operate the system in either interactive and noninteractive modes. The harmonics 2F0 and 3F0 can also benefit from being positioned slightly below F1.}, } @article {pmid15318884, year = {2004}, author = {Obleser, J and Rockstroh, B and Eulitz, C}, title = {Gender differences in hemispheric asymmetry of syllable processing: left-lateralized magnetic N100 varies with syllable categorization in females.}, journal = {Psychophysiology}, volume = {41}, number = {5}, pages = {783-788}, doi = {10.1111/j.1469-8986.2004.00204.x}, pmid = {15318884}, issn = {0048-5772}, mesh = {Adult ; Auditory Perception/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; Magnetoencephalography ; Male ; Psychomotor Performance/physiology ; Sex Characteristics ; Speech Perception/*physiology ; }, abstract = {The present study used magnetic source imaging to examine gender differences in the functional hemispheric asymmetry of auditory processing. The auditory evoked N100m was examined in male and female subjects in response to natural syllables with varying consonant and vowel as well as nonspeech noise. In an additional task subjects had to categorize different syllables from the first 35 ms of syllables, that is, the plosive and the formant transition. Syllable-evoked N100m activity was larger in the left than in the right hemisphere in female but not in male subjects. This gender-specific hemispheric asymmetry was speech specific, that is, absent when processing meaningless noise. Only in females did the degree of left-lateralization predict successful syllable categorization from short syllable bursts: Results suggest gender-specific differences in spectro-temporal analysis of speech.}, } @article {pmid15296012, year = {2004}, author = {Barnes, JJ and Davis, P and Oates, J and Chapman, J}, title = {Therelationship between professional operatic soprano voice and high range spectral energy.}, journal = {The Journal of the Acoustical Society of America}, volume = {116}, number = {1}, pages = {530-538}, doi = {10.1121/1.1710505}, pmid = {15296012}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Middle Aged ; Music ; Phonation/physiology ; Sound Spectrography ; Speech Acoustics ; Tape Recording ; Voice/*physiology ; Voice Quality/*physiology ; }, abstract = {Operatic sopranos need to be audible over an orchestra yet they are not considered to possess a singer's formant. As in other voice types, some singers are more successful than others at being heard and so this work investigated the frequency range of the singer's formant between 2000 and 4000 Hz to consider the question of extra energy in this range. Such energy would give an advantage over an orchestra, so the aims were to ascertain what levels of excess energy there might be and look at any relationship between extra energy levels and performance level. The voices of six operatic sopranos (national and international standard) were recorded performing vowel and song tasks and subsequently analyzed acoustically. Measures taken from vowel data were compared with song task data to assess the consistency of the approaches. Comparisons were also made with regard to two conditions of intended projection (maximal and comfortable), two song tasks (anthem and aria), two recording environments (studio and anechoic room), and between subjects. Ranking the singers from highest energy result to lowest showed the consistency of the results from both vowel and song methods and correlated reasonably well with the performance level of the subjects. The use of formant tuning is considered and examined.}, } @article {pmid15294388, year = {2004}, author = {Wible, B and Nicol, T and Kraus, N}, title = {Atypical brainstem representation of onset and formant structure of speech sounds in children with language-based learning problems.}, journal = {Biological psychology}, volume = {67}, number = {3}, pages = {299-317}, doi = {10.1016/j.biopsycho.2004.02.002}, pmid = {15294388}, issn = {0301-0511}, support = {F31 DC04546-01/DC/NIDCD NIH HHS/United States ; R01 DC01510-09/DC/NIDCD NIH HHS/United States ; T32 DC00015-17/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Cognition/physiology ; Dyslexia/complications/diagnosis ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Language Disorders/complications/*physiopathology ; Learning Disabilities/complications/*physiopathology ; Male ; *Phonetics ; Reaction Time ; Speech Discrimination Tests ; Speech Perception/*physiology ; *Verbal Learning ; }, abstract = {This study investigated how the human auditory brainstem represents constituent elements of speech sounds differently in children with language-based learning problems (LP, n = 9) compared to normal children (NL, n = 11), especially under stress of rapid stimulation. Children were chosen for this study based on performance on measures of reading and spelling and measures of syllable discrimination. In response to the onset of the speech sound /da/, wave V-V(n) of the auditory brainstem response (ABR) had a significantly shallower slope in LP children, suggesting longer duration and/or smaller amplitude. The amplitude of the frequency following response (FFR) was diminished in LP subjects over the 229-686 Hz range, which corresponds to the first formant of the/da/ stimulus, while activity at 114 Hz, representing the fundamental frequency of /da/, was no different between groups. Normal indicators of auditory peripheral integrity suggest a central, neural origin of these differences. These data suggest that poor representation of crucial components of speech sounds could contribute to difficulties with higher-level language processes.}, } @article {pmid15294204, year = {2004}, author = {Russo, N and Nicol, T and Musacchia, G and Kraus, N}, title = {Brainstem responses to speech syllables.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {115}, number = {9}, pages = {2021-2030}, pmid = {15294204}, issn = {1388-2457}, support = {R01 DC001510/DC/NIDCD NIH HHS/United States ; R01 DC001510-12/DC/NIDCD NIH HHS/United States ; R01-DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Child ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Noise ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: To establish reliable procedures and normative values to quantify brainstem encoding of speech sounds.

METHODS: Auditory brainstem responses to speech syllables presented in quiet and in background noise were obtained from 38 normal children. Brainstem responses consist of transient and sustained, periodic components-much like the speech signal itself. Transient peak responses were analyzed with measures of latency, amplitude, area, and slope. Magnitude of sustained, periodic frequency-following responses was assessed with root mean square, fundamental frequency, and first formant amplitudes; timing was assessed by stimulus-to-response and quiet-to-noise inter-response correlations.

RESULTS: Measures of transient and sustained components of the brainstem response to speech syllables were reliably obtained with high test-retest stability and low variability across subjects. All components of the brainstem response were robust in quiet. Background noise disrupted the transient responses whereas the sustained response was more resistant to the deleterious effects of noise.

CONCLUSIONS: The speech-evoked brainstem response faithfully reflects many acoustic properties of the speech signal. Procedures to quantitatively describe it have been developed.

SIGNIFICANCE: Accurate and precise manifestation of stimulus timing at the auditory brainstem is a hallmark of the normal perceptual system. The brainstem response to speech sounds provides a mechanism for understanding the neural bases of normal and deficient attention-independent auditory function.}, } @article {pmid15268924, year = {2004}, author = {Jacobsen, T and Schröger, E and Sussman, E}, title = {Pre-attentive categorization of vowel formant structure in complex tones.}, journal = {Brain research. Cognitive brain research}, volume = {20}, number = {3}, pages = {473-479}, doi = {10.1016/j.cogbrainres.2004.03.021}, pmid = {15268924}, issn = {0926-6410}, support = {R01 DC004263/DC/NIDCD NIH HHS/United States ; R01 06243//PHS HHS/United States ; }, mesh = {Adult ; Attention/*physiology ; Auditory Cortex/physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Memory/physiology ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {It has been demonstrated that vowel information can be extracted from speech sounds without attention focused on them, despite widely varying non-speech acoustic information in the input. The present study tested whether even complex tones that were constructed based on F0, F1 and F2 vowel frequencies to resemble the defining features of speech sounds, but were not speech, are categorized pre-attentively according to vowel space information. The Mismatch Negativity brain response was elicited by infrequent tokens of the complex tones, showing that the auditory system can pre-attentively categorize speech information on the basis of the minimal, defining auditory features. The human mind extracts the language-relevant information from complex tones despite the non-relevant variation in the sound input.}, } @article {pmid15253083, year = {2004}, author = {Brown, JA and Derksen, FJ and Stick, JA and Hartmann, WM and Robinson, NE}, title = {Effect of laryngoplasty on respiratory noise reduction in horses with laryngeal hemiplegia.}, journal = {Equine veterinary journal}, volume = {36}, number = {5}, pages = {420-425}, doi = {10.2746/0425164044868440}, pmid = {15253083}, issn = {0425-1644}, mesh = {Animals ; Female ; Heart Rate ; Hemiplegia/surgery/veterinary ; Horse Diseases/*surgery ; Horses ; Laryngectomy/methods/*veterinary ; Male ; Respiratory Sounds/*veterinary ; Time Factors ; Treatment Outcome ; Vocal Cord Paralysis/surgery/*veterinary ; }, abstract = {REASONS FOR PERFORMING STUDY: Laryngoplasty is the technique of choice for treatment of laryngeal hemiplegia, with the aim of improving airway function and/or eliminating respiratory noise. However, there are no quantitative data in the literature describing the effect of laryngoplasty on upper airway noise or its relationship to upper airway mechanics in horses with laryngeal hemiplegia.

OBJECTIVES: To determine whether laryngoplasty reduces respiratory noise in exercising horses with laryngeal hemiplegia; and to establish whether the degree of upper airway obstruction can be predicted by upper airway noise, or the degree of arytenoid abduction correlated with airway obstruction and noise production.

METHODS: Six Standardbred horses with normal upper airways during maximal exercise were used. Respiratory sounds and inspiratory transupper airway pressure (Pui) were measured in all horses before and after induction of laryngeal hemiplegia and 30, 60 and 90 days after laryngoplasty. Inspiratory sound level (SL) and the sound intensity of the 3 inspiratory formants (F1, F2 and F3, respectively) were measured using a computer-based sound analysis programme. The degree of abduction was graded by endoscopic visualisation 1, 30, 60 and 90 days post operatively. Linear regression analysis was used to determine correlations between Pui, sound indices and grades of arytenoid abduction.

RESULTS: In laryngeal hemiplegia-affected horses, Pui, inspiratory SL and the sound intensity of F1, F2 and F3 were significantly increased. At 30 days following laryngoplasty, the sound intensity of F1 and Pui returned to baseline values. The sound intensities of F2, F3 and SL were significantly improved from laryngeal hemiplegia values at 30 days post operatively, but did not return to baseline at any measurement period. Sound level, F2 and F3 were significantly correlated with Pui (P<0.05), but the correlations were weak (r2 = 0.26, 035 and 0.40, respectively). Grade of abduction and F2 were positively and significantly correlated (P<0.006, r2 = 0.76). Grade of arytenoid abduction and Pui were not correlated (P = 0.12).

CONCLUSIONS: Laryngoplasty reduced inspiratory noise in laryngeal hemiplegia-affected horses by 30 days following surgery, but did not return it to baseline values. While upper airway noise and Pui were correlated, this relationship was insufficiently strong to predict Pui from noise in individual animals. The degree of arytenoid abduction was not correlated with Pui, but was positively correlated with noise production.

POTENTIAL RELEVANCE: Laryngoplasty reduces upper airway noise in horses with laryngeal hemiplegia, but is not as effective as bilateral ventriculocordectomy in this regard, although respiratory noise reduction occurs more rapidly than with bilateral ventriculocordectomy. Residual noise during exercise cannot be used as a predictor of improvement in upper airway function in individual horses following laryngoplasty. The degree of arytenoid abduction obtained following surgery does not affect upper airway flow mechanics. Interestingly, we found that the greater the arytenoid abduction, the louder the respiratory noise.}, } @article {pmid15246292, year = {2004}, author = {Gentilucci, M and Stefanini, S and Roy, AC and Santunione, P}, title = {Action observation and speech production: study on children and adults.}, journal = {Neuropsychologia}, volume = {42}, number = {11}, pages = {1554-1567}, doi = {10.1016/j.neuropsychologia.2004.03.002}, pmid = {15246292}, issn = {0028-3932}, mesh = {Adult ; Age Factors ; Biomechanical Phenomena ; Child ; Female ; Gestures ; Hand Strength ; Humans ; *Imitative Behavior ; *Language Development ; Phonetics ; *Psychomotor Performance ; Semantics ; Size Perception ; Sound Spectrography ; Speech Acoustics ; *Speech Production Measurement ; Verbal Learning ; }, abstract = {The present study aimed to determine whether observation of upper limb actions selectively influences speech production. We compared the effects on children with those on adults, hypothesizing that action observation is used by children for speech learning. Children and adults observed an actor either grasping a cherry or an apple, or bringing the same fruits to his mouth. They pronounced the syllable/ba/ at the end of the action. In a control experiment, children and adults executed the two bringing-to-the-mouth actions, still pronouncing/ba/. As previously found ([Euro. J. Neurosci., 17 (2003) 179]; [Euro. J. Neurosci., 19 (2004) 192]), the observed kinematics of the action, which were different according to the fruit size, influenced lip shaping kinematics and voice formants. In addition, the effect was selective for the action since the observations of actions such as grasping and bringing-to-the-mouth affected formant 1 and formant 2 in the voice spectra, respectively. The effects on speech were greater in the children than in the adults. By contrast, the effects on lip shaping did not differ between the two groups. Effects similar to those found for action observation were found for action execution in spite of a different arm kinematics between children and adults. The results of the present study are discussed according to the hypothesis that action observation induces in the viewer action recognition and activation of the successive mouth act (probably grasping-with-the-mouth when observing grasping-with-the-hand and chewing when observing bringing-to-the-mouth). This subsequently seems to affect characteristics peculiar to the emitted vowel. This mechanism might have been used by humans to transfer a primitive arm gesture communication system from the arm to the mouth and may be further used by children for speech learning.}, } @article {pmid21423823, year = {1998}, author = {Remez, RE and Fellowes, JM and Pisoni, DB and Goh, WD and Rubin, PE}, title = {Multimodal perceptual organization of speech: Evidence from tone analogs of spoken utterances.}, journal = {Speech communication}, volume = {26}, number = {1}, pages = {65-73}, pmid = {21423823}, issn = {0167-6393}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; K08 DC000111/DC/NIDCD NIH HHS/United States ; F32 DC000111/DC/NIDCD NIH HHS/United States ; R01 DC000308/DC/NIDCD NIH HHS/United States ; R01 DC000111-25/DC/NIDCD NIH HHS/United States ; P01 HD001994/HD/NICHD NIH HHS/United States ; }, abstract = {Theoretical and practical motives alike have prompted recent investigations of multimodal speech perception. Theoretically, multimodal studies have extended the conceptualization of perceptual organization beyond the familiar modality-bound accounts deriving from Gestalt psychology. Practically, such investigations have been driven by a need to understand the proficiency of multimodal speech perception using an electrocochlear prosthesis for hearing. In each domain, studies have shown that perceptual organization of speech can occur even when the perceiver's auditory experience departs from natural speech qualities. Accordingly, our research examined auditor-visual multimodal integration of videotaped faces and selected acoustic constituents of speech signals, each realized as a single sinewave tone accompanying a video image of an articulating face. The single tone reproduced the frequency and amplitude of the phonatory cycle or of one of the lower three oral formants. Our results showed a distinct advantage for the condition pairing the video image of the face with a sinewave replicating the second formant, despite its unnatural timbre and its presentation in acoustic isolation from the rest of the speech signal. Perceptual coherence of multimodal speech in these circumstances is established when the two modalities concurrently specify the same underlying phonetic attributes.}, } @article {pmid21269125, year = {1998}, author = {Groenen, P and Maassen, B and Crul, T}, title = {Formant transition duration and place perception in misarticulating children and adolescents.}, journal = {Clinical linguistics & phonetics}, volume = {12}, number = {6}, pages = {439-457}, doi = {10.3109/02699209808985237}, pmid = {21269125}, issn = {0269-9206}, abstract = {The explanation of articulatory problems as an output speech disorder does not preclude the possibility that auditory processing problems are associated. Identification of brief auditory spectral cues in a place-of-articulation continuum was studied in children with articulation problems. First, it was shown that formant transition durations smaller than 20·0 ms dramatically decreased phonemic identification rates for alveolar stop consonants in control subjects. Identification tasks based on two place-of-articulation continua /pαk/-/tαk/ with F2/F3 transition durations of 52 and 20 ms were administered to groups of misarticulating children and adolescents and two control groups (children and adults). For all subject groups there was poorer phonetic processing with shorter transition durations of F2 and F3. The misarticulating subjects demonstrated poorer phonetic processing of formant transitions than did the control subjects. Shortening F2/F3 transition duration did not differentially influence perceptual behaviour between the experimental and the control groups. In determining the causal link between perception and production, an explanation of perception preceding production was favoured. It was argued that, in addition to assessing the specificity between perception and production mechanisms, assessment of perception of formant transitions may have potential as a clinical tool for evaluating phonetic processing.}, } @article {pmid18255700, year = {1997}, author = {Fels, SS and Hinton, GE}, title = {Glove-talk II - a neural-network interface which maps gestures to parallel formant speech synthesizer controls.}, journal = {IEEE transactions on neural networks}, volume = {8}, number = {5}, pages = {977-984}, doi = {10.1109/72.623199}, pmid = {18255700}, issn = {1045-9227}, abstract = {Glove-Talk II is a system which translates hand gestures to speech through an adaptive interface. Hand gestures are mapped continuously to ten control parameters of a parallel formant speech synthesizer. The mapping allows the hand to act as an artificial vocal tract that produces speech in real time. This gives an unlimited vocabulary in addition to direct control of fundamental frequency and volume. Currently, the best version of Glove-Talk II uses several input devices, a parallel formant speech synthesizer, and three neural networks. The gesture-to-speech task is divided into vowel and consonant production by using a gating network to weight the outputs of a vowel and a consonant neural network. The gating network and the consonant network are trained with examples from the user. The vowel network implements a fixed user-defined relationship between hand position and vowel sound and does not require any training examples from the user. Volume, fundamental frequency, and stop consonants are produced with a fixed mapping from the input devices. With Glove-Talk II, the subject can speak slowly but with far more natural sounding pitch variations than a text-to-speech synthesizer.}, } @article {pmid23599660, year = {1996}, author = {May, BJ and Huang, A and LE Prell, G and Hienz, RD}, title = {Vowel Formant Frequency Discrimination in Cats: Comparison of Auditory Nerve Representations and Psychophysical Thresholds.}, journal = {Auditory neuroscience}, volume = {3}, number = {2}, pages = {135-162}, pmid = {23599660}, issn = {1023-618X}, support = {R01 DC000109/DC/NIDCD NIH HHS/United States ; }, abstract = {Experiment 1 derived mathematical models for estimating the neural rate representation of changes in the second formant (F2) frequency of the vowel /ε/. Models were based on linear fits to response patterns of auditory-nerve fibers with high, medium and low spontaneous rates (SRs), as characterized in previous electrophysiological studies of anesthetized cats (Le Prell et al., 1996). Simulations were run at several vowel levels in quiet and in the presence of continuous background noise. Noise levels were adjusted to produce a constant signal-to-noise ratio (S/N) of 3 dB at each vowel level. A signal detection analysis of model outputs suggested that auditory-nerve fibers with low SR provided the best rate representation of changes in F2 frequency at higher vowel levels and in background noise. Experiment 2 examined the predictions of the auditory nerve model by measuring psychophysical thresholds for F2 frequency changes (ΔF2) in cats. Behavioral tests were performed at vowel levels of 31, 51, and 71 dB in continuous background noise at S/Ns of 3, 13, and 23 dB. ΔF2 increased with decreasing S/N at each of these three vowel levels. Trends in behavioral performance corresponded well with the quality of vowel representations that are provided by high SR auditory-nerve fibers at low vowel levels and low SR fibers at moderate-to-high levels.}, } @article {pmid23599659, year = {1996}, author = {LE Prell, G and Sachs, M and May, B}, title = {Representation of Vowel-like Spectra by Discharge Rate Responses of Individual Auditory-Nerve Fibers.}, journal = {Auditory neuroscience}, volume = {2}, number = {3}, pages = {275-288}, pmid = {23599659}, issn = {1023-618X}, support = {R01 DC000109/DC/NIDCD NIH HHS/United States ; }, abstract = {Neural representations of complex vowel-like spectra have been extensively characterized by population studies of single fiber responses in the auditory nerve of anesthetized cats. With traditional population measures, neural rate responses to formants (energy peaks) and troughs (energy minima) in a vowel's amplitude spectrum are measured by sampling several fibers, each tuned to one spectral feature. Similar analyses are rarely performed on structures in the central auditory system primarily due to the difficulty of obtaining the samples of neurons that are needed to construct complete population response profiles. As an alternative to population measures, this study introduces a method for estimating population measures from the responses of individual auditory-nerve fibers. With our spectrum manipulation procedure (SMP), a response profile was created by sampling the responses of individual fibers as important spectral features were shifted to the units' best frequency (BF, the frequency to which a neuron is most sensitive). Observed SMP rate profiles showed the same effects of rate saturation, two-tone suppression, and spontaneous rate as population measures. In addition, when analyzed with signal detection methods, changes in rate responses within individual neurons revealed new insights into how the neural representation of vowel stimuli may be influenced by unit threshold and best frequency. SMP sampling techniques should prove useful in future studies of speech encoding in the central auditory system.}, } @article {pmid20426513, year = {1996}, author = {Buder, EH and Kent, RD and Kent, JF and Milenkovic, P and Workinger, M}, title = {Formoffa: An automated formant, moment, fundamental frequency, amplitude analysis of normal and disordered speech.}, journal = {Clinical linguistics & phonetics}, volume = {10}, number = {1}, pages = {31-54}, doi = {10.3109/02699209608985160}, pmid = {20426513}, issn = {0269-9206}, abstract = {A system for semi-automatic, multi-parameter acoustic analysis is described. The system, called FORMOFFA (For = FORmants, Mo = MOments, FF = Fundamental Frequency, A = Amplitude), operates on a PC microcomputer by adaptations of commercially available software. Data displays include a deterministic time record of instantaneous values, and an ergodic time-compressed distribution. In this report the technique is developed with a one-word example, and some measurement and reliability issues are described. The analysis possibilities are then illustrated with several applications: (1) segmental analysis of normal speech, (2) acoustic assessment of the effects of a progressive neurological disease (amyotrophic lateral sclerosis) on sentence production, (3) acoustic study of palatal lift management of a patient with traumatic brain injury, and (4) phonetic assessment of word production by a subject with dysarthria. Although the current technique is recommended as a research tool, this kind of analysis promises several advantages for clinical application, including semi-automaticity, efficiency, parsimony, and relevance to both segmental and suprasegmental levels of analysis.}, } @article {pmid16538334, year = {1995}, author = {Luo, JP and Fan, S and Yang, CY}, title = {[The design and application of speech signal processing system by computer in stomatology.].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {4}, number = {2}, pages = {85-86}, pmid = {16538334}, issn = {1006-7248}, abstract = {In this study we established a computer bases on Speech Signal Processing System and changes of formant frequencies of the vowel/i/accompany the changes in the oral cavity after maxillary surgery and prosthodontic reconstruction.The study found the F1 for vowel/i/has no change,while the F2 peak significantly shifted between the preprosthetic and postprosthetic speaking condition after anterior resection of relative energy error value and the speech intelligible measured aurally(P<0.05).}, } @article {pmid18267838, year = {1994}, author = {Hanes, MD and Ahalt, SC and Krishnamurthy, AK}, title = {Acoustic-to-phonetic mapping using recurrent neural networks.}, journal = {IEEE transactions on neural networks}, volume = {5}, number = {4}, pages = {659-662}, doi = {10.1109/72.298235}, pmid = {18267838}, issn = {1045-9227}, abstract = {This paper describes the application of artificial neural networks to acoustic-to-phonetic mapping. The experiments described are typical of problems in speech recognition in which the temporal nature of the input sequence is critical. The specific task considered is that of mapping formant contours to the corresponding CVC' syllable. We performed experiments on formant data extracted from the acoustic speech signal spoken at two different tempos (slow and normal) using networks based on the Elman simple recurrent network model. Our results show that the Elman networks used in these experiments were successful in performing the acoustic-to-phonetic mapping from formant contours. Consequently, we demonstrate that relatively simple networks, readily trained using standard backpropagation techniques, are capable of initial and final consonant discrimination and vowel identification for variable speech rates.}, } @article {pmid18267700, year = {1993}, author = {Watrous, RL}, title = {Speaker normalization and adaptation using second-order connectionist networks.}, journal = {IEEE transactions on neural networks}, volume = {4}, number = {1}, pages = {21-30}, doi = {10.1109/72.182692}, pmid = {18267700}, issn = {1045-9227}, abstract = {A method for speaker normalization and adaption using connectionist networks is developed. A speaker-specific linear transformation of observations of the speech signal is computed using second-order network units. Classification is accomplished by a multilayer feedforward network that operates on the normalized speech data. The network is adapted for a new talker by modifying the transformation parameters while leaving the classifier fixed. This is accomplished by backpropagating classification error through the classifier to the second-order transformation units. This method was evaluated for the classification of ten vowels for 76 speakers using the first two formant values of the Peterson-Barney data. The results suggest that rapid speaker adaptation resulting in high classification accuracy can be accomplished by this method.}, } @article {pmid21904420, year = {1991}, author = {Tallal, P and Sainburg, RL and Jernigan, T}, title = {The Neuropathology of Developmental Dysphasia: Behavioral, Morphological, and Physiological Evidence for a Pervasive Temporal Processing Disorder.}, journal = {Reading and writing}, volume = {3}, number = {3-4}, pages = {363-377}, pmid = {21904420}, issn = {0922-4777}, support = {K01 HD001186-04/HD/NICHD NIH HHS/United States ; }, abstract = {Over the past twenty years, Tallal and colleagues have directed their research toward defining the neuropathological mechanisms responsible for developmental dysphasia. We have hypothesized that higher level auditory processing dysfunction, which has previously been associated with developmental dysphasia, may result from more basic temporal processing deficits which interfere with the resolution of rapidly presented, brief duration stimuli. This temporal processing deficit interferes with adequate perception of specific verbal stimuli which require resolution of brief duration formant transitions, resulting in disordered language development. The temporal processing deficit occurs across multiple sensory modalities, and also affects rapid and sequential motor production skills. Despite relatively normal clinical neuroradiological examinations, in vivo morphological analysis, utilizing magnetic resonance imaging techniques for quantitative volumetric measurements of specific brain structures, has identified abnormalities in superior parietal, prefrontal, and temporal cortices, as well as diencephalic and caudate nuclei. Abnormalities in structures which are involved in multimodal processing and sensory motor integration is consistent with the behavioral profile of developmental dysphasia. Two alternative hypotheses regarding the neurophysiological basis of the multimodal temporal processing disorder include: dysfunction in specifc cellular systems which subserve rapid, transient processing; and abnormal gating of sensory relay by intralaminar and reticular thalamic nuclei.}, } @article {pmid23964759, year = {1990}, author = {Sams, M and Aulanko, R and Aaltonen, O and Näätänen, R}, title = {Event-related potentials to infrequent changes in synthesized phonetic stimuli.}, journal = {Journal of cognitive neuroscience}, volume = {2}, number = {4}, pages = {344-357}, doi = {10.1162/jocn.1990.2.4.344}, pmid = {23964759}, issn = {0898-929X}, abstract = {Event-related potentials (ERPs) to synthetic consonant-vowel syllables were recorded. Infrequent changes in such a syllable elicited a "mismatch negativity" as well as an enhanced N100 component of the ERP even when subjects did not pay attention to the stimuli. Both components are probably generated in the supratemporal auditory cortex suggesting that in these areas there are neural networks that are automatically activated by speech-specific auditory stimulus features such as formant transitions.}, } @article {pmid21166617, year = {1989}, author = {Sendlmeier, WF}, title = {Speech cue enhancement in intervocalic stops.}, journal = {Clinical linguistics & phonetics}, volume = {3}, number = {2}, pages = {151-161}, doi = {10.3109/02699208908985278}, pmid = {21166617}, issn = {0269-9206}, abstract = {The place of articulation feature for stop consonants is subject to many errors in speech processing by hearing-impaired listeners. Attempts to improve the recognition of initial and final stop consonants by lowering the level of the first formant or-with a different approach-by narrowing the formant bandwidth of the first five formants only very partially led to satisfactory results. Intervocalic stops were used in the present investigation, on the one hand because the spectral information is represented twice (in the VC as well as in the CV-transition) and, on the other hand, because the closure duration offers additional information to the listener. The modification of bandwidth led to no noticeable improvement in the /b, d, g/ discrimination. A change of the closure durations affected the identification of the stops, especially the /b/. The modification of the transitions of the second and third formants optimized the recognition rates for /b/ and /g/.}, } @article {pmid25505818, year = {1987}, author = {Diehl, RL and Kluender, KR and Foss, DJ and Parker, EM and Gernsbacher, MA}, title = {Vowels as Islands of Reliability.}, journal = {Journal of memory and language}, volume = {26}, number = {5}, pages = {564-573}, pmid = {25505818}, issn = {0749-596X}, support = {K04 NS001376/NS/NINDS NIH HHS/United States ; R01 DC000427/DC/NIDCD NIH HHS/United States ; }, abstract = {Vowel nuclei of syllables appear to provide a relatively stable (although not stationary) frame of reference for judging consonant events. We offer evidence that reliable consonant identification demands prior or simultaneous evaluation of this "vocalic frame." Listeners were presented a list of /bVs/, /dVs/, and /gVs/ syllables and were instructed to press a response key immediately upon recognizing a particular initial consonant target. Three groups of subjects monitored for /b/, /d/, and /g/, respectively. The test syllables contained 10 English vowels varying substantially in intrinsic duration. Response times to the initial consonants correlated positively with the duration of the following vowels, even when the effect of consonant-vowel formant transition duration was partialed out. The results suggest that consonant recognition is vowel dependent and, specifically, that a certain amount or proportion of the vowel formant trajectory must be evaluated before consonants can be reliably identified.}, } @article {pmid24177811, year = {1984}, author = {Urban-Grimal, D and Ribes, V and Labbe-Bois, R}, title = {Cloning by genetic complementation and restriction mapping of the yeast HEM1 gene coding for 5-aminolevulinate synthase.}, journal = {Current genetics}, volume = {8}, number = {5}, pages = {327-331}, pmid = {24177811}, issn = {0172-8083}, abstract = {We have cloned the structural gene HEM1 for 5-aminolevulinate (ALA) synthase from Saccharomyces cerevisiae by transformation and complementation of a yeast hem1-5 mutant which was previously shown to lack ALA synthase activity (Urban-Grimal and Labbe Bois 1981) and had no immunodetectable ALA synthase protein when tested with yeast ALA synthase antiserum. The gene was selected from a recombinant cosmid pool which contained wild-type yeast genomic DNA fragments of an average size of 40 kb. The cloned gene was identified by the restauration.of growth on a non fermentable carbon source without addition of exogenous ALA. Sub cloning of partial Sau3A digests and functional analysis by transformation allowed us to isolate three independent plasmids, each carrying a 6 kb yeast DNA fragment inserted in either orientation into the single BamHI site of the vector pHCG3 and able to complement hem1-5 mutation. Analysis of the three plasmids by restriction endonucleases showed that HEM1 is contained within a 2.9 kb fragment. The three corresponding yeast trans formants present a 1, 2.5 and 16 fold increase in ALA synthase activity as compared to the wild-type strain. The gene product immunodetected in the transformant yeast cells has identical size as the wild-type yeast ALA synthase and its amount correlates well with the increase in ALA synthase activity.}, } @article {pmid22499647, year = {1982}, author = {Datta, AK and Ganguli, NR and Ray, S}, title = {Maximum likelihood methods in vowel recognition: a comparative study.}, journal = {IEEE transactions on pattern analysis and machine intelligence}, volume = {4}, number = {6}, pages = {683-689}, doi = {10.1109/tpami.1982.4767326}, pmid = {22499647}, issn = {0162-8828}, abstract = {Vowel classification accuracy is studied using a generalized maximum likelihood ratio method. It is shown that two simplifying assumptions can reduce computation times by as much as a factor of five while producing practically no change in recognition accuracy. The two simplifying assumptions remove cross correlation terms and produce an Euclidean distance discriminant function. The vowels are taken from 350 multisyllabic isolated words spoken by five male speakers. The vowels occur in a variety of preand postconsonantal contexts. The recognition scores obtained for vowels are 83 percent. The effect of grouping of similar vowels on recognition scores is found to be marginal. The high back and high front vowels show better recognition scores (92-94 percent). In general, recognition performance for individual vowels follows a definite trend with respect to. the vowel diagram. A reasonable similarity is observed between confusion matrix and the distribution of vowels in first and second formant frequency (F1 F2) plane.}, } @article {pmid24203289, year = {1977}, author = {Howell, P and Darwin, CJ}, title = {Some properties of auditory memory for rapid formant transitions.}, journal = {Memory & cognition}, volume = {5}, number = {6}, pages = {700-708}, pmid = {24203289}, issn = {0090-502X}, abstract = {In the three experiments reported here, subjects indicate whether two sequentially presented syllables, differing in the place of articulation of an initial stop consonant, are phonernically the same or not. The first experiment finds faster |ldsame" responses to acoustically identical pairs than to pairs that are phonemically identical but acoustically distinct. provided that the second syllable is presented within 400 msec of the first. This is interpreted as indicating the persistence of a memory which preserves auditory information about within-category distinctions. The third experiment shows that this advantage remains when a tone is interposed between the two syllables, but is removed when a brief vowel is similarly interposed. The second experiment presents the second syllable of each pair dichotically with a noise burst, and shows that the size of the right-ear advantage for this reaction time task is reduced when the result of comparisons based on this auditory memory is compatible with the required phonemic decision, but that the right-ear advantage is increased when auditory comparisons would contradict the phonemic relationship.}, } @article {pmid23204593, year = {1975}, author = {Pisoni, DB and Tash, J}, title = {Auditory property detectors and processing place features in stop consonants.}, journal = {Perception & psychophysics}, volume = {18}, number = {6}, pages = {401-408}, pmid = {23204593}, issn = {0031-5117}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; }, abstract = {The effects of selective adaptation on the perception of consonant-vowel (CV) stimuli varying in place of production was studied under two conditions. In the first condition, repeated presentation of a CV syllable produced an adaptation effect resulting in a shift in the locus of the phonetic boundary between [ba] and [da]. This result replicated previously reported findings. However, in the second condition, an adaptation effect was obtained on this same test series when the critical acoustic information (i.e., formant transitions) was present in final position of a VC speech-like syllable. These latter results support an auditory account of selective adaptation based on the spectral similarity of the adapting stimuli and test series rather than a more abstract linguistic account based on phonetic identity.}, } @article {pmid24197680, year = {1972}, author = {Popper, RD}, title = {Pair discrimination for a continuum of synthetic voiced stops with and without first and third formants.}, journal = {Journal of psycholinguistic research}, volume = {1}, number = {3}, pages = {205-219}, pmid = {24197680}, issn = {0090-6905}, abstract = {Phoneme labeling and discrimination experiments were conducted with a continuum of voiced stops produced by a Terminal Analog Speech Synthesizer. The stops ranged from |b| to |d|. Only second formant (F2) transitions changed from one sound to another. (A formant is energy concentrated in a narrow frequency range.) In the labeling experiment conducted to locate the phoneme boundary, subjects identified the individual stimuli as |b| and |d|. In discrimination, difference and identity pairs were presented, with alternative responses of "same" and "different." This allows separate consideration of discrimination ("different"/Different) and recognition ("same"/Identity) hits, and also analysis of the data in accordance with the theory of signal detectibility. The sounds were discriminated with and without F 1 and F 3 which contained no discriminatory information, but are responsible for perceived similarity to speech. With F 1 F 3 , sensitivity (d') was highest at the |b-d| boundary, but without F 1 F 3 this was not true. Spectral analysis of the sounds both with and without F 1 F 3 revealed a phonemic energy discontinuity for the 1/3 octave around the F 2 steady-state frequency (1250 Hz). It therefore seems probable that subjects listened to frequencies which contained phonemic information when F 1 F 3 were included, but not when they were omitted. In spite of the high sensitivity at the |b-d| boundary, recognition hits ("same" /Identity) were lowest the boundary had to sound less like a difference to be called "different" than a pair away from the boundary.Indications, then, are quite strong that auditory-frequency selection helps the perception of speech, and it is clear that a strategy of criterion lowering helps it.}, } @article {pmid23255833, year = {1972}, author = {Studdert-Kennedy, M and Shankweiler, D and Pisoni, D}, title = {Auditory and Phonetic Processes in Speech Perception: Evidence from a Dichotic Study().}, journal = {Cognitive psychology}, volume = {3}, number = {3}, pages = {455-466}, pmid = {23255833}, issn = {1095-5623}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; }, abstract = {The distinction between auditory and phonetic processes in speech perception was used in the design and analysis of an experiment. Earlier studies had shown that dichotically presented stop consonants are more often identified correctly when they share place of production (e.g., /ba-pa/) or voicing (e.g., /ba-da/) than when neither feature is shared (e.g., /ba-ta/). The present experiment was intended to determine whether the effect has an auditory or a phonetic basis. Increments in performance due to feature-sharing were compared for synthetic stop-vowel syllables in which formant transitions were the sole cues to place of production under two experimental conditions: (1) when the vowel was the same for both syllables in a dichotic pair, as in our earlier studies, and (2) when the vowels differed. Since the increment in performance due to sharing place was not diminished when vowels differed (i.e., when formant transitions did not coincide), it was concluded that the effect has a phonetic rather than an auditory basis. Right ear advantages were also measured and were found to interact with both place of production and vowel conditions. Taken together, the two sets of results suggest that inhibition of the ipsilateral signal in the perception of dichotically presented speech occurs during phonetic analysis.}, } @article {pmid18960355, year = {1968}, author = {Lyle, SJ and Maghzian, R}, title = {Separation of carrier-free silver from neutron-irradiated palladium.}, journal = {Talanta}, volume = {15}, number = {7}, pages = {712-713}, doi = {10.1016/0039-9140(68)80158-4}, pmid = {18960355}, issn = {0039-9140}, abstract = {The use of a chelate-forming ion-exchange resin for the separation of carrier-free silver-111 from neutron irradiated palladium is described. On décrit l'emploi d'une résine échangeuse d'ions formant des chélates pour la séparation de l'argent-111 exempt d'entraîneur du palladium irradié aux neutrons.}, } @article {pmid15239202, year = {2004}, author = {Fourakis, MS and Hawks, JW and Holden, LK and Skinner, MW and Holden, TA}, title = {Effect of frequency boundary assignment on vowel recognition with the Nucleus 24 ACE speech coding strategy.}, journal = {Journal of the American Academy of Audiology}, volume = {15}, number = {4}, pages = {281-299}, doi = {10.3766/jaaa.15.4.3}, pmid = {15239202}, issn = {1050-0545}, support = {R01 DC00581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Analysis of Variance ; Auditory Threshold/physiology ; *Cochlear Implants ; Deafness/*therapy ; Female ; Humans ; Male ; Middle Aged ; Prosthesis Fitting/methods ; *Signal Processing, Computer-Assisted ; *Speech Perception ; Speech Production Measurement ; Surveys and Questionnaires ; }, abstract = {Two speech processor programs (MAPs) differing only in electrode frequency boundary assignments were created for each of eight Nucleus 24 Cochlear Implant recipients. The default MAPs used typical frequency boundaries, and the experimental MAPs reassigned one additional electrode to vowel formant regions. Four objective speech tests and a questionnaire were used to evaluate speech recognition with the two MAPs. Results for the closed-set vowel test and the formant discrimination test showed small but significant improvement in scores with the experimental MAP. Differences for the Consonant-Vowel Nucleus-Consonant word test and closed-set consonant test were nonsignificant. Feature analysis revealed no significant differences in information transmission. Seven of the eight subjects preferred the experimental MAP, reporting louder, crisper, and clearer sound. The results suggest that Nucleus 24 recipients should be given an opportunity to compare a MAP that assigns more electrodes in vowel formant regions with the default MAP to determine which provides the most benefit in everyday life.}, } @article {pmid15238437, year = {2005}, author = {Steinschneider, M and Volkov, IO and Fishman, YI and Oya, H and Arezzo, JC and Howard, MA}, title = {Intracortical responses in human and monkey primary auditory cortex support a temporal processing mechanism for encoding of the voice onset time phonetic parameter.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {15}, number = {2}, pages = {170-186}, doi = {10.1093/cercor/bhh120}, pmid = {15238437}, issn = {1047-3211}, support = {R01 DC004290/DC/NIDCD NIH HHS/United States ; DC000657/DC/NIDCD NIH HHS/United States ; DC00120/DC/NIDCD NIH HHS/United States ; HD01799/HD/NICHD NIH HHS/United States ; }, mesh = {Animals ; Auditory Cortex/*physiology ; Auditory Perception/physiology ; Evoked Potentials, Auditory/*physiology ; Humans ; Macaca fascicularis ; Male ; *Phonetics ; Reaction Time/physiology ; Speech Perception/*physiology ; }, abstract = {This study tests the hypothesis that temporal response patterns in primary auditory cortex are potentially relevant for voice onset time (VOT) encoding in two related experiments. The first experiment investigates whether temporal responses reflecting VOT are modulated in a way that can account for boundary shifts that occur with changes in first formant (F1) frequency, and by extension, consonant place of articulation. Evoked potentials recorded from Heschl's gyrus in a patient undergoing epilepsy surgery evaluation are examined. Representation of VOT varies in a manner that reflects the spectral composition of the syllables and the underlying tonotopic organization. Activity patterns averaged across extended regions of Heschl's gyrus parallel changes in the subject's perceptual boundaries. The second experiment investigates whether the physiological boundary for detecting the sequence of two acoustic elements parallels the psychoacoustic result of approximately 20 ms. Population responses evoked by two-tone complexes with variable tone onset times (TOTs) in primary auditory cortex of the monkey are examined. Onset responses evoked by both the first and second tones are detected at a TOT separation as short as 20 ms. Overall, parallels between perceptual and physiological results support the relevance of a population-based temporal processing mechanism for VOT encoding.}, } @article {pmid15237842, year = {2004}, author = {Mayo, C and Turk, A}, title = {Adult-child differences in acoustic cue weighting are influenced by segmental context: children are not always perceptually biased toward transitions.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {6}, pages = {3184-3194}, doi = {10.1121/1.1738838}, pmid = {15237842}, issn = {0001-4966}, mesh = {Adult ; Bias ; Child ; Child, Preschool ; *Cues ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {It has been proposed that young children may have a perceptual preference for transitional cues [Nittrouer, S. (2002). J. Acoust. Soc. Am. 112, 711-719]. According to this proposal, this preference can manifest itself either as heavier weighting of transitional cues by children than by adults, or as heavier weighting of transitional cues than of other, more static, cues by children. This study tested this hypothesis by examining adults' and children's cue weighting for the contrasts /saI/-/integral of aI/, /de/-/be/, /ta/-/da/, and /ti/-/di/. Children were found to weight transitions more heavily than did adults for the fricative contrast /saI/-/integral aI/, and were found to weight transitional cues more heavily than nontransitional cues for the voice-onset-time contrast /ta/-/da/. However, these two patterns of cue weighting were not found to hold for the contrasts /de/-/be/ and /ti/-/di/. Consistent with several studies in the literature, results suggest that children do not always show a bias towards vowel-formant transitions, but that cue weighting can differ according to segmental context, and possibly the physical distinctiveness of available acoustic cues.}, } @article {pmid15232306, year = {2004}, author = {Roberts, TP and Flagg, EJ and Gage, NM}, title = {Vowel categorization induces departure of M100 latency from acoustic prediction.}, journal = {Neuroreport}, volume = {15}, number = {10}, pages = {1679-1682}, doi = {10.1097/01.wnr.0000134928.96937.10}, pmid = {15232306}, issn = {0959-4965}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/*physiology/radiation effects ; Dose-Response Relationship, Radiation ; Humans ; *Language ; *Magnetoencephalography ; Models, Psychological ; Reaction Time/*physiology ; Time Factors ; }, abstract = {MEG studies have shown that the timing (latency) of the evoked response that peaks approximately 100 ms post-stimulus onset (M100) decreases as frequency increases for sinusoidal tones. We investigated M100 latency using a continuum of synthesized vowel stimuli in which the dominant formant frequency increases from 250 Hz (perceived /u/) to 750 Hz (perceived /a/) in 50 Hz steps. While M100 latency did vary inversely with formant frequency overall, frequency modulation was flattened within each vowel category. However, for mid-continuum ambiguous tokens (i.e. those with increased reaction time/decreased accuracy in the concurrent behavioral identification task), M100 reverted to formant frequency differences, agreeing with previous findings of frequency-dependence. A theory is proposed in which phonological categorization emerges from specific spatial distribution of frequency-tuned neurons.}, } @article {pmid15212563, year = {2004}, author = {Ertmer, DJ}, title = {How well can children recognize speech features in spectrograms? Comparisons by age and hearing status.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {47}, number = {3}, pages = {484-495}, doi = {10.1044/1092-4388(2004/038)}, pmid = {15212563}, issn = {1092-4388}, mesh = {Age Factors ; Case-Control Studies ; Child ; Female ; Hearing Loss/*complications ; Humans ; Male ; Phonetics ; *Sound Spectrography/methods ; Speech Disorders/etiology/*therapy ; Speech Therapy/instrumentation/*methods ; Tape Recording ; }, abstract = {Real-time spectrographic displays (SDs) have been used in speech training for more than 30 years with adults and children who have severe and profound hearing impairments. Despite positive outcomes from treatment studies, concerns remain that the complex and abstract nature of spectrograms may make these speech training aids unsuitable for use with children. This investigation examined how well children with normal hearing sensitivity and children with impaired hearing can recognize spectrographic cues for vowels and consonants, and the ages at which these visual cues are distinguished. Sixty children (30 with normal hearing sensitivity, 30 with hearing impairments) in 3 age groups (6-7, 8-9, and 10-11 years) were familiarized with the spectrographic characteristics of selected vowels and consonants. The children were then tested on their ability to select a match for a model spectrogram from among 3 choices. Overall scores indicated that spectrographic cues were recognized with greater-than-chance accuracy by all age groups. Formant contrasts were recognized with greater accuracy than consonant manner contrasts. Children with normal hearing sensitivity and those with hearing impairment performed equally well.}, } @article {pmid15193649, year = {2004}, author = {Hanamitsu, M and Kataoka, H}, title = {Effect of artificially lengthened vocal tract on vocal fold oscillation's fundamental frequency.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {2}, pages = {169-175}, doi = {10.1016/j.jvoice.2002.12.001}, pmid = {15193649}, issn = {0892-1997}, mesh = {*Acoustics ; Adult ; Air Pressure ; Humans ; Male ; Middle Aged ; Mouth/physiology ; Phonation/*physiology ; *Respiratory Physiological Phenomena ; Respiratory System ; Vibration ; Vocal Cords/*physiology ; }, abstract = {UNLABELLED: The fundamental frequency of vocal fold oscillation (F(0)) is controlled by laryngeal mechanics and aerodynamic properties. F(0) change per unit change of transglottal pressure (dF/dP) using a shutter valve has been studied and found to have nonlinear, V-shaped relationship with F(0). On the other hand, the vocal tract is also known to affect vocal fold oscillation. This study examined the effect of artificially lengthened vocal tract length on dF/dP. dF/dP was measured in six men using two mouthpieces of different lengths.

RESULTS: The dF/dP graph for the longer vocal tract was shifted leftward relative to the shorter one.

CONCLUSION: Using the one-mass model, the nadir of the "V" on the dF/dP graph was strongly influenced by the resonance around the first formant frequency. However, a more precise model is needed to account for the effects of viscosity and turbulence.}, } @article {pmid15189488, year = {2004}, author = {Jacobsen, T and Schröger, E and Alter, K}, title = {Pre-attentive perception of vowel phonemes from variable speech stimuli.}, journal = {Psychophysiology}, volume = {41}, number = {4}, pages = {654-659}, doi = {10.1111/1469-8986.2004.00175.x}, pmid = {15189488}, issn = {0048-5772}, mesh = {Acoustic Stimulation ; Adult ; Attention/*physiology ; Electroencephalography ; Female ; Humans ; Male ; Middle Aged ; Speech Perception/*physiology ; }, abstract = {Understanding speech requires the construction of phonetic representations while abstracting from specific sound features. To understand different speakers of varying pitches of voice, loudness, or timbre, categorical phoneme information needs to be rapidly extracted from dynamic, changing speech input. The present study demonstrated a genuine MMN to tokens of /a/ and /i/ vowels varying in pitch of voice and amplitude envelope when they occurred infrequently among the respective other vowels. These data indicate that the speech perception system pre-attentively extracted the F1/F2 formant information despite the language-irrelevant variation in the sound input.}, } @article {pmid15158015, year = {2004}, author = {Jacobsen, T}, title = {Mismatch negativity to frequency changes: no evidence from human event-related brain potentials for categorical speech processing of complex tones resembling vowel formant structure.}, journal = {Neuroscience letters}, volume = {362}, number = {3}, pages = {204-208}, doi = {10.1016/j.neulet.2004.03.005}, pmid = {15158015}, issn = {0304-3940}, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Brain/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Mental Processes/physiology ; Middle Aged ; Pitch Discrimination/*physiology ; Reaction Time/physiology ; Speech Discrimination Tests ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Based on a memory-comparison process, changes in the pitch of repetitive sounds are pre-attentively detected, reflected by the mismatch negativity (MMN) event-related brain potential. In the present investigation of categorical speech perception, complex tones were used that consisted of vowel-defining F1 and F2 formant information while not being perceived as speech. MMN was obtained in oddball blocks. The auditory system tracks two simultaneous changes in formant frequencies of 50 Hz while also abstracting from three levels of intensity variation. Lower frequency deviants elicited larger MMN. No effects of language-categorical processing were observed.}, } @article {pmid15143495, year = {2004}, author = {Liakso, EE and Gromova, AD and Frolova, OV and Romanova, OD}, title = {[The acoustic aspect of the speech development in children during the third year of life].}, journal = {Rossiiskii fiziologicheskii zhurnal imeni I.M. Sechenova}, volume = {90}, number = {1}, pages = {83-97}, pmid = {15143495}, issn = {0869-8139}, mesh = {Child, Preschool ; Female ; Humans ; *Language Development ; Longitudinal Studies ; Male ; Phonetics ; Speech Acoustics ; }, abstract = {The current part of a Russian language acquisition longitudinal study based on auditory, phonetic and instrumental analysis is devoted to the third year of child's life. We examined the development of supplementary acoustic and phonetic features of the child's speech providing for the possibility for the speech to be recognized. The instrumental analysis and statistical processing of vowel formant dynamics as well as stress, palatalization and VOT development, has been performed for the first time in Russian children. We showed that the high probability of children words recognition by auditors was due to establishment of a system of acoustically stable features which, in combination with each other, provide for the informative sufficiency of a message.}, } @article {pmid15136606, year = {2004}, author = {Averbeck, BB and Romanski, LM}, title = {Principal and independent components of macaque vocalizations: constructing stimuli to probe high-level sensory processing.}, journal = {Journal of neurophysiology}, volume = {91}, number = {6}, pages = {2897-2909}, doi = {10.1152/jn.01103.2003}, pmid = {15136606}, issn = {0022-3077}, mesh = {Acoustic Stimulation/*methods ; Action Potentials/*physiology ; Animals ; Auditory Perception/*physiology ; Frontal Lobe/physiology ; Humans ; Macaca mulatta ; Neurons, Afferent/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Neurons in high-level sensory cortical areas respond to complex features in sensory stimuli. Feature elimination is a useful technique for studying these responses. In this approach, a complex stimulus, which evokes a neuronal response, is simplified, and if the cell responds to the reduced stimulus, it is considered selective for the remaining features. We have developed a feature-elimination technique that uses either the principal or the independent components of a stimulus to define a subset of features, to which a neuron might be sensitive. The original stimulus can be filtered using these components, resulting in a stimulus that retains only a fraction of the features present in the original. We demonstrate the use of this technique on macaque vocalizations, an important class of stimuli being used to study auditory function in awake, behaving primate experiments. We show that principal-component analysis extracts features that are closely related to the dominant Fourier components of the stimuli, often called formants in the study of speech perception. Conversely, independent-component analysis extracts features that preserve the relative phase across a set of harmonically related frequencies. We have used several statistical techniques to explore the original and filtered stimuli, as well as the components extracted by each technique. This novel approach provides a powerful method for determining the essential features within complex stimuli that activate higher-order sensory neurons.}, } @article {pmid15109701, year = {2004}, author = {Mendelson, JR and Lui, B}, title = {The effects of aging in the medial geniculate nucleus: a comparison with the inferior colliculus and auditory cortex.}, journal = {Hearing research}, volume = {191}, number = {1-2}, pages = {21-33}, doi = {10.1016/j.heares.2004.01.010}, pmid = {15109701}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Age Factors ; Aging/*physiology ; Animals ; Auditory Cortex/*physiology ; Geniculate Bodies/*physiology ; Inferior Colliculi/*physiology ; Male ; Rats ; Rats, Long-Evans ; }, abstract = {A common problem among the elderly is a difficulty in discriminating speech. One factor that may contribute to this is deterioration in the ability to process the dynamic components of speech such as formant transitions. The frequency-modulated (FM) sweep is a useful stimulus for investigating the neural basis of temporal processing speed since it has features in common with formant transitions. Previously, we showed that when cells in the auditory cortex of aged animals were presented with FM sweeps, they exhibited a decrease in temporal processing speed when compared to cells recorded from young animals. However, this was not the case for cells in the inferior colliculus (IC) where neural responses did not appear to be affected by aging. One question that remains is how the auditory thalamus is affected by aging: Is it similar to that of the auditory cortex or of the IC. To this end, single units were recorded from the ventral division of the medial geniculate nucleus (MGNv) of young and aged anaesthetized rats in response to FM sweeps. Results showed that there were no age-related differences in speed or direction selectivity of FM sweep responses in the MGNv. When compared with units recorded from the IC and AI, the responses of MGNv neurons were similar to those of the IC. This suggests that temporal processing speed is affected by aging in the cortex, but not in the auditory thalamus or midbrain.}, } @article {pmid15101656, year = {2004}, author = {Nittrouer, S}, title = {The role of temporal and dynamic signal components in the perception of syllable-final stop voicing by children and adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {4}, pages = {1777-1790}, pmid = {15101656}, issn = {0001-4966}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC000633-18/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Age Factors ; Analysis of Variance ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; *Speech Perception ; Speech Production Measurement ; *Verbal Behavior ; }, abstract = {Adults whose native languages permit syllable-final obstruents, and show a vocalic length distinction based on the voicing of those obstruents, consistently weight vocalic duration strongly in their perceptual decisions about the voicing of final stops, at least in laboratory studies using synthetic speech. Children, on the other hand, generally disregard such signal properties in their speech perception, favoring formant transitions instead. These age-related differences led to the prediction that children learning English as a native language would weight vocalic duration less than adults, but weight syllable-final transitions more in decisions of final-consonant voicing. This study tested that prediction. In the first experiment, adults and children (eight and six years olds) labeled synthetic and natural CVC words with voiced or voiceless stops in final C position. Predictions were strictly supported for synthetic stimuli only. With natural stimuli it appeared that adults and children alike weighted syllable-offset transitions strongly in their voicing decisions. The predicted age-related difference in the weighting of vocalic duration was seen for these natural stimuli almost exclusively when syllable-final transitions signaled a voiced final stop. A second experiment with adults and children (seven and five years old) replicated these results for natural stimuli with four new sets of natural stimuli. It was concluded that acoustic properties other than vocalic duration might play more important roles in voicing decisions for final stops than commonly asserted, sometimes even taking precedence over vocalic duration.}, } @article {pmid15101654, year = {2004}, author = {Story, BH}, title = {On the ability of a physiologically constrained area function model of the vocal tract to produce normal formant patterns under perturbed conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {4}, pages = {1760-1770}, doi = {10.1121/1.1689347}, pmid = {15101654}, issn = {0001-4966}, support = {R01 DC004789/DC/NIDCD NIH HHS/United States ; R01-DC04789/DC/NIDCD NIH HHS/United States ; }, mesh = {Biomechanical Phenomena ; Humans ; Larynx/*physiology ; Lip/physiopathology ; Models, Biological ; Phonation/*physiology ; Phonetics ; *Voice ; }, abstract = {An area function model of the vocal tract is tested for its ability to produce typical vowel formant frequencies with a perturbation at the lips. The model, which consists of a neutral shape and two weighted orthogonal shaping patterns (modes), has previously been shown to produce a nearly one-to-one mapping between formant frequencies and the weighting coefficients of the modes [Story and Titze, J. Phonetics, 26, 223-260 (1998)]. In this study, a perturbation experiment was simulated by imposing a constant area "lip tube" on the model. The mapping between the mode coefficients and formant frequencies was then recomputed with the lip tube in place and showed that formant frequencies (F1 and F2) representative of the vowels [u,o,u] could no longer be produced with the model. However, when the mode coefficients were allowed to exceed their typical bounding values, the mapping between them and the formant frequencies was expanded such that the vowels [u,o,u] were compensated. The area functions generated by these exaggerated coefficients were shown to be similar to vocal-tract shapes reported for real speakers under similar perturbed conditions [Savariaux, Perrier, and Orliaguet, J. Acoust. Soc. Am., 98, 2428-2442 (1995)]. This suggests that the structure of this particular model captures some of the human ability to configure the vocal-tract shape under both ordinary and extraordinary conditions.}, } @article {pmid15101644, year = {2004}, author = {Xu, Q and Jacewicz, E and Feth, LL and Krishnamurthy, AK}, title = {Bandwidth of spectral resolution for two-formant synthetic vowels and two-tone complex signals.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {4}, pages = {1653-1664}, doi = {10.1121/1.1624066}, pmid = {15101644}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adult ; Computer Simulation ; Female ; Humans ; Linear Models ; Male ; Models, Biological ; *Phonetics ; Pitch Discrimination/*physiology ; Psychoacoustics ; Signal Processing, Computer-Assisted ; }, abstract = {Spectral integration refers to the summation of activity beyond the bandwidth of the peripheral auditory filter. Several experimental lines have sought to determine the bandwidth of this "supracritical" band phenomenon. This paper reports on two experiments which tested the limit on spectral integration in the same listeners. Experiment I verified the critical separation of 3.5 bark in two-formant synthetic vowels as advocated by the center-of-gravity (COG) hypothesis. According to the COG effect, two formants are integrated into a single perceived peak if their separation does not exceed approximately 3.5 bark. With several modifications to the methods of a classic COG matching task, the present listeners responded to changes in pitch in two-formant synthetic vowels, not estimating their phonetic quality. By changing the amplitude ratio of the formants, the frequency of the perceived peak was closer to that of the stronger formant. This COG effect disappeared with larger formant separation. In a second experiment, auditory spectral resolution bandwidths were measured for the same listeners using common-envelope, two-tone complex signals. Results showed that the limits of spectral averaging in two-formant vowels and two-tone spectral resolution bandwidth were related for two of the three listeners. The third failed to perform the discrimination task. For the two subjects who completed both tasks, the results suggest that the critical region in vowel task and the complex-tone discriminability estimates are linked to a common mechanism, i.e., to an auditory spectral resolving power. A signal-processing model is proposed to predict the COG effect in two-formant synthetic vowels. The model introduces two modifications to Hermansky's [J. Acoust. Soc. Am. 87, 1738-1752 (1990)] perceptual linear predictive (PLP) model. The model predictions are generally compatible with the present experimental results and with the predictions of several earlier models accounting for the COG effect.}, } @article {pmid15087572, year = {2004}, author = {Ryalls, J and Shaw, H and Simon, M}, title = {Voice onset time production in older and younger female monozygotic twins.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {56}, number = {3}, pages = {165-169}, doi = {10.1159/000076938}, pmid = {15087572}, issn = {1021-7762}, mesh = {Adult ; Aged ; Aging/*genetics ; Female ; Humans ; *Phonetics ; Signal Processing, Computer-Assisted ; Social Environment ; Sound Spectrography ; *Speech Acoustics ; *Speech Production Measurement ; Twins, Monozygotic/*genetics ; Voice/*genetics ; }, abstract = {A comparison of voice onset time (VOT) production in younger and older female monozygotic twins revealed greater similarity for the younger than the older female twin pairs. Based on findings from this study and a review of previous twin literature, it is hypothesized that 'source' characteristics of speech (voice properties) are under more constraints by genetics, while 'filter' characteristics of speech (VOT, formant frequencies) are freer to diverge due to environmental factors such as different linguistic environments. Since the older twins separated to two distinct linguistic areas of the United States at the age of 25 years, while the 2 younger twins continue to share domicile at 21 years, the findings of this study appear to support this preliminary hypothesis.}, } @article {pmid15070221, year = {2004}, author = {Erickson, ML}, title = {The interaction of formant frequency and pitch in the perception of voice category and jaw opening in female singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {1}, pages = {24-37}, doi = {10.1016/j.jvoice.2003.08.001}, pmid = {15070221}, issn = {0892-1997}, mesh = {*Acoustics ; Analysis of Variance ; Female ; Humans ; Jaw/*physiology ; Music ; *Phonation ; Pitch Perception/*physiology ; Voice/*physiology ; Voice Quality ; }, abstract = {This study represents a first step toward understanding the contribution formant frequency makes to the perception of female voice categories. The effects of formant frequency and pitch on the perception of voice category were examined by constructing a perceptual study that used two sets of synthetic stimuli at various pitches throughout the female singing range. The first set was designed to test the effects of systematically varying formants 1 through 4. The second set was designed to test the relative effects of lower frequency formants (F1 and F2) versus higher frequency formants (F3 and F4) through construction of mixed stimuli. Generally, as the frequencies of all four formants decreased, perception of soprano voice category decreased at all but the highest pitch, A5. However, perception of soprano voice category also increased as a function of pitch. Listeners appeared to need agreement between all four formants to perceive voice categories. When upper and lower formants are inconsistent in frequency, listeners were unable to judge voice category, but they could use the inconsistent patterns to form perceptions about degree of jaw opening.}, } @article {pmid15070219, year = {2004}, author = {Neubauer, J and Edgerton, M and Herzel, H}, title = {Nonlinear phenomena in contemporary vocal music.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {18}, number = {1}, pages = {1-12}, doi = {10.1016/S0892-1997(03)00073-0}, pmid = {15070219}, issn = {0892-1997}, mesh = {*Acoustics ; Female ; Humans ; Male ; *Music ; Phonation/*physiology ; Sound Spectrography ; Tape Recording ; Voice/*physiology ; }, abstract = {Complex and multiphonic voice signals of vocal improvisors are analyzed within the framework of nonlinear dynamics. Evidence is given that nonlinear phenomena are extensively used by performers associated with contemporary music. Narrow-band spectrograms of complex vocalizations are used to visualize the appearance of nonlinear phenomena (spectral bifurcation diagrams). Possible production mechanisms are discussed in connection with previous research, personal performance and pedagogical experience. Examples for period doubling, biphonation and irregular aperiodic phonation in vocal sonorities of contemporary vocal improvisors are given, and glottal whistle production encompassed with biphonation and triphonation is shown. Furthermore, coincidences of harmonics-formant matching associated with abrupt transitions to subharmonics and biphonation in the vocal output are provided. This also shows the recurrent use of nonlinear phenomena by performers. It is argued that mechanisms such as source-tract coupling or vocal fold desynchronization due to asymmetry are used in a reproducible way for musical tasks.}, } @article {pmid15066534, year = {2004}, author = {Pratt, H and Mittelman, N and Bleich, N and Laufer, I}, title = {Auditory middle-latency components to fusion of speech elements forming an auditory object.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {115}, number = {5}, pages = {1083-1089}, doi = {10.1016/j.clinph.2003.12.004}, pmid = {15066534}, issn = {1388-2457}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; *Evoked Potentials, Auditory ; Female ; Humans ; Male ; Phonetics ; Reaction Time ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: The purpose of this study was to define early brain activity associated with fusion of speech elements to form an auditory object in the middle-latency range preceding the F-Complex.

METHODS: Stimuli were binaural formant transition and base, that were presented separately or fused to form the vowel-consonant-vowel sequence /ada/. Eleven right-handed, adult, native Hebrew speakers listened to 2/s presentations, and the brain potentials from C(z) during the 250 msec following transition onset (in the responses to transition and to the fused word) or following the time it would have been presented (in the response to base alone) were recorded. The net-fusion response was extracted by subtracting the sum of potentials to the base and the formant transition from the potentials to the fused sound.

RESULTS: Auditory middle-latency components, comprising of 9 peaks and troughs were recorded in response to the base, to the formant transition and to the fused /ada/. In general, the responses to the fused object were significantly smaller in peak amplitude and in total activity (area under the curve) resulting in the difference waveform of the net-fusion response that also included 9 peaks, but with opposite polarities.

CONCLUSIONS: The early middle-latency components to fusion indicate that the fusion of speech elements to a word involves inhibition, occlusion or both. The results are in line with the uniqueness of speech perception and the early role of the auditory cortex in speech analysis.}, } @article {pmid15063147, year = {2004}, author = {Schiavetti, N and Metz, DE and Whitehead, RL and Brown, S and Borges, J and Rivera, S and Schultz, C}, title = {Acoustic and perceptual characteristics of vowels produced during simultaneous communication.}, journal = {Journal of communication disorders}, volume = {37}, number = {3}, pages = {275-294}, doi = {10.1016/j.jcomdis.2003.10.003}, pmid = {15063147}, issn = {0021-9924}, mesh = {Adult ; Deafness/*rehabilitation ; Female ; Humans ; Male ; *Manual Communication ; Observer Variation ; Sign Language ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Tape Recording ; }, abstract = {UNLABELLED: This study investigated the acoustical and perceptual characteristics of vowels in speech produced during simultaneous communication (SC). Twelve normal hearing, experienced sign language users were recorded under SC and speech alone (SA) conditions speaking a set of sentences containing monosyllabic words designed for measurement of vowel duration, formant frequencies, and fundamental frequency in consonant-vowel-consonant (CVC) syllables and 60 listeners audited the speech samples. Although results indicated longer sentence and vowel durations for SC than SA, the data showed no difference in spectral characteristics of vowels produced during SC versus SA, indicating no degradation of vowel spectrum by rate alteration during SC. Further, no difference was found in listeners' ability to identify vowels produced during SC versus SA, indicating no degradation of vowel perceptual cues during SC. These conclusions are consistent with previous research indicating that temporal alterations produced by SC do not produce degradation of segmental acoustical characteristics of spoken English.

LEARNING OUTCOMES: As a result of this activity, the participant will be able to (1) describe simultaneous communication; (2) explain the role of simultaneous communication in communication with children who are deaf; (3) describe vowel acoustics in English speech; (4) discuss methods of measuring vowel perception; (5) specify the acoustic characteristics of vowels produced during simultaneous communication; and (6) specify the ability of listeners to perceive vowels in speech produced during simultaneous communication.}, } @article {pmid15058348, year = {2004}, author = {Sjölander, P and Sundberg, J}, title = {Spectrum effects of subglottal pressure variation in professional baritone singers.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {3}, pages = {1270-1273}, doi = {10.1121/1.1646403}, pmid = {15058348}, issn = {0001-4966}, mesh = {Adult ; Aged ; Female ; Fourier Analysis ; Glottis/*physiology ; Humans ; Male ; Middle Aged ; *Music ; Pressure ; Speech Production Measurement ; Tape Recording ; Voice/*physiology ; Voice Training ; }, abstract = {The audio signal from five professional baritones was analyzed by means of spectrum analysis. Each subject sang syllables [pae] and [pa] from loudest to softest phonation at fundamental frequencies representing 25%, 50%, and 75% of his total range. Ten subglottal pressures, equidistantly spaced between highest and lowest, were selected for analysis along with the corresponding production of the vowels. The levels of the first formant and singer's formant were measured as a function of subglottal pressure. Averaged across subjects, vowels, and F0, a 10-dB increase at 600 Hz was accompanied by a 16-dB increase at 3 kHz.}, } @article {pmid15053696, year = {2004}, author = {Lachs, L and Pisoni, DB}, title = {Cross-modal source information and spoken word recognition.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {30}, number = {2}, pages = {378-396}, pmid = {15053696}, issn = {0096-1523}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; K08 DC000111/DC/NIDCD NIH HHS/United States ; DC00012/DC/NIDCD NIH HHS/United States ; F32 DC000111/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; DC-00111/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Random Allocation ; Reaction Time ; *Recognition, Psychology ; Sound Spectrography ; *Speech Perception ; Time Factors ; *Vocabulary ; }, abstract = {In a cross-modal matching task, participants were asked to match visual and auditory displays of speech based on the identity of the speaker. The present investigation used this task with acoustically transformed speech to examine the properties of sound that can convey cross-modal information. Word recognition performance was also measured under the same transformations. The authors found that cross-modal matching was only possible under transformations that preserved the relative spectral and temporal patterns of formant frequencies. In addition, cross-modal matching was only possible under the same conditions that yielded robust word recognition performance. The results are consistent with the hypothesis that acoustic and optical displays of speech simultaneously carry articulatory information about both the underlying linguistic message and indexical properties of the talker.}, } @article {pmid15051129, year = {2004}, author = {Clarey, JC and Paolini, AG and Grayden, DB and Burkitt, AN and Clark, GM}, title = {Ventral cochlear nucleus coding of voice onset time in naturally spoken syllables.}, journal = {Hearing research}, volume = {190}, number = {1-2}, pages = {37-59}, doi = {10.1016/S0378-5955(04)00017-6}, pmid = {15051129}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Animals ; Auditory Threshold/physiology ; Cochlear Nerve/physiology ; Cochlear Nucleus/cytology/*physiology ; Neurons/physiology ; Rats ; Rats, Wistar ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; Time Perception/physiology ; }, abstract = {These experiments examined the coding of the voice onset time (VOT) of six naturally spoken syllables, presented at a number of intensities, by ventral cochlear nucleus (VCN) neurons in rats anesthetized with urethane. VOT is one of the cues for the identification of a stop consonant, and is defined by the interval between stop release and the first glottal pulse that marks the onset of voicing associated with a vowel. The syllables presented (/bot/, /dot/, /got/, /pot/, /tot/, /kot/) each had a different VOT, ranging between 10 and 108 ms. Extracellular recordings were made from single neurons (N=202) with a wide range of best frequencies (BFs; 0.66-10 kHz) that represented the major VCN response types - primary-like (67.8% of sample), chopper (19.8%), and onset (12.4%) neurons. The different VOTs of the syllables were accurately reflected in sharp, precisely timed, and statistically significant changes in average discharge rate in all cell types, as well as the entire VCN sample. The prominence of the response to stop release and voice onset, and the level of activity prior to the VOT, were influenced by syllable intensity and the spectrum of stop release, as well as cell BF and type. Our results suggest that the responses of VCN cells with BFs above the first formant frequency are dominated by their sensitivity to the onsets of broadband events in speech, and allows them to convey accurate information about a syllable's VOT.}, } @article {pmid15015954, year = {2003}, author = {Tepper, G and Haas, R and Schneider, B and Watzak, G and Mailath, G and Jovanovic, SA and Busenlechner, D and Zechner, W and Watzek, G}, title = {Effects of sinus lifting on voice quality. A prospective study and risk assessment.}, journal = {Clinical oral implants research}, volume = {14}, number = {6}, pages = {767-774}, doi = {10.1046/j.0905-7161.2003.00957.x}, pmid = {15015954}, issn = {0905-7161}, mesh = {Adult ; Aged ; Alveolar Bone Loss/pathology/surgery ; Alveolar Ridge Augmentation/methods ; Atrophy ; Bone Transplantation ; Dental Implantation, Endosseous/methods ; Female ; Humans ; Jaw, Edentulous/*surgery ; Male ; Maxilla/*pathology/surgery ; Maxillary Diseases/pathology/*surgery ; Maxillary Sinus/*pathology/*surgery ; Middle Aged ; Phonation ; Prospective Studies ; Risk Assessment ; Treatment Outcome ; *Voice Quality ; }, abstract = {A variety of potential complications associated with sinus lift surgery have been reported in the literature. However, potential alterations of voice quality following sinus elevation have so far not been mentioned or evaluated scientifically. For the majority of patients, slight changes of the voice pattern are of no importance. However, for voice professionals, whose voices have become part of their distinctive profession or trademark, minimal changes may have dramatic consequences. This specific group of patients, such as speakers, actors and singers, depend on the particular quality and timbre of their voice for their livelihood. Consequently, the purpose of this study was to assess the effects of sinus lifting on voice quality in the above patient group. In a collaborative interdisciplinary effort, the Departments of Oral Surgery and Otorhinolaryngology, Section of Phoniatrics and Logopedics, thoroughly evaluated a series of voice parameters of four patients undergoing sinus lifting pre- and postoperatively. The parameters analyzed included pitch, dynamic range, sound pressure level, percent jitter, percent shimmer and noise-to-harmonics ratio with special emphasis on formant analysis. No changes were detected in any of the commonly evaluated parameters. These were rated subjectively by patients and their friends or relatives and objectively with instrumental tools under isolated phoniatric lab conditions. In conclusion, sinus lift surgery appears to be a safe, predictable evidence-based method for regenerating the highly atrophic posterior maxilla, which does not jeopardize the individual characteristic voice pattern of high-profile patients critically dependent on their voices for their livelihood.}, } @article {pmid15006034, year = {2004}, author = {Obleser, J and Lahiri, A and Eulitz, C}, title = {Magnetic brain response mirrors extraction of phonological features from spoken vowels.}, journal = {Journal of cognitive neuroscience}, volume = {16}, number = {1}, pages = {31-39}, doi = {10.1162/089892904322755539}, pmid = {15006034}, issn = {0898-929X}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Perception/physiology ; *Brain Mapping/instrumentation ; Evoked Potentials, Auditory/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; Magnetoencephalography ; *Phonetics ; Reaction Time/physiology ; Reference Values ; Speech Perception/*physiology ; }, abstract = {This study further elucidates determinants of vowel perception in the human auditory cortex. The vowel inventory of a given language can be classified on the basis of phonological features which are closely linked to acoustic properties. A cortical representation of speech sounds based on these phonological features might explain the surprisingly inverse correlation between immense variance in the acoustic signal and high accuracy of speech recognition. We investigated timing and mapping of the N100m elicited by 42 tokens of seven natural German vowels varying along the phonological features tongue height (corresponding to the frequency of the first formant) and place of articulation (corresponding to the frequency of the second and third formants). Auditory evoked fields were recorded using a 148-channel whole-head magnetometer while subjects performed target vowel detection tasks. Source location differences appeared to be driven by place of articulation: Vowels with mutually exclusive place of articulation features, namely, coronal and dorsal elicited separate centers of activation along the posterior-anterior axis. Additionally, the time course of activation as reflected in the N100m peak latency distinguished between vowel categories especially when the spatial distinctiveness of cortical activation was low. In sum, results suggest that both N100m latency and source location as well as their interaction reflect properties of speech stimuli that correspond to abstract phonological features.}, } @article {pmid14983285, year = {2002}, author = {Yu, LY and Wang, GM and Chen, Y and Wu, YL}, title = {[The study on the acoustic features of congenital velopharyngeal insufficiency in 28 patients].}, journal = {Shanghai kou qiang yi xue = Shanghai journal of stomatology}, volume = {11}, number = {2}, pages = {108-111}, pmid = {14983285}, issn = {1006-7248}, abstract = {OBJECTIVE: To study the acoustic features of the patients with congenital velopharyngeal insufficiency (CVPI).

METHODS: The acoustic features of 28 patients with CVPI were analyzed with the computer speech lab, contrasted by the features of 20 controls.

RESULTS: There was significant difference in F2, B2, F3, B3 and VOT between the patients and the controls (P<0.05). It was not same on every syllable; From phonetic images, F1, F2 and F3 of the controls were more clear than those of the patients, the other districts of the controls were lighter than those of the patients. F1 of the patients was darker and broader, their F2 and F3 were lower, weaker and broader. Among F1, F2 and F3 of the patients, there were some extra FORMANTs. Erected stripes and fills before the vowel of the patients were fewer than those of the controls.

CONCLUSION: There were apparent distinctions in F2, B2, F3 and B3 between the patients and the controls, and there was significant difference in phonetic images of two groups, especially in FORMANTs, and consonants.}, } @article {pmid14972361, year = {2004}, author = {Eulitz, C and Obleser, J and Lahiri, A}, title = {Intra-subject replication of brain magnetic activity during the processing of speech sounds.}, journal = {Brain research. Cognitive brain research}, volume = {19}, number = {1}, pages = {82-91}, doi = {10.1016/j.cogbrainres.2003.11.004}, pmid = {14972361}, issn = {0926-6410}, mesh = {Acoustic Stimulation/methods ; Adult ; Analysis of Variance ; Female ; Humans ; Magnetoencephalography/*methods ; Reaction Time/*physiology ; Reproducibility of Results ; Speech Perception/*physiology ; }, abstract = {The present study examined the cortical activity during processing of vocalic segments by means of whole-head magnetoencephalography (MEG) to see whether respective cortical maps are stable across repeated measurements. We investigated the spatial configuration and temporal characteristics of the N100m generators of the auditory-evoked field during the processing of the synthetic German vowels [a], [e] and [i] across 10 repeated measurements in a single subject. Between vowels, N100m latency as well as source location differences were found with the latency differences being in accordance with tonochronic principles. The spatial configuration of the different vowel sources was related to differences in acoustic/phonological features. Vowels differing maximally in those features, i.e., [a] and [i], showed larger Euclidean distances between N100m vowel sources than [e] and [i]. This pattern was repeatable across sessions and independent of the source modeling strategy for left-hemispheric data. Compared to a pure tone control condition, the N100m generators of vowels were localized in more anterior, superior and lateral parts of the temporal lobe and showed longer latencies. Being aware of the limited significance of conclusions drawn from a single case study, the study yielded a repeatable spatial and temporal pattern of vowel source activity in the auditory cortex which was determined by the distinctiveness of the formant frequencies corresponding to abstract phonological features.}, } @article {pmid14759032, year = {2004}, author = {Rendall, D and Owren, MJ and Weerts, E and Hienz, RD}, title = {Sex differences in the acoustic structure of vowel-like grunt vocalizations in baboons and their perceptual discrimination by baboon listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {1}, pages = {411-421}, doi = {10.1121/1.1635838}, pmid = {14759032}, issn = {0001-4966}, support = {DA 12139/DA/NIDA NIH HHS/United States ; }, mesh = {*Animal Communication ; Animals ; *Auditory Perception ; Botswana ; Discrimination Learning ; Female ; Male ; Papio/*psychology ; *Phonetics ; Sex Factors ; Sound Spectrography ; South Africa ; *Speech Acoustics ; *Vocalization, Animal ; }, abstract = {This study quantifies sex differences in the acoustic structure of vowel-like grunt vocalizations in baboons (Papio spp.) and tests the basic perceptual discriminability of these differences to baboon listeners. Acoustic analyses were performed on 1028 grunts recorded from 27 adult baboons (11 males and 16 females) in southern Africa, focusing specifically on the fundamental frequency (F0) and formant frequencies. The mean F0 and the mean frequencies of the first three formants were all significantly lower in males than they were in females, more dramatically so for F0. Experiments using standard psychophysical procedures subsequently tested the discriminability of adult male and adult female grunts. After learning to discriminate the grunt of one male from that of one female, five baboon subjects subsequently generalized this discrimination both to new call tokens from the same individuals and to grunts from novel males and females. These results are discussed in the context of both the possible vocal anatomical basis for sex differences in call structure and the potential perceptual mechanisms involved in their processing by listeners, particularly as these relate to analogous issues in human speech production and perception.}, } @article {pmid14759026, year = {2004}, author = {Apostol, L and Perrier, P and Bailly, G}, title = {A model of acoustic interspeaker variability based on the concept of formant-cavity affiliation.}, journal = {The Journal of the Acoustical Society of America}, volume = {115}, number = {1}, pages = {337-351}, doi = {10.1121/1.1631946}, pmid = {14759026}, issn = {0001-4966}, mesh = {Adult ; Analysis of Variance ; Humans ; *Individuality ; Larynx/physiology ; Lip/physiology ; Male ; Models, Theoretical ; Mouth/physiology ; Pharynx/physiology ; *Phonetics ; Reference Values ; Sound Spectrography ; *Speech Acoustics ; Speech Articulation Tests ; *Verbal Behavior/physiology ; Video Recording ; Voice Quality/physiology ; }, abstract = {A method is proposed to model the interspeaker variability of formant patterns for oral vowels. It is assumed that this variability originates in the differences existing among speakers in the respective lengths of their front and back vocal-tract cavities. In order to characterize, from the spectral description of the acoustic speech signal, these vocal-tract differences between speakers, each formant is interpreted, according to the concept of formant-cavity affiliation, as a resonance of a specific vocal-tract cavity. Its frequency can thus be directly related to the corresponding cavity length, and a transformation model can be proposed from a speaker A to a speaker B on the basis of the frequency ratios of the formants corresponding to the same resonances. In order to minimize the number of sounds to be recorded for each speaker in order to carry out this speaker transformation, the frequency ratios are exactly computed only for the three extreme cardinal vowels [i, a, u] and they are approximated for the remaining vowels through an interpolation function. The method is evaluated through its capacity to transform the (F1,F2) formant patterns of eight oral vowels pronounced by five male speakers into the (F1,F2) patterns of the corresponding vowels generated by an articulatory model of the vocal tract. The resulting formant patterns are compared to those provided by normalization techniques published in the literature. The proposed method is found to be efficient, but a number of limitations are also observed and discussed. These limitations can be associated with the formant-cavity affiliation model itself or with a possible influence of speaker-specific vocal-tract geometry in the cross-sectional direction, which the model might not have taken into account.}, } @article {pmid14750977, year = {2004}, author = {Gentilucci, M and Santunione, P and Roy, AC and Stefanini, S}, title = {Execution and observation of bringing a fruit to the mouth affect syllable pronunciation.}, journal = {The European journal of neuroscience}, volume = {19}, number = {1}, pages = {190-202}, doi = {10.1111/j.1460-9568.2004.03104.x}, pmid = {14750977}, issn = {0953-816X}, mesh = {Adult ; Arm/physiology ; Female ; Frontal Lobe/physiology ; Humans ; Imitative Behavior/*physiology ; Motor Cortex/physiology ; Mouth/physiology ; Movement/*physiology ; Neural Pathways/physiology ; Psychomotor Performance/*physiology ; Speech/*physiology ; Verbal Behavior/*physiology ; }, abstract = {Kinematic analysis of lip and voice spectrum analysis were used to assess the influence of both execution and observation of arm-mouth-related actions on speech production. In experiments 1 and 2 participants brought either a cherry or an apple to their mouth and either pronounced the syllable BA (experiment 1) or emitted a nonspeech-related vocalization (experiment 2). In the other three experiments participants observed arm actions performed by the experimenter and pronounced the syllable BA. In experiment 3, they observed the action of bringing the cherry or apple to the mouth. In experiments 4 and 5, they observed a pantomime of the same action performed by the experimenter with his own arm (experiment 4) or with a nonbiological arm (experiment 5). The results showed that the formant 2 of the vowel 'a' increased when participants executed the bringing-to-the-mouth act with the apple or observed its execution or pantomime with the experimenter's arm (experiments 1, 3 and 4). In contrast, no modification in the vowel formants was found during a nonspeech-related vocalization (experiment 2) and during observation of an action with a nonbiological arm (experiment 5). Finally, the opening of the lips was larger when the participant brought the apple rather than the cherry to the mouth and pronounced BA (experiment 1). Taken together, the results of the present study suggest that the execution and observation of the bringing-to-the-mouth action activate a mouth articulation posture (probably due to the act of food manipulation with the mouth) which selectively influences speech production. They support the idea that the system involved in speech production shares and may derive from the neural substrate which is involved in the control of arm-mouth interactions and, in general, of arm actions.}, } @article {pmid14748637, year = {2002}, author = {McRae, PA and Tjaden, K and Schoonings, B}, title = {Acoustic and perceptual consequences of articulatory rate change in Parkinson disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {45}, number = {1}, pages = {35-50}, doi = {10.1044/1092-4388(2002/003)}, pmid = {14748637}, issn = {1092-4388}, support = {R03 DC00347/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Aged, 80 and over ; Analysis of Variance ; Case-Control Studies ; Female ; Humans ; Linear Models ; Male ; Middle Aged ; Parkinson Disease/*complications/*physiopathology ; Speech ; *Speech Acoustics ; Speech Disorders/*etiology ; *Speech Intelligibility ; Vocal Cords/physiopathology ; }, abstract = {This study sought to characterize the relationship among voluntary rate modification, vocal tract acoustic output, and perceptual impressions of speech for individuals with idiopathic Parkinson disease (PD). Healthy control speakers were studied for comparison purposes. Four research questions were addressed: (1) How is rate modification evidenced in acoustic measures of segmental and global timing? (2) What is the impact of rate modification on measures of acoustic working space for select vowels and consonants? (3) What is the impact of rate modification on perceptual impressions of severity? (4) Are rate-induced changes in measures of acoustic working space related to perceptual impressions of severity? Speakers read the Farm Passage at habitual, slow, and fast rates. The vowels /i/, /ae/, /u/, and /A/ and the fricatives /s/ and /S/ were of interest. Acoustic measures included articulatory rate, segment durations, vowel formant frequencies, and first moment coefficients. Measures of acoustic working space for vowels and fricatives also were derived. The results indicated that temporal acoustic measures changed in the expected direction across rate conditions, with a tendency toward slightly faster rates for the PD group. In addition, the relative rate change for the Fast and Slow conditions compared to the Habitual condition was similar across groups. Rate did not strongly affect measures of acoustic working space for the PD group as a whole, but there was a tendency for slower rates to be associated with larger measures of acoustic working space. Finally, there was not a strong relationship between perceived severity and measures of acoustic working space across the rate continuum for either group. Rather, the relationship between perceived severity and measures of acoustic working space was such that the PD group exhibited smaller measures of acoustic working space and more severe perceptual estimates than the control speakers, irrespective of rate condition.}, } @article {pmid14740934, year = {2003}, author = {Mendes, AP and Rothman, HB and Sapienza, C and Brown, WS}, title = {Effects of vocal training on the acoustic parameters of the singing voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {17}, number = {4}, pages = {529-543}, doi = {10.1067/s0892-1997(03)00083-3}, pmid = {14740934}, issn = {0892-1997}, mesh = {Acoustics ; Adolescent ; Adult ; Female ; Humans ; Longitudinal Studies ; Male ; *Music ; Phonation/physiology ; Pitch Perception/*physiology ; Speech Production Measurement ; Tape Recording ; Voice/physiology ; *Voice Training ; }, abstract = {Vocal training (VT) has, in part, been associated with the distinctions in the physiological, acoustic, and perceptual parameters found in singers' voices versus the voices of nonsingers. This study provides information on the changes in the singing voice as a function of VT over time. Fourteen college voice majors (12 females and 2 males; age range, 17-20 years) were recorded while singing, once a semester, for four consecutive semesters. Acoustic measures included fundamental frequency (F0) and sound pressure level (SPL) of the 10% and 90% levels of the maximum phonational frequency range (MPFR), vibrato pulses per second, vibrato amplitude variation, and the presence of the singer's formant. Results indicated that VT had a significant effect on the MPFR. F0 and SPL of the 90% level of the MPFR and the 90-10% range increased significantly as VT progressed. However, no vibrato or singers' formant differences were detected as a function of training. This longitudinal study not only validates previous cross-sectional research, ie, that VT has a significant effect on the singing voice, but also it demonstrates that these effects can be acoustically detected by the fourth semester of college vocal training.}, } @article {pmid14732979, year = {2003}, author = {Yang, X and Li, N and Bu, L}, title = {[The analysis of formant characteristics of vowels in the speech of patient with cleft palate].}, journal = {Hua xi kou qiang yi xue za zhi = Huaxi kouqiang yixue zazhi = West China journal of stomatology}, volume = {21}, number = {6}, pages = {451-3, 462}, pmid = {14732979}, issn = {1000-1182}, mesh = {Adolescent ; Adult ; Articulation Disorders/etiology/*physiopathology ; Child ; Cleft Palate/complications/*physiopathology/surgery ; Female ; Humans ; Male ; Postoperative Period ; Sound Spectrography ; Speech/*physiology ; Speech Articulation Tests ; Speech Production Measurement ; *Speech Therapy ; Velopharyngeal Insufficiency/etiology/*physiopathology ; }, abstract = {OBJECTIVE: To analyze the formant frequency of vowels in the sequence therapy of patient with cleft palate.

METHODS: The formant frequency of vowels [a], [e], [i], [u] of normal children and postoperative patients with and without speech therapy was measured and analyzed by VS-99.

RESULTS: 1. The mean value of F1, F2, F3 of [a] did not show significant difference among the three groups (P > 0.05). 2. The difference of mean value of [e] was significant between control group and pre-speech-therapy group, and between pre-speech-therapy and post-speech-therapy group (P < 0.05), but no significant difference was found between post-speech-therapy and control group(P > 0.05). The mean value of the formant in post-speech-therapy was higher than that of pre-speech-therapy. 3. The difference of mean value of [i] was significant between pre-speech-therapy and post-speech-therapy (P < 0.05), the mean value of F2, F3 in post-speech-therapy group decreased significantly compared with control (P < 0.05). 4. The difference of mean value of [u] showed significance between pre-speech-therapy and post-speech-therapy (P < 0.05), while the differences among other groups were insignificant (P > 0.05).

CONCLUSION: Surgical repair of cleft palate cannot make all patients obtain perfect Velopharyngeal competence (VPC), while speech therapy can improve patient's pronunciation. Speech spectrum analysis can judge the effect of cleft palate therapy objectively.}, } @article {pmid14724253, year = {2004}, author = {Bandyopadhyay, S and Young, ED}, title = {Discrimination of voiced stop consonants based on auditory nerve discharges.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {24}, number = {2}, pages = {531-541}, pmid = {14724253}, issn = {1529-2401}, support = {R01 DC000109/DC/NIDCD NIH HHS/United States ; DC00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Action Potentials ; Animals ; Cats ; Cochlear Nerve/*physiology ; Discrimination, Psychological ; Kinetics ; Phonetics ; Speech/physiology ; Speech Perception/*physiology ; }, abstract = {Previous studies of the neural representation of speech assumed some form of neural code, usually discharge rate or phase locking, for the representation. In the present study, responses to five synthesized CVC_CV (e.g., /dad_da/) utterances have been examined using information-theoretic distance measures [or Kullback-Leibler (KL) distance] that are independent of a priori assumptions about the neural code. The consonants in the stimuli fall along a continuum from /b/ to /d/ and include both formant-frequency (F1, F2, and F3) transitions and onset (release) bursts. Differences in responses to pairs of stimuli, based on single-fiber auditory nerve responses at 70 and 50 dB sound pressure level, have been quantified, based on KL and KL-like distances, to show how each portion of the response contributes to information coding and the fidelity of the encoding. Distances were large at best frequencies, in which the formants differ but were largest for fibers encoding the high-frequency release bursts. Distances computed at differing time resolutions show significant information in the temporal pattern of spiking, beyond that encoded by rate, at time resolutions from 1-40 msec. Single-fiber just noticeable differences (JNDs) for F2 and F3 were computed from the data. These results show that F2 is coded with greater fidelity than F3, even among fibers tuned to F3, and that JNDs are larger in the syllable final consonant than in the releases.}, } @article {pmid14700384, year = {2003}, author = {Nijland, L and Maassen, B and van der Meulen, S}, title = {Evidence of motor programming deficits in children diagnosed with DAS.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {2}, pages = {437-450}, doi = {10.1044/1092-4388(2003/036)}, pmid = {14700384}, issn = {1092-4388}, mesh = {Adult ; Analysis of Variance ; Apraxias/complications/*physiopathology ; Articulation Disorders/*etiology ; Child ; Child, Preschool ; Female ; Humans ; Male ; Motor Activity ; }, abstract = {In this study the hypothesis of motor programming involvement in developmental apraxia of speech (DAS) was investigated by studying articulatory compensation. Five children with DAS and 5 normally speaking children (age 5;0 [years;months] to 6;10), and 6 adult women produced utterances in a normal speaking condition and in a bite-block condition in which the mandible was kept in a fixed position. Throughout the utterances, the course of the second formant was used to determine articulatory compensation and the effect of the bite block on anticipatory coarticulation. Results showed that the bite-block condition in normally speaking children, like in adult women, did not affect the extent of anticipatory coarticulation. In the speech of children with DAS, the bite block had large effects on coarticulatory patterns and on vowel quality, which, contrary to expectations, had improved. These results are interpreted as a clear demonstration of deficient motor programming in DAS.}, } @article {pmid14697000, year = {2003}, author = {Hasegawa-Johnson, M and Pizza, S and Alwan, A and Cha, JS and Haker, K}, title = {Vowel category dependence of the relationship between palate height, tongue height, and oral area.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {3}, pages = {738-753}, doi = {10.1044/1092-4388(2003/059)}, pmid = {14697000}, issn = {1092-4388}, support = {F23 DC 00323-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Confidence Intervals ; Female ; Humans ; Image Processing, Computer-Assisted ; Magnetic Resonance Imaging ; Male ; Models, Statistical ; Mouth/*anatomy & histology/physiology ; Multivariate Analysis ; Palate/*anatomy & histology/physiology ; Phonation ; *Speech Acoustics ; Tape Recording ; Tongue/*anatomy & histology/physiology ; Voice/*physiology ; }, abstract = {This article evaluates intertalker variance of oral area, logarithm of the oral area, tongue height, and formant frequencies as a function of vowel category. The data consist of coronal magnetic resonance imaging (MRI) sequences and acoustic recordings of 5 talkers, each producing 11 different vowels. Tongue height (left, right, and midsagittal), palate height, and oral area were measured in 3 coronal sections anterior to the oropharyngeal bend and were subjected to multivariate analysis of variance, variance ratio analysis, and regression analysis. The primary finding of this article is that oral area (between palate and tongue) showed less intertalker variance during production of vowels with an oral place of articulation (palatal and velar vowels) than during production of vowels with a uvular or pharyngeal place of articulation. Although oral area variance is place dependent, percentage variance (log area variance) is not place dependent. Midsagittal tongue height in the molar region was positively correlated with palate height during production of palatal vowels, but not during production of nonpalatal vowels. Taken together, these results suggest that small oral areas are characterized by relatively talker-independent vowel targets and that meeting these talker-independent targets is important enough that each talker adjusts his or her own tongue height to compensate for talker-dependent differences in constriction anatomy. Computer simulation results are presented to demonstrate that these results may be explained by an acoustic control strategy: When talkers with very different anatomical characteristics try to match talker-independent formant targets, the resulting area variances are minimized near the primary vocal tract constriction.}, } @article {pmid14696995, year = {2003}, author = {Xue, SA and Hao, GJ}, title = {Changes in the human vocal tract due to aging and the acoustic correlates of speech production: a pilot study.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {3}, pages = {689-701}, doi = {10.1044/1092-4388(2003/054)}, pmid = {14696995}, issn = {1092-4388}, mesh = {Adolescent ; Adult ; Aged ; Aged, 80 and over ; Aging/*physiology ; Cohort Studies ; Cross-Sectional Studies ; Female ; Humans ; Male ; Middle Aged ; Mouth/*physiology ; Pilot Projects ; Reference Values ; Reproducibility of Results ; Sound Spectrography ; *Speech Acoustics ; Vocal Cords/*physiology ; Voice/*physiology ; }, abstract = {This investigation used a derivation of acoustic reflection (AR) technology to make cross-sectional measurements of changes due to aging in the oral and pharyngeal lumina of male and female speakers. The purpose of the study was to establish preliminary normative data for such changes and to obtain acoustic measurements of changes due to aging in the formant frequencies of selected spoken vowels and their long-term average spectra (LTAS) analysis. Thirty-eight young men and women and 38 elderly men and women were involved in the study. The oral and pharyngeal lumina of the participants were measured with AR technology, and their formant frequencies were analyzed using the Kay Elemetrics Computerized Speech Lab. The findings have delineated specific and similar patterns of aging changes in human vocal tract configurations in speakers of both genders. Namely, the oral cavity length and volume of elderly speakers increased significantly compared to their young cohorts. The total vocal tract volume of elderly speakers also showed a significant increment, whereas the total vocal tract length of elderly speakers did not differ significantly from their young cohorts. Elderly speakers of both genders also showed similar patterns of acoustic changes of speech production, that is, consistent lowering of formant frequencies (especially F1) across selected vowel productions. Although new research models are still needed to succinctly account for the speech acoustic changes of the elderly, especially for their specific patterns of human vocal tract dimensional changes, this study has innovatively applied the noninvasive and cost-effective AR technology to monitor age-related human oral and pharyngeal lumina changes that have direct consequences for speech production.}, } @article {pmid14696991, year = {2003}, author = {Hedrick, MS and Younger, MS}, title = {Labeling of /s/ and [see text] by listeners with normal and impaired hearing, revisited.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {3}, pages = {636-648}, doi = {10.1044/1092-4388(2003/050)}, pmid = {14696991}, issn = {1092-4388}, support = {1 R55 DC03682/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Analysis of Variance ; Auditory Threshold ; Case-Control Studies ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; *Persons With Hearing Impairments ; *Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The two aims of this study were (a) to determine the perceptual weight given formant transition and relative amplitude information for labeling fricative place of articulation perception and (b) to determine the extent of integration of relative amplitude and formant transition cues. Seven listeners with normal hearing and 7 listeners with sensorineural hearing loss participated. The listeners were asked to label the fricatives of synthetic consonant-vowel stimuli as either /s/ or [see text]. Across the stimuli, 3 cues were varied: (a) The amplitude of the spectral peak in the 2500-Hz range of the frication relative to the adjacent vowel peak amplitude in the same frequency region, (b) the frication duration, which was either 50 or 140 ms, and (c) the second formant transition onset frequency, which was varied from 1200 to 1800 Hz. An analysis of variance model was used to determine weightings for the relative amplitude and transition cues for the different frication duration conditions. A 30-ms gap of silence was inserted between the frication and vocalic portions of the stimuli, with the intent that a temporal separation of frication and transition information might affect how the cues were integrated. The weighting given transition or relative amplitude differed between the listening groups and depended on frication duration. Use of the transition cue was most affected by insertion of the silent gap. Listeners with hearing loss had smaller interaction terms for the cues than listeners with normal hearing, suggesting less integration of cues.}, } @article {pmid14664913, year = {2003}, author = {Mäkelä, AM and Alku, P and Tiitinen, H}, title = {The auditory N1m reveals the left-hemispheric representation of vowel identity in humans.}, journal = {Neuroscience letters}, volume = {353}, number = {2}, pages = {111-114}, doi = {10.1016/j.neulet.2003.09.021}, pmid = {14664913}, issn = {0304-3940}, mesh = {Adult ; Auditory Cortex/*physiology ; *Brain Mapping ; Evoked Potentials, Auditory/*physiology ; Female ; Functional Laterality/*physiology ; Humans ; Magnetoencephalography ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {The cortical correlates of the perception of the sustained vowels /a/, /o/ and /u/ were studied by using whole-head magnetoencephalography (MEG). The three vowels which were located on a line in the space spanned by the first (F1) and second (F2) formants and having equal F2-F1 differences evoked equally strong auditory N1m responses at 120 ms after stimulus onset. The left-hemispheric distribution of the source locations, estimated by equivalent current dipoles, reflected the acoustic similarity of the vowels: the growing distance of the vowels in the F2,F1-space was accompanied by a growing distance between the centres of gravity of activation elicited by each vowel. Thus, direct evidence for the orderly left-hemispheric representation of phonemes in human auditory cortex was found.}, } @article {pmid14662167, year = {2003}, author = {Clark, G}, title = {Cochlear implants in children: safety as well as speech and language.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {67 Suppl 1}, number = {}, pages = {S7-20}, doi = {10.1016/j.ijporl.2003.08.006}, pmid = {14662167}, issn = {0165-5876}, mesh = {Animals ; Cats ; Child ; *Child Language ; Cochlear Implantation/*adverse effects ; Cognition/physiology ; Deafness/pathology/*surgery ; Ear, Inner/pathology/surgery ; Electric Stimulation/adverse effects ; Electrodes, Implanted/microbiology ; Functional Laterality/physiology ; Granulocytes/pathology ; Humans ; Infant ; Language Tests ; Meningitis/etiology/prevention & control ; Neuropsychological Tests ; *Speech Perception ; Staphylococcal Infections/microbiology ; Streptococcal Infections/microbiology ; Temporal Bone ; Visual Fields/physiology ; }, abstract = {The development of cochlear implants for children at the University of Melbourne and the Bionic Ear Institute, has consisted of a routine of biological and engineering safety followed by evaluation of speech processing strategies on adults before they are undertaken on children. The initial safety studies were to ensure that insertion was atraumatic, the electrical stimulus parameters did not lead to loss of ganglion cells and that the electrode could be inserted without the risk of middle ear infection leading to meningitis. The initial second formant extraction scheme was shown to produce significant open-set speech understanding in adults and was approved by the US Food and Drug Administration (FDA) in 1985. Following this, an international study was undertaken for the FDA on children using a strategy that also included the first formant, and was approved in 1990. Additional advances in speech processing have been evaluated on adults. However, before using one with high rates of stimulation, it was tested for safety on experimental animals. Further advances have been anticipated in particular through the development of a peri-modiolar array, the Nucleus Contour. Prior to its use on adults, it was tested in the human temporal bone and found to lead to minimal trauma. It was evaluated in adults and found to lead to better current localization and lower thresholds. A study was undertaken in children using a spectral maxima scheme at high rates (advanced combination encoder (ACE)) and the Contour array as it had given best results in adults. It was approved as safe and effective for use in children in 2000. Studies were also undertaken to look at plasticity and visual dominance particularly through cognitive studies and the use of the McGurk effect. This demonstrated that deaf children with implants rely heavily on visual information and there is a great need to have unambiguous auditory stimuli to get best results.}, } @article {pmid14656557, year = {2003}, author = {Schenk, BS and Baumgartner, WD and Hamzavi, JS}, title = {Effect of the loss of auditory feedback on segmental parameters of vowels of postlingually deafened speakers.}, journal = {Auris, nasus, larynx}, volume = {30}, number = {4}, pages = {333-339}, doi = {10.1016/s0385-8146(03)00093-2}, pmid = {14656557}, issn = {0385-8146}, mesh = {Adolescent ; Adult ; Aged ; Articulation Disorders/*etiology ; Case-Control Studies ; Feedback/*physiology ; Female ; Fourier Analysis ; Hearing Loss/*complications/physiopathology ; Humans ; Male ; Middle Aged ; Speech Acoustics ; Speech Production Measurement ; Tape Recording ; Verbal Behavior ; }, abstract = {OBJECTIVE: The most obvious and best documented changes in speech of postlingually deafened speakers are the rate, fundamental frequency, and volume (energy). These changes are due to the lack of auditory feedback. But auditory feedback affects not only the suprasegmental parameters of speech. The aim of this study was to determine the change at the segmental level of speech in terms of vowel formants.

METHODS: Twenty-three postlingually deafened and 18 normally hearing speakers were recorded reading a German text. The frequencies of the first and second formants and the vowel spaces of selected vowels in word-in-context condition were compared.

RESULTS: All first formant frequencies (F1) of the postlingually deafened speakers were significantly different from those of the normally hearing people. The values of F1 were higher for the vowels /e/ (418+/-61 Hz compared with 359+/-52 Hz, P=0.006) and /o/ (459+/-58 compared with 390+/-45 Hz, P=0.0003) and lower for /a/ (765+/-115 Hz compared with 851+/-146 Hz, P=0.038). The second formant frequency (F2) only showed a significant increase for the vowel/e/(2016+/-347 Hz compared with 2279+/-250 Hz, P=0.012). The postlingually deafened people were divided into two subgroups according to duration of deafness (shorter/longer than 10 years of deafness). There was no significant difference in formant changes between the two groups.

CONCLUSION: Our report demonstrated an effect of auditory feedback also on segmental features of speech of postlingually deafened people.}, } @article {pmid14607169, year = {2003}, author = {Ravizza, SM}, title = {Dissociating the performance of cortical and subcortical patients on phonemic tasks.}, journal = {Brain and cognition}, volume = {53}, number = {2}, pages = {301-310}, doi = {10.1016/s0278-2626(03)00131-3}, pmid = {14607169}, issn = {0278-2626}, support = {NS 17778/NS/NINDS NIH HHS/United States ; NS 30256/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Aphasia/*diagnosis/*physiopathology ; Cerebellum/*physiopathology ; Female ; Frontal Lobe/*physiopathology ; Functional Laterality/physiology ; Humans ; Male ; Parietal Lobe/*physiopathology ; Parkinson Disease/*physiopathology ; *Phonetics ; Severity of Illness Index ; Speech Disorders/*diagnosis/etiology ; Speech Perception/*physiology ; Speech Production Measurement ; Temporal Lobe/*physiopathology ; }, abstract = {To assess cortical and subcortical contributions to phonemic processing, patients with left frontal, temporal-parietal, or cerebellar lesions as well as those with Parkinson's disease were tested on phonemic identification and production tasks. In Experiment 1, patients and controls were asked to identify syllables on both a voicing and place of articulation continuum. Subcortical patients were relatively unimpaired at this task whereas cortical patients were less accurate at identifying the endpoints of both continua and exhibited little evidence of categorical perception. For Experiment 2, controls and patients were asked to produce syllables. Subcortical patients were able to produce contrastive voice onset times (VOTs) for voicing cognates although VOT of the voiceless phoneme was more variable for cerebellar patients. Cortical patients showed greater overlap in the production of both VOT and formant transition intervals. These results are discussed in terms of the type of computations hypothesized to originate from each neural area.}, } @article {pmid14587615, year = {2003}, author = {Drennan, WR and Gatehouse, S and Lever, C}, title = {Perceptual segregation of competing speech sounds: the role of spatial location.}, journal = {The Journal of the Acoustical Society of America}, volume = {114}, number = {4 Pt 1}, pages = {2178-2189}, doi = {10.1121/1.1609994}, pmid = {14587615}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; *Attention ; Cues ; Dichotic Listening Tests ; Female ; Humans ; Loudness Perception ; Male ; Middle Aged ; *Perceptual Masking ; *Phonetics ; Pitch Discrimination ; Practice, Psychological ; *Sound Localization ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Culling and Summerfield [J. Acoust Soc. Am. 92, 785-797 (1995)] showed that listeners could not use ongoing interaural time differences (ITDs) to achieve source segregation. The present experiments tested a free-field analog of their experiment. The stimuli consisted of narrow bands of noise, pairs of which represented the first and second formants of the whispered vowels "ar," "ee," "er," and "oo." A target noise-band pair (vowel) was presented at various angles on the listeners' left while a complementary distracter was presented on the listeners' right. Listeners correctly identified the target vowel in the free-field well above chance. Performance remained well above chance in headphone experiments that retained spatial cues but eliminated reverberations and head movements. The full range of cues that normally determine perceived spatial location provided sufficient information for segregation. Further experiments, which systematically evaluated the contribution of these cues in isolation and in combination, showed that some listeners, following training, exhibited the ability to segregate based on ongoing ITDs alone. Substantial individual differences were observed. The results show that listeners can use spatial cues to segregate simultaneous sound sources.}, } @article {pmid14587608, year = {2003}, author = {Litvak, L and Delgutte, B and Eddington, D}, title = {Improved neural representation of vowels in electric stimulation using desynchronizing pulse trains.}, journal = {The Journal of the Acoustical Society of America}, volume = {114}, number = {4 Pt 1}, pages = {2099-2111}, pmid = {14587608}, issn = {0001-4966}, support = {N01-DC-6-210/DC/NIDCD NIH HHS/United States ; R01 DC002258/DC/NIDCD NIH HHS/United States ; DC02258/DC/NIDCD NIH HHS/United States ; R01 DC002258-08/DC/NIDCD NIH HHS/United States ; P01 DC000361-100005/DC/NIDCD NIH HHS/United States ; DC00361/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Threshold/drug effects/physiology ; Cats ; *Cochlear Implants ; Cochlear Nerve/drug effects/*physiopathology ; Deafness/chemically induced/*physiopathology ; *Electrodes, Implanted ; Humans ; Kanamycin/toxicity ; Mathematical Computing ; Nerve Fibers/drug effects/physiology ; *Phonetics ; Reaction Time/physiology ; Signal Processing, Computer-Assisted ; *Sound Spectrography ; *Speech Acoustics ; Speech Perception/drug effects/*physiology ; Stochastic Processes ; }, abstract = {Current cochlear implant processors poorly represent sound waveforms in the temporal discharge patterns of auditory-nerve fibers (ANFs). A previous study [Litvak et al., J. Acoust. Soc. Am. 114, 2079-2098 (2003)] showed that the temporal representation of sinusoidal stimuli can be improved in a majority of ANFs by encoding the stimuli as small modulations of a sustained, high-rate (5 kpps), desynchronizing pulse train (DPT). Here, these findings are extended to more complex stimuli by recording ANF responses to pulse trains modulated by bandpass filtered vowels. Responses to vowel modulators depended strongly on the discharge pattern evoked by the unmodulated DPT. ANFs that gave sustained responses to the DPT had period histograms that resembled the modulator waveform for low (< 5%) modulation depths. Spectra of period histograms contained peaks near the formant frequencies. In contrast, ANFs that gave a transient (< 1 min) response to the DPT poorly represented the formant frequencies. A model incorporating a linear modulation filter, a noisy threshold, and neural refractoriness predicts the shapes of period histograms for both types of fibers. These results suggest that a DPT-enhanced strategy may achieve good representation of the stimulus fine structure in the temporal discharge patterns of ANFs for frequencies up to 1000 Hz. It remains to be seen whether these temporal discharge patterns can be utilized by cochlear implant subjects.}, } @article {pmid14582828, year = {2003}, author = {Leinonen, L and Laakso, ML and Carlson, S and Linnankoski, I}, title = {Shared means and meanings in vocal expression of man and macaque.}, journal = {Logopedics, phoniatrics, vocology}, volume = {28}, number = {2}, pages = {53-61}, doi = {10.1080/14015430310011754}, pmid = {14582828}, issn = {1401-5439}, mesh = {Animals ; Biological Evolution ; *Emotions ; Female ; Humans ; Macaca/*physiology/psychology ; Male ; Phonation/*physiology ; Sound Spectrography ; Speech Acoustics ; Vocalization, Animal/*physiology ; Voice/*physiology ; }, abstract = {Vocalisations of six Macaca arctoides that were categorised according to their social context and judgements of naive human listeners as expressions of plea/submission, anger, fear, dominance, contentment and emotional neutrality, were compared with vowel samples extracted from simulations of emotional-motivational connotations by the Finnish name Saara and English name Sarah. The words were spoken by seven Finnish and 13 English women. Humans and monkeys resembled each other in the following respects. 'Neutral', 'pleading' and 'commanding' had a similar F0 level. Loud vocalisations of intense 'anger' and 'fear' had both high F0, and the highest values were encountered for 'fear'. Compared with 'neutral', the audiosignal waveform of 'plea/submission' was more sinusoidal, seen in the spectrum as an attenuation of formants and an emphasis of the fundamental, whereas the signal waveform of 'commanding' was more complex corresponding to an increase in noise and a wider distribution of spectral energy. 'Frightened' samples included rather harmonic segments with emphasis of the fundamental, and 'angry' samples included more noise at the low end of the spectrum and often segments with low-frequency (< 100 Hz) edged modulation. Sounds resembling soft and noisy 'content' grunts of the monkey do not appear in Finnish or English speech but 'content' utterances were, however, associated with low speech pressure, attenuation of harmonics and increase in noise.}, } @article {pmid14575351, year = {2003}, author = {Mayo, C and Scobbie, JM and Hewlett, N and Waters, D}, title = {The influence of phonemic awareness development on acoustic cue weighting strategies in children's speech perception.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {5}, pages = {1184-1196}, doi = {10.1044/1092-4388(2003/092)}, pmid = {14575351}, issn = {1092-4388}, mesh = {Adult ; Analysis of Variance ; Child, Preschool ; Female ; Humans ; Language Tests ; Male ; Middle Aged ; *Phonetics ; *Speech Perception ; }, abstract = {In speech perception, children give particular patterns of weight to different acoustic cues (their cue weighting). These patterns appear to change with increased linguistic experience. Previous speech perception research has found a positive correlation between more analytical cue weighting strategies and the ability to consciously think about and manipulate segment-sized units (phonemic awareness). That research did not, however, aim to address whether the relation is in any way causal or, if so, then in which direction possible causality might move. Causality in this relation could move in 1 of 2 ways: Either phonemic awareness development could impact on cue weighting strategies or changes in cue weighting could allow for the later development of phonemic awareness. The aim of this study was to follow the development of these 2 processes longitudinally to determine which of the above 2 possibilities was more likely. Five-year-old children were tested 3 times in 7 months on their cue weighting strategies for a /so/-/[symbol in text]o/ contrast, in which the 2 cues manipulated were the frequency of fricative spectrum and the frequency of vowel-onset formant transitions. The children were also tested at the same time on their phoneme segmentation and phoneme blending skills. Results showed that phonemic awareness skills tended to improve before cue weighting changed and that early phonemic awareness ability predicted later cue weighting strategies. These results suggest that the development of metaphonemic awareness may play some role in changes in cue weighting.}, } @article {pmid14568454, year = {2003}, author = {Kaiser, J and Ripper, B and Birbaumer, N and Lutzenberger, W}, title = {Dynamics of gamma-band activity in human magnetoencephalogram during auditory pattern working memory.}, journal = {NeuroImage}, volume = {20}, number = {2}, pages = {816-827}, doi = {10.1016/S1053-8119(03)00350-1}, pmid = {14568454}, issn = {1053-8119}, mesh = {Acoustic Stimulation ; Adult ; Algorithms ; Auditory Perception/*physiology ; Brain Mapping/methods ; Data Interpretation, Statistical ; Female ; Frontal Lobe/physiology ; Functional Laterality/physiology ; Humans ; *Magnetoencephalography ; Male ; Memory, Short-Term/*physiology ; Middle Aged ; Prefrontal Cortex/physiology ; Probability ; Speech Perception/physiology ; Temporal Lobe/physiology ; }, abstract = {Both electrophysiological research in animals and human brain imaging studies have suggested that, similar to the visual system, separate cortical ventral "what" and dorsal "where" processing streams may also exist in the auditory domain. Recently we have shown enhanced gamma-band activity (GBA) over posterior parietal cortex belonging to the putative auditory dorsal pathway during a sound location working memory task. Using a similar methodological approach, the present study assessed whether GBA would be increased over auditory ventral stream areas during an auditory pattern memory task. Whole-head magnetoencephalogram was recorded from N = 12 subjects while they performed a working memory task requiring same-different judgments about pairs of syllables S1 and S2 presented with 0.8-s delays. S1 and S2 could differ either in voice onset time or in formant structure. This was compared with a control task involving the detection of possible spatial displacements in the background sound presented instead of S2. Under the memory condition, induced GBA was enhanced over left inferior frontal/anterior temporal regions during the delay phase and in response to S2 and over prefrontal cortex at the end of the delay period. gamma-Band coherence between left frontotemporal and prefrontal sensors was increased throughout the delay period of the memory task. In summary, the memorization of syllables was associated with synchronously oscillating networks both in frontotemporal cortex, supporting a role of these areas as parts of the putative auditory ventral stream, and in prefrontal, possible executive regions. Moreover, corticocortical connectivity was increased between these structures.}, } @article {pmid14564831, year = {2003}, author = {Maassen, B and Groenen, P and Crul, T}, title = {Auditory and phonetic perception of vowels in children with apraxic speech disorders.}, journal = {Clinical linguistics & phonetics}, volume = {17}, number = {6}, pages = {447-467}, doi = {10.1080/0269920031000070821}, pmid = {14564831}, issn = {0269-9206}, mesh = {Adult ; *Apraxias ; Child ; Female ; Humans ; Male ; *Phonation ; Speech Discrimination Tests ; Speech Disorders/*physiopathology ; *Speech Perception ; Speech Production Measurement ; }, abstract = {The aim of this study was to assess auditory and phonetic perceptual processing of vowels in children with apraxic disorders, who demonstrated clinically with only a speech output deficit. Two experiments were conducted. In the preparatory Experiment 1 series of vowels were constructed by moving formant frequencies away from the extreme values in the vowel space in the direction of a 'neutral-vowel position'. These were presented to adults and children with no speech-language involvement. Based on identification performance low-redundancy vowels were selected, which served as the end-points of two vowel continua: /i/-/i/ and /a/-/a/. In Experiment 2 these continua were used in identification and discrimination tasks, presented to 11 children with apraxic speech problems (aged 6:11 to 9:6 years) and 12 normally developing children. The results showed poorer perception of vowels for the children with apraxic speech problems than for the control children for both continua. Identification functions indicated poorer phonetic processing; discrimination functions indicated poorer auditory processing. Furthermore, a combination of perception measures (identification and discrimination) proved to have a high differential and clinical value for the assessment of children with apraxic speech problems. The results support the view that subtle (subclinical) auditory processing deficits make part of speech output disorders.}, } @article {pmid14558730, year = {2003}, author = {Taitelbaum-Swead, R and Hildesheimer, M and Kishon-Rabin, L}, title = {Effect of voice onset time (VOT), stop burst and vowel on the perception of voicing in Hebrew stops: preliminary results.}, journal = {Journal of basic and clinical physiology and pharmacology}, volume = {14}, number = {2}, pages = {165-176}, doi = {10.1515/jbcpp.2003.14.2.165}, pmid = {14558730}, issn = {0792-6855}, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Language ; Phonetics ; Speech Perception/*physiology ; Voice ; }, abstract = {Very few studies investigated systematically the acoustic cues for the perception of voicing stops in Hebrew. Voicing is characterized by several parameters of which the voice onset time (VOT) was found to be the primary cue for its perception. There are, however, other known acoustic cues to voicing such as transition to the first formant (F1) and the initial burst. The purpose of the present study was to measure the relative weighting of these various acoustic cues in the perception of Hebrew voicing, using the conflicting cues paradigm. Thirteen adults with normal hearing participated in this study. Stimuli consisted of one pair of meaningful words that differ in the voicing of the initial stop. Four different continua were constructed from the pair of natural stimuli. The first two consisted of the voiced burst combined with the vowel that was truncated from the consonant-vowel combination (where the consonant was voiced or voiceless). The remaining two continua consisted of the voiceless burst combined with the same truncated vowels. For each stimulus, a VOT continuum was created varying from -40 to +40 ms in 10 ms segments. Subjects were tested using a two alternative forced choice labeling procedure. The percent of responses to each stimulus of each VOT continuum (/b-p/) was calculated for each individual and combination. The results show that each acoustic cue contributed to the perception of initial voicing in Hebrew: (1) When the stimulus was constructed from the voiced cues, positive VOT values were needed for the voice/voiceless distinction; (2) when the stimulus was constructed from the voiceless cues, negative VOT values were needed for the voicing distinction; and (3) when the stimulus was constructed from voiced and voiceless cues, intermediate VOT values were needed for the voicing distinction. These results provide initial information regarding the relative effect of the acoustic cues in the perception of Hebrew stop voicing.}, } @article {pmid14558729, year = {2003}, author = {Kishon-Rabin, L and Dayan, M and Michaeli, O}, title = {Effect of second-formant transitions on the perception of Hebrew voiced stop consonants.}, journal = {Journal of basic and clinical physiology and pharmacology}, volume = {14}, number = {2}, pages = {151-164}, doi = {10.1515/jbcpp.2003.14.2.151}, pmid = {14558729}, issn = {0792-6855}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Cues ; Female ; Humans ; Language ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Studies in English, Dutch, Danish and French show that of the possible acoustic cues that listeners use for the perception of place of articulation, the transition of the second formant (F2) appears to be a very important cue. Although the Hebrew language shares some similarities with the above languages, one cannot assume that it either has similar acoustic-articulatory patterns or uses the same cues for perception. The general goal of the present study was, therefore, to investigate the effect of the starting frequency of F2 transition on the perception of place of articulation of Hebrew voiced plosives in initial position. Sixteen Hebrew-speaking young normal-hearing adults served as subjects. Stimuli were generated by re-synthesizing a naturally spoken /ba/ syllable into 17 test syllables by varying only the starting frequency of F2. Listeners heard each stimulus six times (total of 102) at random and were required to label the stimuli as /ba/, /da/ or /ga/. Results showed that varying only F2 transitions caused a perceptual change of place of articulation for all listeners. There was, however, large inter-subject variability in the perceived category: 75% of the subjects identified /ba/ and /da/, half of them also identified /ga/, and 25% of the subjects were able to identify /ba/ and /ga/ only. These data suggest that while F2 transitions are important for perceiving place of articulation of Hebrew voiced stops, they cannot predict the perceived category. It also supports the notion that normal-hearing listeners differ in the relative importance they assign to the cues for the perception of place of articulation.}, } @article {pmid14515956, year = {2003}, author = {Brown, JA and Derksen, FJ and Stick, JA and Hartmann, WM and Robinson, NE}, title = {Ventriculocordectomy reduces respiratory noise in horses with laryngeal hemiplegia.}, journal = {Equine veterinary journal}, volume = {35}, number = {6}, pages = {570-574}, doi = {10.2746/042516403775467135}, pmid = {14515956}, issn = {0425-1644}, mesh = {Animals ; Female ; Hemiplegia/surgery/veterinary ; Horse Diseases/*surgery ; Horses ; Laryngectomy/*veterinary ; Male ; Physical Conditioning, Animal/physiology ; Pressure ; Respiratory Sounds/*veterinary ; Time Factors ; Treatment Outcome ; Vocal Cord Paralysis/surgery/*veterinary ; Vocal Cords/*surgery ; }, abstract = {REASONS FOR PERFORMING STUDY: Show and performance horse with laryngeal hemiplegia (LH) often present for excessive respiratory noise rather than significant exercise intolerance. Therefore, the goal of surgery in these horses is to reduce respiratory noise but there are no quantitative studies evaluating the effect of any upper-airway surgery in LH-affected horses.

OBJECTIVE: To determine whether bilateral ventriculocordectomy (VC) reduces respiratory noise in exercising horses with laryngeal hemiplegia.

METHODS: Six Standardbred horses with normal upper airways were used in this study. Respiratory sounds and inspiratory trans-upper airway pressure (Pui) were measured in all horses before and after induction of LH, and 30, 90 and 120 days after VC. In horses with LH, spectrogram analysis revealed 3 inspiratory sound formants centred at approximately 400, 1700 and 3700 Hz. Inspiratory sound levels (SL) and the sound intensity of the 3 inspiratory formants (F1, F2, F3 respectively) were measured using a computer-based sound analysis programme.

RESULTS: In LH-affected horses, Pui, inspiratory SL and the sound intensity of F2 and F3 were significantly increased compared to baseline values. At 90 and 120 days after VC the sound intensities of F2 and F3 returned to baseline values. The Pui and SL, were significantly decreased compared to LH values, but remained different from baseline.

CONCLUSIONS: VC effectively reduces inspiratory noise in LH-affected horses by 90 days following surgery. Inspiratory trans-upper airway pressures are improved 30 days following VC, but do not return to baseline values.

POTENTIAL RELEVANCE: VC can be recommended as a surgical treatment of LH-affected horses if reduction of respiratory noise is the primary objective of surgery. Further studies are required to determine if variations of the surgical technique used in this study will have similar results.}, } @article {pmid14515786, year = {2003}, author = {Jin, GW and Yang, CH and Xu, KX and Xu, ZR and Li, SC and Chen, J and Zhang, JX and Zhang, ZL}, title = {[Acoustic analysis of the voice restored by using a tracheoesophageal slit-like fistula].}, journal = {Zhonghua er bi yan hou ke za zhi}, volume = {38}, number = {3}, pages = {225-228}, pmid = {14515786}, issn = {0412-3948}, mesh = {Adult ; Aged ; Female ; Humans ; Laryngectomy ; *Larynx, Artificial ; Male ; Middle Aged ; Speech, Esophageal ; Tracheoesophageal Fistula/*rehabilitation/*surgery ; *Voice Quality ; }, abstract = {OBJECTIVE: To evaluate the quality of voice restored by using a tracheoesophageal slit-like fistula objectively through acoustic analysis.

METHODS: Seven objective voice parameters (fundamental frequency, intensity, duration, formant F1, F2, F3 and their energy, jitter, and shimmer) of esophageal speech, Blom-Singer prosthesis voice, tracheoesophageal slit-like fistula voice, primary or modified surgical restored, and normal voice were analyzed and compared.

RESULTS: T test was used for statistical analysis. The maximum phonation time of slit-like fistula voice was shorter than that of normal voice, longer than that of esophageal voice, no significant difference compared with that of Blom-Singer prosthesis voice. Its sound intensity of it was similar to that of normal voice and Blom-Singer voice, better than that of esophageal voice. Its fundamental frequency was lower than that of normal voice. Its shimmer and jitter was more than that of normal voice, less than that of esophageal voice, and similar to that of Blom-singer one. Compared with esophageal voice, all formants but F1 of it were not statistically different. No statistical difference between the 2 groups of slit-like fistula patients, i.e., the instant slit-like fistula construction after total laryngectomy and the second stage slit-like fistula construction.

CONCLUSION: The voice quality of the patients with tracheoesophageal slit-like fistula is similar to that of the normal and could meet the needs of daily life.}, } @article {pmid14513952, year = {2003}, author = {Stone, RE and Cleveland, TF and Sundberg, PJ and Prokop, J}, title = {Aerodynamic and acoustical measures of speech, operatic, and Broadway vocal styles in a professional female singer.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {17}, number = {3}, pages = {283-297}, doi = {10.1067/s0892-1997(03)00074-2}, pmid = {14513952}, issn = {0892-1997}, mesh = {Adult ; Female ; Glottis/*physiology ; Humans ; *Music ; Phonation/*physiology ; *Speech Acoustics ; *Speech Production Measurement ; Vocal Cords/physiology ; Voice Quality/*physiology ; }, abstract = {Understanding how the voice is used in different styles of singing is commonly based on intuitive descriptions offered by performers who are proficient in only one style. Such descriptions are debatable, lack reproducibility, and lack scientifically derived explanations of the characteristics. We undertook acoustic and aerodynamic analyses of a female subject with professional experience in both operatic and Broadway styles of singing, who sang examples in these two styles. How representative the examples are of the respective styles was investigated by means of a listening test. Further, as a reference point, we compared the styles with her speech. Variation in styles associated with pitch and vocal loudness was investigated for various parameters: subglottal pressure, closed quotient, glottal leakage, H1-H2 difference (the level difference between the two lowest partials of the source spectrum), and glottal compliance (the ratio between the air volume displaced in a glottal pulse and the subglottal pressure). Formant frequencies, long-term-average spectrum, and vibrato characteristics were also studied. Characteristics of operatic style emerge as distinctly different from Broadway style, the latter being more similar to speaking.}, } @article {pmid12966709, year = {2003}, author = {Andreeva, NG and Kulikov, GA}, title = {[Characteristic of the vocal vowels with different pitch frequency].}, journal = {Rossiiskii fiziologicheskii zhurnal imeni I.M. Sechenova}, volume = {89}, number = {6}, pages = {715-724}, pmid = {12966709}, issn = {0869-8139}, mesh = {Humans ; *Music ; Phonation/*physiology ; *Phonetics ; Pitch Perception/*physiology ; Sound Spectrography ; Voice Quality/physiology ; }, abstract = {It is well known that for adult speakers positions of the formants are different for various vowels and they can often be predicted for each phoneme, so first two formant frequencies provide the classic acoustic description of adults vowels' quality. But spectral characteristics of the vowels produced by human voice vary over wide range. In particular this is the case of vocalizations with high fundamental frequencies such as infants' vowel-like sounds and sung vowels. So while the specific structure of a vowel determine its acoustic quality in adults' speaking it does not apply to singing. To test the question of fundamental frequencies dependence of vowels' spectral characteristics, the investigation of vowels [a], [u], [i] sung by two professional singers--tenor and countertenor, was performed. It was shown that with increasing F0 vowel-specific F1/F2 feature tends to disappear. At the same time vowel-specific amplitude relationship between two first spectral maxima tends to occur. So in consistence with our previous data on infants we suggest that the vowel' phonetic identity in high fundamental frequencies might be achieved by information of spectral maxima frequencies (including the first maximum, e. g. fundamental frequency in infants and singers) and their amplitude relations.}, } @article {pmid12959475, year = {2003}, author = {Tjaden, K}, title = {Anticipatory coarticulation in multiple sclerosis and Parkinson's disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {46}, number = {4}, pages = {990-1008}, doi = {10.1044/1092-4388(2003/077)}, pmid = {12959475}, issn = {1092-4388}, support = {R01DC04690-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Case-Control Studies ; Dysarthria/*etiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Multiple Sclerosis/*complications/physiopathology ; Parkinson Disease/*complications/physiopathology ; Reproducibility of Results ; *Speech Acoustics ; }, abstract = {Research investigating coarticulatory patterns in dysarthria has the potential to provide insight regarding deficits in the organizational coherence of phonetic events that may underlie deviant perceptual characteristics. The current study investigated anticipatory coarticulation for 17 speakers with multiple sclerosis (MS), 12 speakers with Parkinson's disease (PD), and 29 healthy control speakers. V1-C-V2 sequences were used to investigate intersyllabic vowel to vowel effects (V2 to V1 effects), intersyllabic consonant to vowel effects (C to V1 effects), and intrasyllabic vowel to consonant effects (V2 to C effects). Second formant frequencies and first moment coefficients were used to infer coarticulation. In general, patterns of intersyllabic and intrasyllabic coarticulation were similar for speakers with MS, speakers with PD, and healthy control speakers. It therefore appears unlikely that coarticulatory patterns for speakers diagnosed with MS or PD strongly contribute to deviant perceptual characteristics, at least for the current group of speakers, most of whom were mildly to moderately impaired. Anticipatory vowel effects in /k/+vowel sequences, however, tended to be reduced for speakers with MS and speakers with PD when data for these 2 speaker groups were pooled and compared to control speakers. These results were not attributable to group differences in speech rate or articulatory scaling, defined as the extent of articulatory movements, and further suggest that coarticulatory deficits are not unique to particular neurological diagnoses or dysarthrias. Potential explanations for the /k/+vowel results include difficulties with anterior-posterior tongue positioning and the competing influences of minimizing articulatory effort and maintaining sufficient perceptual contrast. Despite this subtle difference in coarticulation between disordered speakers and healthy control speakers, the overall results suggest that anticipatory coarticulation for speakers with MS and speakers with PD is preserved.}, } @article {pmid12942990, year = {2003}, author = {Riede, T and Zuberbühler, K}, title = {The relationship between acoustic structure and semantic information in Diana monkey alarm vocalization.}, journal = {The Journal of the Acoustical Society of America}, volume = {114}, number = {2}, pages = {1132-1142}, doi = {10.1121/1.1580812}, pmid = {12942990}, issn = {0001-4966}, mesh = {*Acoustics ; Animals ; Cercopithecus ; Male ; *Semantics ; Sound Spectrography ; Vocalization, Animal/*physiology ; }, abstract = {Mammalian vocal production mechanisms are still poorly understood despite their significance for theories of human speech evolution. Particularly, it is still unclear to what degree mammals are capable of actively controlling vocal-tract filtering, a defining feature of human speech production. To address this issue, a detailed acoustic analysis on the alarm vocalization of free-ranging Diana monkeys was conducted. These vocalizations are especially interesting because they convey semantic information about two of the monkeys' natural predators, the leopard and the crowned eagle. Here, vocal tract and sound source parameter in Diana monkey alarm vocalizations are described. It is found that a vocalization-initial formant downward transition distinguishes most reliably between eagle and leopard alarm vocalization. This finding is discussed as an indication of articulation and alternatively as the result of a strong nasalization effect. It is suggested that the formant modulation is the result of active vocal filtering used by the monkeys to encode semantic information, an ability previously thought to be restricted to human speech.}, } @article {pmid12942984, year = {2003}, author = {Palethorpe, S and Watson, CI and Barker, R}, title = {Acoustic analysis of monophthong and diphthong production in acquired severe to profound hearing loss.}, journal = {The Journal of the Acoustical Society of America}, volume = {114}, number = {2}, pages = {1055-1068}, doi = {10.1121/1.1593059}, pmid = {12942984}, issn = {0001-4966}, mesh = {Aged ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Middle Aged ; *Phonetics ; Severity of Illness Index ; *Speech Acoustics ; Speech Production Measurement/*methods ; }, abstract = {The effect of diminished auditory feedback on monophthong and diphthong production was examined in postlingually deafened Australian-English speaking adults. The participants were 4 female and 3 male speakers with severe to profound hearing loss, who were compared to 11 age- and accent-matched normally hearing speakers. The test materials were 5 repetitions of hVd words containing 18 vowels. Acoustic measures that were studied included F1, F2, discrete cosine transform coefficients (DCTs), and vowel duration information. The durational analyses revealed increased total vowel durations with a maintenance of the tense/lax vowel distinctions in the deafened speakers. The deafened speakers preserved a differentiated vowel space, although there were some gender-specific differences seen. For example, there was a retraction of F2 in the front vowels for the female speakers that did not occur in the males. However, all deafened speakers showed a close correspondence between the monophthong and diphthong formant movements that did occur. Gaussian classification highlighted vowel confusions resulting from changes in the deafened vowel space. The results support the view that postlingually deafened speakers maintain reasonably good speech intelligibility, in part by employing production strategies designed to bolster auditory feedback.}, } @article {pmid12925948, year = {2003}, author = {Tanji, K and Suzuki, K and Okuda, J and Shimizu, H and Seki, H and Kimura, I and Endo, K and Hirayama, K and Fujii, T and Yamadori, A}, title = {Formant interaction as a cue to vowel perception: a case report.}, journal = {Neurocase}, volume = {9}, number = {4}, pages = {350-355}, doi = {10.1076/neur.9.4.350.15554}, pmid = {12925948}, issn = {1355-4794}, mesh = {Adult ; Agnosia/*physiopathology ; Cerebral Cortex/physiology ; Evoked Potentials, Auditory ; Humans ; *Linguistics ; Magnetic Resonance Imaging ; Male ; }, abstract = {Accumulating evidence indicates that cerebral processing of consonants and vowels is separable. It has been shown that disordered temporal acuity leads to disturbed consonant perception in cases with pure word deafness. In contrast, there has been no clear explanation of how vowel perception is impaired. We examined a patient with auditory agnosia, who showed a differential ability to identify the five Japanese vowels after bilateral cerebral lesions. He correctly identified the vowel [a] in more than 70% of auditory presentations, whereas he identified [i] in only about 30% of presentations. The difference between the first and second formant frequencies "F2-F1" and an artificially defined value "F1-(F2-F1)" for each vowel correlated significantly with the percentage of correct identifications. These findings support the hypothesis that vowel perception is based on formant interactions.}, } @article {pmid12925820, year = {2003}, author = {Schenk, BS and Baumgartner, WD and Hamzavi, JS}, title = {Changes in vowel quality after cochlear implantation.}, journal = {ORL; journal for oto-rhino-laryngology and its related specialties}, volume = {65}, number = {3}, pages = {184-188}, doi = {10.1159/000072257}, pmid = {12925820}, issn = {0301-1569}, mesh = {Adolescent ; Adult ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; Phonation ; Phonetics ; *Speech Perception ; *Voice Quality ; }, abstract = {Since auditory feedback is partially restored after cochlear implantation, the aim of the present study was to investigate features of vowels, which reflect improvements in speech production. Ten postlingually deafened subjects (5 male/5 female) were recorded reading a German text before and 3 and 12 months after implantation, respectively. Selected vowels were analysed regarding the fundamental frequency (F(0)), the formant frequencies (F(1), F(2), F(3)) and the vowel space (difference between F(1) and F(2) in hertz). The F(0) decreased only descriptively after 3 and 12 months, respectively. F(1) of the vowel /e/ was significantly lower after 12 months (411 +/- 20 compared to 349 +/- 25 Hz, p < or = 0.05) and for /o/ after 3 months (446 +/- 29 compared to 408 +/- 31 Hz, p < or = 0.05) for the male patients: their vowel space also expanded significantly for the vowel /o/ (372 +/- 37 compared to 467 +/- 32 Hz, p < or = 0.05) after 12 months. Regained auditory feedback after cochlear implantation had an effect on the improvement of the production of vowels.}, } @article {pmid12916479, year = {2003}, author = {Rosique, M and Ramón, JL and Canteras, M and Rosique, L}, title = {[Discriminant analysis applied to formants of vowels in Castellano dialect during the phonation with prosthesis and esophageal voice after total laryngectomy].}, journal = {Acta otorrinolaringologica espanola}, volume = {54}, number = {5}, pages = {361-366}, doi = {10.1016/s0001-6519(03)78424-6}, pmid = {12916479}, issn = {0001-6519}, mesh = {Discriminant Analysis ; Humans ; *Language ; *Laryngectomy ; *Phonetics ; Postoperative Period ; *Speech, Esophageal ; Voice Disorders/*diagnosis ; }, abstract = {The objective of the present work consists, in analyzing if there is statistically significant differences between certain quantitative variables of acoustic parameters of the voice, in three samples of individuals, using a discriminative analysis. The study groups are formed by: 20 patients with total laringectomy and esophageal voice, 20 patients with total laringectomy and tracheoesophageal voice with phonatory prosthesis, and 20 normal individuals. We analyzed the energy, bandwidth and frequency of the four first formants of the five Castilian vowels in the context of an established phrase. Based on our results the method of discriminative analysis of the used variables is valid to establish differences between the three studied groups; that is they display their own characteristics from the acoustic point of view. This method allows us to affirm that the tracheoesophageal voice does not come closer to the normal voice that the esophageal voice does.}, } @article {pmid12904682, year = {2003}, author = {Teoh, SW and Neuburger, HS and Svirsky, MA}, title = {Acoustic and electrical pattern analysis of consonant perceptual cues used by cochlear implant users.}, journal = {Audiology & neuro-otology}, volume = {8}, number = {5}, pages = {269-285}, doi = {10.1159/000072000}, pmid = {12904682}, issn = {1420-3030}, support = {R01-DC03937/DC/NIDCD NIH HHS/United States ; T32-DC00012/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustic Stimulation ; Adult ; *Auditory Threshold ; *Cochlear Implants ; *Cues ; *Electric Stimulation/instrumentation ; Evaluation Studies as Topic ; Hearing Loss/physiopathology/surgery ; Humans ; *Phonetics ; Regression Analysis ; *Speech Perception ; }, abstract = {It is hypothesized that for postlingually deafened adult cochlear implant (CI) users, a significant source of their perceptual performance variability is attributable to differences in their ability to discriminate the basic perceptual cues that are important in speech recognition. Previous research on 'electric hearing' has identified consistent perceptual cues for vowel recognition. However, the results on consonant perception by CI users are less clear. The primary purpose of this study is to present a quantitative method of evaluating potential 'electric cues' used by CI users in consonant identification. Since the actual input signals to the auditory periphery of CI users are electric in nature, we elected to measure the CI electric discharge patterns in addition to the original acoustic waveforms. The characteristics of the electric discharge patterns in response to intervocalic consonants were quantified and correlated with the dimensions of CI patients' perceptual spaces, which were computed from multidimensional scaling analyses of their consonant confusion matrices. The results agree with most, but not all, commonly accepted acoustic cues used by normal-hearing listeners. The correlation findings also suggest that CI users employ different sets of 'electric cues' in perceiving consonants that differ in their manner of articulation. Specifically, spectral and temporal cues associated with slowly changing formant structures and transitions, and features associated with frication and high-frequency noise, are all highly correlated with the perceptual dimensions of all CI users. However, rapidly changing formant transitions, such as those present in stop consonants, did not appear to play a significant role in consonant recognition by more poorly performing CI subjects. The perceptual results were consistent with our physical findings that the SPEAK coding strategy partially degraded the rapidly changing formant transitions.}, } @article {pmid12884904, year = {2003}, author = {Vurma, A and Ross, J}, title = {The perception of 'forward' and 'backward placement' of the singing voice.}, journal = {Logopedics, phoniatrics, vocology}, volume = {28}, number = {1}, pages = {19-28}, doi = {10.1080/14015430310010854}, pmid = {12884904}, issn = {1401-5439}, mesh = {Female ; Humans ; Judgment ; Male ; *Music ; Phonation ; *Pitch Perception ; *Psychoacoustics ; Sound Spectrography ; *Voice ; *Voice Quality ; }, abstract = {Singing teachers sometimes characterize voice quality in terms of 'forward' and 'backward placement'. In view of traditional knowledge about voice production, it is hard to explain any possible acoustic or articulatory differences between the voices so 'placed'. We have synthesized a number of three-tone melodic excerpts performed by the singing voice. Formant frequencies, and the level and frequency of the singer's formant were varied across the stimuli. Results of a listening test show that the stimuli which were perceived as 'placed forward', correlated not only with higher frequencies of the first and second formants, but also with the higher frequency and level of the singer's formant.}, } @article {pmid12852157, year = {2003}, author = {Obrenović, J and Nesić, M and Nesić, V and Cekić, S}, title = {Formant structure of the voice during the intensive acute hypoxia.}, journal = {Vojnosanitetski pregled}, volume = {60}, number = {2}, pages = {155-159}, doi = {10.2298/vsp0302155o}, pmid = {12852157}, issn = {0042-8450}, mesh = {Acute Disease ; Adult ; *Altitude ; Atmospheric Pressure ; Humans ; Hypoxia/*physiopathology ; Male ; *Phonation ; *Speech Acoustics ; Voice Quality ; }, abstract = {The influence of intensive acute hypoxia on the frequency-amplitude formant vocal O characteristics was investigated in this study. Examinees were exposed to the simulated altitudes of 5,500 m and 6,700 m in climabaro chamber and resolved Lotig's test in the conditions of normoxia, i.e. pronounced the three-digit numbers beginning from 900, but in reversed order. Frequency and intensity values of vocal O (F1, F2, F3 and F4), extracted from the context of the pronunciation of the word eight (osam in Serbian), were measured by spectral speech signal analysis. Changes in frequency values and the intensity of the formants were examined. The obtained results showed that there were no significant changes of the formant frequencies in hypoxia condition compared to normoxia. Though, significant changes of formant's intensities were found compared to normoxia on the cited altitudes. The rise of formants intensities was found at the altitude of 5,500 m. Hypoxia at the altitude of 6,700 m caused the significant fall of the intensities in the initial period, compared to normoxia. The prolonged hypoxia exposure caused the rise of the formant intensities compared to the altitude of 5,500 m. In may be concluded that, due to different altitudes, hypoxia causes different effects on the formants structure changes, compared to normoxia.}, } @article {pmid12842731, year = {2003}, author = {Laufer, I and Pratt, H}, title = {Evoked potentials to auditory movement sensation in duplex perception.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {114}, number = {7}, pages = {1316-1331}, doi = {10.1016/s1388-2457(03)00083-x}, pmid = {12842731}, issn = {1388-2457}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Brain Mapping ; Cerebral Cortex/*physiology ; Electroencephalography/methods ; Evoked Potentials, Auditory/*physiology ; Female ; Functional Laterality ; Humans ; Magnetoencephalography/methods ; Male ; Motion Perception/*physiology ; Reaction Time/physiology ; Sound Localization/*physiology ; Speech Perception/physiology ; Statistics, Nonparametric ; Time Factors ; }, abstract = {OBJECTIVE: The purpose of this study was to examine the processing of auditory movement sensation accompanying duplex perception in binaural hearing.

METHODS: Stimuli were formant transitions (presented to the front, left or right of the subject) and base (presented to the front), that fused to result in vowel-consonant-vowel (V-C-V) sequences /aga/ and /ada/. An illusion of auditory movement (duplex sensation) accompanied the fusion of these V-C-V sequences when the spatial locations of the formant transitions and base were different. Ten right-handed, adult, native Hebrew speakers discriminated each fused stimulus, and the brain potentials associated with performance of the task were recorded from 21 electrodes. The processing of auditory movement was studied by a factorial design (ANOVA) and statistical non-parametric mapping (SnPM) of low resolution electromagnetic tomography (LORETA) images of the net-fusion response. Brain regions implicated in auditory movement processing were expected to be associated with the lateralized formant location, which gave rise to duplex perception. In addition, the time-course of significant activation in brain areas that differentiated between fusion conditions was determined.

RESULTS: The posterior parietal, anterior cingulate and premotor cortices were found to be implicated in duplex processing. Auditory cortex involvement was also evident, and together with the latter two brain regions was affected by right-ear advantage.

CONCLUSIONS: Duplex perception resulting from fusion of spatially separate sounds forming an auditory object results in activation of a network of brain regions reflecting enhanced allocation of attention and the effect of language processing.}, } @article {pmid12825646, year = {2003}, author = {Gonzalez, J and Cervera, T and Llau, MJ}, title = {Acoustic analysis of pathological voices compressed with MPEG system.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {17}, number = {2}, pages = {126-139}, doi = {10.1016/s0892-1997(03)00007-9}, pmid = {12825646}, issn = {0892-1997}, mesh = {Acoustic Stimulation/*instrumentation ; Adult ; Aged ; Female ; Humans ; Male ; Middle Aged ; Severity of Illness Index ; Sound Spectrography/instrumentation ; *Speech Acoustics ; Speech Perception ; Voice Disorders/*diagnosis/etiology/physiopathology ; }, abstract = {The MPEG-1 Layer 3 compression schema of audio signal, commonly known as mp3, has caused a great impact in recent years as it has reached high compression rates while conserving a high sound quality. Music and speech samples compressed at high bitrates are perceptually indistinguishable from the original samples, but very little was known about how compression acoustically affects the voice signal. A previous work with normal voices showed a high fidelity at high-bitrate compressions both in voice parameters and the amplitude-frequency spectrum. In the present work, dysphonic voices were tested through two studies. In the first study, spectrograms, long-term average spectra (LTAS), and fast Fourier transform (FFT) spectra of compressed and original samples of running speech were compared. In the second study, intensities, formant frequencies, formant bandwidths, and a multidimensional set of voice parameters were tested in a set of sustained phonations. Results showed that compression at high bitrates (96 and 128 kbps) preserved the relevant acoustic properties of the pathological voices. With compressions at lower bitrates, fidelity decreases, introducing some important alterations. Results from both works, Gonzalez and Cervera and this paper, open up the possibility of using MPEG-compression at high bitrates to store or transmit high-quality speech recordings, without altering their acoustic properties.}, } @article {pmid12822807, year = {2003}, author = {Weismer, G and Berry, J}, title = {Effects of speaking rate on second formant trajectories of selected vocalic nuclei.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {6}, pages = {3362-3378}, doi = {10.1121/1.1572142}, pmid = {12822807}, issn = {0001-4966}, support = {P60 DC01409/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Articulation Disorders/diagnosis ; Dysarthria/diagnosis ; Female ; Humans ; Male ; *Phonetics ; Reference Values ; Regression Analysis ; Semantics ; Sound Spectrography/statistics & numerical data ; *Speech Acoustics ; Speech Disorders/diagnosis ; Speech Production Measurement ; *Verbal Behavior ; }, abstract = {The effect of speaking rate variations on second formant (F2) trajectories was investigated for a continuum of rates. F2 trajectories for the schwa preceding a voiced bilabial stop, and one of three target vocalic nuclei following the stop, were generated for utterances of the form "Put a bV here, where V was /i/,/ae/ or /oI/. Discrete spectral measures at the vowel-consonant and consonant-vowel interfaces, as well as vowel target values, were examined as potential parameters of rate variation; several different whole-trajectory analyses were also explored. Results suggested that a discrete measure at the vowel consonant (schwa-consonant) interface, the F2off value, was in many cases a good index of rate variation, provided the rates were not unusually slow (vowel durations less than 200 ms). The relationship of the spectral measure at the consonant-vowel interface, F2 onset, as well as that of the "target" for this vowel, was less clearly related to rate variation. Whole-trajectory analyses indicated that the rate effect cannot be captured by linear compressions and expansions of some prototype trajectory. Moreover, the effect of rate manipulation on formant trajectories interacts with speaker and vocalic nucleus type, making it difficult to specify general rules for these effects. However, there is evidence that a small number of speaker strategies may emerge from a careful qualitative and quantitative analysis of whole formant trajectories. Results are discussed in terms of models of speech production and a group of speech disorders that is usually associated with anomalies of speaking rate, and hence of formant frequency trajectories.}, } @article {pmid12803966, year = {2003}, author = {Liebenthal, E and Binder, JR and Piorkowski, RL and Remez, RE}, title = {Short-term reorganization of auditory analysis induced by phonetic experience.}, journal = {Journal of cognitive neuroscience}, volume = {15}, number = {4}, pages = {549-558}, doi = {10.1162/089892903321662930}, pmid = {12803966}, issn = {0898-929X}, support = {R01 DC006287/DC/NIDCD NIH HHS/United States ; DC00308/DC/NIDCD NIH HHS/United States ; MH51358/MH/NIMH NIH HHS/United States ; NS33576/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Analysis of Variance ; Auditory Perception/physiology ; Brain Mapping/*methods ; Confidence Intervals ; Female ; Humans ; Magnetic Resonance Imaging/methods ; Male ; Middle Aged ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Sine wave replicas of spoken words can be perceived both as nonphonetic auditory forms and as words, depending on a listener's experience. In this study, brain areas activated by sine wave words were studied with fMRI in two conditions: when subjects perceived the sounds spontaneously as nonphonetic auditory forms ("nïve condition") and after instruction and brief practice attending to their phonetic attributes ("informed condition"). The test items were composed such that half replicated natural words ("phonetic items") and the other half did not, because the tone analogs of the first and third formants had been temporally reversed ("nonphonetic items"). Subjects were asked to decide whether an isolated tone analog of the second formant (T2) presented before the sine wave word (T1234) was included in it. Experience in attending to the phonetic properties of the sinusoids interfered with this auditory matching task and was accompanied by a decrease in auditory cortex activation with word replicas but not with the acoustically matched nonphonetic items. Because the activation patterns elicited by equivalent acoustic test items depended on a listener's awareness of their phonetic potential, this indicates that the analysis of speech sounds in the auditory cortex is distinct from the simple resolution of auditory form, and is not a mere consequence of acoustic complexity. Because arbitrary acoustic patterns did not evoke the response observed for phonetic patterns, these findings suggest that the perception of speech is contingent on the presence of familiar patterns of spectral variation. The results are consistent with a short-term functional reorganization of auditory analysis induced by phonetic experience with sine wave replicas and contingent on the dynamic acoustic structure of speech.}, } @article {pmid12771465, year = {2003}, author = {Baudelle, E and Vaissière, J and Renard, JL and Roubeau, B and Chevrie-Müller, C}, title = {[Intrinsic and co-intrinsic vowel characteristics in cerebellar and parkinsonian dysarthrias].}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {55}, number = {3}, pages = {137-146}, doi = {10.1159/000070725}, pmid = {12771465}, issn = {1021-7762}, mesh = {Aged ; Aged, 80 and over ; Cerebellar Ataxia/*complications ; *Dysarthria/diagnosis/etiology/therapy ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*complications ; *Phonetics ; Speech Acoustics ; Speech Production Measurement ; Speech Therapy/*methods ; }, abstract = {Few studies were made on the intrinsic and co-intrinsic vowel characteristics in neurologic dysarthrias. This work evaluates these parameters in a group of 4 subjects with cerebellar dysarthria suffering from a cerebellar degenerative syndrome and a group of 4 subjects with Parkinson disease suffering from parkinsonian dysarthria. These subjects were compared with 10 control subjects. An intensive speech treatment inspired from the Lee Silverman voice treatment (LSVT) was applied to one of the parkinsonian subjects of the study, with assessment before and after rehabilitation. The acoustic measures assessed the duration and the frequency of the vowels [a, i, ul indifferent contexts as well as their formant frequency and sentence duration. The results show different impairments according to the type of the dysarthria, and some forms of contrast transposition. Speech therapy restores subnormal contrasts.}, } @article {pmid12765409, year = {2003}, author = {Riede, T and Zuberbühler, K}, title = {Pulse register phonation in Diana monkey alarm calls.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {5}, pages = {2919-2926}, doi = {10.1121/1.1567278}, pmid = {12765409}, issn = {0001-4966}, mesh = {Acoustics ; Animals ; Behavior, Animal/physiology ; Cercopithecus ; Male ; Phonation/*physiology ; Time Factors ; Vocalization, Animal/*physiology ; }, abstract = {The adult male Diana monkeys (Cercopithecus diana) produce predator-specific alarm calls in response to two of their predators, the crowned eagles and the leopards. The acoustic structure of these alarm calls is remarkable for a number of theoretical and empirical reasons. First, although pulsed phonation has been described in a variety of mammalian vocalizations, very little is known about the underlying production mechanism. Second, Diana monkey alarm calls are based almost exclusively on this vocal production mechanism to an extent that has never been documented in mammalian vocal behavior. Finally, the Diana monkeys' pulsed phonation strongly resembles the pulse register in human speech, where fundamental frequency is mainly controlled by subglottal pressure. Here, we report the results of a detailed acoustic analysis to investigate the production mechanism of Diana monkey alarm calls. Within calls, we found a positive correlation between the fundamental frequency and the pulse amplitude, suggesting that both humans and monkeys control fundamental frequency by subglottal pressure. While in humans pulsed phonation is usually considered pathological or artificial, male Diana monkeys rely exclusively on pulsed phonation, suggesting a functional adaptation. Moreover, we were unable to document any nonlinear phenomena, despite the fact that they occur frequently in the vocal repertoire of humans and nonhumans, further suggesting that the very robust Diana monkey pulse production mechanism has evolved for a particular functional purpose. We discuss the implications of these findings for the structural evolution of Diana monkey alarm calls and suggest that the restricted variability in fundamental frequency and robustness of the source signal gave rise to the formant patterns observed in Diana monkey alarm calls, used to convey predator information.}, } @article {pmid12763199, year = {2003}, author = {Hertrich, I and Mathiak, K and Lutzenberger, W and Ackermann, H}, title = {Processing of dynamic aspects of speech and non-speech stimuli: a whole-head magnetoencephalography study.}, journal = {Brain research. Cognitive brain research}, volume = {17}, number = {1}, pages = {130-139}, doi = {10.1016/s0926-6410(03)00087-9}, pmid = {12763199}, issn = {0926-6410}, mesh = {Acoustic Stimulation/*methods ; Adult ; Analysis of Variance ; Attention/physiology ; Female ; Humans ; Linear Models ; Magnetoencephalography/*methods ; Male ; Psychomotor Performance/*physiology ; Speech/*physiology ; }, abstract = {Clinical and experimental data indicate higher proficiency of the left hemisphere in encoding dynamic acoustic events such as rapid formant transitions (30-40 ms) that distinguish consonant-vowel syllables such as /ba/ or /da/. In order to further elucidate the underlying neurophysiological mechanisms, discrimination of /bi/-like formant transitions of variable duration (18, 36, 54, or 72 ms) from a steady-state /i/-like vowel was investigated by means of whole-head magnetoencephalography (MEG) both during visual distraction and selective attention. Voiced speech-like as well as unvoiced non-speech stimuli, matched for spectral envelope, served as test materials. Based on an oddball design, magnetic mismatch fields (MMF) were determined during an early (170-210 ms) and a late (230-290 ms) time window. Selective attention toward the deviant events resulted in enhanced MMFs particularly within the left hemisphere, indicating attention-dependent left-lateralized processing of dynamic auditory events across both the speech and non-speech domains. Perceptual discrimination improved along with transient lengthening. Accordingly, early MMF was, as a rule, enlarged in case of longer as compared to shorter transients. The 36-ms transitions yielded attention- and voicing-dependent deviations from the linear regression of MMF strength on transition duration. Considering the predominance of 30- to 40-ms formant transients across the world's languages, these findings indicate an adaptation or predisposition of the human perceptual system to the spectral/temporal characteristics of prototypical speech sounds. Signal voicing had no significant main effect on MMF strength despite superior perceptual performance in case of voiced as compared to voiceless target stimuli.}, } @article {pmid12738428, year = {2003}, author = {Laufer, I and Pratt, H}, title = {The electrophysiological net response ('F-complex') to spatial fusion of speech elements forming an auditory object.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {114}, number = {5}, pages = {818-834}, doi = {10.1016/s1388-2457(03)00029-4}, pmid = {12738428}, issn = {1388-2457}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Brain Mapping/*methods ; Cerebral Cortex/*physiology ; Electrophysiology ; Evoked Potentials, Auditory ; Humans ; Reaction Time/physiology ; Sound Localization/*physiology ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: The purpose of this study was to define and analyze the brain activity associated with fusion of speech elements to form an auditory object and to study the effects of presenting the elements at different spatial locations (duplex stimulus).

METHODS: Stimuli were formant transitions (presented to the front, left or right of the subject) and base (presented to the front), that fused to result in V-C-V sequences /aga/ and /ada/. Ten right-handed, adult, native Hebrew speakers discriminated each fused stimulus, and the brain potentials associated with performance of the task were recorded from 21 electrodes. The net-fusion response, the 'F(fusion)-complex', was extracted by subtracting the sum of potentials to the base and formant transitions from the potentials to the fused sound. Low resolution electromagnetic tomography analysis (LORETA) was performed to assess the timing and brain location of the fusion process.

RESULTS: The 'F-complex', comprising of the difference N(1), P(2), N(2b) (FN(1), FP(2), FN(2b)) components could be identified for each of the stimuli and reflected a process indicating inhibition, occlusion or both, with right ear advantage in fusion. LORETA analyses indicate sequential processing of speech fusion in the temporal lobes, beginning with right prominence in FN(1) and FP(2) shifting to a more symmetrical pattern in FN(2).

CONCLUSIONS: The electrophysiological correlates of speech fusion highlight the uniqueness of speech perception and the brain areas involved in its analysis.}, } @article {pmid12737052, year = {2003}, author = {Nijland, L and Maassen, B and Van Der Meulen, S and Gabreëls, F and Kraaimaat, FW and Schreuder, R}, title = {Planning of syllables in children with developmental apraxia of speech.}, journal = {Clinical linguistics & phonetics}, volume = {17}, number = {1}, pages = {1-24}, doi = {10.1080/0269920021000050662}, pmid = {12737052}, issn = {0269-9206}, mesh = {Apraxias/*complications/*diagnosis ; Child ; Child, Preschool ; Developmental Disabilities/*complications ; Female ; Humans ; Male ; Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {The aim of the present study was to investigate whether children with developmental apraxia of speech (DAS) show a deficit in planning syllables in speech production. Six children with DAS and six normally speaking (NS) children produced high- and low-frequency of occurrence syllable utterances, in which the syllable structure was systematically manipulated in an otherwise unchanging phoneme sequence. Anticipatory coarticulation, using second formant trajectories, and durational structure were analysed. The results showed stronger coarticulation in the children with DAS when compared to the normally speaking children. but in contrast to our expectations, in neither group was a systematic effect of syllable structure on the second format trajectory found. Effects of syllable structure did emerge for durational structure in that durational adjustments were found in the segments of the second syllable. These adjustments were less systematic in children with DAS when compared to normally speaking children. Furthermore, at the prosodic level, normally speaking children showed metrical contrasts that were not realized by the children with DAS. The latter results are interpreted as evidence for a problem in the planning of syllables in speech production of children with DAS, in particular concerning prosodic aspects, which is discussed in relation to the automation of speech production.}, } @article {pmid12725771, year = {2003}, author = {Koyama, S and Akahane-Yamada, R and Gunji, A and Kubo, R and Roberts, TP and Yabe, H and Kakigi, R}, title = {Cortical evidence of the perceptual backward masking effect on /l/ and /r/ sounds from a following vowel in Japanese speakers.}, journal = {NeuroImage}, volume = {18}, number = {4}, pages = {962-974}, doi = {10.1016/s1053-8119(03)00037-5}, pmid = {12725771}, issn = {1053-8119}, mesh = {Acoustic Stimulation/methods ; Adult ; Brain Mapping/methods ; Cerebral Cortex/*physiology ; Female ; Humans ; Japan ; *Language ; Magnetoencephalography ; Male ; *Phonetics ; Reaction Time/physiology ; Reference Values ; Speech Acoustics ; Speech Perception/*physiology ; Time Factors ; Verbal Behavior/*physiology ; }, abstract = {We examined the influence of stimulus duration of foreign consonant vowel stimuli on the MMNm (magnetic counter part of mismatch negativity). In Experiment 1, /ra/ and /la/ stimuli were synthesized and subjects were native Japanese speakers who are known to have difficulty discriminating the stimuli. "Short" duration stimuli were terminated in the middle of the consonant-to-vowel transition (110 ms). They were nevertheless clearly identifiable by English speakers. A clear MMNm was observed only for short-duration stimuli but not for untruncated long-duration (150-ms) stimuli. We suggest that the diminished MMNm for longer duration stimuli result from more effective masking by the longer vowel part. In Experiment 2 we examined this hypothesis by presenting only the third formant (F3) component of the original stimuli, since the acoustic difference between /la/ and /ra/ is most evident in the third formant, whereas F1 and F2 play a major role in vowel perception. If the MMNm effect depends on the acoustic property of F3, a stimulus duration effect comparable to that found with the original /la/ and /ra/ stimuli might be expected. However, if the effect is attributable to the masking effect from the vowel, no influence of stimulus duration would be expected, since neither stimulus contains F1 and F2 components. In fact, the results showed that the "F3 only" stimuli did not show a duration effect; MMNm was always elicited independent of stimulus duration. The MMN stimulus duration effect is thus suggested to come from the backward masking of foreign consonants by subsequent vowels.}, } @article {pmid12703720, year = {2003}, author = {Venkatagiri, HS}, title = {Segmental intelligibility of four currently used text-to-speech synthesis methods.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {4 Pt 1}, pages = {2095-2104}, doi = {10.1121/1.1558356}, pmid = {12703720}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; *Communication Aids for Disabled ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking ; Prosthesis Design ; Sound Spectrography ; *Speech Intelligibility ; }, abstract = {The study investigated the segmental intelligibility of four currently available text-to-speech (TTS) products under 0-dB and 5-dB signal-to-noise ratios. The products were IBM ViaVoice version 5.1, which uses formant coding, Festival version 1.4.2, a diphone-based LPC TTS product, AT&T Next-Gen, a half-phone-based TTS product that uses harmonic-plus-noise method for synthesis, and FlexVoice2, a hybrid TTS product that combines concatenative and formant coding techniques. Overall, concatenative techniques were more intelligible than formant or hybrid techniques, with formant coding slightly better at modeling vowels and concatenative techniques marginally better at synthesizing consonants. No TTS product was better at resisting noise interference than others, although all were more intelligible at 5 dB than at 0-dB SNR. The better TTS products in this study were, on the average, 22% less intelligible and had about 3 times more phoneme errors than human voice under comparable listening conditions. The hybrid TTS technology of FlexVoice had the lowest intelligibility and highest error rates. There were discernible patterns of errors for stops, fricatives, and nasals. Unrestricted TTS output--e-mail messages, news reports, and so on--under high noise conditions prevalent in automobiles, airports, etc. will likely challenge the listeners.}, } @article {pmid12691222, year = {2002}, author = {Dorman, MF and Loizou, PC and Spahr, AJ and Maloff, E}, title = {Factors that allow a high level of speech understanding by patients fit with cochlear implants.}, journal = {American journal of audiology}, volume = {11}, number = {2}, pages = {119-123}, doi = {10.1044/1059-0889(2002/014)}, pmid = {12691222}, issn = {1059-0889}, mesh = {*Cochlear Implantation ; *Cognition ; Deafness/*surgery ; Humans ; Prosthesis Fitting ; Sound Spectrography ; *Speech Perception ; Time Factors ; }, abstract = {Three factors account for the high level of speech understanding in quiet enjoyed by many patients fit with cochlear implants. First, some information about speech exists in the time/amplitude envelope of speech. This information is sufficient to narrow the number of word candidates for a given signal. Second, if information from the envelope of speech is available to listeners, then only minimal information from the frequency domain is necessary for high levels of speech recognition in quiet. Third, perceiving strategies for speech are inherently flexible in terms of the mapping between signal frequencies (i.e., the locations of the formants) and phonetic identity.}, } @article {pmid12669439, year = {2002}, author = {Hou, L and Han, D and Xu, W and Zhang, L}, title = {[Study on voice characteristics of people with different sexes and ages].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {16}, number = {12}, pages = {667-669}, pmid = {12669439}, mesh = {Adolescent ; Adult ; Age Factors ; Aged ; Aged, 80 and over ; Child ; Female ; Humans ; Male ; Middle Aged ; Sex Factors ; *Speech Acoustics ; Vocal Cords ; *Voice ; *Voice Quality ; }, abstract = {OBJECTIVE: To study the voice characteristics and differences of normal people with different sexes and ages, and to analyze the related factors, adopt suitable acoustic parameter values.

METHOD: To collect and analyze acoustic parameter values of 1200 normal people who were divided by sex (male and female) and age (group 1: before puberty; group 2: after puberty to 30; group 3: 31 to 45; group 4: 46 to 60; 5: 61 and older), laryngeal manifestations of different sex and age under stroboscopy were analyzed.

RESULT: Acoustic parameters of men and women have different characteristics. (1) F0, jitter, NNE of women voice are significantly higher than men. (2) Voice amplitude, MPT and formant frequencies of men are significantly higher than those of women. (3) Male and female voice characteristics and anatomic and physiological mechanisms of different age--groups are different.

CONCLUSION: Acoustic parameters of normal people with different sexes and ages have different characteristics, which are related to anatomic and physiological mechanisms and other factors.}, } @article {pmid12668224, year = {2003}, author = {Vihla, M and Salmelin, R}, title = {Hemispheric balance in processing attended and non-attended vowels and complex tones.}, journal = {Brain research. Cognitive brain research}, volume = {16}, number = {2}, pages = {167-173}, doi = {10.1016/s0926-6410(02)00248-3}, pmid = {12668224}, issn = {0926-6410}, mesh = {Acoustic Stimulation ; Adult ; Attention/*physiology ; Auditory Cortex/physiology ; Electroencephalography ; Evoked Potentials, Auditory/physiology ; Female ; Functional Laterality/*physiology ; Hearing/physiology ; Humans ; Magnetoencephalography/*psychology ; Male ; Sensory Thresholds ; Speech Perception/*physiology ; }, abstract = {We compared cortical processing of attended and non-attended vowels and complex tones, using a whole-head neuromagnetometer, to test for possible hemispheric differences. Stimuli included vowels [a] and [i], spoken by two female Finnish speakers, and two complex tones, each with two pure tone components corresponding to the first and second formant frequencies (F1-F2) of the vowels spoken by speaker 1. Sequences including both vowels and complex tones were presented to eight Finnish males during passive and active (phoneme/speaker/complex tone identification) listening. Sequences including only vowels were presented to five of the subjects during passive listening and during a phoneme identification task. The vowel [i] spoken by speaker 1 and the corresponding complex tone were frequent, non-target stimuli. Responses evoked by these frequent stimuli were analyzed. Cortical activation at approximately 100 ms was stronger for the complex tone than the vowel in the right hemisphere (RH). Responses were similar during active and passive listening. Hemispheric balance remained the same when the vowel was presented in sequences including only vowels. The reduction of RH activation for vowels as compared with complex tones indicates a relative increase of left hemisphere involvement, possibly reflecting a shift towards more language-specific processing.}, } @article {pmid12666591, year = {2003}, author = {Pavlikhin, OG and Meshcherkin, AP}, title = {[The role of spectral computer analysis in determination of a singer's voice type and prevention of vocal disease in singers].}, journal = {Vestnik otorinolaringologii}, volume = {}, number = {1}, pages = {9-11}, pmid = {12666591}, issn = {0042-4668}, mesh = {Adult ; Aged ; Diagnosis, Computer-Assisted/*instrumentation ; Humans ; Middle Aged ; Speech Acoustics ; Voice Disorders/*diagnosis/*prevention & control ; *Voice Quality ; *Voice Training ; }, abstract = {The type of voice usually corresponds to anatomic parameters of the larynx but there are cases when such correspondence is not observed. A singer's voice is characterized by pitch, depth and tembre of the sound, which, acoustically, present variation of three sound parameters: frequency of vibrations, their amplitude and compound sound, its spectrum. A spectral computer analysis of voice has been performed in 68 singers aged 24 to 65 years with singing history of 3 to 43 years. The diagnosis of the voice type was made by the frequency of the basic tone and formant composition of sound at primary, i.e. most convenient for the singer tones, and transitional notes of the range. Tessutura at which maximal number of overtones and the highest formant sound intensity were registered was the best for the singer.}, } @article {pmid12624502, year = {2003}, author = {Hamzavi, JS and Schenk, BS and Pok, SM and Moosmueller, S and Baumgartner, WD and Deutsch, WA}, title = {Characteristics of fricatives and sentence duration after cochlear implantation.}, journal = {ORL; journal for oto-rhino-laryngology and its related specialties}, volume = {65}, number = {1}, pages = {22-25}, doi = {10.1159/000068665}, pmid = {12624502}, issn = {0301-1569}, mesh = {Adolescent ; Adult ; Aged ; *Cochlear Implantation ; Deafness/*surgery ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Intelligibility ; Speech Perception/*physiology ; Time Factors ; Voice Quality ; }, abstract = {Ten postlingually deafened patients (5 male, 5 female) were examined after cochlear implantation to measure improvements in their quality of speech. Parameters such as the spectral maximum of fricatives and the duration of utterances were analysed in speech recordings taken at regular intervals after implantation. The speech samples were recorded in an audiological chamber. Parameters were analysed using ST(x) (S-Tools Software). Frequency analyses based on the fast Fourier transform and spectral estimation methods, as well as fundamental frequency and formant extraction (cepstrum, LPC = linear prediction coding) and digital filter implementations were prepared. The results indicate a tendency towards improvement in the spectral maximum of the fricatives and affricates and a shortening of the duration of the fricative parts in affricates and of sentences in nearly all our subjects. These results showed the restored auditory feedback produced by cochlear implantation to have a favourable effect on speech production.}, } @article {pmid12612484, year = {2003}, author = {Uchanski, RM and Geers, AE}, title = {Acoustic characteristics of the speech of young cochlear implant users: a comparison with normal-hearing age-mates.}, journal = {Ear and hearing}, volume = {24}, number = {1 Suppl}, pages = {90S-105S}, doi = {10.1097/01.AUD.0000051744.24290.C1}, pmid = {12612484}, issn = {0196-0202}, support = {DC03100/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; *Cochlear Implantation ; *Cochlear Implants ; Communication ; Deafness/*rehabilitation ; Female ; Humans ; Male ; *Speech Acoustics ; }, abstract = {OBJECTIVE: The primary objective of this study was to compare select acoustic characteristics of the speech of deaf children who use cochlear implants (young cochlear implant users) with those of children with normal hearing. A secondary objective of this study was to examine the effect, if any, of the deaf child's education (oral versus total communication) on the similarity of these acoustic characteristics to those of normal-hearing age-mates.

DESIGN: Speech was recorded from 181 young cochlear implant users and from 24 children with normal hearing. All speech was produced by imitation, and consisted of complete sentences. Acoustic measures included voice onset time (/t/, /d/), second formant frequency (/i/, /[U0254]/), spectral moments (mean, skew and kurtosis of /s/ and /[U0283]/), a nasal manner metric, and durations (of vowels, words, and sentences).

RESULTS AND DISCUSSION: A large percentage (46 to 97%) of the young cochlear implant users produced acoustic characteristics with values within the range found for children with normal hearing. Exceptions were sentence duration and vowel duration in sentence-initial words, for which only 23 and 25%, respectively, of the COCHLEAR IMPLANT users had values within the normal range. Additionally, for most of the acoustic measures, significantly more COCHLEAR IMPLANT users from oral than from total communication settings had values within the normal range.

CONCLUSIONS: Compared with deaf children with hearing aids (from previous studies by others), deaf children who use cochlear implants have improved speech production skills, as reflected in the acoustic measures of this study. Placement in an oral communication educational setting is also associated with more speech production improvement than placement in a total communication setting.}, } @article {pmid12597195, year = {2003}, author = {Francis, AL and Ciocca, V and Yu, JM}, title = {Accuracy and variability of acoustic measures of voicing onset.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {2}, pages = {1025-1032}, doi = {10.1121/1.1536169}, pmid = {12597195}, issn = {0001-4966}, mesh = {Adult ; Data Interpretation, Statistical ; Female ; Hong Kong ; Humans ; *Language ; Larynx/*physiology ; Male ; Phonation/*physiology ; *Phonetics ; Reproducibility of Results ; Sound Spectrography/*statistics & numerical data ; *Speech Acoustics ; Speech Production Measurement/*statistics & numerical data ; }, abstract = {Five commonly used methods for determining the onset of voicing of syllable-initial stop consonants were compared. The speech and glottal activity of 16 native speakers of Cantonese with normal voice quality were investigated during the production of consonant vowel (CV) syllables in Cantonese. Syllables consisted of the initial consonants /ph/, /th/, /kh/, /p/, /t/, and /k/ followed by the vowel /a/. All syllables had a high level tone, and were all real words in Cantonese. Measurements of voicing onset were made based on the onset of periodicity in the acoustic waveform, and on spectrographic measures of the onset of a voicing bar (f0), the onset of the first formant (F1), second formant (F2), and third formant (F3). These measurements were then compared against the onset of glottal opening as determined by electroglottography. Both accuracy and variability of each measure were calculated. Results suggest that the presence of aspiration in a syllable decreased the accuracy and increased the variability of spectrogram-based measurements, but did not strongly affect measurements made from the acoustic waveform. Overall, the acoustic waveform provided the most accurate estimate of voicing onset; measurements made from the amplitude waveform were also the least variable of the five measures. These results can be explained as a consequence of differences in spectral tilt of the voicing source in breathy versus modal phonation.}, } @article {pmid12581828, year = {2003}, author = {Vihla, M and Eulitz, C}, title = {Topography of the auditory evoked potential in humans reflects differences between vowels embedded in pseudo-words.}, journal = {Neuroscience letters}, volume = {338}, number = {3}, pages = {189-192}, doi = {10.1016/s0304-3940(02)01403-9}, pmid = {12581828}, issn = {0304-3940}, mesh = {Acoustic Stimulation ; Adult ; *Brain Mapping ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; *Speech Perception ; }, abstract = {To study the processing of vowels embedded in more complex linguistic structures, we compared cortical responses for pseudo-words. Auditory evoked potentials were recorded in 11 right-handed females using a passive oddball paradigm, with /pemu/ and /pomu/ as standard stimuli, differing only with respect to the first syllable. Topographic differences in the N100 were observed between the standards: /pemu/ had larger amplitudes than /pomu/ at more posterior electrode sites whereas a reverse pattern was found at more anterior positions along the midline. This topographic difference can be explained by different generators for the two stimuli. Different vowels and/or the initial formant transition possibly activate different neural populations in the auditory cortex, also when the vowels are embedded in pseudo-words.}, } @article {pmid12572905, year = {2002}, author = {Jindra, P and Eber, M and Pesák, J}, title = {The spectral analysis of syllables in patients using dentures.}, journal = {Biomedical papers of the Medical Faculty of the University Palacky, Olomouc, Czechoslovakia}, volume = {146}, number = {2}, pages = {91-94}, doi = {10.5507/bp.2002.019}, pmid = {12572905}, issn = {1213-8118}, mesh = {Aged ; Aged, 80 and over ; *Dentures ; Female ; Humans ; Male ; Middle Aged ; *Sound Spectrography ; *Speech Articulation Tests ; }, abstract = {Changes in the oral cavity resulting from the loss of teeth and the ensuing reconstruction of a set of teeth by dentures (partial or complete) may cause changes in the speech and voice of the patient. The aim of the present investigation was to study the changes in speech and voice in patients suffering from teeth loss and the degree of speech improvement using dentures. Voice and speech parameters of a set of tested syllables were analysed in 10 patients at the 2nd Clinic of Stomatology. The analysis was carried out by means of an FFT, SoundForge 5.0 programme. Differently expressed acoustic changes in both consonants and vowels were ascertained in a percentage of the patients under examination. These concerned especially the sibilant ("s", "(see text)"), labiodental ("f", "v") and vibrating ("r", "(see text)") consonants. Changes in the FFT spectrum and air leakage in constrictive consonants were also found. In some patients the vowels, especially the closed ones ("i", "u"), may change their fundamental frequency and show noise admixture manifested as a blurred delimitation of the formants. A denture should, inter alia, render it possible for the patient to produce the same articulation to which he/she had been accustomed before the loss of teeth. For the construction of dentures the most important factors from a phonetic point of view appear to be the following: overbite, overjet, the height of the plate, the thickness of the palatal material, the incisor position, and the modelling of the ruga palatina on the hard palate. In case of wrong denture construction the acoustic changes may continue, resulting in the patient's stress load dependent upon sex, age, psychic condition and seriousness of the problem.}, } @article {pmid12566763, year = {2003}, author = {Higashikawa, M and Green, JR and Moore, CA and Minifie, FD}, title = {Lip kinematics for /p/ and /b/ production during whispered and voiced speech.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {55}, number = {1}, pages = {17-27}, pmid = {12566763}, issn = {1021-7762}, support = {T32 DC000033/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Biomechanical Phenomena ; Cues ; Humans ; Lip/*physiology ; Male ; *Phonetics ; Speech/*physiology ; Speech Production Measurement ; Videotape Recording ; Voice/*physiology ; }, abstract = {In the absence of voicing, the discrimination of "voiced" and "voiceless" stop consonants in whispered speech relies on such acoustic cues as burst duration and amplitude, and formant transition characteristics. The articulatory processes that generate these features of whispered speech remain speculative. This preliminary investigation examines the articulatory kinematics differences between whispered /p/ and /b/, which may underlie the acoustic differences previously reported for these sounds. Computerized video-tracking methods were used to evaluate kinematic differences between voiced and voiceless stops. Seven subjects produced the target utterances "my papa puppy" and "my baba puppy" in voiced and whispered speech modes. The results revealed that mean peak opening and closing velocities for /b/ were significantly greater than those for /p/ during whispered speech. No differences in peak velocity for either oral closing or opening were observed during voiced speech. The maximum distance between the lips for oral opening for /b/ was significantly greater than for /p/ during whisper, whereas no difference was observed during voiced speech. These data supported the suggestion that whispered speech and voiced speech rely on distinct motor control processes.}, } @article {pmid12563947, year = {2000}, author = {Liang, Y and Huang, WN and Zhang, C and Meng, QR and Yang, LJ}, title = {[Observation and analyses on voice changes in the normal aged].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {14}, number = {11}, pages = {512-514}, pmid = {12563947}, mesh = {Aged ; Aged, 80 and over ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; Sex Factors ; Sound Spectrography ; *Voice Quality ; }, abstract = {OBJECTIVE: To measure the voice samples of the normal aged in order to systemically study the features of the voice changes.

METHOD: To collect and analyze 146 voice samples of the normal aged with sonogram.

RESULT: The fundamental frequency of the voice of the aged decreases and rises in the male more than 80 years older. The low frequency harmonics are regulation and the intensity is strong in the formant of the aged. The difference reduces in voice between male and female. The harmonics to noise ratio tends downwards and the amplitude perturbation quotient tends upwards along with the growth of age in the aged male. The changes of the above-mentioned parameters are not significant in the aged female.

CONCLUSION: The voice changes are normal physiological ones in the normal aged. The changes of the parameters are used to evaluate normal aged voice and abnormal one. The changes show that the function in the aged phonation tends to decline to a certain extent and it must be protected and be trained.}, } @article {pmid12558288, year = {2003}, author = {Tabain, M}, title = {Effects of prosodic boundary on /aC/ sequences: acoustic results.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {1}, pages = {516-531}, doi = {10.1121/1.1523390}, pmid = {12558288}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; }, abstract = {This study presents various acoustic measures used to examine the sequence /a # C/, where "#" represents different prosodic boundaries in French. The 6 consonants studied are /b d g f s S/ (3 stops and 3 fricatives). The prosodic units investigated are the utterance, the intonational phrase, the accentual phrase, and the word. It is found that vowel target values, formant transitions into the stop consonant, and the rate of change in spectral tilt into the fricative, are affected by the strength of the prosodic boundary. F1 becomes higher for /a/ the stronger the prosodic boundary, with the exception of one speaker's utterance data, which show the effects of articulatory declension at the utterance level. Various effects of the stop consonant context are observed, the most notable being a tendency for the vowel /a/ to be displaced in the direction of the F2 consonant "locus" for /d/ (the F2 consonant values for which remain relatively stable across prosodic boundaries) and for /g/ (the F2 consonant values for which are displaced in the direction of the velar locus in weaker prosodic boundaries, together with those of the vowel). Velocity of formant transition may be affected by prosodic boundary (with greater velocity at weaker boundaries), though results are not consistent across speakers. There is also a tendency for the rate of change in spectral tilt moving from the vowel to the fricative to be affected by the presence of a prosodic boundary, with a greater rate of change at the weaker prosodic boundaries. It is suggested that spectral cues, in addition to duration, amplitude, and F0 cues, may alert listeners to the presence of a prosodic boundary.}, } @article {pmid12558276, year = {2003}, author = {Bruce, IC and Sachs, MB and Young, ED}, title = {An auditory-periphery model of the effects of acoustic trauma on auditory nerve responses.}, journal = {The Journal of the Acoustical Society of America}, volume = {113}, number = {1}, pages = {369-388}, doi = {10.1121/1.1519544}, pmid = {12558276}, issn = {0001-4966}, support = {DC00023/DC/NIDCD NIH HHS/United States ; DC00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Basement Membrane/physiopathology ; Cats ; Cochlear Nerve/*physiopathology ; Computer Simulation ; Ear, Middle/physiopathology ; Hair Cells, Auditory, Inner/physiopathology ; Hair Cells, Auditory, Outer/physiopathology ; *Hearing Aids ; Hearing Loss, Noise-Induced/*physiopathology ; Humans ; *Models, Neurological ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Acoustic trauma degrades the auditory nerve's tonotopic representation of acoustic stimuli. Recent physiological studies have quantified the degradation in responses to the vowel /E/ and have investigated amplification schemes designed to restore a more correct tonotopic representation than is achieved with conventional hearing aids. However, it is difficult from the data to quantify how much different aspects of the cochlear pathology contribute to the impaired responses. Furthermore, extensive experimental testing of potential hearing aids is infeasible. Here, both of these concerns are addressed by developing models of the normal and impaired auditory peripheries that are tested against a wide range of physiological data. The effects of both outer and inner hair cell status on model predictions of the vowel data were investigated. The modeling results indicate that impairment of both outer and inner hair cells contribute to degradation in the tonotopic representation of the formant frequencies in the auditory nerve. Additionally, the model is able to predict the effects of frequency-shaping amplification on auditory nerve responses, indicating the model's potential suitability for more rapid development and testing of hearing aid schemes.}, } @article {pmid12546493, year = {2002}, author = {Pittman, AL and Stelmachowicz, PG and Lewis, DE and Hoover, BM}, title = {Influence of hearing loss on the perceptual strategies of children and adults.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {45}, number = {6}, pages = {1276-1284}, doi = {10.1044/1092-4388(2002/102)}, pmid = {12546493}, issn = {1092-4388}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Phonetics ; Severity of Illness Index ; Sound Spectrography ; Speech Perception/*physiology ; Time Factors ; *Verbal Behavior ; }, abstract = {To accommodate growing vocabularies, young children are thought to modify their perceptual weights as they gain experience with speech and language. The purpose of the present study was to determine whether the perceptual weights of children and adults with hearing loss differ from those of their normal-hearing counterparts. Adults and children with normal hearing and with hearing loss served as participants. Fricative and vowel segments within consonant-vowel-consonant stimuli were presented at randomly selected levels under two conditions: unaltered and with the formant transition removed. Overall performance for each group was calculated as a function of segment level. Perceptual weights were also calculated for each group using point-biserial correlation coefficients that relate the level of each segment to performance. Results revealed child-adult differences in overall performance and also revealed an effect of hearing loss. Despite these performance differences, the pattern of perceptual weights was similar across all four groups for most conditions.}, } @article {pmid12541720, year = {2001}, author = {Xu, J and Qi, Q and Dong, W and Huang, Z}, title = {[Acoustic analysis in patients with trauma to unilateral cricoarytenoid joint by computer technique].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {15}, number = {12}, pages = {536-538}, pmid = {12541720}, mesh = {Adolescent ; Adult ; Arytenoid Cartilage/*injuries ; *Diagnosis, Computer-Assisted ; Female ; Humans ; Joints/*injuries ; Male ; Middle Aged ; Sound Spectrography ; *Speech Acoustics ; }, abstract = {OBJECTIVE: To investigate the acoustic characteristics of unilateral cricoarytenoid joint trauma and evaluate the effect of acoustic analysis technique on the diagnosis and treatment of this disease.

METHOD: The voice signals of sustained vowel [a] were measured using a micro-computer with Dr. Speech software in 50 healthy adults and 30 patients with unilateral cricoarytenoid joint trauma. The acoustic parameters (jitter, shimmer and NNE) and spectrographic characteristics (harmonic waves, formants and noise) were analyzed. The acoustic changes before and after the treatment of arytenoid motion were observed and compared.

RESULT: All acoustic parameters were significantly increased in trauma of unilateral cricoarytenoid joint. The pathologic spectrograph showed decrease or loss of harmonic waves and formants, and increase of noise, in middle and high frequencies. There was a relationship between NNE and the position of the vocal cord. After the treatment of arytenoid motion, 33% patients were cured and had normal acoustic parameters and spectrograph. 47% were improved with reduced parameters and improved spectrograph. The acoustics in 20% cases didn't change at all after treatment. The acoustic parameters were significantly decreased after the treatment of arytenoid motion.

CONCLUSION: The present study suggested that the acoustic parameters may estimating the degrees of voice lesion as objective standards and NNE may judge the degree of glottic closure. Both acoustic parameters and spectrograph are of value in the diagnosis, treatment effect and prognosis observation to cricoarytenoid joint trauma.}, } @article {pmid12541540, year = {2000}, author = {Zhang, L and Zhang, DX and Zhang, XH and Zhang, WJ}, title = {[The discrimination rate in the consonant by cochlear implanted].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {14}, number = {5}, pages = {197-198}, pmid = {12541540}, mesh = {Adult ; Child ; *Cochlear Implants ; Deafness/*physiopathology/rehabilitation/surgery ; Female ; Humans ; Male ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: To evaluate the effect of the discrimination rate in the consonant by cochlear implanted.

METHOD: The four cases by cochlear implanted have rehabilitated with hearing speech for 1.0-1.5 years.

RESULT: The discrimination rate in the four cases by cochlear implanted was 76.0% in the consonant, 94.1% in the vowel.

CONCLUSION: The discrimination rate in common language of Chinese was lower than that of vowel. When the patients discriminated the consonants with the same formants feature, the chance of the auditory is relatively high.}, } @article {pmid12541387, year = {1999}, author = {Zhang, J and Huang, M and Li, M and Liu, Q}, title = {[Analysis of results of fundamental frequency and voice parameter in healthy young people].}, journal = {Lin chuang er bi yan hou ke za zhi = Journal of clinical otorhinolaryngology}, volume = {13}, number = {9}, pages = {403-405}, pmid = {12541387}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Reference Values ; Sex Factors ; Speech Production Measurement/*standards ; Vocal Cords/physiology ; Voice/physiology ; }, abstract = {OBJECTIVE: To study and analyzing the parameter values of voice in normal healthy young people.

METHOD: In present paper, voice quality of 153 cases were collected and analyzed with Dr. Speech software.

RESULT: Jitter was (0.18 +/- 0.07)%, Shimmer (1.60 +/- 0.74)%, HNR (Harmonics-to-Noise Ratio) (25.34 +/- 3.12) dB, SNR (Signal-to-Noise Ratio) (25.39 +/- 3.09) dB and NNE (Normalized Noise Energy(-16.95 +/- 3.57) dB. Average of fundamental frequency in male was (160.81 +/- 24.27) Hz, in female was (297.42 +/- 35.89) Hz and in all was (206.35 +/- 70.77) Hz. At the same time, the results of some correlative parameter about voice was taken.

CONCLUSION: We considered that there were significant differences between gender about fundamental frequency in parameter values of voice. Three formant produced at 4.5 times, 8.64 times, 17.04 times of the fundamental frequency in male, at 2.07 times, 4.96 times, 7.28 times in female. This data was usefulness to evaluate voice results of normal or abnormal.}, } @article {pmid12527095, year = {2003}, author = {Obleser, J and Elbert, T and Lahiri, A and Eulitz, C}, title = {Cortical representation of vowels reflects acoustic dissimilarity determined by formant frequencies.}, journal = {Brain research. Cognitive brain research}, volume = {15}, number = {3}, pages = {207-213}, doi = {10.1016/s0926-6410(02)00193-3}, pmid = {12527095}, issn = {0926-6410}, mesh = {Adult ; Brain/*physiology ; *Evoked Potentials, Auditory ; Female ; Germany ; Humans ; *Language ; Magnetics ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {We studied neuromagnetic correlates of the processing of German vowels [a], [e] and [i]. The aim was (i) to show an influence of acoustic/phonetic features on timing and mapping of the N100 m component and (ii) to demonstrate the retest reliability of these parameters. To assess the spatial configuration of the N100 m generators, Euclidean distances between vowel sources were computed. Latency, amplitude, and source locations of the N100 m component differed between vowels. The acoustically most dissimilar vowels [a] and [i] showed more distant source locations than the more similar vowels [e] and [i]. This pattern of results was reliably found in a second experimental session after at least 5 days. The results suggest the preservation of spectral dissimilarities as mapped in a F(1)-F(2) vowel space in a cortical representation.}, } @article {pmid12512645, year = {2002}, author = {Ilk, HG and Eroğul, O and Satar, B and Ozkaptan, Y}, title = {Effects of tonsillectomy on speech spectrum.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {16}, number = {4}, pages = {580-586}, doi = {10.1016/s0892-1997(02)00133-9}, pmid = {12512645}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; *Postoperative Complications ; Severity of Illness Index ; Speech Disorders/*diagnosis/*etiology ; *Tonsillectomy ; *Voice Quality ; }, abstract = {Changes in the speech spectrum of vowels and consonants before and after tonsillectomy were investigated to find out the impact of the operation on speech quality. Speech recordings obtained from patients were analyzed using the Kay Elemetrics, Multi-Dimensional Voice Processing (MDVP Advanced) software. Examination of the time-course changes after the operation revealed that certain speech parameters changed. These changes were mainly F3 (formant center frequency) and B3 (formant bandwidth) for the vowel /o/ and a slight decrease in B1 and B2 for the vowel /a/. The noise-to-harmonic ratio (NHR) also decreased slightly, suggesting less nasalized vowels. It was also observed that the fricative, glottal consonant /h/ has been affected. The larger the tonsil had been, the more changes were seen in the speech spectrum. The changes in the speech characteristics (except F3 and B3 for the vowel /o/) tended to recover, suggesting an involvement of auditory feedback and/or replacement of a new soft tissue with the tonsils. Although the changes were minimal and, therefore, have little effect on the extracted acoustic parameters, they cannot be disregarded for those relying on their voice for professional reasons, that is, singers, professional speakers, and so forth.}, } @article {pmid12498354, year = {2002}, author = {Salo, S and Peltola, MS and Aaltonen, O and Johansson, R and Lang, AH and Laurikainen, E}, title = {Stability of memory traces for speech sounds in cochlear implant patients.}, journal = {Logopedics, phoniatrics, vocology}, volume = {27}, number = {3}, pages = {132-138}, doi = {10.1080/140154302760834868}, pmid = {12498354}, issn = {1401-5439}, mesh = {Adult ; *Cochlear Implantation ; Deafness/*surgery ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Memory ; Middle Aged ; *Phonetics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {For this study, we examined the perception and production of vowels by postlingually deafened patients with cochlear implant. Four patients and one normally hearing subject produced typical vowel sounds of Finnish by using a speech synthesizer. Also acoustic analyses of the pronounced vowels were made. The first (F1) and the second (F2) formant frequencies were measured. The mismatch negativity (MMN), a cortical cognitive auditory event related potential, was used to measure objectively the patients' preattentive discrimination of a prototypical /i/ sound from deviants differing in the F2 continuum. In the phonetic tests the hyperspace effect was seen also among the patients. The MMN, which reflects the phonetic discrimination ability, could be identified from the patient with the best vowel perception abilities. The phonetic memory traces once developed for vowels seem to remain quite stable even though they have not been activated by vowel information for years.}, } @article {pmid12493638, year = {2003}, author = {Subramanian, A and Yairi, E and Amir, O}, title = {Second formant transitions in fluent speech of persistent and recovered preschool children who stutter.}, journal = {Journal of communication disorders}, volume = {36}, number = {1}, pages = {59-75}, doi = {10.1016/s0021-9924(02)00135-1}, pmid = {12493638}, issn = {0021-9924}, support = {R01-DC00459/DC/NIDCD NIH HHS/United States ; }, mesh = {Child, Preschool ; Disease-Free Survival ; Female ; Follow-Up Studies ; Humans ; Male ; Observer Variation ; *Phonetics ; Reproducibility of Results ; Severity of Illness Index ; Sound Spectrography ; *Speech Perception ; Stuttering/*diagnosis/epidemiology/therapy ; }, abstract = {UNLABELLED: This study investigated frequency change and duration of the second formant (F2) transitions in perceptually fluent speech samples recorded close to stuttering onset in preschool age children. Comparisons were made among 10 children known to eventually persist in stuttering, 10 who eventually recovered from stuttering, and 10 normally fluent controls. All were enrolled in the longitudinal Stuttering Research Project at the University of Illinois. Subjects fluently repeated standard experimental sentences. The same 36 perceptually fluent target segments (syllables embedded in words) from each subject's repeated sentences were analyzed. The syllables were divided into three phonetic categories based on their initial consonant: bilabial, alveolar, and velar placement. The frequency change and duration of F2 transitions were analyzed for each of the target CV segments. F2 transition onset and offset frequencies and their interval (duration) were measured for each utterance. Data indicate that near stuttering onset, children whose stuttering eventually persisted demonstrated significantly smaller frequency change than that of the recovered group. It is suggested that the F2 transitions should continue to be investigated as a possible predictor of stuttering pathways.

LEARNING OUTCOMES: (1) Readers will learn about studies regarding second formant transition related to stuttering. (2) Readers will learn about differences between children who persist in stuttering and those who recover from stuttering. (3) Readers will learn about research concerned with early identification of risk criteria in persistent stuttering.}, } @article {pmid12469451, year = {2002}, author = {Nijland, L and Maassen, B and Van der Meulen, S and Gabreels, F and Kraaimaat, FW and Schreuder, R}, title = {Coarticulation patterns in children with developmental apraxia of speech.}, journal = {Clinical linguistics & phonetics}, volume = {16}, number = {6}, pages = {461-483}, doi = {10.1080/02699200210159103}, pmid = {12469451}, issn = {0269-9206}, mesh = {Apraxias/*diagnosis ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonetics ; Severity of Illness Index ; Speech Acoustics ; }, abstract = {The aim of this study was to enhance our insight into the underlying deficit in developmental apraxia of speech (DAS). In particular, the involvement of planning and/or programming of speech movements in context was tested by analysing coarticulatory cohesion. For this purpose, second formant frequency measurements were conducted in repetitions of nonsense utterances ([[symbol: see text]] C = /s,x,b,d/; V = /i.a.u/), and compared across nine children with DAS, six normally speaking (NS) children and six adult women. The results showed both intra- and intersyllabic anticipatory coarticulation in NS children and adult women, in which the intersyllabic coarticulation was stronger in NS children than in adult women. The children with DAS showed more variability as compared to NS children, made, on average, less distinction between the vowels, and showed individually idiosyncratic coarticulation patterns. These results are discussed in the light of a delay as well as a deviance of speech development in children with DAS.}, } @article {pmid12469449, year = {2002}, author = {Hagiwara, R and Fosnot, SM and Alessi, DM}, title = {Acoustic phonetics in a clinical setting: a case study of /r/-distortion therapy with surgical intervention.}, journal = {Clinical linguistics & phonetics}, volume = {16}, number = {6}, pages = {425-441}, doi = {10.1080/02699200210128963}, pmid = {12469449}, issn = {0269-9206}, mesh = {Articulation Disorders/*diagnosis/genetics/*surgery ; Child ; Humans ; Male ; Oral Surgical Procedures ; Pedigree ; *Phonetics ; Severity of Illness Index ; Sound Spectrography ; *Speech Acoustics ; }, abstract = {Acoustic measures are used to document the speech of a 6-year-old child with persistent /r/-distortion through several treatment interventions. The child originally presented a complex of speech disorders and was treated by a speech-language pathologist using phonological process techniques. The procedures successfully corrected most of his speech problems, although /r/ remained severely distorted. The primary acoustic manifestation of this distortion was a high third formant. Surgical correction of a banded lingual frenulum, along with adenoton-sillectomy indicated for sleep apnea, is shown to have had a small effect in lowering the third formant. A dramatic change was seen on reintroduction of therapy, when an extreme drop in third formant frequencies for /r/ was observed. The acoustic data are interpreted using speaker-internal controls derived from a dialect-appropriate adult model.}, } @article {pmid12433397, year = {2002}, author = {Lee, HJ and Wallani, T and Mendelson, JR}, title = {Temporal processing speed in the inferior colliculus of young and aged rats.}, journal = {Hearing research}, volume = {174}, number = {1-2}, pages = {64-74}, doi = {10.1016/s0378-5955(02)00639-1}, pmid = {12433397}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Aging/*physiology ; Animals ; Auditory Cortex/physiology ; Inferior Colliculi/*physiology ; Male ; Neurons/physiology ; Rats ; Rats, Long-Evans ; Reaction Time ; }, abstract = {A common problem among the elderly is a difficulty in discriminating speech. One factor that may contribute to this is a deterioration in the ability to process dynamic aspects of speech such as formant transitions. Recently, Mendelson and Ricketts [Mendelson, J.R., Ricketts, C., Hear. Res. 158 (2001) 84-94] showed that cells recorded from the auditory cortex of aged animals exhibited a decrease in temporal processing speed compared to young animals. In the present study, we examined whether this age-related effect was exclusive to the auditory cortex or whether it was apparent subcortically. To this end, single units were recorded from the inferior colliculus (IC) of young and aged rats in response to frequency modulated (FM) sweeps. Results showed that there was no age-related difference in speed or direction selectivity of FM sweep responses in the IC. The present results suggest that the effect of aging on temporal processing speed occurs in the cortex, but not subcortically.}, } @article {pmid12430827, year = {2002}, author = {Green, T and Faulkner, A and Rosen, S}, title = {Spectral and temporal cues to pitch in noise-excited vocoder simulations of continuous-interleaved-sampling cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {112}, number = {5 Pt 1}, pages = {2155-2164}, doi = {10.1121/1.1506688}, pmid = {12430827}, issn = {0001-4966}, support = {G7/ACT_/RNID/United Kingdom ; }, mesh = {Adult ; *Cochlear Implants ; *Cues ; Humans ; Middle Aged ; *Noise ; *Pitch Perception ; Psychometrics ; Sound Spectrography ; Time Factors ; *Time Perception ; }, abstract = {Four-band and single-band noise-excited vocoders were used in acoustic simulations to investigate spectral and temporal cues to melodic pitch in the output of a cochlear implant speech processor. Noise carriers were modulated by amplitude envelopes extracted by half-wave rectification and low-pass filtering at 32 or 400 Hz. The four-band, but not the single-band processors, may preserve spectral correlates of fundamental frequency (F0). Envelope smoothing at 400 Hz preserves temporal correlates of F0, which are eliminated with 32-Hz smoothing. Inputs to the processors were sawtooth frequency glides, in which spectral variation is completely determined by F0, or synthetic diphthongal vowel glides, whose spectral shape is dominated by varying formant resonances. Normal listeners labeled the direction of pitch movement of the processed stimuli. For processed sawtooth waves, purely temporal cues led to decreasing performance with increasing F0. With purely spectral cues, performance was above chance despite the limited spectral resolution of the processors. For processed diphthongs, performance with purely spectral cues was at chance, showing that spectral envelope changes due to formant movement obscured spectral cues to F0. Performance with temporal cues was poorer for diphthongs than for sawtooths, with very limited discrimination at higher F0. These data suggest that, for speech signals through a typical cochlear implant processor, spectral cues to pitch are likely to have limited utility, while temporal envelope cues may be useful only at low F0.}, } @article {pmid12414269, year = {2002}, author = {Mäkelä, AM and Alku, P and Mäkinen, V and Valtonen, J and May, P and Tiitinen, H}, title = {Human cortical dynamics determined by speech fundamental frequency.}, journal = {NeuroImage}, volume = {17}, number = {3}, pages = {1300-1305}, doi = {10.1006/nimg.2002.1279}, pmid = {12414269}, issn = {1053-8119}, mesh = {Adult ; Age Factors ; Auditory Cortex/physiology ; Cerebral Cortex/*physiology ; Dominance, Cerebral/physiology ; Evoked Potentials, Auditory/physiology ; Female ; Humans ; Infant ; *Magnetoencephalography ; Male ; *Phonetics ; Pitch Perception/*physiology ; Sex Factors ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Evidence for speech-specific brain processes has been searched for through the manipulation of formant frequencies which mediate phonetic content and which are, in evolutionary terms, relatively "new" aspects of speech. Here we used whole-head magnetoencephalography and advanced stimulus reproduction methodology to examine the contribution of the fundamental frequency F0 and its harmonic integer multiples in cortical processing. The subjects were presented with a vowel, a frequency-matched counterpart of the vowel lacking in phonetic contents, and a pure tone. The F0 of the stimuli was set at that of a typical male (i.e., 100 Hz), female (200 Hz), or infant (270 Hz) speaker. We found that speech sounds, both with and without phonetic content, elicited the N1m response in human auditory cortex at a constant latency of 120 ms, whereas pure tones matching the speech sounds in frequency, intensity, and duration gave rise to N1m responses whose latency varied between 120 and 160 ms. Thus, it seems that the fundamental frequency F0 and its harmonics determine the temporal dynamics of speech processing in human auditory cortex and that speech specificity arises out of cortical sensitivity to the complex acoustic structure determined by the human sound production apparatus.}, } @article {pmid12403610, year = {2002}, author = {Goedegebure, A and Goedegebure-Hulshof, M and Verschuure, H and Dreschler, WA}, title = {The effects of phonemic compression and anti-upward-spread-of-masking (anti-USOM) on the perception of articulatory features in hearing-impaired listeners.}, journal = {International journal of audiology}, volume = {41}, number = {7}, pages = {414-428}, doi = {10.3109/14992020209090419}, pmid = {12403610}, issn = {1499-2027}, mesh = {Analysis of Variance ; Hearing Loss, High-Frequency/*physiopathology ; Humans ; Noise ; Perceptual Masking/*physiology ; *Phonetics ; Sound Spectrography/methods ; Speech Intelligibility/*physiology ; Speech Perception/physiology ; }, abstract = {The effect of speech processing on articulatory-feature recognition was studied in a group of hearing-impaired listeners with high-frequency sensorineural losses. Individual difference scaling (INDSCAL) and sequential information analysis (SINFA) were applied to a set of consonant-vowel-consonant responses measured under various conditions of speech processing. The processing consisted of high-frequency phonemic compression combined with compensation for anti-upward-spread-of-masking (i.e. anti-USOM). In quiet, we found an improved recognition of frication with compression, whereas additional anti-USOM improved the recognition of the second and third vowel-formants. In background noise, we found remarkably negative effects of anti-USOM on the recognition of features containing low-frequency cues, such as voicing and nasality. We conclude that the combined results of SINFA and INDSCAL provide important insights into the possibilities and limitations of phonemic compression and anti-USOM.}, } @article {pmid12395990, year = {2002}, author = {Vurma, A and Ross, J}, title = {Where is a singer's voice if it is placed "forward"?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {16}, number = {3}, pages = {383-391}, doi = {10.1016/s0892-1997(02)00109-1}, pmid = {12395990}, issn = {0892-1997}, mesh = {Female ; Humans ; Male ; Phonetics ; Severity of Illness Index ; Voice Disorders/*diagnosis ; *Voice Quality ; }, abstract = {Singing teachers sometimes characterize voice quality in terms of "forward" and "backward" placement. In view of our traditional knowledge about voice production it is hard to explain any possible acoustic or articulatory differences between the voices so "placed." The analysis of the teachers' expert opinions demonstrates that, in general, a voice placed "forward" indicates a desirable quality that students should attain by the end of their studies. Productions that were perceived as "forward" and "backward" were selected from the listening test. The acoustic analysis of those productions reveals that the voice quality in the case of "forward" placement correlates with higher frequencies of the second (F2) and third (F3) formants, as well as with a more salient "singer's formant" in the voice. The five basic vowels were included in the investigation.}, } @article {pmid12393244, year = {2002}, author = {Seifert, E and Oswald, M and Bruns, U and Vischer, M and Kompis, M and Haeusler, R}, title = {Changes of voice and articulation in children with cochlear implants.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {66}, number = {2}, pages = {115-123}, doi = {10.1016/s0165-5876(02)00216-1}, pmid = {12393244}, issn = {0165-5876}, mesh = {Articulation Disorders/*diagnosis ; Child ; Child, Preschool ; *Cochlear Implants ; Deafness/*surgery ; Female ; Humans ; Male ; Phonetics ; Postoperative Care ; Severity of Illness Index ; Speech Intelligibility ; Speech Production Measurement ; *Voice Quality ; }, abstract = {OBJECTIVE: The different speech sounds are formed by the primary voice signal and by the shape of the articulation tract. With this mechanism, specific overtones, the formants, are generated for each vowel. The objective of this study was to investigate the fundamental frequency (F0) of the voice signal and the first three formants (F1-F3) as a parameter of the articulation in prelingually deafened children at different timepoints after cochlear implantation (CI) compared with children with normal speech development.

METHODS: Using the Kay CSL 4300B, the fundamental frequency and the formants F1-F3 of the Swiss-German vowel /a/ were investigated at different timepoints after CI in 20 prelingually deafened children aged 3.8-10.2 years by means of spectrographic and linear predictive coding (LPC) analysis.

RESULTS: Children who had been operated before their fourth birthday showed no significant deviation in their fundamental frequency from age- and sex-matched peers, whereas a significant difference was documented in children who were older at the time of implantation. The first formant was very stable in every child and showed only discrete deviations from the normal range. The second and third formants, however, developed a broader scatter, but there was no systematic deviation of these formants to higher or lower values. The F1:F2 ratio was normal in children who were implanted at the age of up to 4 years and more centralized in children who were older at the time of implantation, as is known from the hearing impaired.

CONCLUSIONS: Our results indicate that prelingually deaf children who receive a cochlear implant before their fourth birthday attain a better acoustic control over their speech, normalizing their fundamental frequencies and improving their articulatory skills.}, } @article {pmid12378036, year = {2002}, author = {Casal, C and Domínguez, C and Fernández, A and Sarget, R and Martínez-Celdrán, E and Sentís-Vilalta, J and Gay-Escoda, C}, title = {Spectrographic measures of the speech of young children with cleft lip and cleft palate.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {54}, number = {5}, pages = {247-257}, doi = {10.1159/000065197}, pmid = {12378036}, issn = {1021-7762}, mesh = {Articulation Disorders/*diagnosis/*etiology ; Child ; Child, Preschool ; Cleft Lip/*complications/surgery ; Cleft Palate/*complications/surgery ; Female ; Humans ; Male ; Postoperative Period ; Severity of Illness Index ; Sound Spectrography/instrumentation ; }, abstract = {Twenty-two consecutive children with repaired cleft lip and/or palate [isolated cleft lip (CL) 6, isolated cleft palate (CP) 7, unilateral cleft lip and palate (UCLP) 7, and bilateral cleft lip and palate 2] with a mean age of 27 months underwent spectrographic measures of tape-recorded speech (DSP Sona-Graph digital unit). Controls were 22 age- and sex-matched noncleft children. Data analyzed included (1) the Spanish vocalic variables [a, i, u, e, o]: first formant, second formant, duration, and context; (2) obstruent variables [p, t, k]: burst, voice onset time, and duration, and (3) nasal variables [m]: first formant, second formant, and duration. Statistically significant differences were observed between the CL group and the control group in the first formant of [e] and in the increase of the frequency of the [t] burst. Comparison between UCLP and controls showed differences in the second formant of [a], in the first formant of [o], and in the second formant of [o]. These results suggest a small but significant influence of either the cleft lip or its repair on lip rounding for [o] and [u]. In addition, tongue position differences were most likely responsible for the differences seen with [a] and [e]. Spectrographic differences in the current patients did not contribute to meaningful differences in speech sound development. Individualized care (orthodontics, surgery, speech therapy) in children with cleft lip and/or palate attended at specialized craniofacial units contributes to normalization of speech development.}, } @article {pmid12375627, year = {2002}, author = {Borch, DZ and Sundberg, J}, title = {Spectral distribution of solo voice and accompaniment in pop music.}, journal = {Logopedics, phoniatrics, vocology}, volume = {27}, number = {1}, pages = {37-41}, doi = {10.1080/140154302760146961}, pmid = {12375627}, issn = {1401-5439}, mesh = {*Acoustics ; Humans ; *Music ; Sound Spectrography ; *Voice Quality ; }, abstract = {Singers performing in popular styles of music mostly rely on feedback provided by monitor loudspeakers on the stage. The highest sound level that these loudspeakers can provide without feedback noise is often too low to be heard over the ambient sound level on the stage. Long-term-average spectra of some orchestral accompaniments typically used in pop music are compared with those of classical symphonic orchestras. In loud pop accompaniment the sound level difference between 0.5 and 2.5 kHz is similar to that of a Wagner orchestra. Long-term-average spectra of pop singers' voices showed no signs of a singer's formant but a peak near 3.5 kHz. It is suggested that pop singers' difficulties to hear their own voices may be reduced if the frequency range 3-4 kHz is boosted in the monitor sound.}, } @article {pmid12375624, year = {2002}, author = {Whiteside, SP and Hodgson, C and Tapster, C}, title = {Vocal characteristics in pre-adolescent and adolescent children: a longitudinal study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {27}, number = {1}, pages = {12-20}, doi = {10.1080/140154302760146934}, pmid = {12375624}, issn = {1401-5439}, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child Development/*physiology ; Child, Preschool ; Female ; Humans ; Longitudinal Studies ; Male ; Sex Factors ; Speech Production Measurement ; Voice/*physiology ; Voice Quality/physiology ; }, abstract = {A number of cross-sectional studies have been reported for the fundamental frequency and formant frequency data in the voices of a group of 20 pre-adolescent children aged between 6 and 10 years (Whiteside, S P, Hodgson, C. Acoustic characteristics in 6-10-year old children's voices: some preliminary findings, Log Phon Vocol 1999; 24: 6-13, Whiteside, SP, Hodgson, C. Some acoustic characteristics in the voices of 6-10-year-old children and adults: a comparative sex and developmental perspective, Log Phon Vocol 2000; 25: 122-132). About 15 of the children who participated in these earlier studies, participated in a follow-up study approximately 42 months later. Their speech data were recorded and analysed acoustically to investigate longitudinal patterns of development in their voices by examining data for fundamental frequency and the first three formant frequencies of a phrase final vowel. Data obtained from this follow-up study (Time 2) were compared with earlier corresponding data (Time 1). The findings of this study indicated that there was evidence for maturation in the voice parameters of all subjects between Time 1 and 2, with changes being observed for the majority of comparisons for parameters, across age and sex. In addition there was also evidence of individual differences in the maturational patterns which were observed. These individual differences highlight the degree of variation which occurs in the developmental changes of voice characteristics.}, } @article {pmid12269636, year = {2001}, author = {Andrianopoulos, MV and Darrow, K and Chen, J}, title = {Multimodal standardization of voice among four multicultural populations formant structures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {1}, pages = {61-77}, doi = {10.1016/S0892-1997(01)00007-8}, pmid = {12269636}, issn = {0892-1997}, mesh = {Adult ; Cross-Cultural Comparison ; *Cultural Diversity ; *Culture ; Female ; Humans ; Male ; Phonation/*physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Voice/*physiology ; *Voice Quality ; }, abstract = {A stratified random sample of 20 males and 20 females matched for physiologic factors and cultural-linguistic markers was examined to determine differences in formant frequencies during prolongation of three vowels: [a], [i], and [u]. The ethnic and gender breakdown included four sets of 5 male and 5 female subjects comprised of Caucasian and African American speakers of Standard American English, native Hindi Indian speakers, and native Mandarin Chinese speakers. Acoustic measures were analyzed using the Computerized Speech Lab (4300B) from which formant histories were extracted from a 200-ms sample of each vowel token to obtain first formant (F1), second formant (F2), and third formant (F3) frequencies. Significant group differences for the main effect of culture and race were found. For the main effect gender, sexual dimorphism in vowel formants was evidenced for all cultures and races across all three vowels. The acoustic differences found are attributed to cultural-linguistic factors.}, } @article {pmid12269635, year = {2001}, author = {Cleveland, TF and Sundberg, J and Stone, RE}, title = {Long-term-average spectrum characteristics of country singers during speaking and singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {1}, pages = {54-60}, doi = {10.1016/S0892-1997(01)00006-6}, pmid = {12269635}, issn = {0892-1997}, mesh = {Adult ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Speech/*physiology ; Speech Acoustics ; Time ; Time Factors ; Voice/*physiology ; Voice Quality ; }, abstract = {Five premier male country singers involved in our previous studies spoke and sang the words of both the national anthem and a country song of their choice. Long-term-average spectra were made of the spoken and sung material of each singer. The spectral characteristics of county singers' speech and singing were similar. A prominent peak in the upper part of the spectrum, previously described as the "speaker's formant," was found in the county singers' speech and singing. The singer's formant, a strong spectral peak near 2.8 kHz, an important part of the spectrum of classically trained singers, was not found in the spectra of the country singers. The results support the conclusion that the resonance characteristics in speech and singing are similar in country singing and that county singing is not characterized by a singer's formant.}, } @article {pmid12269633, year = {2001}, author = {Vintturi, J and Alku, P and Lauri, ER and Sala, E and Sihvo, M and Vilkman, I}, title = {Objective analysis of vocal warm-up with special reference to ergonomic factors.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {1}, pages = {36-53}, doi = {10.1016/s0892-1997(01)00005-4}, pmid = {12269633}, issn = {0892-1997}, mesh = {Adolescent ; Adult ; Ergonomics/*methods ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Phonetics ; Sex Factors ; Vocal Cords/*physiology ; Voice/*physiology ; Voice Quality ; }, abstract = {Vocal warm-up was studied in terms of changes in voice parameters during a 45-minute vocal loading session in the morning. The voices of a randomly chosen group of 40 female and 40 male young students were loaded by having them read a novel aloud. The exposure groups (5 females and 5 males per cell) consisted of eight combinations of the following factors: (1) low (25 +/- 5%) or high (65 +/- 5%) relative humidity of ambient air; (2) low [< 65 dB(SPL)] or high [> 65 dB(SPL)] speech output level during vocal loading; (3) sitting or standing posture during vocal loading. Two sets of voice samples were recorded: a resting sample before the loading session and a loading sample after the loading session. The material recorded consisted of /pa:ppa/ words produced normally, as softly and as loudly as possible in this order by all subjects. The long /a/ vowel of the test word was inverse-filtered to obtain the glottal flow waveform. Time domain parameters of the glottal flow [open quotient (OQ), closing quotient (CQ), speed quotient (SQ), fundamental frequency (F0)], amplitude domain parameters of the glottal flow [glottal flow (fAC) and its logarithm, minimum of the first derivative of the glottal flow (dpeak) and its logarithm, amplitude quotient (AQ), and a new parameter, CQAQ], intraoral pressure (p), and sound pressure level (SPL) values of the phonations were analyzed. Voice range profiles (VRP) and the singer's formant (g/G, a/A, cl/c, e1/e, g1/g for females/males) of the loud phonation were also measured. Statistically significant differences between the preloading and postloading samples could be seen in many parameters, but the differences depended on gender and the type of phonation. In females the values of CQ, AQ, and CQAQ decreased and the values of SQ and p increased in normal phonations; the values of fAC, dpeak, and SPL increased in soft phonations; the values of AQ and CQAQ decreased in loud phonations; the harmonic energy in the singer's formant region increased significantly at every pitch. In males the values of OQ and AQ decreased and the values of dpeak, F0, p, and SPL increased in normal phonations; the values of fAC and p increased in soft phonations. The changes could be interpreted as signs of a shift toward hyperfunctional voice production. Low humidity was associated with more hyperfunctional changes than high humidity. High output was associated with more hyperfunctional changes than low output. Sitting position was associated with an increasing trend at both margins of male VRP, whereas the case was the opposite for standing position.}, } @article {pmid12269630, year = {2001}, author = {Spencer, ML and Titze, IR}, title = {An investigation of a modal-falsetto register transition hypothesis using helox gas.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {1}, pages = {15-24}, doi = {10.1016/S0892-1997(01)00003-0}, pmid = {12269630}, issn = {0892-1997}, support = {P60 DC00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Drug Combinations ; Female ; Glottis/physiology ; *Helium ; Humans ; Male ; Models, Biological ; *Oxygen ; Sound Spectrography ; Speech Acoustics ; *Voice Quality ; }, abstract = {This study concerned the effect of the first subglottal formant (F1') on the modal-falsetto register transition in males and females. Phonations using air and a helium-oxygen mixture (helox) were used in a comparative study to tease apart possible acoustic and myoelastic contributions to involuntary register transitions. Recordings of the first subglottal formant and its accompanying bandwidths, and the lower and upper shift point marking the outer boundaries of abrupt register transitions, were obtained via a neck-mounted accelerometer, and analyzed using spectrograms and power spectra on a K-5500 Sona-Graph. The four subjects had their hearing masked bilaterally with speech level noise to increase the likelihood of involuntary register transition via minimized auditory feedback. In three of the four test subjects registration was surmised to be primarily a laryngeal event, as evidenced by the similar frequency dependency of voice breaks in both air and helox. It may be hypothesized that subglottal resonance influenced register transition in the fourth subject, as voice breaks rose with helox-induced phonation; however, this result did not reach statistical significance. Therefore, in this experiment subglottal resonance was not found to have a significant influence on register transition as originally hypothesized.}, } @article {pmid12243162, year = {2002}, author = {Hanson, HM and Stevens, KN}, title = {A quasiarticulatory approach to controlling acoustic source parameters in a Klatt-type formant synthesizer using HLsyn.}, journal = {The Journal of the Acoustical Society of America}, volume = {112}, number = {3 Pt 1}, pages = {1158-1182}, doi = {10.1121/1.1498851}, pmid = {12243162}, issn = {0001-4966}, support = {DC04331/DC/NIDCD NIH HHS/United States ; MH52358/MH/NIMH NIH HHS/United States ; NS-27407-01/NS/NINDS NIH HHS/United States ; }, mesh = {*Communication Aids for Disabled ; Equipment Design ; Humans ; Phonetics ; Sound Spectrography ; Speech Acoustics ; }, abstract = {The HLsyn speech synthesizer uses models of the vocal tract to map higher-level quasiarticulatory parameters to the acoustic parameters of a Klatt-type formant synthesizer. The benefits of this system are several. In addition to requiring a relatively small number of parameters, the HLsyn model includes constraints on source-filter relations that occur naturally during speech production. Such constraints help to prevent combinations of sources and filter that are impossible to achieve with the human vocal tract. Thus, HLsyn could lead to reductions in the complexity of formant synthesis and result in better quality synthesis. HLsyn can also be a useful tool for speech-science education and speech research. This paper focuses on the generation of acoustic sources in HLsyn. Described in detail are the equations and methods used to estimate Klatt-type source parameters from HLsyn parameters. Several examples illustrating the generation of source parameters for obstruents (voiced and voiceless) and sonorants are provided. Future papers will describe the filtering components of HLsyn.}, } @article {pmid12237047, year = {2002}, author = {Morris, RW and Clements, MA}, title = {Reconstruction of speech from whispers.}, journal = {Medical engineering & physics}, volume = {24}, number = {7-8}, pages = {515-520}, doi = {10.1016/s1350-4533(02)00060-7}, pmid = {12237047}, issn = {1350-4533}, mesh = {*Algorithms ; Computer Simulation ; Humans ; *Models, Biological ; Pattern Recognition, Automated ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement/*methods ; }, abstract = {This paper investigates a method for the real-time reconstruction of normal speech from whispers. This system could be used by aphonic individuals as a voice prosthesis. It could also provide improved verbal communication when normal speech is not appropriate. The normal speech is synthesized using the mixed excitation linear prediction model. Differences between whispered and phonated speech are discussed and methods for estimating the parameters of this model from whispered speech for real-time synthesis are proposed. This includes smoothing the noisy linear prediction spectra, modifying the formants, and synthesizing of the excitation signal. Trade-offs between computational complexity, delay, and accuracy of different methods are discussed.}, } @article {pmid12237046, year = {2002}, author = {Wermke, K and Mende, W and Manfredi, C and Bruscaglioni, P}, title = {Developmental aspects of infant's cry melody and formants.}, journal = {Medical engineering & physics}, volume = {24}, number = {7-8}, pages = {501-514}, doi = {10.1016/s1350-4533(02)00061-9}, pmid = {12237046}, issn = {1350-4533}, mesh = {Crying/*physiology ; Female ; Humans ; Infant ; Language Development ; Larynx/*growth & development/physiology ; Male ; *Models, Biological ; Phonation/*physiology ; Reproducibility of Results ; Sensitivity and Specificity ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Speech Production Measurement/methods ; Twins, Monozygotic ; }, abstract = {This paper deals with the analysis of cry melodies (time variations of the fundamental frequency) as well as vocal tract resonance frequencies (formants) from infant cry signals. The increase of complexity of cry melodies is a good indicator for neuro-muscular maturation as well as for the evaluation of pre-speech development. The variation of formant frequencies allows an estimation of articulatory activity during pre-speech vocalization. Subjects are three pairs of healthy identical twins (monocygozity determined by DNA-fingerprint). Spontaneous cries of these six children were recorded at different ages: 8th-9th week, 15th-17th week and 23rd-24th week. Analysis of 136 cry melodies and intensity contours was made using KAY-CSL 4300/MDVP. For formant estimation a spectral parametric technique was applied, which was based on autoregressive models (Digital spectral analysis with applications, 1987) whose order is adaptively estimated on subsequent signal frames by means of a new method (Med. Eng. Phys. 20 (1998) 432; Utras. Med. Biol. 21 (1995) 793). Cry melodies exhibited an increasing complexity during the observation period. Beginning with the second observation period (15th-17th week) an increasing coupling and tuning between melody and resonance frequencies was observed, which was interpreted as "intentional" articulatory activity. Possible applications are in cry diagnosis as well as in the evaluation of pre-speech development.}, } @article {pmid12232464, year = {2002}, author = {Erickson, D}, title = {Articulation of extreme formant patterns for emphasized vowels.}, journal = {Phonetica}, volume = {59}, number = {2-3}, pages = {134-149}, doi = {10.1159/000066067}, pmid = {12232464}, issn = {0031-8388}, mesh = {Female ; Humans ; Male ; *Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {This study examined formant, jaw and tongue dorsum measurements from X-ray microbeam recordings of American English speakers producing emphasized vs. unemphasized words containing high-front, mid-front and low vowels. For emphasized vowels, the jaw position, regardless of vowel height, was lower, while the tongue dorsum had a more extreme articulation in the direction of the phonological specification of the vowel. For emphasized low vowels, the tongue dorsum position was lower with the acoustic consequence of F1 and F2 bunched closer together. For emphasized high and mid-front vowels, the tongue was more forward with the acoustic consequence of F1 and F2 spread more apart. These findings are interpreted within acoustic models of speech production. They also provide empirical data which have application to the C/D model hypothesis that both increased lowering of jaw and enhanced tongue gesture are consequences of a magnitude increase in the syllable pulse due to emphasis.}, } @article {pmid12207989, year = {2002}, author = {Hertrich, I and Mathiak, K and Lutzenberger, W and Ackermann, H}, title = {Hemispheric lateralization of the processing of consonant-vowel syllables (formant transitions): effects of stimulus characteristics and attentional demands on evoked magnetic fields.}, journal = {Neuropsychologia}, volume = {40}, number = {12}, pages = {1902-1917}, doi = {10.1016/s0028-3932(02)00063-5}, pmid = {12207989}, issn = {0028-3932}, mesh = {Adult ; Attention/*physiology ; Data Interpretation, Statistical ; Dichotic Listening Tests ; *Electromagnetic Fields ; Evoked Potentials, Auditory/physiology ; Female ; Functional Laterality/*physiology ; Humans ; Magnetoencephalography ; Male ; Psychomotor Performance/physiology ; Sex Characteristics ; Speech Perception/*physiology ; }, abstract = {It is still unsettled in how far temporal resolution of dynamic acoustic events (formant transitions) or phonetic/linguistic processes contribute to predominant left-hemisphere encoding of consonant-vowel syllables. To further elucidate the underlying mechanisms, evoked magnetic fields in response to consonant-vowel events (synthetic versus spoken) were recorded (oddball design: standards=binaural/ba/, deviants=dichotic/ba/-/da/; 20 right-handed subjects) under different attentional conditions (visual distraction versus stimulus identification). Spoken events yielded a left-lateralized peak phase of the mismatch field (MMF; 150-200ms post-stimulus onset) in response to right-ear deviants during distraction. By contrast, pre-attentive processing of synthetic items gave rise to a left-enhanced MMF onset (100ms), but failed to elicit later lateralization effects. In case of directed attention, synthetic deviants elicited a left-pronounced MMF peak resembling the pre-attentive response to natural syllables. These interactions of MMF asymmetry with signal structure and attentional load indicate two distinct successive left-lateralization effects: signal-related operations and representation of 'phonetic traces'. Furthermore, a right-lateralized early MMF component (100ms) emerged in response to natural syllables during pre-attentive processing and to synthetic stimuli in case of directed attention. Conceivably, these effects indicate right hemisphere operations prior to phonetic evaluation such as periodicity representation. Two distinct time windows showed correlations between dichotic listening performance and ear effects on magnetic responses reflecting early gain factors (ca. 75ms post-stimulus onset) and binaural fusion strategies (ca. 200ms), respectively. Finally, gender interacted with MMF lateralization, indicating different processing strategies in case of artificial speech signals.}, } @article {pmid12199398, year = {2002}, author = {Chang, SE and Ohde, RN and Conture, EG}, title = {Coarticulation and formant transition rate in young children who stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {45}, number = {4}, pages = {676-688}, doi = {10.1044/1092-4388(2002/054)}, pmid = {12199398}, issn = {1092-4388}, support = {DC00523-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Severity of Illness Index ; Speech Acoustics ; Speech Production Measurement ; Stuttering/*diagnosis ; Videotape Recording ; }, abstract = {The purpose of this study was to assess anticipatory coarticulation and second formant (F2) transition rate (FTR) of speech production in young children who stutter (CWS) and who do not stutter (CWNS). Fourteen CWS and 14 age- and gender-matched CWNS in three age groups (3-, 4-, and 5-year-olds) participated in a picture-naming task that elicited single-word utterances. The initial consonant-vowel (CV) syllables of these utterances, comprising either bilabial [b m] or alveolar [d n s z] consonants and a number of vowels [i I e epsilon ae u o c aI av],were used for acoustic analysis. To assess coarticulation and speech movement velocity, the F2 onset frequency and F2 vowel target frequency (for coarticulation) and FTR (for speech movement velocity) were computed for each CV syllable and for each participant. Based on these measures, locus equation statistics of slope, y-intercept, and standard error of estimate as well as the FTR were analyzed. Findings revealed a significant main effect for place of articulation and a significantly larger difference in FTR between the two places of articulation for CWNS than for CWS. Findings suggest that the organization of the FTR production for place of articulation may not be as contrastive or refined in CWS as in CWNS, a subtle difficulty in the speed of speech-language production, which may contribute to the disruption of their speech fluency.}, } @article {pmid12186050, year = {2002}, author = {Nittrouer, S}, title = {Learning to perceive speech: how fricative perception changes, and how it stays the same.}, journal = {The Journal of the Acoustical Society of America}, volume = {112}, number = {2}, pages = {711-719}, pmid = {12186050}, issn = {0001-4966}, support = {R01 DC000633/DC/NIDCD NIH HHS/United States ; R01 DC00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; *Phonetics ; Reference Values ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {A part of becoming a mature perceiver involves learning what signal properties provide relevant information about objects and events in the environment. Regarding speech perception, evidence supports the position that allocation of attention to various signal properties changes as children gain experience with their native language, and so learn what information is relevant to recognizing phonetic structure in that language. However, one weakness in that work has been that data have largely come from experiments that all use similarly designed stimuli and show similar age-related differences in labeling. In this study, two perception experiments were conducted that used stimuli designed differently from past experiments, with different predictions. In experiment 1, adults and children (4, 6, and 8 years of age) labeled stimuli with natural /f/ and /[see text]/ noises and synthetic vocalic portions that had initial formant transitions varying in appropriateness for /f/ or /[see text]/. The prediction was that similar labeling patterns would be found for all listeners. In experiment 2, adults and children labeled stimuli with initial /s/-like and /[see text]/-like noises and synthetic vocalic portions that had initial formant transitions varying in appropriateness for /s/ or /[see text]/. The prediction was that, as found before, children would weight formant transitions more and fricative noises less than adults, but that this age-related difference would elicit different patterns of labeling from those found previously. Results largely matched predictions, and so further evidence was garnered for the position that children learn which properties of the speech signal provide relevant information about phonetic structure in their native language.}, } @article {pmid12161728, year = {2002}, author = {Behrman, A and Shikowitz, MJ and Dailey, S}, title = {The effect of upper airway surgery on voice.}, journal = {Otolaryngology--head and neck surgery : official journal of American Academy of Otolaryngology-Head and Neck Surgery}, volume = {127}, number = {1}, pages = {36-42}, doi = {10.1067/mhn.2002.126589}, pmid = {12161728}, issn = {0194-5998}, mesh = {Adult ; Aged ; Case-Control Studies ; Cohort Studies ; Female ; Humans ; Male ; Middle Aged ; Nasal Obstruction/diagnosis/*surgery ; Otorhinolaryngologic Surgical Procedures/*adverse effects/methods ; Probability ; Prognosis ; Prospective Studies ; Risk Assessment ; Severity of Illness Index ; Speech Acoustics ; Statistics, Nonparametric ; Tonsillectomy/adverse effects ; Voice Disorders/diagnosis/*etiology ; }, abstract = {OBJECTIVE: Our goal was to assess patient perception and acoustic characteristics of voice before and after upper airway surgery.

STUDY DESIGN AND SETTING: We conducted a prospective assessment of 44 surgical patients preoperatively and postoperatively at a tertiary care, academic hospital. Operations included septoplasty and turbinectomy (n = 28) and septoplasty, turbinectomy, uvulopalatopharyngoplasty, and tonsillectomy (n = 16). Patient opinion measures included Voice Handicap Index score, perception of vocal resonance, and change in voice. Acoustic measures included assessment of the relative amplitude of selected formants (resonances) of the vocal tract.

RESULTS: Mean Voice Handicap Index scores were unchanged after surgery. Nine patients (20%) perceived their voice to be improved after surgery. None perceived the voice to be worse. Postoperative changes in relative formant amplitudes were statistically significant. These changes caused the acoustic features to become more representative of normative data than the preoperative values.

CONCLUSION: Upper airway oeprations can affect acoustics and perception of voice.

SIGNIFICANCE: Patients are unlikely to perceive a change in voice as a result of upper airway surgeries, but in those cases where a difference is perceived, it is likely to be a positive change.}, } @article {pmid12161255, year = {2002}, author = {Mathiak, K and Hertrich, I and Lutzenberger, W and Ackermann, H}, title = {The influence of critical bands on neuromagnetic fields evoked by speech stimuli in humans.}, journal = {Neuroscience letters}, volume = {329}, number = {1}, pages = {29-32}, doi = {10.1016/s0304-3940(02)00572-4}, pmid = {12161255}, issn = {0304-3940}, mesh = {Adult ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; *Magnetoencephalography ; Male ; Phonetics ; Reaction Time/physiology ; Speech Perception/*physiology ; }, abstract = {The various classes of speech sounds differ in their configuration of acoustic features. Vowels are characterized by specific local maxima of spectral energy distribution (formants). Using whole-head magnetoencephalography, the impact of variation of the first (F1) and second formant (F2) on the evoked N1m component (100 ms latency) was studied based on an oddball design. F1 changes yielded N1m enhancements in parallel to the spectral distance between standard and deviant stimuli. By contrast, F2 shifts gave rise to a non-linear relationship: the N1m effect flattened out above a range of two Barks. This frequency domain accords to critical band characteristics of the peripheral and central auditory system. The differences of early neuronal encoding of both formants relate to the predominant role of F2 for the encoding of stop consonants.}, } @article {pmid12153454, year = {2002}, author = {Sumita, YI and Ozawa, S and Mukohyama, H and Ueno, T and Ohyama, T and Taniguchi, H}, title = {Digital acoustic analysis of five vowels in maxillectomy patients.}, journal = {Journal of oral rehabilitation}, volume = {29}, number = {7}, pages = {649-656}, doi = {10.1046/j.1365-2842.2002.00911.x}, pmid = {12153454}, issn = {0305-182X}, mesh = {Acoustics ; Adult ; Aged ; Forecasting ; Humans ; Male ; Maxilla/*surgery ; Middle Aged ; *Phonetics ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Speech/physiology ; Speech Intelligibility ; Statistics, Nonparametric ; }, abstract = {The aim of the study was to characterize the acoustics of vowel articulation in maxillectomy patients. Digital acoustic analysis of five vowels, /a/, /e/, /i/, /o/ and /u/, was performed on 12 male maxillectomy patients and 12 normal male individuals. A simple set of acoustic descriptions called the first and second formant frequencies, F1 and F2, were employed and calculated based on linear predictive coding. The maxillectomy patients had a significantly lower F2 for all five vowels and a significantly higher F1 for only /i/ vowel. From the data plotted on an F1-F2 plane in each subject, we determined the F1 range and the F2 range, which are the differences between the minimum and the maximum frequencies among the five vowels. The maxillectomy patients had a significantly narrower F2 range than the normal controls. In contrast, there was no significant difference in the F1 range. These results suggest that the maxillectomy patients had difficulty in controlling F2 properly. In addition, the speech intelligibility (SI) test was performed to verify the results of this new frequency range method. A high correlation between the F2 range and the score of SI test was demonstrated, suggesting that the F2 range is effective in evaluating the speech ability of maxillectomy patients.}, } @article {pmid12150369, year = {2002}, author = {Erickson, ML and D'Alfonso, AE}, title = {A comparison of two methods of formant frequency estimation for high-pitched voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {16}, number = {2}, pages = {147-171}, doi = {10.1016/s0892-1997(02)00086-3}, pmid = {12150369}, issn = {0892-1997}, mesh = {Adolescent ; Adult ; Humans ; *Larynx, Artificial ; Male ; Middle Aged ; Phonetics ; Pitch Perception/*physiology ; Posture ; Sound Spectrography ; Speech Intelligibility ; Speech Perception ; Time Factors ; *Voice Quality ; }, abstract = {This study sought to compare formant frequencies estimated from natural phonation to those estimated using two methods of artificial laryngeal stimulation: (1) stimulation of the vocal tract using an artificial larynx placed on the neck and (2) stimulation of the vocal tract using an artificial larynx with an attached tube placed in the oral cavity. Twenty males between the ages of 18 and 45 performed the following three tasks on the vowels /a/ and /i/: (1) 4 seconds of sustained vowel, (2) 2 seconds of sustained vowel followed by 2 seconds of artificial phonation via a neck placement, and (3) 4 seconds of sustained vowel, the last two of which were accompanied by artificial phonation via an oral placement. Frequencies for formants 1-4 were measured for each task at second 1 and second 3 using linear predictive coding. These measures were compared across second 1 and second 3, as well as across all three tasks. Neither of the methods of artificial laryngeal stimulation tested in this study yielded formant frequency estimates that consistently agreed with those obtained from natural phonation for both vowels and all formants. However, when estimating mean formant frequency data for samples of large N, each of the methods agreed with mean estimations obtained from natural phonation for specific vowels and formants. The greatest agreement was found for a neck placement of the artificial larynx on the vowel /a/.}, } @article {pmid12141351, year = {2002}, author = {Ferguson, SH and Kewley-Port, D}, title = {Vowel intelligibility in clear and conversational speech for normal-hearing and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {112}, number = {1}, pages = {259-271}, doi = {10.1121/1.1482078}, pmid = {12141351}, issn = {0001-4966}, support = {NIHDCD-00012/HD/NICHD NIH HHS/United States ; NIHDCD-02229/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Auditory Threshold/physiology ; Cues ; Female ; Hearing ; Hearing Disorders/*diagnosis ; Humans ; Male ; Middle Aged ; *Phonetics ; *Speech Perception ; }, abstract = {Several studies have demonstrated that when talkers are instructed to speak clearly, the resulting speech is significantly more intelligible than speech produced in ordinary conversation. These speech intelligibility improvements are accompanied by a wide variety of acoustic changes. The current study explored the relationship between acoustic properties of vowels and their identification in clear and conversational speech, for young normal-hearing (YNH) and elderly hearing-impaired (EHI) listeners. Monosyllabic words excised from sentences spoken either clearly or conversationally by a male talker were presented in 12-talker babble for vowel identification. While vowel intelligibility was significantly higher in clear speech than in conversational speech for the YNH listeners, no clear speech advantage was found for the EHI group. Regression analyses were used to assess the relative importance of spectral target, dynamic formant movement, and duration information for perception of individual vowels. For both listener groups, all three types of information emerged as primary cues to vowel identity. However, the relative importance of the three cues for individual vowels differed greatly for the YNH and EHI listeners. This suggests that hearing loss alters the way acoustic cues are used for identifying vowels.}, } @article {pmid12132760, year = {2002}, author = {Maddox, WT and Molis, MR and Diehl, RL}, title = {Generalizing a neuropsychological model of visual categorization to auditory categorization of vowels.}, journal = {Perception & psychophysics}, volume = {64}, number = {4}, pages = {584-597}, doi = {10.3758/bf03194728}, pmid = {12132760}, issn = {0031-5117}, support = {5 R01 DC00427-11/DC/NIDCD NIH HHS/United States ; 5 R01 MH59196-03/MH/NIMH NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Corpus Striatum/*physiology ; Discrimination Learning/*physiology ; Humans ; Male ; Neuropsychology ; *Phonetics ; Probability ; Problem Solving/physiology ; *Reading ; Signal Detection, Psychological ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Twelve male listeners categorized 54 synthetic vowel stimuli that varied in second and third formant frequency on a Bark scale into the American English vowel categories [see text]. A neuropsychologically plausible model of categorization in the visual domain, the Striatal Pattern Classifier (SPC; Ashby & Waldron, 1999), is generalized to the auditory domain and applied separately to the data from each observer. Performance of the SPC is compared with that of the successful Normal A Posteriori Probability model (NAPP; Nearey, 1990; Nearey & Hogan, 1986) of auditory categorization. A version of the SPC that assumed piece-wise linear response region partitions provided a better account of the data than the SPC that assumed linear partitions, and was indistinguishable from a version that assumed quadratic response region partitions. A version of the NAPP model that assumed nonlinear response regions was superior to the NAPP model with linear partitions. The best fitting SPC provided a good account of each observer's data but was outperformed by the best fitting NAPP model. Implications for bridging the gap between the domains of visual and auditory categorization are discussed.}, } @article {pmid12126501, year = {2002}, author = {Helenius, P and Salmelin, R and Richardson, U and Leinonen, S and Lyytinen, H}, title = {Abnormal auditory cortical activation in dyslexia 100 msec after speech onset.}, journal = {Journal of cognitive neuroscience}, volume = {14}, number = {4}, pages = {603-617}, doi = {10.1162/08989290260045846}, pmid = {12126501}, issn = {0898-929X}, mesh = {Acoustic Stimulation ; Adult ; Attention/physiology ; Auditory Cortex/*physiopathology ; Dyslexia/*physiopathology ; Female ; Humans ; Magnetoencephalography ; Male ; Middle Aged ; Phonetics ; Reaction Time ; *Reading ; Speech Perception ; }, abstract = {Reading difficulties are associated with problems in processing and manipulating speech sounds. Dyslexic individuals seem to have, for instance, difficulties in perceiving the length and identity of consonants. Using magnetoencephalography (MEG), we characterized the spatio-temporal pattern of auditory cortical activation in dyslexia evoked by three types of natural bisyllabic pseudowords (/ata/, /atta/, and /a a/), complex nonspeech sound pairs (corresponding to /atta/ and /a a/) and simple 1-kHz tones. The most robust difference between dyslexic and non-reading-impaired adults was seen in the left supratemporal auditory cortex 100 msec after the onset of the vowel /a/. This N100m response was abnormally strong in dyslexic individuals. For the complex nonspeech sounds and tone, the N100m response amplitudes were similar in dyslexic and nonimpaired individuals. The responses evoked by syllable /ta/ of the pseudoword /atta/ also showed modest latency differences between the two subject groups. The responses evoked by the corresponding nonspeech sounds did not differ between the two subject groups. Further, when the initial formant transition, that is, the consonant, was removed from the syllable /ta/, the N100m latency was normal in dyslexic individuals. Thus, it appears that dyslexia is reflected as abnormal activation of the auditory cortex already 100 msec after speech onset, manifested as abnormal response strengths for natural speech and as delays for speech sounds containing rapid frequency transition. These differences between the dyslexic and nonimpaired individuals also imply that the N100m response codes stimulus-specific features likely to be critical for speech perception. Which features of speech (or nonspeech stimuli) are critical in eliciting the abnormally strong N100m response in dyslexic individuals should be resolved in future studies.}, } @article {pmid12121743, year = {2002}, author = {Cunningham, J and Nicol, T and King, C and Zecker, SG and Kraus, N}, title = {Effects of noise and cue enhancement on neural responses to speech in auditory midbrain, thalamus and cortex.}, journal = {Hearing research}, volume = {169}, number = {1-2}, pages = {97-111}, doi = {10.1016/s0378-5955(02)00344-1}, pmid = {12121743}, issn = {0378-5955}, support = {NIH-NIDCD-DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Cortex/*physiology ; Auditory Pathways/physiology ; Auditory Perception/physiology ; Child ; Female ; Guinea Pigs ; Humans ; Language Disorders/physiopathology ; Male ; Mesencephalon/*physiology ; Noise/adverse effects ; Speech Acoustics ; Speech Perception/*physiology ; Thalamus/*physiology ; }, abstract = {Speech perception depends on the auditory system's ability to extract relevant acoustic features from competing background noise. Despite widespread acknowledgement that noise exacerbates this process, little is known about the neurophysiologic mechanisms underlying the encoding of speech in noise. Moreover, the relative contribution of different brain nuclei to these processes has not been fully established. To address these issues, aggregate neural responses were recorded from within the inferior colliculus, medial geniculate body and over primary auditory cortex of anesthetized guinea pigs to a synthetic vowel-consonant-vowel syllable /ada/ in quiet and in noise. In noise the onset response to the stop consonant /d/ was reduced or eliminated at each level, to the greatest degree in primary auditory cortex. Acoustic cue enhancements characteristic of 'clear' speech (lengthening the stop gap duration and increasing the intensity of the release burst) improved the neurophysiologic representation of the consonant at each level, especially at the cortex. Finally, the neural encoding of the vowel segment was evident at subcortical levels only, and was more resistant to noise than encoding of the dynamic portion of the consonant (release burst and formant transition). This experiment sheds light on which speech-sound elements are poorly represented in noise and demonstrates how acoustic modifications to the speech signal can improve neural responses in a normal auditory system. Implications for understanding neurophysiologic auditory signal processing in children with perceptual impairments and the design of efficient perceptual training strategies are also discussed.}, } @article {pmid12070345, year = {2002}, author = {Geissler, DB and Ehret, G}, title = {Time-critical integration of formants for perception of communication calls in mice.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {99}, number = {13}, pages = {9021-9025}, pmid = {12070345}, issn = {0027-8424}, mesh = {*Animal Communication ; Animals ; *Animals, Newborn ; Female ; Mice ; }, abstract = {Brain mechanisms in humans group together acoustical frequency components both in the spectral and temporal domain, which leads to the perception of auditory objects and of streams of sound events that are of biological and communicative significance. At the perceptual level, behavioral data on mammals that clearly support the presence of common concepts for processing species-specific communication sounds are unavailable. Here, we synthesize 17 models of mouse pup wriggling calls, present them in sequences of four calls to the pups' mothers in a natural communication situation, and record the maternal response behavior. We show that the biological significance of a call sequence depends on grouping together three predominant frequency components (formants) to an acoustic object within a critical time window of about 30-ms lead or lag time of the first formant. Longer lead or lag times significantly reduce the maternal responsiveness. Central inhibition seems to be responsible for setting this time window, which is also found in numerous perceptual studies in humans. Further, a minimum of 100-ms simultaneous presence of the three formants is necessary for occurrence of response behavior. As in humans, onset-time asynchronies of formants and formant durations interact nonlinearly to influence the adequate perception of a stream of sounds. Together, these data point to common rules for time-critical spectral integration, perception of acoustical objects, and auditory streaming (perception of an acoustical Gestalt) in mice and humans.}, } @article {pmid12067751, year = {2002}, author = {Sakayori, S and Kitama, T and Chimoto, S and Qin, L and Sato, Y}, title = {Critical spectral regions for vowel identification.}, journal = {Neuroscience research}, volume = {43}, number = {2}, pages = {155-162}, doi = {10.1016/s0168-0102(02)00026-3}, pmid = {12067751}, issn = {0168-0102}, mesh = {Female ; Humans ; Japan ; Language ; Male ; *Phonetics ; *Speech Intelligibility ; }, abstract = {The first two formant frequencies (F1 and F2) are the cues important for vowel identification. In the categorization of the naturally spoken vowels, however, there are overlaps among the vowels in the F1 and F2 plane. The fundamental frequency (F0), the third formant frequency (F3) and the spectral envelope have been proposed as additional cues. In the present study, to investigate the spectral regions essential for the vowel identification, untrained subjects performed the forced-choice identification task in response to Japanese isolated vowels (/a, o, u, e, i/), in which some spectral regions were deleted. Minimum spectral regions needed for correct vowel identification were the two regions including F1 and F2 (the first and fourth in the quadrisected F1-F2 regions in Bark scale). This was true even when phonetically different vowels had a similar combination of F1 and F2 frequency components. F0 and F3 cues were not necessarily needed. It is concluded that the relative importance in the spectral region is not equivalent, but weighted on the two critical spectral regions. The auditory system may identify the vowels by analyzing the information of the spectral shapes and the formant frequencies (F1 and F2) in these critical spectral regions.}, } @article {pmid12062771, year = {2002}, author = {Krishnan, A}, title = {Human frequency-following responses: representation of steady-state synthetic vowels.}, journal = {Hearing research}, volume = {166}, number = {1-2}, pages = {192-201}, doi = {10.1016/s0378-5955(02)00327-1}, pmid = {12062771}, issn = {0378-5955}, support = {R03-DC01980-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Pathways/physiology ; Brain Stem/physiology ; Cochlear Nerve/physiology ; Electrophysiology ; Evoked Potentials, Auditory ; Humans ; Scalp ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Auditory nerve single-unit population studies have demonstrated that phase-locking plays a dominant role in the neural encoding of the spectrum of speech sounds. Given this, it was reasoned that the phase-locked neural activity underlying the scalp-recorded human frequency-following response (FFR) might preserve information about certain acoustic features of speech sounds. It was recently reported (Ananthanarayan, A.K., 1999. J. Audiol. Neurootol. 4, 95-103) that the FFR spectrum to simple two-tone approximations of several English back vowels does indeed contain peaks corresponding to the first and second formant frequencies. In this investigation FFRs to the more complex steady-state synthetic English back vowels (/u/, /)/, and /a/) were evaluated. FFRs were obtained from 10 normal-hearing human adults at 85, 75, 65, and 55 dB normal-hearing level (nHL). Spectrum analyses of the FFRs revealed distinct peaks at harmonics adjacent to the first and the second formants across all levels suggesting that phase-locked activity among two distinct populations of neurons is indeed preserved in the FFR. For each vowel the spectral peaks at first formant harmonics dominated the spectrum at high stimulus levels suggesting formant capture. The observation of less robust peaks for harmonics between the formants may very well suggest selective suppression to enhance spectral peaks at the formant frequencies. These results suggest that the scalp-recorded FFR may provide for a non-invasive analytic window to evaluate neural encoding of speech sounds in the brainstem of normal-hearing individuals and how this encoding may be degraded subsequent to cochlear hearing impairment.}, } @article {pmid12051441, year = {2002}, author = {Recio, A and Rhode, WS and Kiefte, M and Kluender, KR}, title = {Responses to cochlear normalized speech stimuli in the auditory nerve of cat.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {5 Pt 1}, pages = {2213-2218}, doi = {10.1121/1.1468878}, pmid = {12051441}, issn = {0001-4966}, support = {DC-04072/DC/NIDCD NIH HHS/United States ; NS-17590/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustics ; Animals ; Cats ; Cochlea/*physiology ; Cochlear Nerve/*physiology ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Previous studies of auditory-nerve fiber (ANF) representation of vowels in cats and rodents (chinchillas and guinea pigs) have shown that, at amplitudes typical for conversational speech (60-70 dB), neuronal firing rate as a function of characteristic frequency alone provides a poor representation of spectral prominences (e.g., formants) of speech sounds. However, ANF rate representations may not be as inadequate as they appear. Here, it is investigated whether some of this apparent inadequacy owes to the mismatch between animal and human cochlear characteristics. For all animal models tested in earlier studies, the basilar membrane is shorter and encompasses a broader range of frequencies than that of humans. In this study, a customized speech synthesizer was used to create a rendition of the vowel [E] with formant spacing and bandwidths that fit the cat cochlea in proportion to the human cochlea. In these vowels, the spectral envelope is matched to cochlear distance rather than to frequency. Recordings of responses to this cochlear normalized [E] in auditory-nerve fibers of cats demonstrate that rate-based encoding of vowel sounds is capable of distinguishing spectral prominences even at 70-80-dB SPL. When cochlear dimensions are taken into account, rate encoding in ANF appears more informative than was previously believed.}, } @article {pmid12030500, year = {2002}, author = {Sapir, S and Maimon, T and Eviatar, Z}, title = {Linguistic and nonlinguistic auditory processing of rapid vowel formant (F2) modulations in university students with and without developmental dyslexia.}, journal = {Brain and cognition}, volume = {48}, number = {2-3}, pages = {520-526}, pmid = {12030500}, issn = {0278-2626}, mesh = {Dyslexia/*complications/physiopathology ; Frontal Lobe/physiopathology ; Humans ; Language Development Disorders/*complications ; *Linguistics ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {To help resolve the issue of whether developmental dyslexia (DD) is related to central auditory processing deficits or to language-specific processing deficits, we had nine dyslectic and nine nondyslectic right-handed undergraduate students perform linguistic (Experiment 1: phoneme identification) and nonlinguistic (Experiment 2: formant rate change detection) tasks. In Experiment 1, subjects listened to synthetic vowels whose second formant (F2) was modulated sinusoidally with F1, F3, and F4 held constant. F2 modulation rate (4-18 Hz) was manipulated within and across stimuli. The groups did not differ in phoneme identification. Experiment 2 was run three times and showed that the control subjects' performance improved across runs whereas the dyslexics' deteriorated across runs (p < .0001), suggesting practice and fatigue effects, respectively. Performance on the two experiments correlated significantly and negatively for the dyslexic subjects only. These results suggest that resource depletion or frontal lobe dysfunction may be implicated in developmental dyslexia.}, } @article {pmid12006736, year = {2002}, author = {Eggermont, JJ and Ponton, CW}, title = {The neurophysiology of auditory perception: from single units to evoked potentials.}, journal = {Audiology & neuro-otology}, volume = {7}, number = {2}, pages = {71-99}, doi = {10.1159/000057656}, pmid = {12006736}, issn = {1420-3030}, mesh = {Animals ; Auditory Cortex/*anatomy & histology/*physiology ; Auditory Pathways/physiology ; Evoked Potentials/*physiology ; Humans ; Phonetics ; Reaction Time ; Sound Localization/physiology ; Speech Perception/*physiology ; Voice/physiology ; }, abstract = {Evoked electric potential and magnetic field studies have the immense benefit that they can be conducted in awake, behaving humans and can be directly correlated with aspects of perception. As such, they are powerful objective indicators of perceptual properties. However, given a set of evoked potential and/or evoked field waveforms and their source locations, obtained for an exhaustive set of stimuli and stimulus contrasts, is it possible to determine blindly, i.e. predict, what the stimuli or stimulus contrasts were? If this can be done with some success, then a useful amount of information resides in scalp-recorded activity for, e.g., the study of auditory speech processing. In this review, we compare neural representations based on single-unit and evoked response activity for vowels and consonant-vowel phonemes with distinctions in formant glides and voice onset time. We conclude that temporal aspects of evoked responses can track some of the dominant response features present in single-unit activity. However, N1 morphology does not reliably predict phonetic identification of stimuli varying in voice onset time, and the reported appearance of a double-peak onset response in aggregate recordings from the auditory cortex does not indicate a cortical correlate of the perception of voicelessness. This suggests that temporal aspects of single-unit population activity are likely not inclusive enough for representation of categorical perception boundaries. In contrast to population activity based on single-unit recording, the ability to accurately localize the sources of scalp-evoked activity is one of the bottlenecks in obtaining an accessible neurophysiological substrate of perception. Attaining this is one of the requisites to arrive at the prospect of blind determination of stimuli on the basis of evoked responses. At the current sophistication level of recording and analysis, evoked responses remain in the realm of extremely sensitive objective indicators of stimulus change or stimulus differences. As such, they are signs of perceptual activity, but not comprehensive representations thereof.}, } @article {pmid12003512, year = {2002}, author = {Houde, JF and Jordan, MI}, title = {Sensorimotor adaptation of speech I: Compensation and adaptation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {45}, number = {2}, pages = {295-310}, doi = {10.1044/1092-4388(2002/023)}, pmid = {12003512}, issn = {1092-4388}, mesh = {Adaptation, Physiological/*physiology ; Child ; Feedback/physiology ; Humans ; Male ; Perceptual Masking/physiology ; Phonetics ; Psychomotor Performance/*physiology ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {When motor actions (e.g., reaching with your hand) adapt to altered sensory feedback (e.g., viewing a shifted image of your hand through a prism), the phenomenon is called sensorimotor adaptation (SA). In the study reported here, SA was observed in speech. In two 2-hour experiments (adaptation and control), participants whispered a variety of CVC words. For those words containing the vowel /E/, participants heard auditory feedback of their whispering. A DSP-based vocoder processed the participants' auditory feedback in real time, allowing the formant frequencies of participants' auditory speech feedback to be shifted. In the adaptation experiment, formants were shifted along one edge of the vowel triangle. For half the participants, formants were shifted so participants heard /a/ when they produced /E/; for the other half, the shift made participants hear /i/ when they produced /E/. During the adaptation experiment, participants altered their production of /E/ to compensate for the altered feedback, and these production changes were retained when participants whispered with auditory feedback blocked by masking noise. In a control experiment, in which the formants were not shifted, participants' production changes were small and inconsistent. Participants exhibited a range of adaptations in response to the altered feedback, with some participants adapting almost completely, and other participants showing very little or no adaptation.}, } @article {pmid12002872, year = {2002}, author = {Ménard, L and Schwartz, JL and Boë, LJ and Kandel, S and Vallée, N}, title = {Auditory normalization of French vowels synthesized by an articulatory model simulating growth from birth to adulthood.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {4}, pages = {1892-1905}, doi = {10.1121/1.1459467}, pmid = {12002872}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Language ; Male ; *Phonetics ; Psycholinguistics ; Reference Values ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The present article aims at exploring the invariant parameters involved in the perceptual normalization of French vowels. A set of 490 stimuli, including the ten French vowels /i y u e ø o E oe (inverted c) a/ produced by an articulatory model, simulating seven growth stages and seven fundamental frequency values, has been submitted as a perceptual identification test to 43 subjects. The results confirm the important effect of the tonality distance between F1 and f0 in perceived height. It does not seem, however, that height perception involves a binary organization determined by the 3-3.5-Bark critical distance. Regarding place of articulation, the tonotopic distance between F1 and F2 appears to be the best predictor of the perceived front-back dimension. Nevertheless, the role of the difference between F2 and F3 remains important. Roundedness is also examined and correlated to the effective second formant, involving spectral integration of higher formants within the 3.5-Bark critical distance. The results shed light on the issue of perceptual invariance, and can be interpreted as perceptual constraints imposed on speech production.}, } @article {pmid12002869, year = {2002}, author = {Buder, EH and Stoel-Gammon, C}, title = {American and Swedish children's acquisition of vowel duration: effects of vowel identity and final stop voicing.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {4}, pages = {1854-1864}, doi = {10.1121/1.1463448}, pmid = {12002869}, issn = {0001-4966}, support = {R01-HD32065/HD/NICHD NIH HHS/United States ; }, mesh = {Child Language ; Child, Preschool ; Female ; Humans ; Infant ; *Language ; *Language Development ; Male ; *Phonation ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Perception ; }, abstract = {Vowel durations typically vary according to both intrinsic (segment-specific) and extrinsic (contextual) specifications. It can be argued that such variations are due to both predisposition and cognitive learning. The present report utilizes acoustic phonetic measurements from Swedish and American children aged 24 and 30 months to investigate the hypothesis that default behaviors may precede language-specific learning effects. The predicted pattern is the presence of final consonant voicing effects in both languages as a default, and subsequent learning of intrinsic effects most notably in the Swedish children. The data, from 443 monosyllabic tokens containing high-front vowels and final stop consonants, are analyzed in statistical frameworks at group and individual levels. The results confirm that Swedish children show an early tendency to vary vowel durations according to final consonant voicing, followed only six months later by a stage at which the intrinsic influence of vowel identity grows relatively more robust. Measures of vowel formant structure from selected 30-month-old children also revealed a tendency for children of this age to focus on particular acoustic contrasts. In conclusion, the results indicate that early acquisition of vowel specifications involves an interaction between language-specific features and articulatory predispositions associated with phonetic context.}, } @article {pmid11949972, year = {2002}, author = {Baillargeon, M and McLeod, A and Metz, DE and Schiavetti, N and Whitehead, RL}, title = {Preservation of second formant transitions during simultaneous communication: a locus equation perspective.}, journal = {Journal of communication disorders}, volume = {35}, number = {1}, pages = {51-62}, doi = {10.1016/s0021-9924(01)00066-1}, pmid = {11949972}, issn = {0021-9924}, mesh = {Adult ; *Communication ; Female ; Humans ; Male ; *Speech ; Speech Production Measurement ; Verbal Behavior ; }, abstract = {UNLABELLED: This study investigated the preservation of second formant transition acoustic cues to intelligibility in speech produced during simultaneous communication (SC) from a locus equation perspective. Twelve normal hearing, experienced sign language users were recorded under SC and speech alone (SA) conditions speaking a set of sentences containing monosyllabic words designed for measurement of second formant frequencies in consonant-vowel-consonant (CVC) syllables. Linear regression fits made to coordinates representing second formant transition onset and offset frequencies following stop consonant release of CVC syllables (locus equations) were used to examine place of articulation cues in both SA and SC conditions. Although results indicated longer sentence durations for SC than SA, locus equation slopes and intercepts obtained from speech produced during SC were virtually identical to those obtained during SA, indicating no degradation of stop consonant acoustic cues during SC. This conclusion is consistent with previous research indicating that temporal alterations produced by SC do not involve violations of other rules of spoken English.

EDUCATIONAL OBJECTIVES: As a result of this activity, the participant will be able to (1) describe SC; (2) explain the role of SC in communication with children who are deaf; (3) describe second formant transitions in English speech; and (4) identify second formant transition patterns in speech produced during SC.}, } @article {pmid11897158, year = {2002}, author = {Vihla, M and Kiviniemi, K and Salmelin, R}, title = {Auditory cortical activation in Finnish and Swedish speaking Finns: a magnetoencephalographic study.}, journal = {Neuroscience letters}, volume = {322}, number = {3}, pages = {141-144}, doi = {10.1016/s0304-3940(01)02517-4}, pmid = {11897158}, issn = {0304-3940}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory/physiology ; Finland ; Functional Laterality ; Humans ; *Language ; *Magnetoencephalography ; Male ; Speech Perception/physiology ; Sweden ; }, abstract = {To study the effects of linguistic background on auditory processing, magnetoencephalographic responses for pure tones (120 Hz, 1 and 4 kHz), [u] and a complex tone (with three pure tone components corresponding to the three lowest formant frequencies of [u]) were recorded in ten Finnish and ten Swedish speaking Finnish males. Auditory cortical activation, maximal at about 100 ms after stimulus onset, was stronger in the right hemisphere (RH) for all stimuli. At 175-225 ms, Swedish speaking subjects had larger inter-hemispheric differences and different signal morphology in the RH than Finnish speaking subjects, suggesting that linguistic background influences basic auditory processes. Possibly, Swedish speaking subjects had retained a juvenile response component due to their bilingual surrounding after early childhood.}, } @article {pmid11881918, year = {2002}, author = {Summerfield, AQ and Nakisa, MJ and Mccormick, B and Archbold, S and Gibbin, KP and Odonoghue, GM}, title = {Use of vocalic information in the identification of /s/ and /sh/ by children with cochlear implants.}, journal = {Ear and hearing}, volume = {23}, number = {1}, pages = {58-77}, doi = {10.1097/00003446-200202000-00005}, pmid = {11881918}, issn = {0196-0202}, mesh = {Adult ; Child, Preschool ; *Cochlear Implantation ; Deafness/*surgery ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: When a syllable such as "sea" or "she" is spoken, listeners with normal hearing extract evidence of the fricative consonant from both the fricative noise and the following vocalic segment. If the fricative noise is made ambiguous, listeners may still perceive "s" or "sh" categorically, depending on information in the vocalic segment. Do children whose auditory experience comes from electrical stimulation also display this effect, in which a subsequent segment of speech disambiguates an earlier segment?

DESIGN: Unambiguous vowels were appended to ambiguous fricative noises to form tokens of the words "she," "sea," "shoe," and "Sue." A four-choice identification test was undertaken by children with normal hearing (N = 29), prelingually deaf children with the Nucleus Spectra-22 implant system using the SPEAK coding strategy (N = 13), postlingually deafened adults with the same implant system (N = 26), and adults with normal hearing (N = 10). The last group undertook the test before and after the stimuli were processed to simulate the transformations introduced by the SPEAK coding strategy.

RESULTS: All four groups made use of vocalic information. Simulated processing reduced the use made by normal-hearing adults. Implanted subjects made less use than the other groups, with no significant difference between implanted children and implanted adults. The highest levels of use by implanted subjects were within one standard deviation of the mean level displayed when normal-hearing adults listened to processed stimuli. Analyses showed that the SPEAK strategy distorted formant contours in the vocalic segments of the stimuli in ways that are compatible with the errors of identification made by implanted subjects.

CONCLUSIONS: Some children with implants can extract information from a following vowel to disambiguate a preceding fricative noise. The upper limit on this ability may be set by distortions introduced by the implant processor, rather than by the auditory experience of the child.}, } @article {pmid11863162, year = {2002}, author = {Lotto, AJ and Kluender, KR}, title = {Synchrony capture hypothesis fails to account for effects of amplitude on voicing perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {2}, pages = {1056-1062}, doi = {10.1121/1.1433809}, pmid = {11863162}, issn = {0001-4966}, mesh = {Dichotic Listening Tests ; Humans ; Phonation/physiology ; *Speech Perception ; Voice/*physiology ; Voice Quality ; }, abstract = {Kluender et al. [J. Acoust. Soc. Am. 97, 2552-2567 (1995)] reported that overall stimulus amplitude affects perception of the voicing contrast in syllable-initial stops as a function of frequency separation between the first formant (F1) and higher formants (F2, F3). These results were offered as support for a hypothesis that [- voice] could be signaled by a shift in the temporal pattern of neural firing from synchronizing to energy at frequencies of F2 and F3 to synchronizing to energy near F1. Several predictions from this "synchrony capture hypothesis" were tested in the current study. In all cases the hypothesis was not supported. Effect of stimulus amplitude (increased voiceless responses with higher amplitude) was maintained when there was no cutback in F1 or when F2 and F1 energy bands were presented dichotically. In further tests of the hypothesis, voice-voiceless series were created that maintained periodic energy throughout the syllable (with F1 cutback signaling voicing). Energy just below the frequency of F2 and energy above F1 were presented dichotically. Thus, at the periphery there was no competition between frequencies near F2 and lower frequencies. In contrast to predictions of the "synchrony capture hypothesis," overall amplitude still had an effect on voice-voiceless identifications.}, } @article {pmid11831817, year = {2002}, author = {Wouters, J and Macon, MW}, title = {Effects of prosodic factors on spectral dynamics. II. Synthesis.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {1 Pt 1}, pages = {428-438}, doi = {10.1121/1.1428263}, pmid = {11831817}, issn = {0001-4966}, mesh = {Humans ; *Models, Biological ; Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {In Paper I [J. Wouters and M. Macon, J. Acoust. Soc. Am. 111, 417-427 (2002)], the effects of prosodic factors on the spectral rate of change of phoneme transitions were analyzed for a balanced speech corpus. The results showed that the spectral rate of change, defined as the root-mean-square of the first three formant slopes, increased with linguistic prominence, i.e., in stressed syllables, in accented words, in sentence-medial words, and in clearly articulated speech. In the present paper, an initial approach is described to integrate the results of Paper I in a concatenative synthesis framework. The target spectral rate of change of acoustic units is predicted based on the prosodic structure of utterances to be synthesized. Then, the spectral shape of the acoustic units is modified according to the predicted spectral rate of change. Experiments show that the proposed approach provides control over the degree of articulation of acoustic units, and improves the naturalness and intelligibility of concatenated speech in comparison to standard concatenation methods.}, } @article {pmid11831816, year = {2002}, author = {Wouters, J and Macon, MW}, title = {Effects of prosodic factors on spectral dynamics. I. Analysis.}, journal = {The Journal of the Acoustical Society of America}, volume = {111}, number = {1 Pt 1}, pages = {417-427}, doi = {10.1121/1.1428262}, pmid = {11831816}, issn = {0001-4966}, mesh = {Humans ; Models, Biological ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The effects of prosodic factors on the spectral rate of change of vowel transitions are investigated. Thirty two-syllable English words are placed in carrier phrases and read by a single speaker. Liquid-vowel, diphthong, and vowel-liquid transitions are extracted from different prosodic contexts, corresponding to different levels of stress, pitch accent, word position, and speaking style, following a balanced experimental design. The spectral rate of change in these transitions is measured by fitting linear regression lines to the first three formants and computing the root-mean-square of the slopes. Analysis shows that the spectral rate of change increases with linguistic prominence, i.e., in stressed syllables, in accented words, in sentence-medial words, and in hyperarticulated speech. The results are consistent with a contextual view of vowel reduction, where the extent of reduction depends both on the spectral rate of change and on vowel duration. A numerical model of spectral rate of change is proposed, which can be integrated in a system for concatenative speech synthesis, as discussed in Paper II [J. Wouters and M. Macon, J. Acoust. Soc. Am. 111, 428-438 (2002)].}, } @article {pmid11825683, year = {2002}, author = {King, C and Warrier, CM and Hayes, E and Kraus, N}, title = {Deficits in auditory brainstem pathway encoding of speech sounds in children with learning problems.}, journal = {Neuroscience letters}, volume = {319}, number = {2}, pages = {111-115}, doi = {10.1016/s0304-3940(01)02556-3}, pmid = {11825683}, issn = {0304-3940}, support = {R01 DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Auditory Diseases, Central/*physiopathology ; Auditory Pathways/*physiology ; Brain Stem/*physiopathology ; Cerebral Cortex/physiology ; Child ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Language Development Disorders/*physiopathology ; Learning Disabilities/*physiopathology ; Male ; Reaction Time/physiology ; Recovery of Function/physiology ; Speech Perception/*physiology ; Speech Therapy ; }, abstract = {Auditory brainstem responses were recorded in normal children (NL) and children clinically diagnosed with a learning problem (LP). These responses were recorded to both a click stimulus and the formant transition portion of a speech syllable /da/. While no latency differences between the NL and LP populations were seen in responses to the click stimuli, the syllable /da/ did elicit latency differences between these two groups. Deficits in cortical processing of signals in noise were seen for those LP subjects with delayed brainstem responses to the /da/, but not for LPs with normal brainstem measures. Preliminary findings indicate that training may be beneficial to LP subjects with brainstem processing delays.}, } @article {pmid11820576, year = {2001}, author = {Kosztyła-Hojna, B and Rogowski, M and Pepiński, W and Rutkowski, R and Moniuszko, T and Lazarczyk, B}, title = {Evaluation of the voice function after the supraglottis subtotal laryngectomy.}, journal = {Folia histochemica et cytobiologica}, volume = {39 Suppl 2}, number = {}, pages = {139-141}, pmid = {11820576}, issn = {0239-8508}, mesh = {Aged ; Glottis/physiology ; Humans ; Laryngeal Neoplasms/*surgery ; Laryngectomy/*methods ; Male ; Middle Aged ; *Sound Spectrography ; Treatment Outcome ; Voice Disorders/*diagnosis ; Voice Quality ; }, abstract = {Voice quality was analysed in 39 patients with the larynx carcinoma after the supraglottis subtotal laryngectomy. Voice pattern was analysed with the use of subjective and objective spectrography before and after the surgery. A deteriorated voice quality was found after the surgery. The spectrographic examination revealed decreased frequency levels of the formants F3 and F4 and the presence of a noise component generated in the glottis area.}, } @article {pmid11820574, year = {2001}, author = {Kosztyła-Hojna, B and Rogowski, M and Lazarczyk, B and Pepiński, W}, title = {Voice creating aspect after horizontal laryngectomy and chordectomy in patients with carcinoma of the larynx.}, journal = {Folia histochemica et cytobiologica}, volume = {39 Suppl 2}, number = {}, pages = {133-135}, pmid = {11820574}, issn = {0239-8508}, mesh = {Aged ; Female ; Glottis/physiology ; Humans ; Laryngeal Neoplasms/pathology/rehabilitation/*surgery ; Laryngectomy/*methods/rehabilitation ; Male ; Middle Aged ; Phonation ; Treatment Outcome ; Vocal Cords/pathology/*surgery ; *Voice Quality ; }, abstract = {Voice quality was assessed in 55 patients with the laryngeal carcinoma. A quality of voice was examined in 18 patients before and after chordectomy and in 37 patients before and after supraglottic surgery. Subjective and objective spectrography methods were applied to evaluate dysphony. The larynx was examined by indirect larngoscopy and videolaryngostroboscopy (VLSS). Significant voice pathology was found in patients before surgery when compared with the normal group. A change of voice colour was found, which was manifested in spectrography by decreased in formant levels, especially F3 and F4 in patients after supraglottic surgery. Dysphagia and longer tracheostomy were temporary complications after the surgery and resulted in further phoniatric rehabilitation. Early phoniatric rehabilitation after chordectomy helped to achieve subjective and objective improvement of voice quality in patients after surgery. Good voice quality in patients after chordectomy is due to preserved structure and increased levels F1, F2, F3, and F4 formants in spectrography.}, } @article {pmid11808723, year = {2001}, author = {Møller, AR}, title = {Neurophysiologic basis for cochlear and auditory brainstem implants.}, journal = {American journal of audiology}, volume = {10}, number = {2}, pages = {68-77}, doi = {10.1044/1059-0889(2001/012)}, pmid = {11808723}, issn = {1059-0889}, mesh = {Brain Stem/*surgery ; Cochlea/physiopathology/*surgery ; *Cochlear Implantation ; Deafness/physiopathology/*surgery ; Humans ; Prostheses and Implants ; Speech Perception/*physiology ; Time Factors ; }, abstract = {The physiologic basis for cochlear and brainstem implants is discussed. It is concluded that the success of cochlear implants may be explained by assuming that the auditory system can adequately discriminate complex sounds, such as speech sounds, on the basis of their temporal structure when that is encoded in a few separate frequency bands to offer moderate separation of spectral components. The most important roles of the cochlea seems to be to prepare complex sounds for temporal analysis and to create separate channels through which information in different frequency bands is transmitted separately to higher nervous centers for decoding of temporal information. It is then pertinent to ask how many channels are needed. Because speech discrimination is very important, it is probably sufficient to use enough channels to separate formants from each other.}, } @article {pmid11792022, year = {2001}, author = {Weiss, R and Brown, WS and Morris, J}, title = {Singer's formant in sopranos: fact or fiction?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {4}, pages = {457-468}, doi = {10.1016/s0892-1997(01)00046-7}, pmid = {11792022}, issn = {0892-1997}, mesh = {Adult ; Aged ; Female ; Humans ; Middle Aged ; *Phonetics ; Speech Acoustics ; Tape Recording ; Voice/*physiology ; *Voice Quality ; }, abstract = {Although it is generally agreed that the singer's formant (F(S)) is a prerequisite for successful stage performance, the results of this research do not support its presence in the soprano voices of trained female singers. Results are based on a recent investigation testing 10 advanced/professional sopranos in two groups singing sustained vowels at three frequencies: high (932 Hz), mid (622 Hz), and low (261 Hz). Spectrographic analysis shows that the nature of harmonic energy varies in relation to pitch. A resonance band somewhat resembling the tenor F(S) was usually evident in vowels sung at low and mid pitch. However, unlike the F(S) of typically less than 1 kHz bandwidth associated with tenors, sopranos singing similar pitches produced corresponding bandwidths which were significantly broader, usually at least 2-kHz wide. Vowels sung by sopranos at high-pitch levels exhibited strong fundamental frequency production with strong reinforcement of adjacent harmonics extending to 5 kHz and beyond. This type of production in essence nullifies the necessity for a typical F(S). Absence of the F(S) in strong soprano voices might also imply the adaptation of a sufficiently different overall vocal tract configuration, so that techniques geared to developing maximal projection should not be the same as those developed to maximize the F(S) in other voices.}, } @article {pmid11791939, year = {2001}, author = {Plyler, PN and Ananthanarayan, AK}, title = {Human frequency-following responses: representation of second formant transitions in normal-hearing and hearing-impaired listeners.}, journal = {Journal of the American Academy of Audiology}, volume = {12}, number = {10}, pages = {523-533}, pmid = {11791939}, issn = {1050-0545}, mesh = {Adult ; Hearing Loss, High-Frequency/*diagnosis ; Humans ; Middle Aged ; Phonetics ; Random Allocation ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Auditory nerve single-unit studies have demonstrated that phase-locking plays an important role in the neural encoding of the spectrum of speech-like sounds. Recently, it has been reported that the phase-locked activity underlying the scalp-recorded human frequency-following response (FFR) also encodes the first two formants of several steady-state vowels and the time-variant frequency presented in tonal sweeps. The purpose of this study was to determine (1) if FFR can encode the time-varying second formant transitions in synthetic stop consonant stimuli in normal-hearing and hearing-impaired listeners, (2) if hearing-impairment causes degradation of this neural representation, and (3) if the degraded representation is correlated with reduced consonant identification in hearing-impaired listeners. FFRs were obtained from normal-hearing and hearing-impaired listeners in response to several synthetic stop consonants. The results demonstrated that the FFR did encode the second formant transition in normal-hearing listeners. However, FFR encoding was severely degraded in most of the hearing-impaired listeners. Further, comparison of identification and FFR data for individual hearing-impaired listeners appears to suggest that degradation in the neural representation of the second formant transition may be accompanied by reduction in identification performance.}, } @article {pmid11776900, year = {1999}, author = {Zhu, H and Sun, Y and Wang, G}, title = {[Studies on the spectrum features of nasalized vowels].}, journal = {Zhonghua kou qiang yi xue za zhi = Zhonghua kouqiang yixue zazhi = Chinese journal of stomatology}, volume = {34}, number = {5}, pages = {304-306}, pmid = {11776900}, issn = {1002-0098}, mesh = {Child ; Child, Preschool ; Cleft Palate/*physiopathology ; Humans ; *Phonetics ; Speech/*physiology ; Speech Production Measurement/methods ; }, abstract = {OBJECTIVE: To study the spectrum features of nasalized vowels and set up effective indexes to reflect the features.

METHODS: The acoustic features of 100 children's speech, aged 5 to 12 years old, consisted of 40 unrepaired cleft palate children, 28 VPI children and 32 normal children, were studied by the application of digital spectrograph.

RESULTS: No significant changes in the frequency of inherent formants of each vowel, appearance of one relatively fixed extra formant in the low frequency area, and the energy of high frequency area being damped obviously were the main spectrum features of nasalized vowels.

CONCLUSION: There are three main features on the spectrums of nasalized vowels compared to that of oral vowels. Nasal resonance index (NRI) can effectively distinguish the extra formant in low frequency area of oral vowel /a/ from nasalized vowel /a/. Energy damping index (EDI) can effectively reflect that the energy of high frequency area of each nasalized vowel is damped obviously.}, } @article {pmid11776363, year = {2001}, author = {Vick, JC and Lane, H and Perkell, JS and Matthies, ML and Gould, J and Zandipour, M}, title = {Covariation of cochlear implant users' perception and production of vowel contrasts and their identification by listeners with normal hearing.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {6}, pages = {1257-1267}, doi = {10.1044/1092-4388(2001/098)}, pmid = {11776363}, issn = {1092-4388}, support = {R01-DC003007/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implantation ; Deafness/*surgery ; Female ; Hearing/*physiology ; Humans ; Male ; Phonetics ; Speech Perception/*physiology ; Speech Production Measurement ; *Verbal Behavior ; }, abstract = {This study investigates covariation of perception and production of vowel contrasts in speakers who use cochlear implants and identification of those contrasts by listeners with normal hearing. Formant measures were made of seven vowel pairs whose members are neighboring in acoustic space. The vowels were produced in carrier phrases by 8 postlingually deafened adults, before and after they received their cochlear implants (CI). Improvements in a speaker's production and perception of a given vowel contrast and normally hearing listeners' identification of that contrast in masking noise tended to occur together. Specifically, speakers who produced vowel pairs with reduced contrast in the pre-CI condition (measured by separation in the acoustic vowel space) and who showed improvement in their perception of these contrasts post-CI (measured with a phoneme identification test) were found to have enhanced production contrasts post-CI in many cases. These enhanced production contrasts were associated, in turn, with enhanced masked word recognition, as measured from responses of a group of 10 normally hearing listeners. The results support the view that restoring self-hearing allows a speaker to adjust articulatory routines to ensure sufficient perceptual contrast for listeners.}, } @article {pmid11776360, year = {2001}, author = {Bunton, K and Weismer, G}, title = {The relationship between perception and acoustics for a high-low vowel contrast produced by speakers with dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {6}, pages = {1215-1228}, doi = {10.1044/1092-4388(2001/095)}, pmid = {11776360}, issn = {1092-4388}, support = {R01 DC00319/DC/NIDCD NIH HHS/United States ; T32 DC00042/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aged, 80 and over ; Dysarthria/*diagnosis/etiology ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/complications ; Phonetics ; Severity of Illness Index ; Speech Acoustics ; Speech Intelligibility ; Speech Perception/*physiology ; }, abstract = {This study was designed to explore the relationship between perception of a high-low vowel contrast and its acoustic correlates in tokens produced by persons with motor speech disorders. An intelligibility test designed by Kent, Weismer, Kent, and Rosenbek (1989a) groups target and error words in minimal-pair contrasts. This format allows for construction of phonetic error profiles based on listener responses, thus allowing for a direct comparison of the acoustic characteristics of vowels perceived as the intended target with those heard as something other than the target. The high-low vowel contrast was found to be a consistent error across clinical groups and therefore was selected for acoustic analysis. The contrast was expected to have well-defined acoustic measures or correlates, derived from the literature, that directly relate to a listeners' responses for that token. These measures include the difference between the second and first formant frequency (F2-F1), the difference between F1 and the fundamental frequency (FO), and vowel duration. Results showed that the acoustic characteristics of tongue-height errors were not clearly differentiated from the acoustic characteristics of targets. Rather, the acoustic characteristics of errors often looked like noisy (nonprototypical) versions of the targets. Results are discussed in terms of the test from which the errors were derived and within the framework of speech perception theory.}, } @article {pmid11774426, year = {1998}, author = {Zhu, H and Sun, Y and Wang, G}, title = {[The study on the acoustic-phonetic features of marginal velopharyngeal closure].}, journal = {Zhonghua kou qiang yi xue za zhi = Zhonghua kouqiang yixue zazhi = Chinese journal of stomatology}, volume = {33}, number = {3}, pages = {178-180}, pmid = {11774426}, issn = {1002-0098}, mesh = {*Acoustics ; Child ; Child, Preschool ; Cleft Palate/*physiopathology ; Humans ; Nasopharynx/*physiopathology ; *Voice ; }, abstract = {OBJECTIVE: To study the acoustic-phonetic features of the cleft patients with marginal velopharyngeal closure.

METHODS: The acoustic-phonetic features of 17 cleft patients with marginal velopharyngeal closure were analyzed with USSA sound spectrograph, contrasted by the features of 28 cleft patients with velopharyngeal incompetence and 18 patients with velopharyngeal competence.

RESULTS: The manifestations of single vowel spectrum differ from that of the same vowel in sentence for MVPC patients. There are extra formants or strong nasal formants in a number of vowels spectrums in sentence, while they do not occur on single vowel spectrum. The spike and stop gap in the spectrograms of stops and affricatives may be absent or existing occasionally, and the noise energy in the lowest frequency area in the spectrograms of aspirated consonants may be watched or not.

CONCLUSION: Acoustic analysis is very valuable to evaluate the state of marginal velopharyngeal closure. Analyzing the acoustic features of multiple vowels and consonants, and comparing the resonance characteristics of single vowel with the same vowel in continuous speech simultaneously are emphasized.}, } @article {pmid11769345, year = {2001}, author = {Jónsdottir, V and Laukkanen, AM and Ilomäki, I and Roininen, H and Alastalo-Borenius, M and Vilkman, E}, title = {Effects of amplified and damped auditory feedback on vocal characteristics.}, journal = {Logopedics, phoniatrics, vocology}, volume = {26}, number = {2}, pages = {76-81}, doi = {10.1080/140154301753207449}, pmid = {11769345}, issn = {1401-5439}, mesh = {Adult ; Amplifiers, Electronic ; Feedback/*physiology ; Female ; Humans ; Phonation ; Speech Acoustics ; Voice/*physiology ; }, abstract = {This study tested two possible methods for reducing vocal load e.g. during classroom speech. Six female subjects read aloud from a text (1) under normal circumstances, (2) while hearing their own voice amplified (through headphones) and (3) with auditory feedback damped by foam plastic earplugs inserted in the outer ear canal. Fundamental frequency (F0) and sound pressure level (SPL) decreased during both amplified and damped feedback. Additionally, during amplification the relative level of F0 compared to that of the first formant diminished, likewise the alpha ratio. These changes may indicate reduced vocal fold adduction. The results suggest that both amplification and damping of auditory feedback may reduce vocal load during phonation.}, } @article {pmid11757947, year = {2001}, author = {Nelson, DJ}, title = {Cross-spectral methods for processing speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {5 Pt 1}, pages = {2575-2592}, doi = {10.1121/1.1402616}, pmid = {11757947}, issn = {0001-4966}, mesh = {Fourier Analysis ; Humans ; Perceptual Masking ; Pitch Discrimination ; Psychoacoustics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {We present time-frequency methods which are well suited to the analysis of nonstationary multicomponent FM signals, such as speech. These methods are based on group delay, instantaneous frequency, and higher-order phase derivative surfaces computed from the short time Fourier transform (STFT). Unlike more conventional approaches, these methods do not assume a locally stationary approximation of the signal model. We describe the computation of the phase derivatives, the physical interpretation of these derivatives, and a re-mapping algorithm based on these phase derivatives. We show analytically, and by example, the convergence of the re-mapping to the FM representation of the signal. The methods are applied to speech to estimate signal parameters, such as the group delay of a transmission channel and speech formant frequencies. Our goal is to develop a unified method which can accurately estimate speech components in both time and frequency and to apply these methods to the estimation of instantaneous formant frequencies, effective excitation time, vocal tract group delay, and channel group delay. The proposed method has several interesting properties, the most important of which is the ability to simultaneously resolve all FM components of a multicomponent signal, as long as the STFT of the composite signal satisfies a simple separability condition. The method can provide super-resolution in both time and frequency in the sense that it can simultaneously provide time and frequency estimates of FM components, which have much better accuracy than the Heisenberg uncertainty of the STFT. Super-resolution provides the capability to accurately "re-map" each component of the STFT surface to the time and frequency of the FM signal component it represents. To attain high resolution and accuracy, the signal must be jointly estimated simultaneously in time and frequency. This is accomplished by estimating two surfaces, which are essentially the derivatives of the STFT phase with respect to time and frequency. To avoid phase ambiguities, the differentiation is performed as a cross-spectral product.}, } @article {pmid11756654, year = {2002}, author = {Ehret, G and Riecke, S}, title = {Mice and humans perceive multiharmonic communication sounds in the same way.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {99}, number = {1}, pages = {479-482}, pmid = {11756654}, issn = {0027-8424}, mesh = {Age Factors ; Animal Communication ; Animals ; Female ; *Hearing ; Humans ; Mice ; *Sound ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Vowels and voiced consonants of human speech and most mammalian vocalizations consist of harmonically structured sounds. The frequency contours of formants in the sounds determine their spectral shape and timbre and carry, in human speech, important phonetic and prosodic information to be communicated. Steady-state partitions of vowels are discriminated and identified mainly on the basis of harmonics or formants having been resolved by the critical-band filters of the auditory system and then grouped together. Speech-analog processing and perception of vowel-like communication sounds in mammalian vocal repertoires has not been demonstrated so far. Here, we synthesize 11 call models and a tape loop with natural wriggling calls of mouse pups and show that house mice perceive this communication call in the same way as we perceive speech vowels: they need the presence of a minimum number of formants (three formants-in this case, at 3.8 + 7.6 + 11.4 kHz), they resolve formants by the critical-band mechanism, group formants together for call identification, perceive the formant structure rather continuously, may detect the missing fundamental of a harmonic complex, and all of these occur in a natural communication situation without any training or behavioral constraints. Thus, wriggling-call perception in mice is comparable with unconditioned vowel discrimination and perception in prelinguistic human infants and points to evolutionary old rules of handling speech sounds in the human auditory system up to the perceptual level.}, } @article {pmid11725613, year = {2001}, author = {Cugini, P and De Rosa, R and Coda, S and De Francesco, GP and Fontana, S and Pellegrino, AM}, title = {[Identification of "presumptive risk" of hypertension crises with fractal interpolation of the 24-hour arterial pressure. III: study of normotensive subjects].}, journal = {La Clinica terapeutica}, volume = {152}, number = {4}, pages = {225-229}, pmid = {11725613}, issn = {0009-9074}, mesh = {Adult ; Aged ; *Blood Pressure Monitoring, Ambulatory ; Female ; *Fractals ; Humans ; Hypertension/*diagnosis ; Male ; Middle Aged ; Risk Assessment ; }, abstract = {PURPOSE: The present study is devoted to identify the normotensive subjects who are at a "presumptive risk" (PR) for hypertensive crisis, by applying the "fractal interpolation" (FI) to blood pressure 24-h pattern.

MATERIALS AND METHODS: The investigation was performed on 109 ascertained cases of normotension, who underwent a non-invasive ambulatory monitoring of their 24-h blood pressure, by applying the FI to their daily mean arterial pressure.

RESULTS: The study showed that the PR for hypertensive crisis can be found in 12% out of the investigated normotensives. The proportion of such a risk is not significantly higher in non-dippers as compared to dippers. Vice versa, the prevalence of cases with a PR for hypertensive crisis was found to be significantly increased in normotensives who were deprived by a significant blood pressure circadian rhythm as compared to those who showed the blood pressure circadian rhythm to be preserved. Furthermore, the cases "at risk" were found to show a different spectrum for the harmonic formants which compose the complex harmonic structure of the blood pressure 24-h variability.

CONCLUSIONS: The PR for hypertensive crisis in normotensives seems to be increased by the loss of the blood pressure circadian rhythm, due to a change in the harmonic structure which confers the 24-h periodicity to the hemodynamic variable. This means that the abrogation of the blood pressure circadian rhythm in conditions of normotension promotes an adjunctive disorder for which the PR for hypertensive crisis is augmented.}, } @article {pmid11721140, year = {2001}, author = {Vintturi, J and Alku, P and Lauri, ER and Sala, E and Sihvo, M and Vilkman, E}, title = {The effects of post-loading rest on acoustic parameters with special reference to gender and ergonomic factors.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {53}, number = {6}, pages = {338-350}, doi = {10.1159/000052687}, pmid = {11721140}, issn = {1021-7762}, mesh = {Adolescent ; Adult ; Ergonomics/*methods ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; *Rest ; Sex Factors ; *Speech Acoustics ; Time Factors ; Voice/*physiology ; Voice Quality ; }, abstract = {It is a common experience that vocal quality changes during a break in vocal loading. The purpose of the present study was (1) to analyse the effects of a short post-loading vocal rest in terms of changes in a large variety of voice parameters and (2) to assess the possible effects of gender and exposure factors on these changes. The voices of a randomly chosen group of 40 female and 40 male young students were loaded by having them read aloud a novel. Two sets of voice samples were recorded: a post-loading sample after three times 45-min vocal loading during the morning and a post-resting sample after a 45-min lunch break. The material recorded consisted of /pa:ppa/ words produced normally, as softly and as loudly as possible in this order. The long /a/ vowel of the test word was inverse-filtered to obtain the glottal flow waveform. Time-domain parameters of the glottal flow [open quotient, closing quotient (ClQ), speed quotient (SQ), fundamental frequency (F(0))], amplitude-domain parameters of the glottal flow [glottal flow, minimum of the first derivative of glottal flow, amplitude quotient (AQ)], intraoral pressure and sound pressure level (SPL) values of the phonations were analysed. Voice range profiles and the singer's formant (g/G, a/A, c(mid R:)/c, e(mid R:)/e, g(mid R:)/g for females/males) of the loud phonations were also measured. The subjects were divided into eight exposure groups (5 females and 5 males per cell) according to different combinations of the following exposure factors: (1) low (25 +/- 5%) or high (65 +/- 5%) relative humidity of ambient air, (2) low [<65 dB(A)] or high [>65 dB(A)] speech output level during vocal loading and (3) sitting or standing posture during vocal loading. Statistically significant differences between the post-loading and post-resting samples could be observed in many parameters (the values of intraoral pressure in the soft phonations decreased, the values of SPL and SQ in the normal phonations decreased and the values of AQ, F(0) and ClQ in the normal phonations increased). Most of the differences reflected a shift towards softer phonation. Gender and exposure factors also had significant effects.}, } @article {pmid11708538, year = {2001}, author = {Cervera, T and Miralles, JL and González-Alvarez, J}, title = {Acoustical analysis of Spanish vowels produced by laryngectomized subjects.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {5}, pages = {988-996}, doi = {10.1044/1092-4388(2001/077)}, pmid = {11708538}, issn = {1092-4388}, mesh = {Adult ; Humans ; *Language ; *Laryngectomy ; Male ; Phonetics ; Severity of Illness Index ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Speech, Esophageal ; *Verbal Behavior ; Voice Disorders/diagnosis/epidemiology ; }, abstract = {The purpose of this study was to describe the acoustic characteristics of Spanish vowels in subjects who had undergone a total laryngectomy and to compare the results with those obtained in a control group of subjects who spoke normally. Our results are discussed in relation to those obtained in previous studies with English-speaking laryngectomized patients. The comparison between English and Spanish, which diFfer widely in the size of their vowel inventories, will help us to determine specific or universal vowel production characteristics in these patients. Our second objective was to relate the acoustic properties of these vowels to the perceptual data obtained in our previous work (J. L. Miralles & T. Cervera, 1995). In that study, results indicated that vowels produced by alaryngeal speakers were well perceived in word context. Vowels were produced in CVCV word context by two groups of patients who had undergone laryngectomy: tracheoesophageal speakers (TES) and esophageal speakers. In addition a control group of normal talkers was included. Audio recordings of 24 Spanish words produced by each speaker were analyzed using CSL (Kay Elemetrics). Results showed that F1, F2, and vowel duration of alaryngeal speakers differ significantly from normal values. In general, laryngectomized patients produce vowels with higher formant frequencies and longer durations than the group of laryngeal subjects. Thus, the data indicate modifications either in the frequency or temporal domain, following the same tendency found in previous studies with English-speaking laryngectomized speakers.}, } @article {pmid11708536, year = {2001}, author = {Hedrick, M and Younger, MS}, title = {Perceptual weighting of relative amplitude and formant transition cues in aided CV syllables.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {5}, pages = {964-974}, doi = {10.1044/1092-4388(2001/075)}, pmid = {11708536}, issn = {1092-4388}, support = {1 R55 DC03682/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Auditory Threshold/physiology ; *Cues ; Feedback ; Female ; Hearing Loss, Sensorineural/diagnosis ; Humans ; Male ; Phonetics ; Random Allocation ; Severity of Illness Index ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The current study explored the changes in weighting of relative amplitude and formant transition cues that may be caused by a K-amp circuit. Twelve listeners with normal hearing and 3 listeners with sensorineural hearing loss labeled the stop consonant place of articulation of synthetic consonant-vowel stimuli. Within the stimuli, two acoustic cues were varied: the frequency of the onset of the second and third formant (F2/F3) transitions and the relative amplitude between the consonant burst and the following vowel in the fourth and fifth formant (F4/ F5) frequency region. The variation in the two cues ranged from values appropriate for a voiceless labial stop consonant to a voiceless alveolar stop consonant. The listeners labeled both the unaided stimuli and the stimuli recorded through a hearing aid with a K-amp circuit. An analysis of variance (ANOVA) model was used to calculate the perceptual weight given each cue. Data from listeners with normal hearing show a change in relative weighting of cues between aided and unaided stimuli. Pilot data from the listeners with hearing loss show a more varied pattern, with more weight placed on relative amplitude. These results suggest that calculation of perceptual weights using an ANOVA model may be worthwhile in Future studies examining the relationship between acoustic information presented by a hearing aid and the subsequent perception by the listener with hearing loss.}, } @article {pmid11707349, year = {2001}, author = {Le Prell, CG and Niemiec, AJ and Moody, DB}, title = {Macaque thresholds for detecting increases in intensity: effects of formant structure.}, journal = {Hearing research}, volume = {162}, number = {1-2}, pages = {29-42}, doi = {10.1016/s0378-5955(01)00357-4}, pmid = {11707349}, issn = {0378-5955}, support = {F32 DC00367/DC/NIDCD NIH HHS/United States ; P01-DC00078/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Perception ; *Auditory Threshold ; *Discrimination, Psychological ; Macaca/*physiology ; Male ; Phonetics ; Sound Spectrography ; }, abstract = {Macaque monkeys, like humans, are more sensitive to differences in formant frequency than to differences in the frequency of pure tones (see Sinnott et al. (1987) J. Comp. Psychol. 94, 401-415; Pfingst (1993) J. Acoust. Soc. Am. 93, 2124-2129; Prosen et al. (1990) J. Acoust. Soc. Am. 88, 2152-2158; Sinnott et al. (1985) J. Acoust. Soc. Am. 78, 1977-1985; Sinnott and Kreiter (1991) J. Acoust. Soc. Am. 89, 2421-2429; for summary, see May et al. (1996) Aud. Neurosci. 3, 135-162). In the discrimination of formant frequency, it appears that the relevant cue for macaque monkeys is relative level differences of the component frequencies (Sommers et al. (1992) J. Acoust. Soc. Am. 91, 3499-3510). To further explore the result of Sommers et al., we trained macaque monkeys (Macaca fuscata) to report detection of a change in the spectral shape of multi-component harmonic complexes. Spectral shape changes were produced by the addition of intensity increments. When the amplitude spectrum of the comparison stimulus was modeled after the /ae/ vowel sound, thresholds for detecting a change from the comparison stimulus were lowest when intensity increments were added at spectral peaks. These results parallel previous data from human subjects, suggesting that both human and monkey subjects may process vowel spectra through simultaneous comparisons of component levels across the spectrum. When the subjects were asked to detect a change from a comparison stimulus with a flat amplitude spectrum, the subjects showed sensitivity that was relatively comparable to that of human subjects tested in other investigations (e.g. Zera et al. (1993) J. Acoust. Soc. Am. 93, 3431-3441). In additional experiments, neither increasing the dynamic range of the /ae/ spectrum nor dynamically varying the amplitude of the increment during the stimulus presentation reliably affected detection thresholds.}, } @article {pmid11692534, year = {2001}, author = {Cugini, P and De Rosa, R and Coda, S and De Francesco, GP and Fontana, S and Pellegrino, AM}, title = {[Determination of the "presumptive risk" of hypertensive crisis with fractal interpolation of 24-hour arterial pressure. II: study with secondary hypertensive patients].}, journal = {La Clinica terapeutica}, volume = {152}, number = {3}, pages = {165-169}, pmid = {11692534}, issn = {0009-9074}, mesh = {Adult ; Aged ; Circadian Rhythm ; Female ; Fractals ; Humans ; Hypertension/*epidemiology ; Male ; Middle Aged ; Risk Assessment ; }, abstract = {PURPOSE: The present study applies the "fractal interpolation" (FI) to 24-h blood pressure nonivasively and ambulatorily monitored over a day-night period in secondary hypertensives. The purpose is the evaluation of the prevalence for a "presumptive risk" (PR) of hypertensive crisis.

MATERIALS AND METHODS: The study was carried out in 108 cases of secondary hypertension, who were non-invasively and ambulatorily monitored for their 24-h blood pressure values. The FI was applied to the time-qualified values of the mean arterial pressure.

RESULTS: The PR of hypertensive crisis was found in 11% of the investigated secondary hypertensive patients. Such a risk shows a not significant prevalence in dippers as compared to non-dippers, and in those who showed a significant blood pressure circadian rhythm as compared to those who showed the blood pressure circadian rhythm to be abolished. Additionally, a not significant difference was found between the cases "at risk" and "not at risk" as far as the spectrum of harmonic formants of the 24-h blood pressure pattern is concerned.

CONCLUSIONS: The PR of hypertensive crisis is not associated with the dipping/non-dipping phenomenon as well as the circadian rhythmicity of blood pressure. Its occurrence in secondary hypertensives is essentially related to the disorder that is detectable in blood pressure non-linear variability. Therefore, such a risk may be caused by neurovegetative mechanisms which notoriously confer a non-linear chaotic variability to 24-h blood pressure pattern.}, } @article {pmid11681392, year = {2001}, author = {Ohde, RN and Abou-Khalil, R}, title = {Age differences for stop-consonant and vowel perception in adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {4}, pages = {2156-2166}, doi = {10.1121/1.1399047}, pmid = {11681392}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aging/*psychology ; Female ; Humans ; Male ; Middle Aged ; Perceptual Masking ; *Phonetics ; Reference Values ; Speech Acoustics ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {The purpose of this study was to determine the role of static, dynamic, and integrated cues for perception in three adult age groups, and to determine whether age has an effect on both consonant and vowel perception, as predicted by the "age-related deficit hypothesis." Eight adult subjects in each of the age ranges of young (ages 20-26), middle aged (ages 52-59), and old (ages 70-76) listened to synthesized syllables composed of combinations of [b d g] and [i u a]. The synthesis parameters included manipulations of the following stimulus variables: formant transition (moving or straight), noise burst (present or absent), and voicing duration (10, 30, or 46 ms). Vowel perception was high across all conditions and there were no significant differences among age groups. Consonant identification showed a definite effect of age. Young and middle-aged adults were significantly better than older adults at identifying consonants from secondary cues only. Older adults relied on the integration of static and dynamic cues to a greater extent than younger and middle-aged listeners for identification of place of articulation of stop consonants. Duration facilitated correct stop-consonant identification in the young and middle-aged groups for the no-burst conditions, but not in the old group. These findings for the duration of stop-consonant transitions indicate reductions in processing speed with age. In general, the results did not support the age-related deficit hypothesis for adult identification of vowels and consonants from dynamic spectral cues.}, } @article {pmid11681391, year = {2001}, author = {Kewley-Port, D}, title = {Vowel formant discrimination II: Effects of stimulus uncertainty, consonantal context, and training.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {4}, pages = {2141-2155}, doi = {10.1121/1.1400737}, pmid = {11681391}, issn = {0001-4966}, support = {DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Auditory Threshold ; Female ; Humans ; Male ; *Phonetics ; *Practice, Psychological ; Reference Values ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {This study is one in a series that has examined factors contributing to vowel perception in everyday listening. Four experimental variables have been manipulated to examine systematical differences between optimal laboratory testing conditions and those characterizing everyday listening. These include length of phonetic context, level of stimulus uncertainty, linguistic meaning, and amount of subject training. The present study investigated the effects of stimulus uncertainty from minimal to high uncertainty in two phonetic contexts, /V/ or /bVd/, when listeners had either little or extensive training. Thresholds for discriminating a small change in a formant for synthetic female vowels /I,E,ae,a,inverted v,o/ were obtained using adaptive tracking procedures. Experiment I optimized extensive training for five listeners by beginning under minimal uncertainty (only one formant tested per block) and then increasing uncertainty from 8-to-16-to-22 formants per block. Effects of higher uncertainty were less than expected; performance only decreased by about 30%. Thresholds for CVCs were 25% poorer than for isolated vowels. A previous study using similar stimuli [Kewley-Port and Zheng. J. Acoust. Soc. Am. 106, 2945-2958 (1999)] determined that the ability to discriminate formants was degraded by longer phonetic context. A comparison of those results with the present ones indicates that longer phonetic context degrades formant frequency discrimination more than higher levels of stimulus uncertainty. In experiment 2, performance in the 22-formant condition was tracked over 1 h for 37 typical listeners without formal laboratory training. Performance for typical listeners was initially about 230% worse than for trained listeners. Individual listeners' performance ranged widely with some listeners occasionally achieving performance similar to that of the trained listeners in just one hour.}, } @article {pmid11604765, year = {2001}, author = {Modegi, T}, title = {XML transcription method for biomedical acoustic signals.}, journal = {Studies in health technology and informatics}, volume = {84}, number = {Pt 1}, pages = {366-370}, pmid = {11604765}, issn = {0926-9630}, mesh = {*Acoustics ; Algorithms ; *Heart Sounds ; Humans ; *Programming Languages ; *Signal Processing, Computer-Assisted ; Software ; }, abstract = {We have proposed applying MIDI technology to coding of biomedical acoustic signals such as heart sounds used in medical diagnosis for constructing medical audio databases. According to our several experiments, we could correspond encoded MIDI event data or musical notes with heart sound components used for clinical diagnosis. However, precise discrimination among abnormal heart sounds and murmurs was difficult owing to insufficiency of analysis precision in our proposed method. Then we have improved its coding precision and supported harmonic or formant analysis in order to extend our application to multimedia low bit-rate audio coding, especially representation of vocal sounds by MIDI codes. In these efforts, we can describe precisely heart-sound components with MIDI codes and segment converted MIDI codes into heart sound components; about 80% automatically in 55 cases. In this paper, we are going to describe our novel heart sound automatic XML transcription method. This XML document includes MIDI data reflecting numerical detailed information for each segmented heart sound component and can reproduce heart sound signals by a browser and MIDI sequencer plug-in.}, } @article {pmid11575632, year = {2001}, author = {Hartl, DM and Hans, S and Vaissière, J and Riquet, M and Brasnu, DF}, title = {Objective voice quality analysis before and after onset of unilateral vocal fold paralysis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {3}, pages = {351-361}, doi = {10.1016/S0892-1997(01)00037-6}, pmid = {11575632}, issn = {0892-1997}, mesh = {Aged ; Humans ; Iatrogenic Disease ; Larynx/physiopathology ; Male ; Middle Aged ; Phonetics ; Prospective Studies ; Severity of Illness Index ; Speech Acoustics ; Vocal Cord Paralysis/*complications/physiopathology ; Voice Disorders/*diagnosis/*etiology ; *Voice Quality ; }, abstract = {This study was designed to investigate objective voice quality measurements in unilateral vocal fold paralysis (UVFP) by eliminating intersubject variability. To our knowledge this is the first report objectively analyzing paralytic dysphonia as compared to the same voice before onset of UVFP. The voices of two male subjects were prospectively recorded before and after the onset of iatrogenic UVFP (thoracic surgery). The following acoustic measurements of the vowel /a/ were performed using the CSL and MDVP (Kay Elemetrics): jitter, shimmer, harmonics-to-noise ratio, cepstral peak prominence, the relative energy levels of the first harmonic, the first formant and the third formant, the spectral slope in the low-frequency zone (0-1 kHz and 0-2 kHz), and the relative level of energy above 6 kHz. Distribution of spectral energy was analyzed from a long-term average spectrum of 40 seconds of text. Laryngeal aerodynamic measurements were obtained for one patient before and after onset of paralysis using the Aerophone II (Kay Elemetrics). Pitch and amplitude perturbation increased secondary to UVFP, while the harmonics-to-noise ratio and the cepstral peak prominence decreased. A relative increase in the mid-frequency and high-frequency ranges and a decrease in the low-frequency spectral slope were observed. Mean airflow rate and intraoral pressure increased, and glottal resistance and vocal efficiency decreased secondary to UVFP. The findings of this self-paired study confirm some but not all the results of previous studies. Measures involving the fundamental and the formants did not corroborate previous findings. Further investigation with vocal tract modeling is warranted.}, } @article {pmid11575631, year = {2001}, author = {Oliveira Barrichelo, VM and Heuer, RJ and Dean, CM and Sataloff, RT}, title = {Comparison of singer's formant, speaker's ring, and LTA spectrum among classical singers and untrained normal speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {3}, pages = {344-350}, doi = {10.1016/s0892-1997(01)00036-4}, pmid = {11575631}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; Professional Competence ; Prospective Studies ; Sound Spectrography ; Speech/*physiology ; *Verbal Behavior ; *Voice Quality ; }, abstract = {Many studies have described and analyzed the singer's formant. A similar phenomenon produced by trained speakers led some authors to examine the speaker's ring. If we consider these phenomena as resonance effects associated with vocal tract adjustments and training, can we hypothesize that trained singers can carry over their singing formant ability into speech, also obtaining a speaker's ring? Can we find similar differences for energy distribution in continuous speech? Forty classically trained singers and forty untrained normal speakers performed an all-voiced reading task and produced a sample of a sustained spoken vowel /a/. The singers were also requested to perform a sustained sung vowel /a/ at a comfortable pitch. The reading was analyzed by the long-term average spectrum (LTAS) method. The sustained vowels were analyzed through power spectrum analysis. The data suggest that singers show more energy concentration in the singer's formant/speaker's ring region in both sung and spoken vowels. The singers' spoken vowel energy in the speaker's ring area was found to be significantly larger than that of the untrained speakers. The LTAS showed similar findings suggesting that those differences also occur in continuous speech. This finding supports the value of further research on the effect of singing training on the resonance of the speaking voice.}, } @article {pmid11572375, year = {2001}, author = {Tom, K and Titze, IR}, title = {Vocal intensity in falsetto phonation of a countertenor: an analysis by synthesis approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {3 Pt 1}, pages = {1667-1676}, doi = {10.1121/1.1396331}, pmid = {11572375}, issn = {0001-4966}, support = {5 R01 DC 00387-08/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Biomechanical Phenomena ; Glottis/physiology ; Humans ; Larynx/physiology ; Male ; Middle Aged ; Models, Biological ; *Music ; *Phonation ; Pressure ; Pulmonary Ventilation ; *Voice/physiology ; }, abstract = {An analysis by synthesis paradigm was implemented to model glottal airflow and vocal tract acoustics for the falsetto phonation of a trained countertenor. Changes in vocal intensity were measured as a function of subglottal pressure, open quotient of the time-varying glottal airflow pulse, and formant tuning. The contributions of laryngeal adduction (open quotient of the glottal flow pulse) and of formant tuning to intensity change were derived from modeled data. The findings were: (1) Subglottal pressure accounted for almost 90% of the variation in SPL in falsetto phonation. (2) The open quotient of the glottal flow pulse was remarkably constant in these falsetto phonations, and thus did not affect vocal intensity significantly. (3) Formant tuning occurred in two out of nine possibilities for the vowel /a/. These instances did not support the concept of systematic exploitation of formant tuning.}, } @article {pmid11572371, year = {2001}, author = {Loizou, PC and Poroy, O}, title = {Minimum spectral contrast needed for vowel identification by normal hearing and cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {3 Pt 1}, pages = {1619-1627}, doi = {10.1121/1.1388004}, pmid = {11572371}, issn = {0001-4966}, support = {R01 DC03421/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; *Cochlear Implants ; Deafness/*physiopathology/*rehabilitation ; Female ; *Hearing ; Humans ; Male ; Middle Aged ; *Phonetics ; Reference Values ; *Speech Perception ; }, abstract = {The minimum spectral contrast needed for vowel identification by normal-hearing and cochlear implant listeners was determined in this study. In experiment 1, a spectral modification algorithm was used that manipulated the channel amplitudes extracted from a 6-channel continuous interleaved sampling (CIS) processor to have a 1-10 dB spectral contrast. The spectrally modified amplitudes of eight natural vowels were presented to six Med-EI/CIS-link users for identification. Results showed that subjects required a 4-6 dB contrast to identify vowels with relatively high accuracy. A 4-6 dB contrast was needed independent of the individual subject's dynamic range (range 9-28 dB). Some cochlear implant (CI) users obtained significantly higher scores with vowels enhanced to 6 dB contrast compared to the original, unenhanced vowels, suggesting that spectral contrast enhancement can improve the vowel identification scores for some CI users. To determine whether the minimum spectral contrast needed for vowel identification was dependent on spectral resolution (number of channels available), vowels were processed in experiment 2 through n (n =4, 6, 8, 12) channels, and synthesized as a linear combination of n sine waves with amplitudes manipulated to have a 1-20 dB spectral contrast. For vowels processed through 4 channels, normal-hearing listeners needed a 6 dB contrast, for 6 and 8 channels a 4 dB contrast was needed, consistent with our findings with CI listeners, and for 12 channels a 1 dB contrast was sufficient to achieve high accuracy (>80%). The above-mentioned findings with normal-hearing listeners suggest that when the spectral resolution is poor, a larger spectral contrast is needed for vowel identification. Conversely, when the spectral resolution is fine, a small spectral contrast (1 dB) is sufficient. The high identification score (82%) achieved with 1 dB contrast was significantly higher than any of the scores reported in the literature using synthetic vowels, and this can be attributed to the fact that we used natural vowels which contained duration and spectral cues (e.g., formant movements) present in fluent speech. The outcomes of experiments 1 and 2, taken together, suggest that CI listeners need a larger spectral contrast (4-6 dB) than normal-hearing listeners to achieve high recognition accuracy, not because of the limited dynamic range, but because of the limited spectral resolution.}, } @article {pmid11572368, year = {2001}, author = {Bachorowski, JA and Smoski, MJ and Owren, MJ}, title = {The acoustic features of human laughter.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {3 Pt 1}, pages = {1581-1597}, doi = {10.1121/1.1391244}, pmid = {11572368}, issn = {0001-4966}, mesh = {*Acoustics ; Female ; Humans ; Individuality ; *Laughter/physiology ; Male ; Mouth/physiology ; Reaction Time ; Sex Characteristics ; Voice/physiology ; }, abstract = {Remarkably little is known about the acoustic features of laughter. Here, acoustic outcomes are reported for 1024 naturally produced laugh bouts recorded from 97 young adults as they watched funny video clips. Analyses focused on temporal features, production modes, source- and filter-related effects, and indexical cues to laugher sex and individual identity. Although a number of researchers have previously emphasized stereotypy in laughter, its acoustics were found now to be variable and complex. Among the variety of findings reported, evident diversity in production modes, remarkable variability in fundamental frequency characteristics, and consistent lack of articulation effects in supralaryngeal filtering are of particular interest. In addition, formant-related filtering effects were found to be disproportionately important as acoustic correlates of laugher sex and individual identity. These outcomes are examined in light of existing data concerning laugh acoustics, as well as a number of hypotheses and conjectures previously advanced about this species-typical vocal signal.}, } @article {pmid11548047, year = {2001}, author = {Verschuur, C and Rafaely, V}, title = {An exploratory study into perception of acoustic speech cues by hearing-impaired adults.}, journal = {British journal of audiology}, volume = {35}, number = {3}, pages = {209-217}, doi = {10.1080/00305364.2001.11745238}, pmid = {11548047}, issn = {0300-5364}, mesh = {Aged ; *Cues ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Middle Aged ; Severity of Illness Index ; Speech Perception/*physiology ; }, abstract = {The aims of the present study were to assess discrimination and identification based on two classes of acoustic cue by adults with acquired sensorineural hearing impairment. Eight hearing-impaired and eight normally hearing adults were asked to identify and discriminate two different sets of speech stimuli. A plosive voicing continuum (coat/goat) varied in voice onset time. The plosive place of articulation continuum (date/gate) varied in burst spectra and second formant transition. Subjects were tested in the unaided condition with the exception of one hearing-impaired subject for whom speech was completely inaudible without a hearing aid. There was no significant between-group difference in discrimination or identification of the voicing contrast. There was no significant between-group difference in identification of stimuli varying by place of articulation. However, three of the eight hearing-impaired subjects were very poor at identification. The hearing-impaired subjects also showed significantly impaired place of articulation discrimination. Both measures were significantly correlated with threshold at 2000 Hz. The results support the view that hearing impairment can have different effects on perception of different acoustic contrasts and on different psychophysical tasks.}, } @article {pmid11527328, year = {2001}, author = {Louis, M and Espesser, R and Rey, V and Daffaure, V and Di Cristo, A and Habib, M}, title = {Intensive training of phonological skills in progressive aphasia: a model of brain plasticity in neurodegenerative disease.}, journal = {Brain and cognition}, volume = {46}, number = {1-2}, pages = {197-201}, doi = {10.1016/s0278-2626(01)80065-8}, pmid = {11527328}, issn = {0278-2626}, mesh = {Aged ; Aphasia, Broca/*physiopathology/*rehabilitation ; Aphasia, Primary Progressive/*physiopathology ; Brain/*physiopathology ; Disease Progression ; Dyslexia, Acquired/diagnosis/*physiopathology/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; Neurodegenerative Diseases/physiopathology ; Neuronal Plasticity/*physiology ; Neuropsychological Tests ; *Phonetics ; *Remedial Teaching ; Severity of Illness Index ; Speech Perception/physiology ; }, abstract = {Three patients with a typical syndrome of nonfluent primary progressive aphasia (Mesulam's syndrome) were trained daily with a remediation protocol including auditory exercises specifically designed to involve several aspects of phonological processing, a domain known to be specifically affected in the condition. The speech content of the exercises was based on the temporal theory of phonological processes according to which increasing the duration of formant transition should facilitate phoneme discrimination and phoneomic awareness. Significantly improved performance on the trained tasks was demonstrated in the three patients. Improvement further generalized to other tasks such as nonword repetition and reading. We conclude that such results (1) argue for using intensive focused therapy of language impairment in neurodegenerative disorders, (2) may constitute a good model of brain plasticity in neurodegenerative disorders in general, and (3) support theories of phonological processing emphasizing temporal features of the auditory signal.}, } @article {pmid11521773, year = {2001}, author = {Ertmer, DJ}, title = {Emergence of a vowel system in a young cochlear implant recipient.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {4}, pages = {803-813}, doi = {10.1044/1092-4388(2001/063)}, pmid = {11521773}, issn = {1092-4388}, support = {1R03DC04226-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {*Child Language ; *Cochlear Implantation ; Deafness/*surgery ; Female ; Humans ; Infant ; Phonetics ; Speech Acoustics ; Speech Disorders/*diagnosis ; Speech Production Measurement ; Verbal Behavior ; }, abstract = {This report chronicles changes in vowel production by a congenitally deaf child who received a multichannel cochlear implant at 19 months. The emergence of Hannah's vowel system was monitored by transcribing vocalic segments from spontaneous utterances produced during two 30-minute recording sessions before implant surgery and 12 monthly recording sessions after her implant was activated. Vowel types were included in her inventory whenever transcribers independently agreed that a vocalization contained an allophone of a given vowel type. Hannah exhibited three vowel types before implantation. A total of nine different vowel types were observed during her first year of implant experience, and a full range of place and height categories was represented. Acoustic analyses revealed that Hannah's vowel space was near normal in size and that the formant structures of /i/ and /u/ were distinctive from other point vowels. Formant regions for /ae/ and /a/ showed some overlap. Taken together with a previous report of her vocal development (D. J. Ertmer & J. A. Mellon, 2001), Hannah appears to have made substantial progress in speech development during her first year of implant use.}, } @article {pmid11521767, year = {2001}, author = {Rosen, S and Manganari, E}, title = {Is there a relationship between speech and nonspeech auditory processing in children with dyslexia?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {4}, pages = {720-736}, doi = {10.1044/1092-4388(2001/057)}, pmid = {11521767}, issn = {1092-4388}, mesh = {Auditory Threshold/physiology ; Child ; Dyslexia/*diagnosis ; Humans ; Perceptual Masking/physiology ; Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; Speech Perception/*physiology ; Teaching ; }, abstract = {A group of 8 young teenagers with dyslexia were compared to age-matched control participants on a number of speech and nonspeech auditory tasks. There were no differences between the control participants and the teenagers with dyslexia in forward and simultaneous masking, nor were there any differences in frequency selectivity as indexed by performance with a bandstop noise. Thresholds for backward masking in a broadband noise were elevated for the teenagers with dyslexia as a group. If this deficit in backward masking had an influence on speech perception, we might expect the perception of "ba" versus "da" to be affected, as the crucial second formant transition is followed by a vowel. On the other hand, as forward masking is not different in the two groups, we would expect the perception of "ab" versus "ad" to be unaffected, as the contrastive second formant transition is preceded by a vowel. Overall speech identification and discrimination performance for these two contrasts was superior for the control group but did not differ otherwise. Thus, the clear group deficit in backward masking in the group with dyslexia has no simple relationship to the perception of crucial acoustic features in speech. Furthermore, the deficit for nonspeech analogues of the speech contrasts (second formants in isolation) was much less marked than for the speech sounds, with 75% of the listeners with dyslexia performing equivalently to control listeners. The auditory deficit cannot therefore be simply characterized as a difficulty in processing rapid auditory information. Either there is a linguistic/phonological component to the speech perception deficit, or there is an important effect of acoustic complexity.}, } @article {pmid11519581, year = {2001}, author = {Ito, M and Tsuchida, J and Yano, M}, title = {On the effectiveness of whole spectral shape for vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {2}, pages = {1141-1149}, doi = {10.1121/1.1384908}, pmid = {11519581}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The formant hypothesis of vowel perception, where the lowest two or three formant frequencies are essential cues for vowel quality perception, is widely accepted. There has, however, been some controversy suggesting that formant frequencies are not sufficient and that the whole spectral shape is necessary for perception. Three psychophysical experiments were performed to study this question. In the first experiment, the first or second formant peak of stimuli was suppressed as much as possible while still maintaining the original spectral shape. The responses to these stimuli were not radically different from the ones for the unsuppressed control. In the second experiment, F2-suppressed stimuli, whose amplitude ratios of high- to low-frequency components were systemically changed, were used. The results indicate that the ratio changes can affect perceived vowel quality, especially its place of articulation. In the third experiment, the full-formant stimuli, whose amplitude ratios were changed from the original and whose F2's were kept constant, were used. The results suggest that the amplitude ratio is equal to or more effective than F2 as a cue for place of articulation. We conclude that formant frequencies are not exclusive cues and that the whole spectral shape can be crucial for vowel perception.}, } @article {pmid11508971, year = {2001}, author = {Whiteside, SP}, title = {Sex-specific fundamental and formant frequency patterns in a cross-sectional study.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {1}, pages = {464-478}, doi = {10.1121/1.1379087}, pmid = {11508971}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Humans ; Language Development ; Male ; *Phonetics ; Sex Factors ; *Sound Spectrography ; *Speech Acoustics ; }, abstract = {An extensive developmental acoustic study of the speech patterns of children and adults was reported by Lee and colleagues [Lee et al., J. Acoust. Soc. Am. 105, 1455-1468 (1999)]. This paper presents a reexamination of selected fundamental frequency and formant frequency data presented in their report for ten monophthongs by investigating sex-specific and developmental patterns using two different approaches. The first of these includes the investigation of age- and sex-specific formant frequency patterns in the monophthongs. The second, the investigation of fundamental frequency and formant frequency data using the critical band rate (bark) scale and a number of acoustic-phonetic dimensions of the monophthongs from an age- and sex-specific perspective. These acoustic-phonetic dimensions include: vowel spaces and distances from speaker centroids; frequency differences between the formant frequencies of males and females; vowel openness/closeness and frontness/backness; the degree of vocal effort; and formant frequency ranges. Both approaches reveal both age- and sex-specific development patterns which also appear to be dependent on whether vowels are peripheral or nonperipheral. The developmental emergence of these sex-specific differences are discussed with reference to anatomical, physiological, sociophonetic, and culturally determined factors. Some directions for further investigation into the age-linked sex differences in speech across the lifespan are also proposed.}, } @article {pmid11508966, year = {2001}, author = {Dissard, P and Darwin, CJ}, title = {Formant-frequency matching between sounds with different bandwidths and on different fundamental frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {110}, number = {1}, pages = {409-415}, doi = {10.1121/1.1379085}, pmid = {11508966}, issn = {0001-4966}, mesh = {Adult ; Humans ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The two experiments described here use a formant-matching task to investigate what abstract representations of sound are available to listeners. The first experiment examines how veridically and reliably listeners can adjust the formant frequency of a single-formant sound to match the timbre of a target single-formant sound that has a different bandwidth and either the same or a different fundamental frequency (F0). Comparison with previous results [Dissard and Darwin, J. Acoust. Soc. Am. 106, 960-969 (2000)] shows that (i) for sounds on the same F0, introducing a difference in bandwidth increases the variability of matches regardless of whether the harmonics close to the formant are resolved or unresolved; (ii) for sounds on different F0's, introducing a difference in bandwidth only increases variability for sounds that have unresolved harmonics close to the formant. The second experiment shows that match variability for sounds differing in F0, but with the same bandwidth and with resolved harmonics near the formant peak, is not influenced by the harmonic spacing or by the alignment of harmonics with the formant peak. Overall, these results indicate that match variability increases when the match cannot be made on the basis of the excitation pattern, but match variability does not appear to depend on whether ideal matching performance requires simply interpolation of a spectral envelope or also the extraction of the envelope's peak frequency.}, } @article {pmid11506940, year = {2001}, author = {Mendelson, JR and Ricketts, C}, title = {Age-related temporal processing speed deterioration in auditory cortex.}, journal = {Hearing research}, volume = {158}, number = {1-2}, pages = {84-94}, doi = {10.1016/s0378-5955(01)00294-5}, pmid = {11506940}, issn = {0378-5955}, mesh = {Aging/*physiology ; Animals ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Male ; Rats ; Rats, Long-Evans ; Time Perception/*physiology ; }, abstract = {A common problem among the elderly is a difficulty in discriminating speech sounds. One factor that may contribute to this is a deterioration in the ability to process dynamic aspects of speech such as formant transitions. For the aging auditory system, this deterioration in temporal processing speed may be manifest as a deficit in encoding time-varying sounds that contain rapidly changing frequencies such as formant transitions. The primary goal of this study was to explore the neural basis of the effects of aging on temporal processing speed. To this end, single units were recorded from the auditory cortex of young and aged rats in response to frequency-modulated (FM) sweeps that changed from trial to trial in both direction and speed. Results showed that the majority of cells recorded from young rats responded most vigorously to fast and medium speeds. By contrast, the majority of units recorded from aged animals responded best to slow speeds. For preferred direction of FM sweep, similar results were observed for both age groups, namely, approximately half of the units exhibited a direction-selective response. The results of the present study demonstrate an age-related decrease in the rate of change of frequency that can be processed by the auditory cortex.}, } @article {pmid11500073, year = {2001}, author = {Gokcen, JM and Fox, RA}, title = {Neurological evidence in support of a specialized phonetic processing module.}, journal = {Brain and language}, volume = {78}, number = {2}, pages = {241-253}, doi = {10.1006/brln.2001.2467}, pmid = {11500073}, issn = {0093-934X}, mesh = {Adult ; Brain/*physiology ; Evoked Potentials/*physiology ; Female ; Humans ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Event-related potentials (ERPs) were utilized to study brain activity while subjects listened to speech and nonspeech stimuli. The effect of duplex perception was exploited, in which listeners perceive formant transitions that are isolated as nonspeech "chirps," but perceive formant transitions that are embedded in synthetic syllables as unique linguistic events with no chirp-like sounds heard at all (Mattingly et al., 1971). Brain ERPs were recorded while subjects listened to and silently identified plain speech-only tokens, duplex tokens, and tone glides (perceived as "chirps" by listeners). A highly controlled set of stimuli was developed that represented equivalent speech and nonspeech stimulus tokens such that the differences were limited to a single acoustic parameter: amplitude. The acoustic elements were matched in terms of number and frequency of components. Results indicated that the neural activity in response to the stimuli was different for different stimulus types. Duplex tokens had significantly longer latencies than the pure speech tokens. The data are consistent with the contention of separate modules for phonetic and auditory stimuli.}, } @article {pmid11479980, year = {1997}, author = {Li, J and Liu, J}, title = {[Study of testing velopharyngeal function via the third formant frequency of Chinese vowels].}, journal = {Hua xi kou qiang yi xue za zhi = Huaxi kouqiang yixue zazhi = West China journal of stomatology}, volume = {15}, number = {4}, pages = {325-327}, pmid = {11479980}, issn = {1000-1182}, mesh = {Child ; Child, Preschool ; Cleft Palate/complications/*physiopathology ; Female ; Humans ; Male ; Signal Processing, Computer-Assisted ; Speech Articulation Tests ; Velopharyngeal Insufficiency/etiology/*physiopathology ; }, abstract = {The value of the third formant frequency (F3) of Chinese vowels of 24 patients with cleft palate and 10 normal children was measured with computerized speech signal processing system (CSSPS), and the rates of velopharyngeal incompetence (RVPI) of Chinese vowels of normal children were quantitatively analyzed using nasopharyngeal fiberscope (NPF). The correlation analysis between the RVPI of vowels [i] [a] and their F3 value of normal group was made. The results showed: 1. The F3 value of Chinese vowels between patients with cleft palate was not significantly different (P > 0.05); 2. The value of F3 in Chinese vowels of normal children was significantly higher than those of patients with cleft palate except [a] (P < 0.05 or P < 0.01); 3. There were generally velopharyngeal incompetence (VPI), in children with cleft palate and velopharyngeal competence of vowels in normal children except [a]; 4. There was highly native correlation between RVPI and F3 in vowel [i], the correlation coefficient was -0.8775 (P < 0.001). It was concluded the F3 of vowel [i] may be one of the most important indices testing velopharyngeal function of postoperative children with cleft palate.}, } @article {pmid11441531, year = {2001}, author = {Cugini, P and De Rosa, R and Coda, S and De Francesco, GP and Fontana, S and Pellegrino, AM}, title = {[Individualization of the "presumptive risk" of hypertensive crisis using fractals interpolation of the 24-hour arterial pressure. Study of patients with essential hypertension].}, journal = {La Clinica terapeutica}, volume = {152}, number = {2}, pages = {95-99}, pmid = {11441531}, issn = {0009-9074}, mesh = {Adult ; Aged ; *Blood Pressure Monitoring, Ambulatory ; *Circadian Rhythm ; Female ; *Fractals ; Humans ; Hypertension/*complications/diagnosis ; Male ; Middle Aged ; Models, Biological ; Nonlinear Dynamics ; Risk Factors ; Time Factors ; }, abstract = {PURPOSE: The present study applies the "fractal interpolation" (FI) to blood pressure that was nonivasively and ambulatorily monitored over a day-night period in essential hypertensives. The purpose is the identification of cases who are at a "presumptive risk" (PR) for hypertensive crisis.

MATERIALS AND METHODS: The investigation was performed on 380 ascertained cases of essential hypertension, who underwent a non-invasive ambulatory monitoring of 24-h blood pressure. The FI was applied to the ambulatory mean arterial pressure.

RESULTS: The FI showed that the PR of hypertensive crisis can be found in 14% of the investigated essential hypertensive patients. Such a risk is not significantly higher in dippers as compared to non-dippers, and in those who showed a significant blood pressure circadian rhythm as compared to those who showed the blood pressure circadian rhythm to be abolished. Furthermore, no significant difference was found between the cases "at risk" and "not at risk" as far as the spectrum of harmonic formants of the 24-h blood pressure pattern is concerned.

CONCLUSIONS: The PR of hypertensive crisis is not correlated to the dipping/nondipping condition, circadian rhythmicity and complex harmonic structure of 24-h blood pressure pattern. Its occurrence depends essentially from the disorder that is detectable in 24-h blood pressure pattern. Therefore, such a PR in essential hypertensives may be attributed to the mechanisms, likely neurovegetative, which confer a non-linear chaotic variability to 24-h blood pressure values.}, } @article {pmid11425141, year = {2001}, author = {Perry, TL and Ohde, RN and Ashmead, DH}, title = {The acoustic bases for gender identification from children's voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {6}, pages = {2988-2998}, doi = {10.1121/1.1370525}, pmid = {11425141}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; DC00523/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Adolescent ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonetics ; Sex Factors ; Speech/*physiology ; Speech Acoustics ; *Speech Perception ; }, abstract = {The purpose of this study was to examine the acoustic characteristics of children's speech and voices that account for listeners' ability to identify gender. In Experiment I, vocal recordings and gross physical measurements of 4-, 8-, 12-, and 16-year olds were taken (10 girls and 10 boys per age group). The speech sample consisted of seven nondiphthongal vowels of American English (/ae/ "had," /E/ "head," /i/ "heed," /I/ "hid," /a/ "hod," /inverted v/ "hud," and /u/ "who'd") produced in the carrier phrase, "Say /hVd/ again." Fundamental frequency (f0) and formant frequencies (F1, F2, F3) were measured from these syllables. In Experiment II, 20 adults rated the syllables produced by the children in Experiment I based on a six-point gender rating scale. The results from these experiments indicate (1) vowel formant frequencies differentiate gender for children as young as four years of age, while formant frequencies and f0 differentiate gender after 12 years of age, (2) the relationship between gross measures of physical size and vocal characteristics is apparent for at least 12- and 16-year olds, and (3) listeners can identify gender from the speech and voice of children as young as four years of age, and with respect to young children, listeners appear to base their gender ratings on vowel formant frequencies. The findings are discussed in relation to the development of gender identity and its perceptual representation in speech and voice.}, } @article {pmid11423784, year = {2001}, author = {Carré, R and Ainsworth, WA and Jospa, P and Maeda, S and Pasdeloup, V}, title = {Perception of vowel-to-vowel transitions with different formant trajectories.}, journal = {Phonetica}, volume = {58}, number = {3}, pages = {163-178}, doi = {10.1159/000056197}, pmid = {11423784}, issn = {0031-8388}, mesh = {Adult ; Gestures ; Humans ; Phonetics ; Speech Perception/*physiology ; Time Perception/*physiology ; }, abstract = {In this paper, the perceptual effects of vowel-to-vowel transitions determined by different temporal variations of model parameters which specify the shapes of the vocal tract area function are investigated. It is shown that, (a) the method of deformation of the vocal tract area function between two targets can be perceptually important and (b) conversely, within certain limits, the time course of parameters from one state to another, and the precise synchronization of two parameters is not important for the correct identification of a vowel series. These characteristics are necessary but not sufficient to prove the existence of a phonetic gesture percept.}, } @article {pmid11420099, year = {2001}, author = {Hienz, RD and Zarcone, TJ and Brady, JV}, title = {Perceptual and motor effects of morphine and buprenorphine in baboons.}, journal = {Pharmacology, biochemistry, and behavior}, volume = {69}, number = {1-2}, pages = {305-313}, doi = {10.1016/s0091-3057(01)00544-5}, pmid = {11420099}, issn = {0091-3057}, support = {DA 02490/DA/NIDA NIH HHS/United States ; DA 04731/DA/NIDA NIH HHS/United States ; DA-00018/DA/NIDA NIH HHS/United States ; MH 15330/MH/NIMH NIH HHS/United States ; }, mesh = {Animals ; Auditory Perception/*drug effects ; Buprenorphine/*pharmacology ; Discrimination, Psychological/drug effects ; Dose-Response Relationship, Drug ; Male ; Morphine/*pharmacology ; Motor Activity/*drug effects ; Narcotic Antagonists/*pharmacology ; Narcotics/*pharmacology ; Papio ; Reaction Time/drug effects ; }, abstract = {The effects of morphine and buprenorphine on auditory perceptual discriminations and response latency ("reaction time") in baboons are compared. The task employed synthetic human vowel sounds that are readily generated in the laboratory, and closely approximate natural baboon "grunt" vocalizations [J. Acoust. Soc. Am. 101 (1997) 2951]. Baboons pressed a lever to produce one repeating "standard" vowel, and released the lever only when one of four other "comparison" vowels occasionally occurred in place of the standard vowel. The percentage of correct detections and median reaction time for each comparison were measured following intramuscular drug administrations of morphine (0.01-1.8 mg/kg) and buprenorphine (0.00032-0.032 mg/kg). Both morphine and buprenorphine impaired vowel discriminability, and greater impairments occurred for those comparison vowels that were more similar in formant structure to the standard vowel. Morphine increased reaction time in all baboons, and buprenorphine increased reaction time in two of three baboons. Morphine's perceptual effects occurred within 20-40 min following drug administration; buprenorphine's perceptual effects occurred 50-100 min following drug administration. Morphine and buprenorphine did not differ in the time course of their maximal reaction time effects. The results demonstrate that both morphine and buprenorphine can impair auditory discriminations involving human vowel sounds in baboons, as well as lengthen reaction times to the stimuli.}, } @article {pmid11419223, year = {2000}, author = {Ling, LE and Grabe, E and Nolan, F}, title = {Quantitative characterizations of speech rhythm: syllable-timing in Singapore English.}, journal = {Language and speech}, volume = {43}, number = {Pt 4}, pages = {377-401}, doi = {10.1177/00238309000430040301}, pmid = {11419223}, issn = {0023-8309}, mesh = {Adult ; Female ; Humans ; Male ; *Multilingualism ; Phonetics ; Singapore ; *Speech Acoustics ; Speech Production Measurement ; Time Factors ; United Kingdom ; }, abstract = {British English and Singapore English are said to differ in rhythmic patterning. British English is commonly described as stress-timed, but Singapore English is claimed to be syllable-timed. In the present paper, we explore the acoustic nature of the suggested cross-varietal difference. In directly comparable samples from British English and Singapore English, two types of acoustic measurements were taken; we calculated a variability index reflecting changes in vowel length over utterances, and measurements reflecting vowel quality. Our findings provide acoustic data which support the hypothesized cross-varietal difference in rhythmic patterning; we show (1) that successive vowel durations are more nearly equal in Singapore English than in British English, and (2) that reduced vowels pattern more peripherally in the F1/F2 formant space in Singapore English than in British English. We complete the paper with a comparison of our vowel variability index with a set of acoustic measures for rhythm proposed by Ramus, Nespor, and Mehler (1999), which focus on variability in vocalic and intervocalic intervals. We conclude that our variability index is more successful in capturing rhythmic differences than Ramus et al. (1999)'s measures, and that an application of our index to Ramus et al.'s intervocalic measure may provide a further diagnostic of rhythmic class.}, } @article {pmid11411472, year = {2001}, author = {Sundberg, J}, title = {Level and center frequency of the singer's formant.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {15}, number = {2}, pages = {176-186}, doi = {10.1016/S0892-1997(01)00019-4}, pmid = {11411472}, issn = {0892-1997}, mesh = {Female ; Humans ; Male ; Phonation/physiology ; Phonetics ; Speech Acoustics ; Voice/*physiology ; }, abstract = {The "singer's formant" is a prominent spectrum envelope peak near 3 kHz, typically found in voiced sounds produced by classical operatic singers. According to previous research, it is mainly a resonatory phenomenon produced by a clustering of formants 3, 4, and 5. Its level relative to the first formant peak varies depending on vowel, vocal loudness, and other factors. Its dependence on vowel formant frequencies is examined. Applying the acoustic theory of voice production, the level difference between the first and third formant is calulated for some standard vowels. The difference between observed and calculated levels is determined for various voices. It is found to vary considerably more between vowels sung by professional singers than by untrained voices. The center frequency of the singer's formant as determined from long-term spectrum analysis of commercial recordings is found to increase slightly with the pitch range of the voice classification.}, } @article {pmid11407560, year = {2001}, author = {Lane, H and Matthies, M and Perkell, J and Vick, J and Zandipour, M}, title = {The effects of changes in hearing status in cochlear implant users on the acoustic vowel space and CV coarticulation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {3}, pages = {552-563}, doi = {10.1044/1092-4388(2001/043)}, pmid = {11407560}, issn = {1092-4388}, support = {R01 DC03007/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implantation ; Deafness/*diagnosis/*surgery ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; *Models, Biological ; Phonetics ; Postoperative Period ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception/*physiology ; Time Factors ; Verbal Behavior ; }, abstract = {In order to examine the role of hearing status in controlling coarticulation, eight English vowels in /bVt/ and /dVt/ syllables, embedded in a carrier phrase, were elicited from 7 postlingually deafened adults and 2 speakers with normal hearing. The deaf adults served in repeated recording sessions both before and up to a year after they received cochlear implants and their speech processors were turned on. Each of the two hearing control speakers served in two recording sessions, separated by about 3 months. Measures were made of second formant frequency at obstruent release and at 25 ms intervals until the final obstruent. An index of coarticulation, based on the ratio of F2 at vowel onset to F2 at midvowel target, was computed. Changes in the amount of coarticulation after the change in hearing status were small and nonsystematic for the /bVt/ syllables; those for the /dVt/ syllables averaged a 3% increase--within the range of reliability measures for the 2 hearing control speakers. Locus equations (F2 at vowel onset vs. F2 at vowel midpoint) and ratios of F2 onsets in point vowels were also calculated. Like the index of coarticulation, these measures tended to confirm that hearing status had little if any effect on coarticulation in the deaf speakers, consistent with the hypothesis that hearing does not play a direct role in regulating anticipatory coarticulation in adulthood. With the restoration of some hearing, 2 implant users significantly increased the average spacing between vowels in the formant plane, whereas the remaining 5 decreased that measure. All speakers but one also reduced vowel duration significantly. Four of the speakers reduced dispersion of vowel formant values around vowel midpoint means, but the other 3 did not show this effect.}, } @article {pmid11386568, year = {2001}, author = {Beautemps, D and Badin, P and Bailly, G}, title = {Linear degrees of freedom in speech production: analysis of cineradio- and labio-film data and articulatory-acoustic modeling.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {5 Pt 1}, pages = {2165-2180}, doi = {10.1121/1.1361090}, pmid = {11386568}, issn = {0001-4966}, mesh = {Cineradiography/*methods ; Humans ; Jaw/diagnostic imaging/*physiology ; *Linear Models ; Lip/*diagnostic imaging/*physiology ; *Models, Biological ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/*methods ; Tongue/anatomy & histology/diagnostic imaging/physiology ; }, abstract = {The following contribution addresses several issues concerning speech degrees of freedom in French oral vowels, stop, and fricative consonants based on an analysis of tongue and lip shapes extracted from cineradio- and labio-films. The midsagittal tongue shapes have been submitted to a linear decomposition where some of the loading factors were selected such as jaw and larynx position while four other components were derived from principal component analysis (PCA). For the lips, in addition to the more traditional protrusion and opening components, a supplementary component was extracted to explain the upward movement of both the upper and lower lips in [v] production. A linear articulatory model was developed; the six tongue degrees of freedom were used as the articulatory control parameters of the midsagittal tongue contours and explained 96% of the tongue data variance. These control parameters were also used to specify the frontal lip width dimension derived from the labio-film front views. Finally, this model was complemented by a conversion model going from the midsagittal to the area function, based on a fitting of the midsagittal distances and the formant frequencies for both vowels and consonants.}, } @article {pmid11386567, year = {2001}, author = {Simpson, AP}, title = {Dynamic consequences of differences in male and female vocal tract dimensions.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {5 Pt 1}, pages = {2153-2164}, doi = {10.1121/1.1356020}, pmid = {11386567}, issn = {0001-4966}, support = {R01 DC 00820/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; Jaw/*physiology ; Larynx/*physiology ; Lip/*physiology ; Male ; Movement/*physiology ; Phonetics ; Sex Factors ; Speech/*physiology ; Speech Acoustics ; Speech Perception/physiology ; Speech Production Measurement ; Time Factors ; Tongue/*physiology ; }, abstract = {Phonetic differences between male and female speakers are generally considered in terms of the static acoustic and perceptual consequences of different articulatory dimensions. This article investigates the dynamic acoustic and articulatory implications of differences in mean male and female vocal tract dimensions. The temporal acoustic consequences of time-varying twin-tube resonators of different dimensions are explored, and the possible implications for human speech production are considered. Empirical support for the theoretical predictions is sought by investigating the kinematic and acoustic patterns in diphthong productions from 26 female and 22 male speakers in the University of Wisconsin X-ray Microbeam Speech Production Database. Aside from expected acoustic differences, the shape of male and female formant tracks plotted in Bark space is found to be very similar. Male and female patterns of tongue movement, however, are found to be very dissimilar. The mean male diphthong, defined by the tracks of four midsagittal pellets, is characterized by greater pellet excursions, higher pellet speed, and consistently larger dorso-palatal strictures than its female counterpart. The empirical findings suggest that gender-specific dynamic behavior could be an important factor in accounting for nonuniform vowel system differences, but at the same time having more wide-ranging implications for transitional phenomena and undershoot.}, } @article {pmid11386565, year = {2001}, author = {Harnsberger, JD and Svirsky, MA and Kaiser, AR and Pisoni, DB and Wright, R and Meyer, TA}, title = {Perceptual "vowel spaces" of cochlear implant users: implications for the study of auditory adaptation to spectral shift.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {5 Pt 1}, pages = {2135-2145}, pmid = {11386565}, issn = {0001-4966}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; DC00012/DC/NIDCD NIH HHS/United States ; R01-DC00111/DC/NIDCD NIH HHS/United States ; T32 DC000012/DC/NIDCD NIH HHS/United States ; R01-DC03937/DC/NIDCD NIH HHS/United States ; R01 DC003937/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological/*physiology ; Adolescent ; Adult ; Aged ; Cochlea/*physiopathology ; Deafness/*physiopathology/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Space Perception/*physiology ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Cochlear implant (CI) users differ in their ability to perceive and recognize speech sounds. Two possible reasons for such individual differences may lie in their ability to discriminate formant frequencies or to adapt to the spectrally shifted information presented by cochlear implants, a basalward shift related to the implant's depth of insertion in the cochlea. In the present study, we examined these two alternatives using a method-of-adjustment (MOA) procedure with 330 synthetic vowel stimuli varying in F1 and F2 that were arranged in a two-dimensional grid. Subjects were asked to label the synthetic stimuli that matched ten monophthongal vowels in visually presented words. Subjects then provided goodness ratings for the stimuli they had chosen. The subjects' responses to all ten vowels were used to construct individual perceptual "vowel spaces." If CI users fail to adapt completely to the basalward spectral shift, then the formant frequencies of their vowel categories should be shifted lower in both F1 and F2. However, with one exception, no systematic shifts were observed in the vowel spaces of CI users. Instead, the vowel spaces differed from one another in the relative size of their vowel categories. The results suggest that differences in formant frequency discrimination may account for the individual differences in vowel perception observed in cochlear implant users.}, } @article {pmid11347439, year = {2001}, author = {Fayed, ZT}, title = {Utterance-based proposed spot diagnostic system of vocal tract malfunction.}, journal = {Biomedical sciences instrumentation}, volume = {37}, number = {}, pages = {485-491}, pmid = {11347439}, issn = {0067-8856}, mesh = {Adult ; Articulation Disorders/*diagnosis ; Humans ; Phonetics ; Speech Acoustics ; Speech Intelligibility ; }, abstract = {It is not surprising that speech recognition by machine, has received a great deal of attention through the techniques of artificial intelligence (AI), like expert systems to support decisions in various intended fields. One proposal that is based on the expert system paradigm is to diagnose a malfunction of the vocal tract during uttering recommended utterances for this purpose. The choice of these utterances is achieved according to the position and the manner of articulation. Four important features of acoustic analysis of speech are, fundamental frequency, F0, Formants, (F1-F5), amplitude, and the harmonic structure (tone vs. noise). The most Candidate features in the proposed diagnostic system are both fundamental frequency and/or the formants. These are considered to be the Core of the intended work. The throat, mouth and nose as the resonating champers will support this attitude and will affect, negatively, the range of the mentioned frequencies when they are out of the anatomical and/or physical functions. The discrete speech (isolated words) as the most recognizable utterances. Will be considered to put aside the difficulties of both connected and continuous word-based recognition. In a diagnostic systems, the generality is an essential issue, that is to consider "Speaker independent" recognizer which needs more efforts during the system training phase. The paper presents a rough (initial) spotting diagnostic system to be the base for a future detailed system for specific defects of precised organs belonging to the vocal tract. Arabic Vowels as well as some Consonants would be the target, taking into account the age range, that is (20-25) years-aged matures. Different recommended utterances, Arabic segmented alphabetic, focused on various points through the vocal apparatus. Speech related waveforms, as well as the associated fundamental frequencies and formants have been considered in the normal and the Corresponding abnormal Cases. The deviations that appeared in the frequency pattern have indicated the defected articulator that is dominant in the intended utterance production. The results illustrated, would be the base for designing a dedicated hardware unit, which may be reliable for the physicians interesting in this field. Human beings communicate with one another primarily by speech, and speech brings human beings closer together speech sounds travel through the air at the rate of about 330 meter per second, whereas impulses travel a long nerve pathways in the body at a rate of about 60 meter per second. The time it takes for a spoken word to be heard and understood by a listener may be shorter than the time it takes as a neural message to travel to the brain, [1]. Speech not only for human Communication, but it also has many applications in different fields. Some of these applications are machine control commands base systems, speech-to-text, and Text-to-speech systems, natural language-based systems, and medical diagnosis systems for vocal tract malfunction, the issue of this paper. One difficulty of speech based-systems is the fact that not everyone speaks the same way, even those who supposedly speak the same language at the same way. The term dialect is used to refer to this variability and is emphasized in case of uttering with different languages. The above discussion is concerning with the social and emotional variability which can be modified with reasonable efforts. The great variabilities, which are difficulty to be manipulated, are belonging to the inheritance and anatomical aspects. So it is worthy and to propose a methodology that can be used globally inspite of different social communities.}, } @article {pmid11341528, year = {2001}, author = {Harper, P and Kraman, SS and Pasterkamp, H and Wodicka, GR}, title = {An acoustic model of the respiratory tract.}, journal = {IEEE transactions on bio-medical engineering}, volume = {48}, number = {5}, pages = {543-550}, doi = {10.1109/10.918593}, pmid = {11341528}, issn = {0018-9294}, mesh = {Acoustics ; Glottis/physiology ; Humans ; *Models, Biological ; *Respiratory Sounds ; Stomatognathic System/*physiology ; }, abstract = {With the emerging use of tracheal sound analysis to detect and monitor respiratory tract changes such as those found in asthma and obstructive sleep apnea, there is a need to link the attributes of these easily measured sounds first to the underlying anatomy, and then to specific pathophysiology. To begin this process, we have developed a model of the acoustic properties of the entire respiratory tract (supraglottal plus subglottal airways) over the frequency range of tracheal sound measurements, 100 to 3000 Hz. The respiratory tract is represented by a transmission line acoustical analogy with varying cross sectional area, yielding walls, and dichotomous branching in the subglottal component. The model predicts the location in frequency of the natural acoustic resonances of components or the entire tract. Individually, the supra and subglottal portions of the model predict well the distinct locations of the spectral peaks (formants) from speech sounds such as /a/ as measured at the mouth and the trachea, respectively, in healthy subjects. When combining the supraglottic and subglottic portions to form a complete tract model, the predicted peak locations compare favorably with those of tracheal sounds measured during normal breathing. This modeling effort provides the first insights into the complex relationships between the spectral peaks of tracheal sounds and the underlying anatomy of the respiratory tract.}, } @article {pmid11325134, year = {2001}, author = {Story, BH and Titze, IR and Hoffman, EA}, title = {The relationship of vocal tract shape to three voice qualities.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {4}, pages = {1651-1667}, doi = {10.1121/1.1352085}, pmid = {11325134}, issn = {0001-4966}, support = {R01 DC02532/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Larynx/*anatomy & histology ; Magnetic Resonance Imaging ; Male ; Models, Biological ; Phonetics ; Speech/physiology ; Speech Acoustics ; Voice/*physiology ; *Voice Quality ; }, abstract = {Three-dimensional vocal tract shapes and consequent area functions representing the vowels [i, ae, a, u] have been obtained from one male and one female speaker using magnetic resonance imaging (MRI). The two speakers were trained vocal performers and both were adept at manipulation of vocal tract shape to alter voice quality. Each vowel was performed three times, each with one of the three voice qualities: normal, yawny, and twangy. The purpose of the study was to determine some ways in which the vocal tract shape can be manipulated to alter voice quality while retaining a desired phonetic quality. To summarize any overall tract shaping tendencies mean area functions were subsequently computed across the four vowels produced within each specific voice quality. Relative to normal speech, both the vowel area functions and mean area functions showed, in general, that the oral cavity is widened and tract length increased for the yawny productions. The twangy vowels were characterized by shortened tract length, widened lip opening, and a slightly constricted oral cavity. The resulting acoustic characteristics of these articulatory alterations consisted of the first two formants (F1 and F2) being close together for all yawny vowels and far apart for all the twangy vowels.}, } @article {pmid11325124, year = {2001}, author = {Wunderlich, JL and Cone-Wesson, BK}, title = {Effects of stimulus frequency and complexity on the mismatch negativity and other components of the cortical auditory-evoked potential.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {4}, pages = {1526-1537}, doi = {10.1121/1.1349184}, pmid = {11325124}, issn = {0001-4966}, mesh = {Adult ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {This study investigated, first, the effect of stimulus frequency on mismatch negativity (MMN), N1, and P2 components of the cortical auditory event-related potential (ERP) evoked during passive listening to an oddball sequence. The hypothesis was that these components would show frequency-related changes, reflected in their latency and magnitude. Second, the effect of stimulus complexity on those same ERPs was investigated using words and consonant-vowel tokens (CVs) discriminated on the basis of formant change. Twelve normally hearing listeners were tested with tone bursts in the speech frequency range (400/440, 1,500/1,650, and 3,000/3,300 Hz), words (/baed/ vs /daed/) and CVs (/bae/ vs /dae/). N1 amplitude and latency decreased as frequency increased. P2 amplitude, but not latency, decreased as frequency increased. Frequency-related changes in MMN were similar to those for N1, resulting in a larger MMN area to low frequency contrasts. N1 amplitude and latency for speech sounds were similar to those found for low tones but MMN had a smaller area. Overall, MMN was present in 46%-71% of tests for tone contrasts but for only 25%-32% of speech contrasts. The magnitude of N1 and MMN for tones appear to be closely related, and both reflect the tonotopicity of the auditory cortex.}, } @article {pmid11324656, year = {2001}, author = {Matthies, M and Perrier, P and Perkell, JS and Zandipour, M}, title = {Variation in anticipatory coarticulation with changes in clarity and rate.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {44}, number = {2}, pages = {340-353}, doi = {10.1044/1092-4388(2001/028)}, pmid = {11324656}, issn = {1092-4388}, support = {5R01 DC01925/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Algorithms ; Female ; Humans ; Lip/physiology ; Male ; Movement/physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; *Verbal Behavior ; }, abstract = {This study tests the hypothesis that the relative timing, or coarticulation, of articulatory movements at VC and CV boundaries is influenced by both the listener's requirement for clarity and the speaker's strategy to economize effort. Movement and acoustic data were collected from 7 subjects who spoke in three conditions: normal, clear, and fast. It was predicted that fast speech would show more coarticulation and clear speech would show less coarticulation than normal speech. The speech materials were designed to investigate coarticulation in the movements of the upper lip and tongue. They consisted of repetitions of [iC(n)u] utterances embedded in carrier phrases, where the number of consonants, n, ranged from 1 to 3. Analyses focused on kinematic measures and the amount of coarticulation (overlap) of the /i-u/ transition movement with the acoustic interval of the /i/. The consonant-string duration was longest in the clear speaking condition and shortest in the fast condition. Compared to the normal condition, peak velocities were higher in the fast and clear speaking conditions, indicating increased effort. The influences of speaking condition on coarticulation and on the formants of the /i/ were small. Thus, even though there was evidence of increased effort in the clear and fast conditions, the hypothesized effects of a trade-off between clarity and economy of effort were minimally evident in formant values for /i/ and measures of coarticulation.}, } @article {pmid11319408, year = {2001}, author = {Crevier-Buchman, L and Maeda, S and Bely, N and Laccourreye, O and Vaissière, J and Brasnu, D}, title = {[Articulatory compensation after supracricoid partial laryngectomy with cricohyoidoepiglottopexy].}, journal = {Annales d'oto-laryngologie et de chirurgie cervico faciale : bulletin de la Societe d'oto-laryngologie des hopitaux de Paris}, volume = {118}, number = {2}, pages = {81-88}, pmid = {11319408}, issn = {0003-438X}, mesh = {Adult ; Aged ; Cricoid Cartilage ; Epiglottis ; Humans ; Hyoid Bone ; Laryngectomy/*methods ; Male ; Middle Aged ; Prospective Studies ; *Speech, Alaryngeal ; }, abstract = {OBJECTIVES: The consequences of the modification of the glottis and the shortening of the vocal tract after supracricoid partial laryngectomy (SCPL) with cricohyoidoepiglottopexy (CHEP) were investigated prospectively on ten patients. An acoustic analysis of the transfer function of the vocal tract was performed by measuring the formant frequencies of the [a] and [i] vowels. The articulation compensatory mechanisms of the vocal tract were observed with cinefluoroscopy in order to evaluate the phonation and articulation constraints.

PATIENTS AND METHODS: Ten male patients were recorded before surgery and at six, 12 and 18 months after surgery. The results were compared with those of 10 male normal speakers having the same range of age. For the acoustic measures, we tracked the three first formant frequencies of the cardinal vowels [a] and [i], before and after surgery. Articulation investigation was performed with cinefluoroscopy for the vowels [a] and [i] uttered by two of the 10 patients.

RESULTS: For the [a] vowel, the acoustic analysis showed higher values for all three formants, related to the shortening of the vocal tract after surgery. For the [i] vowel, the lowering of the second formant frequencies after surgery was related to an articulatory compensation. Cinefluoroscopy confirmed the shortening of the vocal tract, the tongue-root retraction for voicing and the anterior position of the tip of the tongue for the [i] vowel.

CONCLUSIONS: The consequences of the shortening of the vocal tract after SCPL with CHEP can be evaluated, non invasively, by means of acoustic analysis. The understanding of the articulation compensatory mechanisms resulting from voicing constraints should help voice rehabilitation and improve oral communication in such patients.}, } @article {pmid11303931, year = {2001}, author = {Sussman, JE}, title = {Vowel perception by adults and children with normal language and specific language impairment: based on steady states or transitions?.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {3}, pages = {1173-1180}, doi = {10.1121/1.1349428}, pmid = {11303931}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Child ; Child, Preschool ; Cues ; Female ; Humans ; *Language Disorders ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {The current investigation studied whether adults, children with normally developing language aged 4-5 years, and children with specific language impairment, aged 5-6 years identified vowels on the basis of steady-state or transitional formant frequencies. Four types of synthetic tokens, created with a female voice, served as stimuli: (1) steady-state centers for the vowels [i] and [ae]; (2) voweless tokens with transitions appropriate for [bib] and [baeb]; (3) "congruent" tokens that combined the first two types of stimuli into [bib] and [baeb]; and (4) "conflicting" tokens that combined the transitions from [bib] with the vowel from [baeb] and vice versa. Results showed that children with language impairment identified the [i] vowel more poorly than other subjects for both the voweless and congruent tokens. Overall, children identified vowels most accurately in steady-state centers and congruent stimuli (ranging between 94%-96%). They identified the vowels on the basis of transitions only from "voweless" tokens with 89% and 83.5% accuracy for the normally developing and language impaired groups, respectively. Children with normally developing language used steady-state cues to identify vowels in 87% of the conflicting stimuli, whereas children with language impairment did so for 79% of the stimuli. Adults were equally accurate for voweless, steady-state, and congruent tokens (ranging between 99% to 100% accuracy) and used both steady-state and transition cues for vowel identification. Results suggest that most listeners prefer the steady state for vowel identification but are capable of using the onglide/offglide transitions for vowel identification. Results were discussed with regard to Nittrouer's developmental weighting shift hypothesis and Strange and Jenkin's dynamic specification theory.}, } @article {pmid11294224, year = {2001}, author = {Remez, RE and Pardo, JS and Piorkowski, RL and Rubin, PE}, title = {On the bistability of sine wave analogues of speech.}, journal = {Psychological science}, volume = {12}, number = {1}, pages = {24-29}, doi = {10.1111/1467-9280.00305}, pmid = {11294224}, issn = {0956-7976}, support = {DC00308/DC/NIDCD NIH HHS/United States ; HD01994/HD/NICHD NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Speech Acoustics ; *Speech Perception ; *Speech, Alaryngeal ; }, abstract = {Our studies revealed two stable modes of perceptual organization, one based on attributes of auditory sensory elements and another based on attributes of patterned sensory variation composed by the aggregation of sensory elements. In a dual-task method, listeners attended concurrently to both aspects, component and pattern, of a sine wave analogue of a word. Organization of elements was indexed by several single-mode tests of auditory form perception to verify the perceptual segregation of either an individual formant of a synthetic word or a tonal component of a sinusoidal word analogue. Organization of patterned variation was indexed by a test of lexical identification. The results show the independence of the perception of auditory and phonetic form, which appear to be differently organized concurrent effects of the same acoustic cause.}, } @article {pmid11286438, year = {2000}, author = {Hunt, A and Howard, DM and Morrison, G and Worsdall, J}, title = {A real-time interface for a formant speech synthesizer.}, journal = {Logopedics, phoniatrics, vocology}, volume = {25}, number = {4}, pages = {169-175}, doi = {10.1080/140154300750067548}, pmid = {11286438}, issn = {1401-5439}, mesh = {*Communication Aids for Disabled ; Equipment Design ; Humans ; Speech, Alaryngeal ; }, abstract = {This paper describes a multi-parametric user interface based around the Musical Instrument Digital Interface (MIDI) Creator system developed at York which provides MIDI data in response to changing pressures on five strain gauge sensors to control the fundamental frequency, first three formants and the overall amplitude of synthesized speech. Vocal synthesis is achieved by means of a freely available time domain formant synthesis system running on a standard PC compatible machine. The result is a novel hand-controlled speech synthesizer which is not command/phoneme based, but is rather more like a continually controlled musical instrument where the speech sounds are shaped in real-time.}, } @article {pmid11248979, year = {2001}, author = {Hillenbrand, JM and Clark, MJ and Nearey, TM}, title = {Effects of consonant environment on vowel formant patterns.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {2}, pages = {748-763}, doi = {10.1121/1.1337959}, pmid = {11248979}, issn = {0001-4966}, support = {2-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Discriminant Analysis ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Perception ; Speech Production Measurement ; Time Factors ; }, abstract = {A significant body of evidence has accumulated indicating that vowel identification is influenced by spectral change patterns. For example, a large-scale study of vowel formant patterns showed substantial improvements in category separability when a pattern classifier was trained on multiple samples of the formant pattern rather than a single sample at steady state [J. Hillenbrand et al., J. Acoust. Soc. Am. 97, 3099-3111 (1995)]. However, in the earlier study all utterances were recorded in a constant /hVd/ environment. The main purpose of the present study was to determine whether a close relationship between vowel identity and spectral change patterns is maintained when the consonant environment is allowed to vary. Recordings were made of six men and six women producing eight vowels (see text) in isolation and in CVC syllables. The CVC utterances consisted of all combinations of seven initial consonants (/h,b,d,g,p,t,k/) and six final consonants (/b,d,g,p,t,k/). Formant frequencies for F1-F3 were measured every 5 ms during the vowel using an interactive editing tool. Results showed highly significant effects of phonetic environment. As with an earlier study of this type, particularly large shifts in formant patterns were seen for rounded vowels in alveolar environments [K. Stevens and A. House, J. Speech Hear. Res. 6, 111-128 (1963)]. Despite these context effects, substantial improvements in category separability were observed when a pattern classifier incorporated spectral change information. Modeling work showed that many aspects of listener behavior could be accounted for by a fairly simple pattern classifier incorporating F0, duration, and two discrete samples of the formant pattern.}, } @article {pmid11248978, year = {2001}, author = {Tom, K and Titze, IR and Hoffman, EA and Story, BH}, title = {Three-dimensional vocal tract imaging and formant structure: varying vocal register, pitch, and loudness.}, journal = {The Journal of the Acoustical Society of America}, volume = {109}, number = {2}, pages = {742-747}, doi = {10.1121/1.1332380}, pmid = {11248978}, issn = {0001-4966}, support = {P60 DC 00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Image Interpretation, Computer-Assisted ; Larynx/*diagnostic imaging/*physiology ; Male ; Middle Aged ; Phonation/physiology ; Phonetics ; Speech/physiology ; Tomography, X-Ray Computed ; Voice/*physiology ; Voice Quality/*physiology ; }, abstract = {Although advances in techniques for image acquisition and analysis have facilitated the direct measurement of three-dimensional vocal tract air space shapes associated with specific speech phonemes, little information is available with regard to changes in three-dimensional (3-D) vocal tract shape as a function of vocal register, pitch, and loudness. In this study, 3-D images of the vocal tract during falsetto and chest register phonations at various pitch and loudness conditions were obtained using electron beam computed tomography (EBCT). Detailed measurements and differences in vocal tract configuration and formant characteristics derived from the eight measured vocal tract shapes are reported.}, } @article {pmid11234753, year = {2001}, author = {Mathiak, K and Hertrich, I and Lutzenberger, W and Ackermann, H}, title = {Neural correlates of duplex perception: a whole-head magnetencephalography study.}, journal = {Neuroreport}, volume = {12}, number = {3}, pages = {501-506}, doi = {10.1097/00001756-200103050-00015}, pmid = {11234753}, issn = {0959-4965}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Dichotic Listening Tests ; Female ; Humans ; *Magnetoencephalography ; Male ; Speech Perception/*physiology ; }, abstract = {Simultaneous experience of the same acoustic stimulus in two distinct phenomenological modes, e.g. as a speech-like and as a non-speech event, is referred to as duplex perception (DP). The most widely investigated DP paradigm splits each of the stop consonant-vowel (CV) syllables /ga/ and /da/ into an isolated formant transient (chirp) and the remaining sound structure (base). The present study recorded mismatch fields in response to a series of dichotically applied base and chirp components using whole-head magnetencephalography (MEG). Preattentive mismatch fields showed larger amplitudes in response to contralateral deviants. During attention to the fused percept /da/, the left ear deviants chirps elicited an enhanced and posteriorly shifted dipole field over the ipsilateral hemisphere. These data provide first neurophysiological evidence that the integration of acoustic stimulus elements into a coherent syllable representation constitutes a distinct stage of left-hemisphere speech sound encoding.}, } @article {pmid11219410, year = {2001}, author = {Lübben, B and Alberty, J and Lang-Roth, R and Stoll, W and Seifert, E}, title = {[Voice change and laryngeal obstruction caused by a median thyroglossal duct cyst].}, journal = {HNO}, volume = {49}, number = {1}, pages = {48-53}, doi = {10.1007/s001060050707}, pmid = {11219410}, issn = {0017-6192}, mesh = {Follow-Up Studies ; Humans ; Laryngoscopy ; Laryngostenosis/*etiology/surgery ; Male ; Sound Spectrography ; Thyroglossal Cyst/*diagnosis/surgery ; Voice Disorders/*etiology/surgery ; Voice Quality ; }, abstract = {Thyroglossal duct cysts are common primary neck tumors, resulting from remnants of the ductus thyroglossus. They can occur at any point along the migratory path of the thyroid gland anlage until the 2nd or 3rd decade of life. The usual symptoms leading to the diagnosis are painless midline neck masses orfistulas. Despite their close proximity, the cysts normally do not affect the larynx. We report on the eighth case mentioned in the world literature of a thyroglossal duct cyst invading the larynx in a 62-year-old patient. Preoperatively, the voice of the patient sounded metallic, and the frequency of the voice field was reduced. Postoperatively, the voice onset was physiological, and the voice increased by an octave in the higher frequencies. The intonation of the voice was steady. The analysis of formants of the vowels "a" and "i" showed that the fourth formant of the vowel "i" was lower in frequency postoperatively.}, } @article {pmid11216297, year = {2000}, author = {Most, T and Amir, O and Tobin, Y}, title = {The Hebrew vowel system: raw and normalized acoustic data.}, journal = {Language and speech}, volume = {43}, number = {Pt 3}, pages = {295-308}, doi = {10.1177/00238309000430030401}, pmid = {11216297}, issn = {0023-8309}, mesh = {Adult ; Aging/*psychology ; Child ; Female ; Humans ; Male ; *Phonetics ; Speech/*physiology ; Speech Acoustics ; }, abstract = {It is well known that different languages use different vowel systems in terms of variety and number. The Hebrew vowel system consists of five vowels /i, e, a, o, u/. The present research identified the acoustic features of the vowels produced by Hebrew speakers differing in age and sex. Ninety speakers (men, women, boys, and girls) were recorded. The vowels were presented in a nonword context that was placed in a meaningful Hebrew sentence. The data included measurements of F0, F1, F2, F3, F4, and vowel duration for the five different vowels produced by the four groups of participants. Conversion of the physical frequency measures of formants into a critical band (bark) scale was performed as well. The results indicated that the F2/F1 ratio is a distinctive feature of all five vowels, keeping with the findings of previous research in other languages. Nevertheless, the values of the F2/F1 ratios led to an overlap between different vowels produced by different groups of speakers. Applying the bark transformation as speaker normalization procedure succeeded in reducing speaker differences while increasing vowel differences.}, } @article {pmid11193962, year = {2000}, author = {Solomon, NP}, title = {Changes in normal speech after fatiguing the tongue.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {43}, number = {6}, pages = {1416-1428}, doi = {10.1044/jslhr.4306.1416}, pmid = {11193962}, issn = {1092-4388}, support = {P60-DC00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Fatigue/diagnosis/*physiopathology ; Female ; Humans ; Male ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Time Factors ; Tongue/*physiopathology ; }, abstract = {Detrimental effects of tongue fatigue on speech have been assumed to exist based on neuromotor speech disorders. However, to address whether fatigue is a contributing cause to impaired speech requires an experimental protocol with an uncomplicated population. This study induced tongue fatigue in eight neurologically normal persons and examined changes in speech perceptually and acoustically. The fatigue task consisted of repeated cycles of 6 s of sustained maximum voluntary contraction and 4 s of rest until 50% of maximum strength could not be achieved for three consecutive cycles. Participants then produced speech that was weighted heavily with lingual-palatal consonants. Perceptual analyses of the speech revealed a statistically significant deleterious effect of induced tongue fatigue on speech precision and an incomplete reversal of this effect after a recovery period. Acoustically, the first and third spectral moments (mean and skewness) of the spectral energy for /see text/, /see text/, and /see text/ differed significantly after fatigue but in directions opposite to a priori predictions. Tendencies were found for decreased stop-closure duration and increased voice onset time for /see text/ after fatigue. Supplemental analyses revealed decreased second formant (F2) frequency for /see text/ and /see text/ and flattened F2 transition for the diphthong /see text/ after fatigue. These results indicate disruption of tongue positioning and transitioning for lingual-palatal consonants during speech after prolonged strenuous tongue exercises.}, } @article {pmid11144593, year = {2000}, author = {Hillenbrand, JM and Clark, MJ and Houde, RA}, title = {Some effects of duration on vowel recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {6}, pages = {3013-3022}, doi = {10.1121/1.1323463}, pmid = {11144593}, issn = {0001-4966}, support = {DC-01661/DC/NIDCD NIH HHS/United States ; }, mesh = {*Attention ; Humans ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {This study was designed to examine the role of duration in vowel perception by testing listeners on the identification of CVC syllables generated at different durations. Test signals consisted of synthesized versions of 300 utterances selected from a large, multitalker database of /hVd/ syllables [Hillenbrand et al., J. Acoust. Soc. Am. 97, 3099-3111 (1995)]. Four versions of each utterance were synthesized: (1) an original duration set (vowel duration matched to the original utterance), (2) a neutral duration set (duration fixed at 272 ms, the grand mean across all vowels), (3) a short duration set (duration fixed at 144 ms, two standard deviations below the mean), and (4) a long duration set (duration fixed at 400 ms, two standard deviations above the mean). Experiment 1 used a formant synthesizer, while a second experiment was an exact replication using a sinusoidal synthesis method that represented the original vowel spectrum more precisely than the formant synthesizer. Findings included (1) duration had a small overall effect on vowel identity since the great majority of signals were identified correctly at their original durations and at all three altered durations; (2) despite the relatively small average effect of duration, some vowels, especially [see text] and [see text], were significantly affected by duration; (3) some vowel contrasts that differ systematically in duration, such as [see text], and [see text], were minimally affected by duration; (4) a simple pattern recognition model appears to be capable of accounting for several features of the listening test results, especially the greater influence of duration on some vowels than others; and (5) because a formant synthesizer does an imperfect job of representing the fine details of the original vowel spectrum, results using the formant-synthesized signals led to a slight overestimate of the role of duration in vowel recognition, especially for the shortened vowels.}, } @article {pmid11130107, year = {2000}, author = {Lundy, DS and Roy, S and Casiano, RR and Xue, JW and Evans, J}, title = {Acoustic analysis of the singing and speaking voice in singing students.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {4}, pages = {490-493}, doi = {10.1016/s0892-1997(00)80006-5}, pmid = {11130107}, issn = {0892-1997}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; *Music ; *Speech ; Speech Acoustics ; Voice/*physiology ; }, abstract = {The singing power ratio (SPR) is an objective means of quantifying the singer's formant. SPR has been shown to differentiate trained singers from nonsingers and sung from spoken tones. This study was designed to evaluate SPR and acoustic parameters in singing students to determine if the singer-in-training has an identifiable difference between sung and spoken voices. Digital audio recordings were made of both sung and spoken vowel sounds in 55 singing students for acoustic analysis. SPR values were not significantly different between the sung and spoken samples. Shimmer and noise-to-harmonic ratio were significantly higher in spoken samples. SPR analysis may provide an objective tool for monitoring the student's progress.}, } @article {pmid11130104, year = {2000}, author = {Story, BH and Laukkanen, AM and Titze, IR}, title = {Acoustic impedance of an artificially lengthened and constricted vocal tract.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {4}, pages = {455-469}, doi = {10.1016/s0892-1997(00)80003-x}, pmid = {11130104}, issn = {0892-1997}, support = {P60-DC00976/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustic Impedance Tests ; Humans ; Models, Biological ; Phonation/*physiology ; Phonetics ; Voice/*physiology ; }, abstract = {Voice training techniques often make use of exercises involving partial occlusion of the vocal tract, typically at the anterior part of the oral cavity or at the lips. In this study two techniques are investigated: a bilabial fricative and a small diameter hard-walled tube placed between the lips. Because the input acoustic impedance of the vocal tract is known to affect both the shaping of the glottal flow pulse and the vibrational pattern of the vocal folds, a study of the input impedance is an essential step in understanding the benefits of these two techniques. The input acoustic impedance of the vocal tract was investigated theoretically for cases of a vowel, bilabial occlusion (fully closed lips), a bilabial fricative, and artificially lengthening the tract with small diameter tubes. The results indicate that the tubes increase the input impedance in the range of the fundamental frequency of phonation by lowering the first formant frequency to nearly that of the bilabial occlusion (the lower bound on the first formant) while still allowing a continuous airflow. The bilabial fricative also has the effect of lowering the first formant frequency and increasing the low-frequency impedance, but not as effectively as the extension tubes.}, } @article {pmid11125256, year = {2001}, author = {Weismer, G and Jeng, JY and Laures, JS and Kent, RD and Kent, JF}, title = {Acoustic and intelligibility characteristics of sentence production in neurogenic speech disorders.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {53}, number = {1}, pages = {1-18}, doi = {10.1159/000052649}, pmid = {11125256}, issn = {1021-7762}, mesh = {Aged ; Aged, 80 and over ; Dysarthria/*diagnosis/etiology ; Female ; Fourier Analysis ; Humans ; Male ; Middle Aged ; Motor Neuron Disease/diagnosis ; Parkinson Disease/diagnosis ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Production Measurement ; }, abstract = {The purpose of this study was to examine the relationship between scaled speech intelligibility and selected acoustic variables in persons with dysarthria. Control speakers and speakers with amyotrophic lateral sclerosis (ALS) and Parkinson's disease (PD) produced sentences which were analyzed acoustically and perceptually. The acoustic variables included total utterance durations, segment durations, estimates of the acoustic vowel space, and slopes of formant transitions; the perceptual variables included scaled speech intelligibility and severity of speech involvement. Results indicated that the temporal variables typically differentiated the ALS group, but not the PD group, from the controls, and that vowel spaces were smaller for both neurogenic groups as compared to controls, but only significantly so for the ALS speakers. The relation of these acoustic measures to scaled speech intelligibility is shown to be complex, and the composite results are discussed in terms of sentence vs. single-word intelligibility estimates and their underlying acoustic bases.}, } @article {pmid11124875, year = {2000}, author = {Collins, SA}, title = {Men's voices and women's choices.}, journal = {Animal behaviour}, volume = {60}, number = {6}, pages = {773-780}, doi = {10.1006/anbe.2000.1523}, pmid = {11124875}, issn = {0003-3472}, abstract = {I investigated the relationship between male human vocal characteristics and female judgements about the speaker. Thirty-four males were recorded uttering five vowels and measures were taken, from power spectrums, of the first five harmonic frequencies, overall peak frequency and formant frequencies (emphasized, resonance, frequencies within the vowel). Male body measures were also taken (age, weight, height, and hip and shoulder width) and the men were asked whether they had chest hair. The recordings were then played to female judges, who were asked to rate the males' attractiveness, age, weight and height, and to estimate the muscularity of the speaker and whether he had a hairy chest. Men with voices in which there were closely spaced, low-frequency harmonics were judged as being more attractive, older and heavier, more likely to have a hairy chest and of a more muscular body type. There was no relationship between any vocal and body characteristic. The judges' estimates were incorrect except for weight. They showed extremely strong agreement on all judgements. The results imply that there could be sexual selection through female choice for male vocal characteristics, deeper voices being preferred. However, the function of the preference is unclear given that the estimates were generally incorrect. Copyright 2000 The Association for the Study of Animal Behaviour.}, } @article {pmid11096370, year = {2001}, author = {Johnson, K and Martin, J}, title = {Acoustic vowel reduction in Creek: effects of distinctive length and position in the word.}, journal = {Phonetica}, volume = {58}, number = {1-2}, pages = {81-102}, doi = {10.1159/000028489}, pmid = {11096370}, issn = {0031-8388}, mesh = {Adult ; Aged ; Aged, 80 and over ; Female ; Humans ; *Language ; Male ; Middle Aged ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {Eight speakers (4 male and 4 female) of the Muskogee dialect of Creek pronounced a set of words illustrating the vowels and diphthongs of Creek. These recordings were analyzed acoustically and data on vowel duration and vowel formant frequencies are presented in this paper. The ratio of the durations of dictinctively long and short vowels was 1.8. This ratio showed a sex difference, being larger for female speakers than it was for male speakers. Final lengthening was also observed: both distinctively long and short vowels were longer in word-final position than in word-initial position. The vowel formant data showed two additive, orthogonal phonetic vowel reduction processes: short vowel centralization and positional reduction. Short vowel centralization has been found in many languages. Distinctively long vowels in Creek tended to be more peripheral in the acoustic vowel space than were the distinctively short vowels. Positional reduction is also evident in these data: vowels in word-final position were reduced relative to vowels in word-initial position. Short vowel centralization was preserved in both positions in the word. Positional reduction has been documented in several languages, and these results from Creek lend support to the hypothesis that it is a general property of speech production. The results of this acoustic-phonetic study, the first such study of Creek, are discussed in light of cross-linguistic phonetic trends.}, } @article {pmid11091821, year = {2000}, author = {Tsui, IY and Ciocca, V}, title = {Perception of aspiration and place of articulation of Cantonese initial stops by normal and sensorineural hearing-impaired listeners.}, journal = {International journal of language & communication disorders}, volume = {35}, number = {4}, pages = {507-525}, doi = {10.1080/136828200750001269}, pmid = {11091821}, issn = {1368-2822}, mesh = {Adolescent ; Case-Control Studies ; *Cues ; Female ; Hearing Aids ; Hearing Loss, Sensorineural/*physiopathology ; Hong Kong ; Humans ; *Inhalation ; Language Tests ; Male ; Speech Perception/*physiology ; }, abstract = {This study investigated the use of acoustic cues to the perception of aspiration and place of articulation of Cantonese initial stops by bilateral severely hearing-impaired (n = 14) and normal-hearing (n = 14) adolescents. The stimuli were consonant-vowel (CV) words spoken by two male speakers in which the initial consonants /ph, p, th, t, kh, k/ were followed by the diphthong /[symbol: see text]i/. Subjects listened to the stimuli through a loudspeaker and chose the correct initial consonant among six choices. Three test conditions (short, medium and long voice onset time, or VOT) were prepared by increasing the VOT of unaspirated stops and by decreasing the VOT of aspirated stops. The results showed that the presence of aspiration noise was an important cue for normal listeners in the perception of aspiration, but not for hearing-impaired listeners. Hearing-impaired listeners used formant transitions as the main cue to the perception of aspiration. VOT was a weak aspiration cue for both groups. In the perception of place of articulation, normal listeners appeared to rely mainly on formant transitions and release burst information rather than VOT. The recognition of place by hearing-impaired listeners was at chance level for all the experimental stimuli, showing that these listeners were unable to use VOT, formant transitions or release burst information as cues to place.}, } @article {pmid11086804, year = {2000}, author = {Whiteside, SP and Hodgson, C}, title = {Some acoustic characteristics in the voices of 6- to 10-year-old children and adults: a comparative sex and developmental perspective.}, journal = {Logopedics, phoniatrics, vocology}, volume = {25}, number = {3}, pages = {122-132}, doi = {10.1080/14015430050175851}, pmid = {11086804}, issn = {1401-5439}, mesh = {Adult ; Age Factors ; Child ; Child Development/*physiology ; Female ; Humans ; Male ; Phonetics ; Sex Factors ; Speech/*physiology ; *Speech Acoustics ; *Voice Quality ; }, abstract = {This study is a follow-up investigation of some of the acoustic characteristics reported for a group of twenty 6-, 8- and 10-year-old children's voices (Whiteside SP, Hodgson C. Log Phon Vocol 1999; 24: 6-13) and compares the data of these children with data from a group of nine adults. The acoustic characteristics for both the children and the adults were examined as a function of age and sex. Fundamental frequency (F0) and the standard deviation (SD) of F0 were investigated for the entire duration of a phrase-final vowel. In addition, the first three formant frequencies of the midpoints for the phrase-final vowel were also investigated, with reference to phonetic context. All acoustic parameters were investigated by examining the age, sex and context-determined differences. Results indicated significant age differences for all acoustic parameters investigated, which were characterized by a general decrease in the frequency values of parameters with increasing age. In addition, significant sex differences were found for all parameters, with the females generally having higher formant frequency values than their male peers across all ages. Significant phonetic context differences were only found for the second formant frequency, with formant values being highest for the context "jar" and lowest for the context "bar". No significant interactions were found for age-by-context, sex-by-context or age-by-sex-context comparisons. Relative formant frequency differences were also examined using Fant's Kn factor, by considering sex-, age- and context-linked patterns. The data also provided some evidence for parallel developmental patterns in the different phonetic contexts of a phrase-final vowel. Furthermore, the results showed that there was some evidence of all three types of the aforementioned patterns in addition to phonetic context-determined changes in K2 factor values. The results reported here, therefore, provide further evidence for age- and sex-linked differences in the patterns of development and maturation of the vocal apparatus.}, } @article {pmid11063247, year = {2000}, author = {Kent, RD and Kent, JF and Duffy, JR and Thomas, JE and Weismer, G and Stuntebeck, S}, title = {Ataxic dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {43}, number = {5}, pages = {1275-1289}, doi = {10.1044/jslhr.4305.1275}, pmid = {11063247}, issn = {1092-4388}, support = {5 R01 DC 00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged ; Ataxia/*complications ; Diagnosis, Differential ; Dysarthria/*complications/diagnosis/physiopathology ; Female ; Humans ; Male ; Middle Aged ; Phonation ; Reproducibility of Results ; Severity of Illness Index ; Sound Spectrography ; Speech Acoustics ; Voice Quality ; }, abstract = {Although ataxic dysarthria has been studied with various methods in several languages, questions remain concerning which features of the disorder are most consistent, which speaking tasks are most sensitive to the disorder, and whether the different speech production subsystems are uniformly affected. Perceptual and acoustic data were obtained from 14 individuals (seven men, seven women) with ataxic dysarthria for several speaking tasks, including sustained vowel phonation, syllable repetition, sentence recitation, and conversation. Multidimensional acoustic analyses of sustained vowel phonation showed that the largest and most frequent abnormality for both men and women was a long-term variability of fundamental frequency. Other measures with a high frequency of abnormality were shimmer and peak amplitude variation (for both sexes) and jitter (for women). Syllable alternating motion rate (AMR) was typically slow and irregular in its temporal pattern. In addition, the energy maxima and minima often were highly variable across repeated syllables, and this variability is thought to reflect poorly coordinated respiratory function and inadequate articulatory/voicing control. Syllable rates tended to be slower for sentence recitation and conversation than for AMR, but the three rates were highly similar. Formant-frequency ranges during sentence production were essentially normal, showing that articulatory hypometria is not a pervasive problem. Conversational samples varied considerably across subjects in intelligibility and number of words/ morphemes in a breath group. Qualitative analyses of unintelligible episodes in conversation showed that these samples generally had a fairly well-defined syllable pattern but subjects differed in the degree to which the acoustic contrasts typical of consonant and vowel sequences were maintained. For some individuals, an intelligibility deficit occurred in the face of highly distinctive (and contrastive) acoustic patterns.}, } @article {pmid11054929, year = {2000}, author = {Diesch, E and Luce, T}, title = {Topographic and temporal indices of vowel spectral envelope extraction in the human auditory cortex.}, journal = {Journal of cognitive neuroscience}, volume = {12}, number = {5}, pages = {878-893}, doi = {10.1162/089892900562480}, pmid = {11054929}, issn = {0898-929X}, mesh = {Adult ; Auditory Cortex/*physiology ; Brain Mapping ; Female ; Humans ; Magnetoencephalography ; Male ; *Phonetics ; Reaction Time ; }, abstract = {The auditory-evoked neuromagnetic field elicited by single vowel formants and two-formant vowels was recorded under active listening conditions using a 37-channel magnetometer. There were three single formants with formant frequencies of 200, 400, and 800 Hz, another single formant with a formant frequency of 2600 Hz, and three vowels that were constructed by linear superimposition of the high- onto one of the low-frequency formants. P50 m and N100 m latency values were inversely correlated with the formant frequency of single formants. A strong effect of formant frequency on source location was obtained along the postero-anterior axis, which is orthogonal to the well-established latero-medial tonotopic gradient. Regardless of whether single formants or first formants of vowels were considered, N100 m sources were more anterior and sustained field sources were more posterior for higher-frequency than for lower-frequency formants. The velocity of the apparent posterior-to-anterior movement across cortical surface of N100 m sources first reported by Rogers et al. [Rogers, R. L., Papanicolaou, A. C., Baumann, S. B., Saydjari, C., & Eisenberg, H. M. (1990). Neuromagnetic evidence of a dynamic excitation pattern generating the N100 auditory response. Electroencephalography and Clinical Neurophysiology,77, 237-240] decreased as a function of latency. The amount of deceleration was positively correlated with formant frequency. Responses to the vowels were superadditive, indicating that the processes elicited by the constituents of composite stimuli interact at one or more stages of the afferent auditory pathway. Such interaction may account for the absence of a lateral-to-medial tonotopic mapping of first formant frequency. The source topography found may reflect activity in auditory fields adjacent to AI with the strength of the contribution varying with formant frequency. Alternatively, it may reflect sharpness-of-tuning and inhibitory response-area asymmetry gradients along isofrequency stripes within AI. Either alternative may be interpreted in terms of a spectral blurring mechanism that abstracts spectral envelope information from the details of spectral composition, an important step towards the formation of invariant phonetic percepts.}, } @article {pmid11051512, year = {2000}, author = {Assmann, PF and Katz, WF}, title = {Time-varying spectral change in the vowels of children and adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {4}, pages = {1856-1866}, doi = {10.1121/1.1289363}, pmid = {11051512}, issn = {0001-4966}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; *Phonetics ; *Sound Spectrography ; *Speech Acoustics ; Time Factors ; }, abstract = {Recent studies have shown that time-varying changes in formant pattern contribute to the phonetic specification of vowels. This variation could be especially important in children's vowels, because children have higher fundamental frequencies (f0's) than adults, and formant-frequency estimation is generally less reliable when f0 is high. To investigate the contribution of time-varying changes in formant pattern to the identification of children's vowels, three experiments were carried out with natural and synthesized versions of 12 American English vowels spoken by children (ages 7, 5, and 3 years) as well as adult males and females. Experiment 1 showed that (i) vowels generated with a cascade formant synthesizer (with hand-tracked formants) were less accurately identified than natural versions; and (ii) vowels synthesized with steady-state formant frequencies were harder to identify than those which preserved the natural variation in formant pattern over time. The decline in intelligibility was similar across talker groups, and there was no evidence that formant movement plays a greater role in children's vowels compared to adults. Experiment 2 replicated these findings using a semi-automatic formant-tracking algorithm. Experiment 3 showed that the effects of formant movement were the same for vowels synthesized with noise excitation (as in whispered speech) and pulsed excitation (as in voiced speech), although, on average, the whispered vowels were less accurately identified than their voiced counterparts. Taken together, the results indicate that the cues provided by changes in the formant frequencies over time contribute materially to the intelligibility of vowels produced by children and adults, but these time-varying formant frequency cues do not interact with properties of the voicing source.}, } @article {pmid11048491, year = {2000}, author = {Novák, A and Vokrál, J}, title = {The speech intelligibility at the opera singing.}, journal = {Sbornik lekarsky}, volume = {101}, number = {2}, pages = {153-164}, pmid = {11048491}, issn = {0036-5327}, mesh = {Female ; Humans ; Male ; *Music ; Sound Spectrography ; *Speech Intelligibility ; }, abstract = {The authors investigated the speech intelligibility at opera singing. They analysed several arias sung by soprano voices from Czech opera "Rusalka" (The water Nymph), from Puccini's opera "La Boheme" and several parts of arias from "Il barbiere di Siviglia" sung by baritone, tenor, bass and soprano. The sonographic pictures of selected arias were compared with a subjective evaluation. The difference between both authors was about 70%. The opinion is, that the singer's formant is not the only one problem having its role in the speech intelligibility of opera singing. The important role is played by the ability to change the shape of vocal tract and the ability of rapid and exact articulatory movements. This ability influences the shape of transients that are important at the normal speaking and also in the singing.}, } @article {pmid11025331, year = {2000}, author = {Krishnan, A and Parkinson, J}, title = {Human frequency-following response: representation of tonal sweeps.}, journal = {Audiology & neuro-otology}, volume = {5}, number = {6}, pages = {312-321}, doi = {10.1159/000013897}, pmid = {11025331}, issn = {1420-3030}, support = {R03-DC01980-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Threshold/physiology ; Humans ; Phonetics ; Sound Spectrography ; Speech Perception/*physiology ; }, abstract = {Auditory nerve single-unit population studies have demonstrated that phase locking plays a dominant role in the neural encoding of steady-state speech sounds. Recently, we have reported that the phase-locked activity underlying the human frequency-following response (FFR) could also encode the first two formants of several tonal approximations of steady-state vowels. Since auditory nerve single-unit population studies have also demonstrated that phase locking is used to represent time-varying speech-like sounds, it was reasoned that the phase-locked neural activity underlying the human FFR, likewise, is dynamic enough to represent time-varying sounds. FFRs to a rising and a falling tone were obtained from 8 normal-hearing adults at 95, 85, 75 and 65 dB nHL. Results clearly demonstrated that the human FFR does indeed follow the trajectory of the rising and falling tones. Also, amplitude changes in the FFR supported the view that neural phase locking decreases with increasing frequency. Finally, the relatively smaller FFR amplitude for the falling tone compared to its rising counterpart lends further support to the notion that rising tones produce greater neural synchrony than falling tones. These results indicate that the human FFR may be used to evaluate encoding of time-varying speech sounds like diphthongs and certain consonant-vowel syllables.}, } @article {pmid11021500, year = {2000}, author = {Shrivastav, R and Yamaguchi, H and Andrews, M}, title = {Effects of stimulation techniques on vocal responses: implications for assessment and treatment.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {3}, pages = {322-330}, doi = {10.1016/s0892-1997(00)80078-8}, pmid = {11021500}, issn = {0892-1997}, mesh = {Adolescent ; Adult ; Female ; Humans ; Laryngoscopy ; Phonetics ; Speech Acoustics ; Speech Therapy ; Videotape Recording ; Voice Disorders/diagnosis/*therapy ; *Voice Training ; }, abstract = {During voice evaluation and treatment it is customary for clinicians to elicit samples of the vowel /a/ from clients using various elicitation techniques. The purpose of this study was to compare the effects of four commonly used stimulation tasks on the laryngeal mechanism. Eleven female singing students, studying at a university music school, served as subjects for the study. The subjects phonated the vowel /a/ using 4 vocal stimulation techniques: yawn-sigh, gentle onset, focus, and the use of the voiceless fricative. Videoendoscopic and acoustic evaluations of their productions were done. Results show that, in the first 100 ms following the end of the formant transition, these techniques affected voice differently. The fundamental frequency was found to be highest in the yawn-sigh condition, whereas the maximum frequency perturbation was obtained for the voiceless fricative condition. Planned comparisons were made by comparing the data across 2 dimensions: (1) vowels elicited with voiced contexts versus those elicited with voiceless consonantal contexts and (2) vowels elicited with obstruent versus vowels elicited with nonobstruent consonantal contexts. Some changes in acoustic parameters brought about by these stimulation techniques may be explained on the basis of coarticulatory effects of the consonantal context.}, } @article {pmid11021498, year = {2000}, author = {Brown, WS and Rothman, HB and Sapienza, CM}, title = {Perceptual and acoustic study of professionally trained versus untrained voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {3}, pages = {301-309}, doi = {10.1016/s0892-1997(00)80076-4}, pmid = {11021498}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; *Speech Acoustics ; Speech Perception ; Voice/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {Acoustic and perceptual analyses were completed to determine the effect of vocal training on professional singers when speaking and singing. Twenty professional singers and 20 nonsingers, acting as the control, were recorded while sustaining a vowel, reading a modified Rainbow Passage, and singing "America the Beautiful." Acoustic measures included fundamental frequency, duration, percent jitter, percent shimmer, noise-to-harmonic ratio, and determination of the presence or absence of both vibrato and the singer's formant. Results indicated that, whereas certain acoustic parameters differentiated singers from nonsingers within sex, no consistently significant trends were found across males and females for either speaking or singing. The most consistent differences were the presence or absence of the singer's vibrato and formant in the singers versus the nonsingers, respectively. Perceptual analysis indicated that singers could be correctly identified with greater frequency than by chance alone from their singing, but not their speaking utterances.}, } @article {pmid11020188, year = {2000}, author = {Colletti, V and Fiorino, FG and Carner, M and Sacchetto, L and Giarbini, N}, title = {New approach for cochlear implantation: cochleostomy through the middle fossa.}, journal = {Otolaryngology--head and neck surgery : official journal of American Academy of Otolaryngology-Head and Neck Surgery}, volume = {123}, number = {4}, pages = {467-474}, doi = {10.1067/mhn.2000.107406}, pmid = {11020188}, issn = {0194-5998}, mesh = {Adolescent ; Adult ; Aged ; Audiometry ; Cochlear Implantation/instrumentation/*methods ; Electrodes ; Female ; Follow-Up Studies ; Hearing Loss, Bilateral/diagnosis/*surgery ; Humans ; Male ; Middle Aged ; Prosthesis Design ; Severity of Illness Index ; Treatment Outcome ; }, abstract = {The middle fossa approach was used in 11 patients with profound bilateral hearing loss for insertion of a cochlear implant. Fibroadhesive otitis media (n = 1), bilateral cavity radical mastoidectomy (n = 1), autoimmune inner ear disease (n = 2), previous cranial trauma (n = 1), genetic prelingual deafness (n = 5), and otosclerosis (n = 1) were the causes of deafness. A cochleostomy was performed on the most superficial part of the basal turn, and the electrode array was inserted up to the cochlear apex. Speech perception tests (1-9 months after cochlear implant activation) yielded better results in these patients compared with a homogeneous group of postlingually deaf patients operated on through the traditional transmastoid route. Insertion of the implant through the middle fossa cochleostomy furnishes the possibility of stimulating areas of the cochlea (ie, the middle and apical turns) where a greater survival rate of spiral ganglion cells is known to occur, with improvement of information regarding the formants relevant for speech perception.}, } @article {pmid11012241, year = {2000}, author = {Gopal, KV and Daly, DM and Daniloff, RG and Pennartz, L}, title = {Effects of selective serotonin reuptake inhibitors on auditory processing: case study.}, journal = {Journal of the American Academy of Audiology}, volume = {11}, number = {8}, pages = {454-463}, pmid = {11012241}, issn = {1050-0545}, mesh = {Adult ; Auditory Perception/*drug effects ; Auditory Threshold/drug effects ; Depressive Disorder, Major/drug therapy ; Evoked Potentials, Auditory, Brain Stem/drug effects ; Female ; Humans ; Reflex, Acoustic/drug effects ; Selective Serotonin Reuptake Inhibitors/*pharmacology ; }, abstract = {Auditory sensitivity and processing ability were evaluated in a patient who suffered from hyperacusis, difficulty understanding speech, withdrawn depression, lethargy, and hypersensitivity to touch, pressure, and light. Treatment with fluvoxamine and fluoxetine (selective serotonin reuptake inhibitors) reversibly alleviated complaints. Testing while medicated and unmedicated (after voluntary withdrawal from medication for several weeks) revealed no difference in pure-tone thresholds, speech thresholds, word recognition scores, tympanograms, or acoustic reflex thresholds. Medicated SCAN-A (a screening test for central auditory processing disorders) results were normal, and unmedicated results were abnormal. Unmedicated transient otoacoustic emissions and auditory brainstem response waves I, III, and V were significantly larger bilaterally. Uncomfortable loudness levels indicated greater tolerance during the medicated condition. Central processing and vigilance were evaluated with analog-synthesized three-formant consonant-vowel syllables. While medicated, responses to stimuli at each ear revealed well-defined, labeling crossovers of about 90 msec. Vowel identification matched normal subject responses; labeling of /gE/jE/ and /bE/wE/ continua was well defined but all crossover points differed from normals (p < .0001). During unmedicated testing, responses to /gE/jE/ began at medicated levels but approached chance levels for the entire continuum within 10 min; labeling of /bE/wE/ was consistent with medicated responses throughout with earlier than normal crossover points.}, } @article {pmid11008827, year = {2000}, author = {Henry, BA and McKay, CM and McDermott, HJ and Clark, GM}, title = {The relationship between speech perception and electrode discrimination in cochlear implantees.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {3 Pt 1}, pages = {1269-1280}, doi = {10.1121/1.1287711}, pmid = {11008827}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Aged ; *Cochlear Implantation ; *Discrimination Learning ; *Electrodes ; Humans ; Middle Aged ; Models, Biological ; Speech Perception/*physiology ; }, abstract = {Speech Intelligibility Index (SII) procedures were used to measure the amount of speech information perceived in five frequency bands (170-570, 570-1170, 1170-1768, 1768-2680, and 2680-5744 Hz) by 15 users of the Cochlear Ltd. CI-22M implant and Spectra-22/SPEAK processor. The speech information perceived was compared to that perceived by normal-hearing listeners. The ability of these subjects to discriminate between stimulation on adjacent electrodes corresponding to each frequency band was also investigated, using a 4IFC procedure with random current level variations of between 0% and 60% of the dynamic range. Relative to normal-hearing listeners, speech information was, on average, significantly more reduced in the four frequency regions between 170 and 2680 Hz than in the region 2680-5744 Hz. There was a significant correlation between electrode discrimination ability (when the random level variation encompassed 20% or more of the dynamic range) and the amount of speech information perceived in the four frequency regions between 170 and 2680 Hz. There was no such correlation in the region 2680-5744 Hz, regardless of the extent of random level variation. These results indicate that speech information in the low to medium frequencies is more difficult for implantees to perceive, that this difficulty is correlated with the difficulty in discriminating electrode place in the presence of random loudness variations, and that fine spectral discrimination may be relatively more important in the vowel-formant regions than in higher frequency regions.}, } @article {pmid11008820, year = {2000}, author = {Grant, KW and Seitz, PF}, title = {The use of visible speech cues for improving auditory detection of spoken sentences.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {3 Pt 1}, pages = {1197-1208}, doi = {10.1121/1.1288668}, pmid = {11008820}, issn = {0001-4966}, support = {DC00792/DC/NIDCD NIH HHS/United States ; DC01643/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Auditory Threshold/physiology ; *Cues ; Female ; Humans ; Lipreading ; Male ; Middle Aged ; Noise ; Speech Perception/*physiology ; Visual Perception/*physiology ; }, abstract = {Classic accounts of the benefits of speechreading to speech recognition treat auditory and visual channels as independent sources of information that are integrated fairly early in the speech perception process. The primary question addressed in this study was whether visible movements of the speech articulators could be used to improve the detection of speech in noise, thus demonstrating an influence of speechreading on the ability to detect, rather than recognize, speech. In the first experiment, ten normal-hearing subjects detected the presence of three known spoken sentences in noise under three conditions: auditory-only (A), auditory plus speechreading with a visually matched sentence (AV(M)) and auditory plus speechreading with a visually unmatched sentence (AV(UM). When the speechread sentence matched the target sentence, average detection thresholds improved by about 1.6 dB relative to the auditory condition. However, the amount of threshold reduction varied significantly for the three target sentences (from 0.8 to 2.2 dB). There was no difference in detection thresholds between the AV(UM) condition and the A condition. In a second experiment, the effects of visually matched orthographic stimuli on detection thresholds was examined for the same three target sentences in six subjects who participated in the earlier experiment. When the orthographic stimuli were presented just prior to each trial, average detection thresholds improved by about 0.5 dB relative to the A condition. However, unlike the AV(M) condition, the detection improvement due to orthography was not dependent on the target sentence. Analyses of correlations between area of mouth opening and acoustic envelopes derived from selected spectral regions of each sentence (corresponding to the wide-band speech, and first, second, and third formant regions) suggested that AV(M) threshold reduction may be determined by the degree of auditory-visual temporal coherence, especially between the area of lip opening and the envelope derived from mid- to high-frequency acoustic energy. Taken together, the data (for these sentences at least) suggest that visual cues derived from the dynamic movements of the fact during speech production interact with time-aligned auditory cues to enhance sensitivity in auditory detection. The amount of visual influence depends in part on the degree of correlation between acoustic envelopes and visible movement of the articulators.}, } @article {pmid11006976, year = {2000}, author = {Thivard, L and Belin, P and Zilbovicius, M and Poline, JB and Samson, Y}, title = {A cortical region sensitive to auditory spectral motion.}, journal = {Neuroreport}, volume = {11}, number = {13}, pages = {2969-2972}, doi = {10.1097/00001756-200009110-00028}, pmid = {11006976}, issn = {0959-4965}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Cortex/*diagnostic imaging/physiology ; Auditory Perception/*physiology ; Cerebrovascular Circulation/physiology ; Functional Laterality/physiology ; Humans ; Male ; Thalamus/diagnostic imaging/physiology ; Tomography, Emission-Computed ; }, abstract = {The functional architecture of human auditory cortex is still poorly understood compared with that of visual cortex, yet anatomical and electrophysiological studies in non-human primates suggest that the auditory cortex also might be functionally specialized, in a model of parallel and hierarchical organization. In particular, spectral changes such as the formant transitions of speech, or spectral motion (SM) by analogy with visual motion, could be processed in specialized cortical regions. In this study, positron emission tomography (PET) was used to identify which auditory cortical region are involved in SM analysis. We found that a bilateral secondary auditory cortical region, located in the caudal-lateral belt of auditory cortex, was more sensitive to auditory stimuli containing spectral changes than to matched stimuli with a stationary spectral profile. This result suggests that analogies between sensory systems could prove useful in the research into the functional organization of the auditory cortex.}, } @article {pmid10992146, year = {2000}, author = {Diehl, RL}, title = {Searching for an auditory description of vowel categories.}, journal = {Phonetica}, volume = {57}, number = {2-4}, pages = {267-274}, doi = {10.1159/000028479}, pmid = {10992146}, issn = {0031-8388}, support = {5 RO1 DC00427-11/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {This paper examines three auditory hypotheses concerning the location of category boundaries among vowel sounds. The first hypothesis claims that category boundaries tend to occur in a region corresponding to a 3-Bark separation between adjacent spectral peaks. According to the second hypothesis, vowel category boundaries are determined by the combined effects of the Bark distances between adjacent spectral peaks but that the weight of each of these effects is inversely related to the individual sizes of the Bark distances. In a series of perceptual experiments, each of these hypotheses was found to account for some category boundaries in American English but not others. The third hypothesis, which has received preliminary support from studies in our laboratory and elsewhere, claims that listeners partition the vowel space of individual talkers along lines corresponding to relatively simple linear functions of formant values when scaled in auditorily motivated units of frequency such as Bark.}, } @article {pmid10992141, year = {2000}, author = {Fitch, WT}, title = {The phonetic potential of nonhuman vocal tracts: comparative cineradiographic observations of vocalizing animals.}, journal = {Phonetica}, volume = {57}, number = {2-4}, pages = {205-218}, doi = {10.1159/000028474}, pmid = {10992141}, issn = {0031-8388}, support = {T32 DC00038/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Animal Communication ; Animals ; Cineradiography/methods ; Dogs ; Goats/physiology ; Haplorhini/physiology ; Humans ; Larynx/physiology ; Speech/*physiology ; Swine/physiology ; Voice/*physiology ; }, abstract = {For more than a century it has been noted that the adult human vocal tract differs from that of other mammals, in that the resting position of the larynx is much lower in humans. While animals habitually breathe with the larynx inserted into the nasal cavity, adult humans are unable to do this. This anatomical difference has been cited as an important factor limiting the vocal potential of nonhuman animals, because the low larynx of humans allows a wider range of vocal tract shapes and thus formant patterns than is available to other species. However, it is not clear that the static anatomy of dead animals provides an accurate guide to the phonetic potential of the living animal's vocal tract. Here I present X-ray video observations of four mammal species (dogs Canis familiaris, goats Capra hircus, pigs Sus scrofa and cotton-top tamarins Sagunius oedipus). In all four species, the larynx was lowered from the nasopharynx, and the velum was closed, during loud calls. In dogs this temporary lowering was particularly pronounced. Although preliminary, these results suggest that the nonhuman vocal tract is more flexible than previously supposed, and that static postmortem anatomy provides an incomplete guide to the phonetic potential of nonhuman animals. The implications of these findings for theories of speech evolution are discussed.}, } @article {pmid10992136, year = {2000}, author = {Carré, R and Divenyi, PL}, title = {Modeling and perception of 'gesture reduction'.}, journal = {Phonetica}, volume = {57}, number = {2-4}, pages = {152-169}, pmid = {10992136}, issn = {0031-8388}, support = {R01 AG007998-10A1/AG/NIA NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Speech/*physiology ; Speech Production Measurement ; Time Factors ; }, abstract = {The phenomenon of vowel reduction is investigated by modeling 'gesture reduction' with the use of the Distinctive Region Model (DRM). First, a definition is proposed for the term gesture, i.e. an acoustically efficient command aimed at deforming, in the time domain, the area function of the vocal tract. Second, tests are reported on the perception of vowel-to-vowel transitions obtained with reduced gestures. These tests show that a dual representation of formant transitions is required to explain the reduction phenomenon: the trajectory in the F(1)-F(2) plane and the time course of the formant changes. The results also suggest that time-domain integration of the trajectories constitutes an integral part of the auditory processing of transitions. Perceptual results are also discussed in terms of the acoustic traces of DRM gestures.}, } @article {pmid10981603, year = {2000}, author = {Shen, JX and Xia, YF and Xu, ZM and Zhao, SQ and Guo, JZ}, title = {Speech evaluation of partially implantable piezoelectric middle ear implants in vivo.}, journal = {Ear and hearing}, volume = {21}, number = {4}, pages = {275-279}, doi = {10.1097/00003446-200008000-00002}, pmid = {10981603}, issn = {0196-0202}, mesh = {Animals ; Audiometry, Pure-Tone/methods ; Ear, Middle/*surgery ; Electric Stimulation/instrumentation ; Equipment Design ; Female ; Fourier Analysis ; Humans ; Male ; Phonetics ; *Prostheses and Implants ; Prosthesis Implantation ; Rabbits ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: Cochlear microphonic responses (CMs) were measured in a rabbit model in the intact ear (CM1) and in ears with a partially implantable piezoelectric middle ear implant (P-MEI) (CM2) to investigate the characteristics of speech transmission of the P-MEI in vivo.

DESIGN: The spectra of pure tones, voices, and elicited CM1 and CM2 obtained from the round window before and after the implantation of a P-MEI device were calculated by using fast Fourier transform. Frequency response functions of CM1 and CM2 were used to demonstrate the functional similarity between the implanted P-MEI and the normal ossicular chain. The coherence functions between the voices and CM1 and between CM1 and CM2 were evaluated to characterize speech transmission of the P-MEI in vivo. Ten rabbit ears were used in this study. Pure tones, six Chinese vowels, and six Chinese characters were the acoustic stimuli. The CMs elicited by a list of bisyllabic words were tape-recorded and then recognized by subjects with normal hearing.

RESULTS: Using pure tones at the same intensity of 90 dB SPL, frequency response functions of the CMs between the two states (the intact ear with normal hearing and ears with a P-MEI device at the medium volume) were calculated showing great resemblance in shape. Compared with that at 1 kHz, gain factors were 10 and 20 dB, respectively, at higher frequencies. The correlation and spectral analyses of the vocalizations, CM1 and CM2, demonstrated that the harmonics of CM1 were approximately identical as those of the voices between 0.5 and 5.0 kHz with coherence functions of about 0.7 to approximately 1 at the formants' frequencies, whereas the harmonics of CM2 between 0.5 and 2.5 kHz were enhanced with the coherences near to unity at the formants' frequencies, and others <0.5 kHz and >2.5 kHz were attenuated. The recognition score of the CMs elicited by a list of bisyllabic words was >90% using subjects with normal hearing.

CONCLUSIONS: Data from this study suggest that cochlear microphonic potentials can be used as an important tool to evaluate objectively whether the implantation of a P-MEI device is successful and whether the quality in speech transmission of the P-MEI is satisfactory. Thus, the method would be of significance to clinically ascertain the performance of the device in vivo.}, } @article {pmid10978700, year = {2000}, author = {Mathiak, K and Hertrich, I and Lutzenberger, W and Ackermann, H}, title = {Encoding of temporal speech features (formant transients) during binaural and dichotic stimulus application: a whole-head magnetencephalography study.}, journal = {Brain research. Cognitive brain research}, volume = {10}, number = {1-2}, pages = {125-131}, doi = {10.1016/s0926-6410(00)00035-5}, pmid = {10978700}, issn = {0926-6410}, mesh = {Acoustic Stimulation/methods ; Adult ; Brain/physiology ; Dichotic Listening Tests ; Female ; Humans ; Magnetoencephalography ; Male ; Prohibitins ; Speech Intelligibility/physiology ; Speech Perception/*physiology ; Time Factors ; }, abstract = {Spoken-word recognition depends upon the encoding of relevant 'information bearing elements' of the acoustic speech signal. For example, relatively rapid shifts of spectral energy distribution (formant transients) cue the perception of stop consonant-vowel (CV) syllables such as /ba/, /ga/, and /da/. A variety of data indicate left-hemisphere superiority with respect to the processing of formant transients. To further delineate the underlying neurophysiological mechanisms, evoked cortical fields in response to CV syllables (oddball design; frequent stimulus=binaural /ga/; four deviant constellations: Binaural /ba/, binaural /da/, left /da/ (left ear deviant)-right /ga/, right /da/ (right ear deviant)-left /ga/) were recorded by means of whole-head magnetencephalography (MEG; 151 channels) under two different conditions of attentional demands (visual distraction versus reaction to prespecified stimuli). (a) During binaural stimulus presentation attention toward target events resulted in a significantly enhanced mismatch field (MMNm, magnetic analogue to the mismatch negativity) over the left as compared to the right hemisphere. In contrast, preattentive processing of the CV syllables failed MMNm lateralization effects. (b) Dichotic application of /da/ elicited a larger contralateral MMNm amplitude in subjects with right ear advantage (REA) at behavioral testing. In addition, right ear deviants yielded a stronger ipsilateral response than the left ear cognates. Taken together, these data indicate bilateral preattentive processing and subsequent attention-related predominant left-hemisphere encoding of formant transients at the level of the supratemporal plane. Furthermore, REA during dichotic application of CV syllables seems to be linked to functional dissociation of the two hemispheres during auditory processing.}, } @article {pmid10973483, year = {2000}, author = {Vihla, M and Lounasmaa, OV and Salmelin, R}, title = {Cortical processing of change detection: dissociation between natural vowels and two-frequency complex tones.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {97}, number = {19}, pages = {10590-10594}, pmid = {10973483}, issn = {0027-8424}, mesh = {Adult ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Speech Perception ; }, abstract = {We compared magnetoencephalographic responses for natural vowels and for sounds consisting of two pure tones that represent the two lowest formant frequencies of these vowels. Our aim was to determine whether spectral changes in successive stimuli are detected differently for speech and nonspeech sounds. The stimuli were presented in four blocks applying an oddball paradigm (20% deviants, 80% standards): (i) /alpha/ tokens as deviants vs. /i/ tokens as standards; (ii) /e/ vs. /i/; (iii) complex tones representing /alpha/ formants vs. /i/ formants; and (iv) complex tones representing /e/ formants vs. /i/ formants. Mismatch fields (MMFs) were calculated by subtracting the source waveform produced by standards from that produced by deviants. As expected, MMF amplitudes for the complex tones reflected acoustic deviation: the amplitudes were stronger for the complex tones representing /alpha/ than /e/ formants, i.e., when the spectral difference between standards and deviants was larger. In contrast, MMF amplitudes for the vowels were similar despite their different spectral composition, whereas the MMF onset time was longer for /e/ than for /alpha/. Thus the degree of spectral difference between standards and deviants was reflected by the MMF amplitude for the nonspeech sounds and by the MMF latency for the vowels.}, } @article {pmid10965174, year = {2000}, author = {Weismer, G and Laures, JS and Jeng, JY and Kent, RD and Kent, JF}, title = {Effect of speaking rate manipulations on acoustic and perceptual aspects of the dysarthria in amyotrophic lateral sclerosis.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {52}, number = {5}, pages = {201-219}, doi = {10.1159/000021536}, pmid = {10965174}, issn = {1021-7762}, mesh = {Aged ; Amyotrophic Lateral Sclerosis/*complications ; Dysarthria/diagnosis/*etiology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Reproducibility of Results ; Severity of Illness Index ; Speech/*physiology ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {The current study explored the acoustic and perceptual effects of speaking rate adjustments in persons with amyotrophic lateral sclerosis (ALS) and in neurologically normal individuals. Sentence utterances were obtained from the participants at two self-selected speaking rates: habitual and fast. Total utterance durations, segment durations, and vowel formant frequencies comprised the acoustic measures, whereas magnitude estimates of speech intelligibility and severity of speech involvement were the perceptual measures. Results showed that participants in both the neurologically normal and ALS groups were able to increase their speaking rate when asked to do so, but that the participants with ALS were significantly slower than the neurologically normal participants at both rates. Both groups of participants also showed compression of the acoustic vowel space with increased speaking rate, with the vowel spaces of participants with ALS generally being more compressed than the vowel spaces of neurologically normal participants, at either rate. Most importantly, the perceptual measures failed to show any effect of the speaking rate adjustment on scaled intelligibility or severity, for either group. These findings are discussed relative to the general issue of slow habitual speaking rates among many speakers with dysarthria, and possible explanations for the slowness. The lack of an effect of increased rate on the perception of the speech deficit among speakers with ALS argues against the idea that the habitually slow rates are a form of compensation to reduce the complexity of speech production.}, } @article {pmid10955642, year = {2000}, author = {Lyzenga, J and Carlyon, RP}, title = {Binaural effects in center-frequency modulation detection interference for vowel formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {2}, pages = {753-759}, doi = {10.1121/1.429608}, pmid = {10955642}, issn = {0001-4966}, mesh = {Humans ; Perceptual Masking/physiology ; Phonetics ; Speech Perception/*physiology ; }, abstract = {The detection of slow (5 Hz) center-frequency modulations of formants (signals) can be impaired by the simultaneous presentation of off-frequency modulated formants (maskers) to the same ear [J. Lyzenga and R. P. Carlyon, J. Acoust. Soc. Am. 105, 2792-2806 (1999)]. In the present study we examine this "formant-frequency modulation detection interference (FMDI)" for various binaural masker presentation schemes. Signals and maskers were formantlike complex tones, centered around 1500 and 3000 Hz, respectively. Fundamentals of 80 and 240 Hz were used. The signals were presented to the right ear. The maskers were presented either to the right, the left, or to both ears, and they were either unmodulated or modulated at a slow rate (10 Hz). They had the same fundamental as the signals. Hardly any interference was found for the unmodulated maskers. For modulated maskers, the amount of FMDI depended strongly on the binaural masker presentation scheme. Substantial interference was found for the ipsilateral maskers. Interference was smaller for the contralateral maskers. In both cases the FMDI increased with increasing masker level. Substantial interference was also found for the binaural maskers. Imposing different interaural time and level differences (ITDs and ILDs) on maskers and signals did not affect FMDI. The same was true for the ITD condition when the maskers had different fundamentals than the signals, though FMDI was slightly smaller here. The amount of interference for the binaural maskers was roughly equal to that of the corresponding monaural masker with the largest effect. The data could not be described accurately using a model based on the loudness of the maskers. On the other hand, they were well described by a model in which the amount of FMDI was predicted from a "weighted combination" of the monaural masker levels.}, } @article {pmid10938551, year = {1999}, author = {Maurer, D and d'Heureuse, C and Landis, T}, title = {Formant Pattern Ambiguity of Vowel Sounds.}, journal = {The International journal of neuroscience}, volume = {100}, number = {1-4}, pages = {39-76}, pmid = {10938551}, issn = {1563-5279}, abstract = {The formant frequencies of a particular vowel vary according to the speaker group and to coarticulation. Therefore, overlapping formant patterns of different vowels are commonly related to sex and age differences and to coarticulation, and are considered to concern mainly the F(1) -F(2) pattern of adjacent vowels. However, several studies have reported indications of a correlation between the lower formant frequencies and F(0), as well as of the appearance of different formant numbers relevant to vowel identity. As a consequence, the overlap between formant patterns of different vowels might be more substantial than has traditionally been assumed. Within the present study, therefore, the extent to which a given formant pattern can represent different vowels was investigated for natural Swiss German vowels produced monotonously and in isolation by men, women and children at F(0) of 85-870 Hz. Similar formant patterns were found for vocalizations of different vowels with both small and large phonetic distances, and within the entire frequency ranges of the formants relevant for phoneme identity. For vowel sounds displaying ambiguous formant patterns, the main spectral characteristics related to differences in their perceptual identity were found to concern F(0) and relative formant amplitudes. Results are given in exemplary vowel series, and consequences for the psychophysics of the vowel are discussed.}, } @article {pmid10931270, year = {2000}, author = {Yoshida, H and Furuya, Y and Shimodaira, K and Kanazawa, T and Kataoka, R and Takahashi, K}, title = {Spectral characteristics of hypernasality in maxillectomy patients.}, journal = {Journal of oral rehabilitation}, volume = {27}, number = {8}, pages = {723-730}, doi = {10.1046/j.1365-2842.2000.00537.x}, pmid = {10931270}, issn = {0305-182X}, mesh = {Adult ; Aged ; Analysis of Variance ; Female ; Humans ; Male ; Maxillary Neoplasms/surgery ; Middle Aged ; Oral Surgical Procedures/*adverse effects ; Palatal Neoplasms/surgery ; *Palatal Obturators ; Palate/*surgery ; Regression Analysis ; *Sound Spectrography ; Speech Acoustics ; Statistics, Nonparametric ; Voice Disorders/*diagnosis/etiology ; *Voice Quality ; }, abstract = {To reveal the acoustic characteristics associated with hypernasality and to ascertain their correlation to the severity of hypernasality, 30 speech samples produced by 15 maxillectomy patients were acoustically analysed with and without an obturator prosthesis in place. The isolated, sustained Japanese vowel /i/ was used as the stimulus for acoustic measurement and perceptual judgment to evaluate the severity of hypernasality. Normalized 1/3-octave spectral analysis demonstrated the spectral characteristics of hypernasality as a rise in amplitude between the first and second formants around the 1 kHz region, and a reduction in amplitude of the frequencies higher than the second formant. High correlation was shown between the perceptual ratings and the predicted values derived from stepwise regression analysis.}, } @article {pmid10923897, year = {2000}, author = {Espy-Wilson, CY and Boyce, SE and Jackson, M and Narayanan, S and Alwan, A}, title = {Acoustic modeling of American English /r/.}, journal = {The Journal of the Acoustical Society of America}, volume = {108}, number = {1}, pages = {343-356}, doi = {10.1121/1.429469}, pmid = {10923897}, issn = {0001-4966}, support = {1 K02 DC00149-01A1/DC/NIDCD NIH HHS/United States ; IR03-C2576-01//PHS HHS/United States ; }, mesh = {Humans ; *Language ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {Recent advances in physiological data collection methods have made it possible to test the accuracy of predictions against speaker-specific vocal tracts and acoustic patterns. Vocal tract dimensions for /r/ derived via magnetic-resonance imaging (MRI) for two speakers of American English [Alwan, Narayanan, and Haker, J. Acoust. Soc. Am. 101, 1078-1089 (1997)] were used to construct models of the acoustics of /r/. Because previous models have not sufficiently accounted for the very low F3 characteristic of /r/, the aim was to match formant frequencies predicted by the models to the full range of formant frequency values produced by the speakers in recordings of real words containing /r/. In one set of experiments, area functions derived from MRI data were used to argue that the Perturbation Theory of tube acoustics cannot adequately account for /r/, primarily because predicted locations did not match speakers' actual constriction locations. Different models of the acoustics of /r/ were tested using the Maeda computer simulation program [Maeda, Speech Commun. 1, 199-299 (1982)]; the supralingual vocal-tract dimensions reported in Alwan et al. were found to be adequate at predicting only the highest of attested F3 values. By using (1) a recently developed adaptation of the Maeda model that incorporates the sublingual space as a side branch from the front cavity, and by including (2) the sublingual space as an increment to the dimensions of the front cavity, the mid-to-low values of the speakers' F3 range were matched. Finally, a simple tube model with dimensions derived from MRI data was developed to account for cavity affiliations. This confirmed F3 as a front cavity resonance, and variations in F1, F2, and F4 as arising from mid- and back-cavity geometries. Possible trading relations for F3 lowering based on different acoustic mechanisms for extending the front cavity are also proposed.}, } @article {pmid10916253, year = {2000}, author = {France, DJ and Shiavi, RG and Silverman, S and Silverman, M and Wilkes, DM}, title = {Acoustical properties of speech as indicators of depression and suicidal risk.}, journal = {IEEE transactions on bio-medical engineering}, volume = {47}, number = {7}, pages = {829-837}, doi = {10.1109/10.846676}, pmid = {10916253}, issn = {0018-9294}, mesh = {Adult ; Biomedical Engineering ; Case-Control Studies ; Depression/*psychology ; Female ; Humans ; Male ; Middle Aged ; Risk Factors ; *Speech Acoustics ; Suicide/*psychology ; Suicide Prevention ; }, abstract = {Acoustic properties of speech have previously been identified as possible cues to depression, and there is evidence that certain vocal parameters may be used further to objectively discriminate between depressed and suicidal speech. Studies were performed to analyze and compare the speech acoustics of separate male and female samples comprised of normal individuals and individuals carrying diagnoses of depression and high-risk, near-term suicidality. The female sample consisted of ten control subjects, 17 dysthymic patients, and 21 major depressed patients. The male sample contained 24 control subjects, 21 major depressed patients, and 22 high-risk suicidal patients. Acoustic analyses of voice fundamental frequency (Fo), amplitude modulation (AM), formants, and power distribution were performed on speech samples extracted from audio recordings collected from the sample members. Multivariate feature and discriminant analyses were performed on feature vectors representing the members of the control and disordered classes. Features derived from the formant and power spectral density measurements were found to be the best discriminators of class membership in both the male and female studies. AM features emerged as strong class discriminators of the male classes. Features describing Fo were generally ineffective discriminators in both studies. The results support theories that identify psychomotor disturbances as central elements in depression and suicidality.}, } @article {pmid10913893, year = {2000}, author = {Recio, A and Rhode, WS}, title = {Representation of vowel stimuli in the ventral cochlear nucleus of the chinchilla.}, journal = {Hearing research}, volume = {146}, number = {1-2}, pages = {167-184}, doi = {10.1016/s0378-5955(00)00111-8}, pmid = {10913893}, issn = {0378-5955}, support = {DC00045/DC/NIDCD NIH HHS/United States ; NS17590/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Chinchilla/anatomy & histology/*physiology ; Cochlear Nucleus/anatomy & histology/*physiology ; Evoked Potentials, Auditory/physiology ; Fourier Analysis ; Neurons/physiology ; *Speech Acoustics ; }, abstract = {Responses of neurons in the ventral cochlear nucleus (VCN) of anesthetized chinchillas to six synthetic vowel sounds (/a/, /e/, /epsilon/, /i/, /o/ and /u/) were recorded at several intensity levels. Stimuli were synthesized with a fundamental frequency of 100 Hz or 181.6 Hz and had formant values at integer multiples of 100 Hz. Responses came from most neuron types in the VCN (with the exception of onset cells with an I-shaped pattern). Population studies, performed only on primary-like (PL) and chopper neurons, showed that PL neurons provide a better temporal representation than do chopper neurons. At the lowest level of stimulation, all neuron types provide an accurate rate-place representation of vowel spectra. With an increase in stimulus level, the rate-place representation of PL neurons becomes inferior to that of chopper neurons, either sustained choppers or transient choppers.}, } @article {pmid10883999, year = {2000}, author = {Fowler, CA and Brown, JM and Mann, VA}, title = {Contrast effects do not underlie effects of preceding liquids on stop-consonant identification by humans.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {26}, number = {3}, pages = {877-888}, doi = {10.1037//0096-1523.26.3.877}, pmid = {10883999}, issn = {0096-1523}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Humans ; Judgment ; Phonetics ; Random Allocation ; Speech Perception/*physiology ; }, abstract = {These experiments explored the claim by A. Lotto and K. Kluender (1998) that frequency contrast explains listeners' compensations for coarticulation in the case of liquid consonants coarticulating with following stops. Evidence of frequency contrast in experiments that tested for it directly was not found, but Lotto and Kluender's finding that high- and low-frequency precursor tones can produce contrastive effects on stop-consonant judgments were replicated. The effect depends on the amplitude relation of the tones to the third formant (F3) of the stops. This implies that the tones mask F3 information in the stop consonants. It is unknown whether liquids and following stops in natural speech are in an appropriate intensity relation for masking of the stop. A final experiment, exploiting the McGurk effect, showed compensation for coarticulation by listeners when neither frequency contrast nor masking can be the source of the compensations.}, } @article {pmid10877441, year = {2000}, author = {Callan, DE and Kent, RD and Guenther, FH and Vorperian, HK}, title = {An auditory-feedback-based neural network model of speech production that is robust to developmental changes in the size and shape of the articulatory system.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {43}, number = {3}, pages = {721-736}, doi = {10.1044/jslhr.4303.721}, pmid = {10877441}, issn = {1092-4388}, support = {R01 DC006282/DC/NIDCD NIH HHS/United States ; 1R29 DC02852/DC/NIDCD NIH HHS/United States ; 5R01 DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Child Development/*physiology ; Child, Preschool ; *Feedback ; Humans ; Infant ; Larynx/physiology ; Lip/physiology ; Movement/physiology ; Nerve Net/*physiology ; Palate, Soft/physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; *Speech Production Measurement ; Tongue/physiology ; Verbal Learning/physiology ; Vocal Cords/physiology ; }, abstract = {The purpose of this article is to demonstrate that self-produced auditory feedback is sufficient to train a mapping between auditory target space and articulator space under conditions in which the structures of speech production are undergoing considerable developmental restructuring. One challenge for competing theories that propose invariant constriction targets is that it is unclear what teaching signal could specify constriction location and degree so that a mapping between constriction target space and articulator space can be learned. It is predicted that a model trained by auditory feedback will accomplish speech goals, in auditory target space, by continuously learning to use different articulator configurations to adapt to the changing acoustic properties of the vocal tract during development. The Maeda articulatory synthesis part of the DIVA neural network model (Guenther et al., 1998) was modified to reflect the development of the vocal tract by using measurements taken from MR images of children. After training, the model was able to maintain the 11 English vowel targets in auditory planning space, utilizing varying articulator configurations, despite morphological changes that occur during development. The vocal-tract constriction pattern (derived from the vocal-tract area function) as well as the formant values varied during the course of development in correspondence with morphological changes in the structures involved with speech production. Despite changes in the acoustical properties of the vocal tract that occur during the course of development, the model was able to demonstrate motor-equivalent speech production under lip-restriction conditions. The model accomplished this in a self-organizing manner even though there was no prior experience with lip restriction during training.}, } @article {pmid10875582, year = {2000}, author = {Hollien, H and Mendes-Schwartz, AP and Nielsen, K}, title = {Perceptual confusions of high-pitched sung vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {2}, pages = {287-298}, doi = {10.1016/s0892-1997(00)80038-7}, pmid = {10875582}, issn = {0892-1997}, support = {NS-06459/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Questions exist as to the intelligibility of vowels sung at extremely high fundamental frequencies and, especially, when the fundamental frequency (F0) produced is above the region where the first vowel formant (F1) would normally occur. Can such vowels be correctly identified and, if so, does context provide the necessary information or are acoustical elements also operative? To this end, 18 professional singers (5 males and 13 females) were recorded when singing 3 isolated vowels at high and low pitches at both loud and soft levels. Aural-perceptual studies employing four types of auditors were carried out to determine the identity of these vowels, and the nature of the confusions with other vowels. Subsequent acoustical analysis focused on the actual fundamental frequencies sung plus those defining the first 2 vowel formants. It was found that F0 change had a profound effect on vowel perception; one of the more important observations was that the target tended to shift toward vowels with an F1 just above the sung frequency.}, } @article {pmid10875388, year = {2000}, author = {Traunmüller, H and Eriksson, A}, title = {Acoustic effects of variation in vocal effort by men, women, and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {107}, number = {6}, pages = {3438-3451}, doi = {10.1121/1.429414}, pmid = {10875388}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Child ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Verbal Behavior/*physiology ; }, abstract = {The acoustic effects of the adjustment in vocal effort that is required when the distance between speaker and addressee is varied over a large range (0.3-187.5 m) were investigated in phonated and, at shorter distances, also in whispered speech. Several characteristics were studied in the same sentence produced by men, women, and 7-year-old boys and girls: duration of vowels and consonants, pausing and occurrence of creaky voice, mean and range of F0, certain formant frequencies (F1 in [a] and F3), sound-pressure level (SPL) of voiced segments and [s], and spectral emphasis. In addition to levels and emphasis, vowel duration, F0, and F1 were substantially affected. "Vocal effort" was defined as the communication distance estimated by a group of listeners for each utterance. Most of the observed effects correlated better with this measure than with the actual distance, since some additional factors affected the speakers' choice. Differences between speaker groups emerged in segment durations, pausing behavior, and in the extent to which the SPL of [s] was affected. The whispered versions are compared with the phonated versions produced by the same speakers at the same distance. Several effects of whispering are found to be similar to those of increasing vocal effort.}, } @article {pmid10875387, year = {2000}, author = {Pitermann, M}, title = {Effect of speaking rate and contrastive stress on formant dynamics and vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {107}, number = {6}, pages = {3425-3437}, doi = {10.1121/1.429413}, pmid = {10875387}, issn = {0001-4966}, mesh = {Humans ; Models, Biological ; Phonetics ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {Vowel formants play an important role in speech theories and applications; however, the same formant values measured for the steady-state part of a vowel can correspond to different vowel categories. Experimental evidence indicates that dynamic information can also contribute to vowel characterization. Hence, dynamically modeling formant transitions may lead to quantitatively testable predictions in vowel categorization. Because the articulatory strategy used to manage different speaking rates and contrastive stress may depend on speaker and situation, the parameter values of a dynamic formant model may vary with speaking rate and stress. In most experiments speaking rate is rarely controlled, only two or three rates are tested, and most corpora contain just a few repetitions of each item. As a consequence, the dependence of dynamic models on those factors is difficult to gauge. This article presents a study of 2300 [iai] or [i epsilon i] stimuli produced by two speakers at nine or ten speaking rates in a carrier sentence for two contrastive stress patterns. The corpus was perceptually evaluated by naive listeners. Formant frequencies were measured during the steady-state parts of the stimuli, and the formant transitions were dynamically and kinematically modeled. The results indicate that (1) the corpus was characterized by a contextual assimilation instead of a centralization effect; (2) dynamic or kinematic modeling was equivalent as far as the analysis of the model parameters was concerned; (3) the dependence of the model parameter estimates on speaking rate and stress suggests that the formant transitions were sharper for high speaking rate, but no consistent trend was found for contrastive stress; (4) the formant frequencies measured in the steady-state parts of the vowels were sufficient to explain the perceptual results while the dynamic parameters of the models were not.}, } @article {pmid10875384, year = {2000}, author = {Akeroyd, MA and Summerfield, AQ}, title = {Integration of monaural and binaural evidence of vowel formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {107}, number = {6}, pages = {3394-3406}, doi = {10.1121/1.429410}, pmid = {10875384}, issn = {0001-4966}, mesh = {Adult ; Auditory Threshold/physiology ; Hearing/physiology ; Humans ; Models, Biological ; Phonetics ; Speech Perception/*physiology ; }, abstract = {The intelligibility of speech is sustained at lower signal-to-noise ratios when the speech has a different interaural configuration from the noise. This paper argues that the advantage arises in part because listeners combine evidence of the spectrum of speech in the across-frequency profile of interaural decorrelation with evidence in the across-frequency profile of intensity. To support the argument, three experiments examined the ability of listeners to integrate and segregate evidence of vowel formants in these two profiles. In experiment 1, listeners achieved accurate identification of the members of a small set of vowels whose first formant was defined by a peak in one profile and whose second formant was defined by a peak in the other profile. This result demonstrates that integration is possible. Experiment 2 demonstrated that integration is not mandatory, insofar as listeners could report the identity of a vowel defined entirely in one profile despite the presence of a competing vowel in the other profile. The presence of the competing vowel reduced accuracy of identification, however, showing that segregation was incomplete. Experiment 3 demonstrated that segregation of the binaural vowel, in particular, can be increased by the introduction of an onset asynchrony between the competing vowels. The results of experiments 2 and 3 show that the intrinsic cues for segregation of the profiles are relatively weak. Overall, the results are compatible with the argument that listeners can integrate evidence of spectral peaks from the two profiles.}, } @article {pmid10867569, year = {2000}, author = {Schouten, ME and Peeters, WJ}, title = {Searching for an explanation for diphthong perception: dynamic tones and dynamic spectral profiles.}, journal = {Phonetica}, volume = {57}, number = {1}, pages = {17-39}, doi = {10.1159/000028457}, pmid = {10867569}, issn = {0031-8388}, mesh = {Adolescent ; Adult ; Child ; Humans ; Phonetics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {The aim was to find a psychophysical explanation for the perception, by naive listeners, of diphthongs as single vowels, even though they are essentially formant movements. Subjects were asked to match sinusoidal tone and resonance glides around 1,000 Hz with two connected steady-state tones or resonances whose frequencies could be controlled independently. The expectation was that short glides (below 120 ms) would give rise to single perceptual events without any movement in a particular direction, so that the two matching steady-state patterns would not show any frequency direction either; long resonance glides (above 120 ms), on the other hand, were expected to be perceived as rising or falling and matched accordingly. The results showed an effect of duration, although it interacted with glide width. At durations shorter than about 120 ms, subjects placed the two steady profiles with which they had to match the dynamic profile closer together than with durations over 120 ms; however, this only occurred if a glide covered more than 500 Hz, and is therefore irrelevant to diphthong perception.}, } @article {pmid10859570, year = {2000}, author = {Fitch, WT}, title = {The evolution of speech: a comparative review.}, journal = {Trends in cognitive sciences}, volume = {4}, number = {7}, pages = {258-267}, doi = {10.1016/s1364-6613(00)01494-7}, pmid = {10859570}, issn = {1879-307X}, abstract = {The evolution of speech can be studied independently of the evolution of language, with the advantage that most aspects of speech acoustics, physiology and neural control are shared with animals, and thus open to empirical investigation. At least two changes were necessary prerequisites for modern human speech abilities: (1) modification of vocal tract morphology, and (2) development of vocal imitative ability. Despite an extensive literature, attempts to pinpoint the timing of these changes using fossil data have proven inconclusive. However, recent comparative data from nonhuman primates have shed light on the ancestral use of formants (a crucial cue in human speech) to identify individuals and gauge body size. Second, comparative analysis of the diverse vertebrates that have evolved vocal imitation (humans, cetaceans, seals and birds) provides several distinct, testable hypotheses about the adaptive function of vocal mimicry. These developments suggest that, for understanding the evolution of speech, comparative analysis of living species provides a viable alternative to fossil data. However, the neural basis for vocal mimicry and for mimesis in general remains unknown.}, } @article {pmid10840653, year = {2000}, author = {Cugini, P and Camillieri, G and Alessio, L and Cristina, G and De Rosa, R and Petrangeli, CM}, title = {Daily hunger sensation monitoring as a tool for investigating human circadian synchronization.}, journal = {Eating and weight disorders : EWD}, volume = {5}, number = {1}, pages = {24-30}, pmid = {10840653}, issn = {1124-4909}, mesh = {Acclimatization/physiology ; Adult ; Antarctic Regions ; Case-Control Studies ; Chi-Square Distribution ; Circadian Rhythm/*physiology ; Female ; Humans ; Hunger/*physiology ; Least-Squares Analysis ; Male ; *Self-Assessment ; Sleep/*physiology ; }, abstract = {This study investigates within-day hunger sensation (HS) variability in Clinically Healthy Subjects Adapted to Living in Antarctica (CHSALA), as compared to their coeval subjects living in their mother country. The aim is to detect how the orectic stimulus behaves in those environmental conditions and occupational schemes, in order to investigate the individual synchronization to sleep-wake alternation and meal time schedule. HS was estimated via a self-rating score of its intensity on a Visual Analog Scale, repeating the subjective perception every 30 min, unless sleeping. The individual HS time-qualified scores (orexigram) were analyzed according to conventional and chronobiological procedures. The orexigrams of the CHSALA were seen to show a more cadenced intermittence during the diurnal part of the day, strictly related to the meal timing, and a preserved circadian rhythm as well. In addition, these orexigrams were resolved in a spectrum of harmonic components which indicated a subsidiary number of ultradian formants. These findings are convincing evidence that the individual orexigram may be used to investigate whether or not a single subject is synchronized to sleep-wake cycle, meal time schedule and socio-occupational routines, instead of using more complex and expensive techniques, involving automated equipments and biohumoral assays.}, } @article {pmid10790041, year = {2000}, author = {Martin, BA and Boothroyd, A}, title = {Cortical, auditory, evoked potentials in response to changes of spectrum and amplitude.}, journal = {The Journal of the Acoustical Society of America}, volume = {107}, number = {4}, pages = {2155-2161}, doi = {10.1121/1.428556}, pmid = {10790041}, issn = {0001-4966}, support = {5P50DC00178/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Auditory Cortex/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Language ; Male ; Scalp/physiology ; Sensitivity and Specificity ; Time Factors ; }, abstract = {The acoustic change complex (ACC) is a scalp-recorded negative-positive voltage swing elicited by a change during an otherwise steady-state sound. The ACC was obtained from eight adults in response to changes of amplitude and/or spectral envelope at the temporal center of a three-formant synthetic vowel lasting 800 ms. In the absence of spectral change, the group mean waveforms showed a clear ACC to amplitude increments of 2 dB or more and decrements of 3 dB or more. In the presence of a change of second formant frequency (from perceived /u/ to perceived /i/), amplitude increments increased the magnitude of the ACC but amplitude decrements had little or no effect. The fact that the just detectable amplitude change is close to the psychoacoustic limits of the auditory system augurs well for the clinical application of the ACC. The failure to find a condition under which the spectrally elicited ACC is diminished by a small change of amplitude supports the conclusion that the observed ACC to a change of spectral envelope reflects some aspect of cortical frequency coding. Taken together, these findings support the potential value of the ACC as an objective index of auditory discrimination capacity.}, } @article {pmid10769882, year = {2000}, author = {Lehman, ME and Swartz, B}, title = {Electropalatographic and spectrographic descriptions of allophonic variants of /1/.}, journal = {Perceptual and motor skills}, volume = {90}, number = {1}, pages = {47-61}, doi = {10.2466/pms.2000.90.1.47}, pmid = {10769882}, issn = {0031-5125}, mesh = {Adult ; Electrophysiology ; Female ; Humans ; Male ; Mouth/*physiology ; Palate/physiology ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Tongue/physiology ; }, abstract = {Prevocalic and postvocalic /1/ were investigated in three adult subjects utilizing a combination of electropalatographic and acoustic techniques. Results indicated that prevocalic /1/ was characterized by both alveolar and lateral lingua-palatal contact, while postvocalic /1/ was primarily alveolar contact only. Acoustically, prevocalic /1/ had a lower first formant and higher second formant than postvocalic /1/. In addition, the second and third formants were often weak or absent for prevocalic but not postvocalic /1/. Vowel context had a greater effect on the electropalatographic and acoustic characteristics of prevocalic than postvocalic /1/. Models that relate physiological and acoustical aspects of speech were utilized to account for the observed results.}, } @article {pmid10764936, year = {2000}, author = {Chiu, CC and Chang, HH and Yang, CH}, title = {Objective auscultation for traditional chinese medical diagnosis using novel acoustic parameters.}, journal = {Computer methods and programs in biomedicine}, volume = {62}, number = {2}, pages = {99-107}, doi = {10.1016/s0169-2607(00)00055-9}, pmid = {10764936}, issn = {0169-2607}, mesh = {Acoustics ; Adult ; Arthritis, Rheumatoid/*diagnosis/physiopathology ; Auscultation/*statistics & numerical data ; Discriminant Analysis ; Female ; Humans ; Lupus Erythematosus, Systemic/*diagnosis/physiopathology ; Male ; Mathematical Computing ; *Medicine, Chinese Traditional ; Sjogren's Syndrome/*diagnosis/physiopathology ; Voice Quality/*physiology ; }, abstract = {The goal of this work is to propose novel acoustic parameters of voice for the purpose of providing a quantitative analysis of auscultation in traditional Chinese medical diagnosis. There is rare amount of available literature related to this topic. Four novel acoustic parameters, the average number of zero-crossings, the variations in local peaks and valleys, the variations in first and second formant frequencies, and the spectral energy ratio, are presented to analyze and identify the characteristics among non-vacuity, qi-vacuity, and yin-vacuity subjects. Among these acoustic parameters, two temporal parameters, the average number of zero-crossings and the variations in local peaks and valleys, outperformed other parameters in classifying both non-vacuity and deficient subjects. The spectral energy ratio was adequate for the classification between qi-vacuity and yin-vacuity patients. This research is a first step in an ongoing effort to modernize the auscultation in traditional Chinese medical diagnosis.}, } @article {pmid10764114, year = {2000}, author = {Gelfer, MP and Schofield, KJ}, title = {Comparison of acoustic and perceptual measures of voice in male-to-female transsexuals perceived as female versus those perceived as male.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {14}, number = {1}, pages = {22-33}, doi = {10.1016/s0892-1997(00)80092-2}, pmid = {10764114}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Reproducibility of Results ; Speech Acoustics ; *Speech Perception ; *Transsexualism ; Voice/*physiology ; Voice Quality ; }, abstract = {The present study explored significant differences between male-to-female transgendered speakers perceived as male and those perceived as female in terms of speaking fundamental frequency (SFF) and its variability, vowel formants for /a/ and /i/, and intonation measures. Fifteen individuals who identified themselves as male-to-female transsexuals served as speaker subjects, in addition to 6 biological female control subjects and 3 biological male control subjects. Each subject was recorded reading the Rainbow Passage and producing the isolated vowels /a/ and /i/. Twenty undergraduate psychology students served as listeners. Results indicated that subjects perceived as female had a higher mean SFF and higher upper limit of SFF than subjects perceived as male. A significant correlation between upper limit of SFF and ratings of femininity was achieved.}, } @article {pmid10748826, year = {2000}, author = {Brosch, S and Matthes, C and Pirsig, W and Verse, T}, title = {Uvulopalatopharyngoplasty changes fundamental frequency of the voice--a prospective study.}, journal = {The Journal of laryngology and otology}, volume = {114}, number = {2}, pages = {113-118}, doi = {10.1258/0022215001905049}, pmid = {10748826}, issn = {0022-2151}, mesh = {Adult ; Humans ; Male ; Middle Aged ; Palate, Soft/surgery ; Pharynx/surgery ; Phonation ; *Postoperative Complications ; Prospective Studies ; Sleep Apnea, Obstructive/*surgery ; Speech Acoustics ; Tonsillectomy ; Uvula/surgery ; Voice Disorders/*etiology ; }, abstract = {The aim of the study was to find whether a muscle-sparing uvulopalatopharyngoplasty (UPPP) and tonsillectomy cause a measurable change in the voice. The fundamental frequency and the first two formants of five sustained vowels were measured before and nine (six to 15) months following operation. The operation consisted of tonsillectomy and UPPP with preservation of the musculature of the soft palate. All patients received a pre- and post-operative 12-canal polysomnography (level-I sleep study). No patient showed signs of any post-operative velopharyngeal insufficiency. Acoustic analysis showed a significant raising of the fundamental frequency of up to 10 Hz. There was also a lowering of the second formant in two of the five vowels. This was correlated with the volume of the excised tissue. The minimal changes will probably have no significance for those who place no special reliance on their voice, but the possible post-operative changes should be made clear to singers and those relying on their voice for professional reasons.}, } @article {pmid10723905, year = {1999}, author = {Perrin, E and Berger-Vachon, C and Collet, L}, title = {Acoustical recognition of laryngeal pathology: a comparison of two strategies based on sets of features.}, journal = {Medical & biological engineering & computing}, volume = {37}, number = {5}, pages = {652-658}, pmid = {10723905}, issn = {0140-0118}, mesh = {Female ; Humans ; Laryngeal Diseases/*diagnosis ; Male ; Reference Values ; *Speech Acoustics ; }, abstract = {The efficiency of sets of acoustical features discriminating pathological voices from control voices is reported. Two strategies were compared. The first (called the 'distance strategy') was built upon a statistical distance of voice features to reference values obtained for a set of healthy (reference) voices. The second strategy (called the 'range strategy') is based on the position inside or outside normal ranges established from a reference population; results based on this strategy were presented in a previous paper. Reference values were calculated from a database of 200 healthy voices distributed into 10-year age groups ranging from 20 to 70. Comparisons were made using a second database of 220 voices, including 65 control, 51 functional dysphonia, 50 with nodules on the vocal folds and 54 recurrent nerve palsy. The phonetic material was compared of 17 French vowels: 11 vowels in a sentence, three isolated vowels and three segments (beginning, middle and end) of the sustained vowel /a/. Four acoustical features were considered for each vowel: the voice fundamental (f0) and the first three formant frequencies. Acoustical features were calculated on an ILS (Interactive Laboratory System) analysis system (workstation). The separation of each pathological group from the control group, using sets of acoustical features, was statistically assessed. From the strategy point of view, results indicated that (i) the fundamental frequency f0 was the best measure to separate normal from pathological voices with the distance strategy; (ii) when the formants were taken, the range strategy performed better in separating the voices. For classification of pathologies, the best separation coefficients were obtained with nodules and the worst with recurrent nerve palsy. Overall, it was seen that the separation between control and pathological voices was most efficient when measured using the distance strategy for f0. The range strategy was useful with formant frequencies.}, } @article {pmid10710768, year = {1999}, author = {Whiteside, SP}, title = {Acoustic characteristics of vocal emotions simulated by actors.}, journal = {Perceptual and motor skills}, volume = {89}, number = {3 Pt 2}, pages = {1195-1208}, doi = {10.2466/pms.1999.89.3f.1195}, pmid = {10710768}, issn = {0031-5125}, mesh = {Adult ; *Drama ; *Emotions ; Female ; Humans ; Imitative Behavior ; Male ; Occupations ; *Speech Acoustics ; Speech Perception ; *Voice Quality ; }, abstract = {This paper reports on a set of acoustic measures of vocal emotions simulated by a skilled actor and actress who read 12 short sentences. The emotions simulated were Neutral, Cold Anger, Hot Anger, Happiness, Sadness, Interest, and Elation. Acoustic measures were examined for digitised samples of the 168 sentences (2 speakers x 12 sentences x 7 emotions). These measures included the over-all mean energy, standard deviations of energy, mean fundamental frequency, mean values of the first six formant frequencies and their corresponding frequency bandwidths, mean durations of utterance, and mean articulation rate. Results displayed both similarities and differences in the acoustic measures of the two actors' simulations of the different emotions. These are presented and discussed.}, } @article {pmid10687705, year = {2000}, author = {Dissard, P and Darwin, CJ}, title = {Extracting spectral envelopes: formant frequency matching between sounds on different and modulated fundamental frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {107}, number = {2}, pages = {960-969}, doi = {10.1121/1.428277}, pmid = {10687705}, issn = {0001-4966}, mesh = {Auditory Perception/*physiology ; Humans ; *Sound ; Time Factors ; }, abstract = {The four experiments reported here measure listeners' accuracy and consistency in adjusting a formant frequency of one- or two-formant complex sounds to match the timbre of a target sound. By presenting the target and the adjustable sound on different fundamental frequencies, listeners are prevented from performing the task by comparing the absolute or relative levels of resolved spectral components. Experiment 1 uses two-formant vowellike sounds. When the two sounds have the same F0, the variability of matches (within-subject standard deviation) for either the first or the second formant is around 1%-3%, which is comparable to existing data on formant frequency discrimination thresholds. With a difference in F0, variability increases to around 8% for first-formant matches, but to only about 4% for second-formant matches. Experiment 2 uses sounds with a single formant at 1100 or 1200 Hz with both sounds on either low or high fundamental frequencies. The increase in variability produced by a difference in F0 is greater for high F0's (where the harmonics close to the formant peak are resolved) than it is for low F0's (where they are unresolved). Listeners also showed systematic errors in their mean matches to sounds with different high F0's. The direction of the systematic errors was towards the most intense harmonic. Experiments 3 and 4 showed that introduction of a vibratolike frequency modulation (FM) on F0 reduces the variability of matches, but does not reduce the systematic error. The experiments demonstrate, for the specific frequencies and FM used, that there is a perceptual cost to interpolating a spectral envelope across resolved harmonics.}, } @article {pmid10622522, year = {1999}, author = {White, P}, title = {Formant frequency analysis of children's spoken and sung vowels using sweeping fundamental frequency production.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {13}, number = {4}, pages = {570-582}, doi = {10.1016/s0892-1997(99)80011-3}, pmid = {10622522}, issn = {0892-1997}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonetics ; Sex Characteristics ; Speech/*physiology ; Speech Production Measurement ; Voice/physiology ; Voice Quality ; }, abstract = {High-pitched productions present difficulties in formant frequency analysis due to wide harmonic spacing and poorly defined formants. As a consequence, there is little reliable data regarding children's spoken or sung vowel formants. Twenty-nine 11-year-old Swedish children were asked to produce 4 sustained spoken and sung vowels. In order to circumvent the problem of wide harmonic spacing, F1 and F2 measurements were taken from vowels produced with a sweeping F0. Experienced choir singers were selected as subjects in order to minimize the larynx height adjustments associated with pitch variation in less skilled subjects. Results showed significantly higher formant frequencies for speech than for singing. Formants were consistently higher in girls than in boys suggesting longer vocal tracts in these preadolescent boys. Furthermore, formant scaling demonstrated vowel dependent differences between boys and girls suggesting non-uniform differences in male and female vocal tract dimensions. These vowel-dependent sex differences were not consistent with adult data.}, } @article {pmid10615689, year = {1999}, author = {Szymanski, MD and Yund, EW and Woods, DL}, title = {Phonemes, intensity and attention: differential effects on the mismatch negativity (MMN).}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {6}, pages = {3492-3505}, doi = {10.1121/1.428202}, pmid = {10615689}, issn = {0001-4966}, support = {NS32893/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Attention/*physiology ; Differential Threshold ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Phonetics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {Auditory event-related potentials (ERPs) to speech sounds were recorded in a demanding selective attention task to measure how the mismatch negativity (MMN) was affected by attention, deviant feature, and task relevance, i.e., whether the feature was target or nontarget type. With vowel-consonant-vowel (VCV) disyllables randomly presented to the right and left ears, subjects attended to the VCVs in one ear. In different conditions, the subjects responded to either intensity or phoneme deviance in the consonant. The position of the deviance within the VCV also varied, being in the first (VC), second (CV), or both (VC and CV) formant-transition regions. The MMN amplitudes were larger for deviants in the attended ear. Task relevance affected the MMNs to intensity and phoneme deviants differently. Target-type intensity deviants yielded larger MMNs than nontarget types. For phoneme deviants there was no main effect of task relevance, but there was a critical interaction with deviance position. The both position gave the largest MMN amplitudes for target-type phoneme deviants, as it did for target- and nontarget-type intensity deviants. The MMN for nontarget-type phoneme deviants, however, showed an inverse pattern such that the MMN for the both position had the smallest amplitude despite its greater spectro-temporal deviance and its greater detectability when it was the target. These data indicate that the MMN reflects differences in phonetic structure as well as differences in acoustic spectral-energy structure of the deviant stimuli. Furthermore, the task relevance effects demonstrate that top-down controls not only affect the amplitude of the MMN, but can reverse the pattern of MMN amplitudes among different stimuli.}, } @article {pmid10613383, year = {1999}, author = {Skinner, MW and Fourakis, MS and Holden, TA and Holden, LK and Demorest, ME}, title = {Identification of speech by cochlear implant recipients with the multipeak (MPEAK) and spectral peak (SPEAK) speech coding strategies II. Consonants.}, journal = {Ear and hearing}, volume = {20}, number = {6}, pages = {443-460}, doi = {10.1097/00003446-199912000-00001}, pmid = {10613383}, issn = {0196-0202}, support = {R01-DC00581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Threshold/physiology ; *Cochlear Implantation ; Deafness/surgery ; Electric Stimulation/instrumentation ; Equipment Design ; Evaluation Studies as Topic ; Humans ; Phonetics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {OBJECTIVE: The major objective of this study was to evaluate differences in consonant recognition with the Multipeak (MPEAK) and the Spectral Peak (SPEAK) speech coding strategies of the Nucleus-22 Cochlear Implant System. This objective was addressed by comparison of acoustic and electrode activation analyses of consonants with cochlear implant recipients' responses to these same consonant tokens when they used the two speech coding strategies.

DESIGN: Nine subjects identified 14 English consonants with the MPEAK and SPEAK speech coding strategies. These strategies were compared with an ABAB design. Evaluation occurred during two weekly sessions after subjects used each strategy for at least 3 wk in everyday life.

RESULTS: Group medial consonant [aCa] identification scores with the SPEAK strategy were significantly higher than with the MPEAK strategy (76.2% versus 67.5%; p < 0.001). This improvement was largely due to the significant increase in information transmitted for the place feature (p < 0.001) through accurate tracking of second formant transitions and spectrally specific stimulation patterns to differentiate [s] from [symbol see text] and [n] from [m], and the stop consonant bursts. For this reason, more nasal consonants were correctly identified with SPEAK, but there also were more non-nasal error responses when the nasal murmur was of unusually low amplitude. Consequently, significantly less information was transmitted for the nasality feature with SPEAK than MPEAK (p < 0.001).

CONCLUSIONS: Electrical stimulation with the SPEAK strategy provided better spectral representation of the stop consonant bursts, tracking formant transitions into the following vowel, frication in the consonant [symbol see text], and the formants for the nasals [m] and [n] than with the MPEAK strategy. The marked improvement in recognition of the velar consonants, [g] and [k], which cannot be seen during speechreading, should allow greater ease and accuracy of communication with SPEAK than MPEAK.}, } @article {pmid10607359, year = {2000}, author = {Lecanuet, JP and Graniere-Deferre, C and Jacquet, AY and DeCasper, AJ}, title = {Fetal discrimination of low-pitched musical notes.}, journal = {Developmental psychobiology}, volume = {36}, number = {1}, pages = {29-39}, pmid = {10607359}, issn = {0012-1630}, mesh = {Adult ; Female ; Heart Rate, Fetal/*physiology ; Humans ; Infant, Newborn ; *Music ; Pitch Discrimination/*physiology ; Pregnancy ; Pregnancy Trimester, Third ; Sound Spectrography ; }, abstract = {Cardiac responses of 36- to 39-week-old (GA) fetuses were tested with a no-delay pulsed stimulation paradigm while exhibiting a low heart rate (HR) variability (the HR pattern recorded when fetuses are in the 1f behavioral state). We examined whether fetuses could discriminate between two low-pitched piano notes, D4 (F(0) = 292 Hz/292-1800 Hz) and C5 (F(0) = 518 Hz/518-300 Hz). Seventy percent of all fetuses reacted to the onset of the first note (D4 or C5) with the expected cardiac deceleration. After heart rate returned to baseline, the note was changed (to C5 or D4, respectively). Ninety percent of the fetuses who reacted to the note switch did it with another cardiac deceleration. Control fetuses, for whom the first note did not change, displayed few cardiac decelerations. Thus, fetuses detected and responded to the pulsed presentation of a note and its subsequent change regardless of which note was presented first. Because perceived loudness (for adults) of the notes was controlled, it seems that the note's differences in F(0) and frequency band were relevant for detecting the change. Fetuses' ability to discriminate between spectra that lay within the narrow range of voice F(0) and F(1) formants may play an important role in the earliest developmental stages of speech perception.}, } @article {pmid10598464, year = {1999}, author = {Boardman, I and Grossberg, S and Myers, C and Cohen, M}, title = {Neural dynamics of perceptual order and context effects for variable-rate speech syllables.}, journal = {Perception & psychophysics}, volume = {61}, number = {8}, pages = {1477-1500}, doi = {10.3758/bf03213112}, pmid = {10598464}, issn = {0031-5117}, mesh = {Attention/physiology ; Auditory Pathways/physiology ; Brain/*physiology ; Brain Mapping ; Humans ; Mental Recall ; *Neural Networks, Computer ; Phonetics ; Psycholinguistics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {How does the brain extract invariant properties of variable-rate speech? A neural model, called PHONET, is developed to explain aspects of this process and, along the way, data about perceptual context effects. For example, in consonant-vowel (CV) syllables, such as /ba/ and /wa/, an increase in the duration of the vowel can cause a switch in the percept of the preceding consonant from /w/ to /b/ (J.L. Miller & Liberman, 1979). The frequency extent of the initial formant transitions of fixed duration also influences the percept (Schwab, Sawusch, & Nusbaum, 1981). PHONET quantitatively simulates over 98% of the variance in these data, using a single set of parameters. The model also qualitatively explains many data about other perceptual context effects. In the model, C and V inputs are filtered by parallel auditory streams that respond preferentially to the transient and sustained properties of the acoustic signal before being stored in parallel working memories. A lateral inhibitory network of onset- and rate-sensitive cells in the transient channel extracts measures of frequency transition rate and extent. Greater activation of the transient stream can increase the processing rate in the sustained stream via a cross-stream automatic gain control interaction. The stored activities across these gain-controlled working memories provide a basis for rate-invariant perception, since the transient-to-sustained gain control tends to preserve the relative activities across the transient and sustained working memories as speech rate changes. Comparisons with alternative models tested suggest that the fit cannot be attributed to the simplicity of the data. Brain analogues of model cell types are described.}, } @article {pmid10573907, year = {1999}, author = {Kewley-Port, D and Zheng, Y}, title = {Vowel formant discrimination: towards more ordinary listening conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {5}, pages = {2945-2958}, doi = {10.1121/1.428134}, pmid = {10573907}, issn = {0001-4966}, support = {DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Phonetics ; Reproducibility of Results ; Sound Spectrography ; Speech/physiology ; Speech Discrimination Tests ; Speech Perception/*physiology ; Teaching ; }, abstract = {Thresholds for formant frequency discrimination have been established using optimal listening conditions. In normal conversation, the ability to discriminate formant frequency is probably substantially degraded. The purpose of the present study was to change the listening procedures in several substantial ways from optimal towards more ordinary listening conditions, including a higher level of stimulus uncertainty, increased levels of phonetic context, and with the addition of a sentence identification task. Four vowels synthesized from a female talker were presented in isolation, or in the phonetic context of /bVd/ syllables, three-word phrases, or nine-word sentences. In the first experiment, formant resolution was estimated under medium stimulus uncertainty for three levels of phonetic context. Some undesirable training effects were obtained and led to the design of a new protocol for the second experiment to reduce this problem and to manipulate both length of phonetic context and level of difficulty in the simultaneous sentence identification task. Similar results were obtained in both experiments. The effect of phonetic context on formant discrimination is reduced as context lengthens such that no difference was found between vowels embedded in the phrase or sentence contexts. The addition of a challenging sentence identification task to the discrimination task did not degrade performance further and a stable pattern for formant discrimination in sentences emerged. This norm for the resolution of vowel formants under these more ordinary listening conditions was shown to be nearly a constant at 0.28 barks. Analysis of vowel spaces from 16 American English talkers determined that the closest vowels, on average, were 0.56 barks apart, that is, a factor of 2 larger than the norm obtained in these vowel formant discrimination tasks.}, } @article {pmid10573905, year = {1999}, author = {Macmillan, NA and Kingston, J and Thorburn, R and Dickey, LW and Bartels, C}, title = {Integrality of nasalization and F1. II. Basic sensitivity and phonetic labeling measure distinct sensory and decision-rule interactions.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {5}, pages = {2913-2932}, doi = {10.1121/1.428113}, pmid = {10573905}, issn = {0001-4966}, support = {DCO1708/DC/NIDCD NIH HHS/United States ; RR07048-26/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; Models, Biological ; Phonetics ; Psychophysics ; Sensitivity and Specificity ; Speech/*physiology ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {In vowel perception, nasalization and height (the inverse of the first formant, F1) interact. This paper asks whether the interaction results from a sensory process, decision mechanism, or both. Two experiments used vowels varying in height, degree of nasalization, and three other stimulus parameters: the frequency region of F1, the location of the nasal pole/zero complex relative to F1, and whether a consonant following the vowel was oral or nasal. A fixed-classification experiment, designed to estimate basic sensitivity between stimuli, measured accuracy for discriminating stimuli differing in F1, in nasalization, and on both dimensions. A configuration derived by a multidimensional scaling analysis revealed a perceptual interaction that was stronger for stimuli in which the nasal pole/zero complex was below rather than above the oral pole, and that was present before both nasal and oral consonants. Phonetic identification experiments, designed to measure trading relations between the two dimensions, required listeners to identify height and nasalization in vowels varying in both. Judgments of nasalization depended on F1 as well as on nasalization, whereas judgments of height depended primarily on F1, and on nasalization more when the nasal complex was below than above the oral pole. This pattern was interpreted as a decision-rule interaction that is distinct from the interaction in basic sensitivity. Final consonant nasality had little effect in the classification experiment; in the identification experiment, nasal judgments were more likely when the following consonant was nasal.}, } @article {pmid10573886, year = {1999}, author = {Miller, RL and Calhoun, BM and Young, ED}, title = {Contrast enhancement improves the representation of /epsilon/-like vowels in the hearing-impaired auditory nerve.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {5}, pages = {2693-2708}, doi = {10.1121/1.428135}, pmid = {10573886}, issn = {0001-4966}, support = {DC00109/DC/NIDCD NIH HHS/United States ; DC00202/DC/NIDCD NIH HHS/United States ; DC00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Cats ; Cochlear Nerve/*physiopathology ; Hearing Loss, Noise-Induced/*complications/diagnosis/*etiology/*physiopathology ; Humans ; Male ; Models, Biological ; Phonetics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {This study examines the neural representation of the vowel /epsilon/ in the auditory nerve of acoustically traumatized cats and asks whether spectral modifications of the vowel can restore a normal neural representation. Four variants of /epsilon/, which differed primarily in the frequency of the second formant (F2), were used as stimuli. Normally, the rate-place code provides a robust representation of F2 for these vowels, in the sense that rate changes encode changes in F2 frequency [Conley and Keilson, J. Acoust. Soc. Am. 98, 3223 (1995)]. This representation is lost after acoustic trauma [Miller et al., J. Acoust. Soc. Am. 105, 311 (1999)]. Here it is shown that an improved representation of the F2 frequency can be gained by a form of high-frequency emphasis that is determined by both the hearing-loss profile and the spectral envelope of the vowel. Essentially, the vowel was high-pass filtered so that the F2 and F3 peaks were amplified without amplifying frequencies in the trough between F1 and F2. This modification improved the quality of the rate and temporal tonotopic representations of the vowel and restored sensitivity to the F2 frequency. Although a completely normal representation was not restored, this method shows promise as an approach to hearing-aid signal processing.}, } @article {pmid10556603, year = {1999}, author = {Mathiak, K and Hertrich, I and Lutzenberger, W and Ackermann, H}, title = {Preattentive processing of consonant vowel syllables at the level of the supratemporal plane: a whole-head magnetencephalography study.}, journal = {Brain research. Cognitive brain research}, volume = {8}, number = {3}, pages = {251-257}, doi = {10.1016/s0926-6410(99)00027-0}, pmid = {10556603}, issn = {0926-6410}, mesh = {Adult ; Attention/*physiology ; Female ; Humans ; Magnetoencephalography/*methods ; Mental Processes/*physiology ; Speech Perception/*physiology ; Temporal Lobe/*physiology ; }, abstract = {A variety of clinical and experimental data indicate superiority of the left hemisphere with respect to the encoding of dynamic aspects of the acoustic speech signal such as formant transients, i.e., fast changes of spectral energy distribution across a few tens of milliseconds, which cue the perception of stop consonant vowel syllables. Using an oddball design, the present study recorded auditory evoked magnetic fields by means of a whole-head device in response to vowels as well as syllable-like structures. Both the N1m component (=the magnetic equivalent to the N1 response of the electroencephalogram (EEG)) and various difference waves between the magnetic fields to standard and respective rare events (MMNm=magnetic mismatch negativity) were calculated. (a) Vowel mismatch (/a/ against /e/) resulted in an enlarged N1m amplitude reflecting, most presumably, peripheral adaptation processes. (b) As concerns lateralized responses to syllable-like structures, only the shortest transient duration (=10 ms) elicited a significantly enhanced MMNm at the left side. Conceivably, the observed hemispheric difference contributes to prelexical parsing of the auditory signal rather than the encoding of linguistic categories.}, } @article {pmid10550233, year = {1999}, author = {Sussman, HM}, title = {A neural mapping hypothesis to explain why velar stops have an allophonic split.}, journal = {Brain and language}, volume = {70}, number = {2}, pages = {294-304}, doi = {10.1006/brln.1999.2182}, pmid = {10550233}, issn = {0093-934X}, mesh = {Humans ; *Language ; Neural Pathways/*physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; }, abstract = {Velar stops are phonetically characterized by two allophones-a palatal "g" when produced in front-vowel contexts and a velar "g" when produced in back-vowel contexts. Acoustic characterizations of velar stops based on plotting the onset and offset frequencies of the second formant transition clearly reveal two acoustically separated allophonic groups. Using cross-linguistic acoustic data, it is shown that alveolar stops fall precisely within the acoustic void created by the velar allophones. A neural-based mapping hypothesis is put forth claiming that the bimodal distribution of velars is an instance of articulatory behavior acting in the service of auditory representation.}, } @article {pmid10530031, year = {1999}, author = {Bradlow, AR and Kraus, N and Nicol, TG and McGee, TJ and Cunningham, J and Zecker, SG and Carrell, TD}, title = {Effects of lengthened formant transition duration on discrimination and neural representation of synthetic CV syllables by normal and learning-disabled children.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {4 Pt 1}, pages = {2086-2096}, doi = {10.1121/1.427953}, pmid = {10530031}, issn = {0001-4966}, support = {DC 01510/DC/NIDCD NIH HHS/United States ; DC 03762/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; *Child Language ; Electrophysiology ; Female ; Humans ; *Language Development ; Learning Disabilities/complications/*diagnosis ; Male ; Neural Pathways/*physiology ; Perceptual Disorders/complications/*diagnosis ; Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Time Factors ; }, abstract = {In order to investigate the precise acoustic features of stop consonants that pose perceptual difficulties for some children with learning problems, discrimination thresholds along two separate synthetic /da-ga/ continua were compared in a group of children with learning problems (LP) and a group of normal children. The continua differed only in the duration of the formant transitions. Results showed that simply lengthening the formant transition duration from 40 to 80 ms did not result in improved discrimination thresholds for the LP group relative to the normal group. Consistent with previous findings, an electrophysiologic response that is known to reflect the brain's representation of a change from one auditory stimulus to another--the mismatch negativity (MMN)--indicated diminished responses in the LP group relative to the normal group to /da/ versus /ga/ when the transition duration was 40 ms. In the lengthened transition duration condition the MMN responses from the LP group were more similar to those from the normal group, and were enhanced relative to the short transition duration condition. These data suggest that extending the duration of the critical portion of the acoustic stimulus can result in enhanced encoding at a preattentive neural level; however, this stimulus manipulation on its own is not a sufficient acoustic enhancement to facilitate increased perceptual discrimination of this place-of-articulation contrast.}, } @article {pmid10530025, year = {1999}, author = {Löfqvist, A}, title = {Interarticulator phasing, locus equations, and degree of coarticulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {4 Pt 1}, pages = {2022-2030}, pmid = {10530025}, issn = {0001-4966}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; DC-00865/DC/NIDCD NIH HHS/United States ; R01 DC000865-06/DC/NIDCD NIH HHS/United States ; DC-02717/DC/NIDCD NIH HHS/United States ; R01 DC000865/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Movement/physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; Tongue/physiology ; }, abstract = {A locus equation plots the frequency of the second formant at vowel onset against the target frequency of the same formant for the vowel in a consonant-vowel sequence, across different vowel contexts. It has generally been assumed that the slope of the locus equation reflects the degree of coarticulation between the consonant and the vowel, with a steeper slope showing more coarticulation. This study examined the articulatory basis for this assumption. Four subjects participated and produced VCV sequences of the consonants /b, d, g/ and the vowels /i, a, u/. The movements of the tongue and the lips were recorded using a magnetometer system. One articulatory measure was the temporal phasing between the onset of the lip closing movement for the bilabial consonant and the onset of the tongue movement from the first to the second vowel in a VCV sequence. A second measure was the magnitude of the tongue movement during the oral stop closure, averaged across four receivers on the tongue. A third measure was the magnitude of the tongue movement from the onset of the second vowel to the tongue position for that vowel. When compared with the corresponding locus equations, no measure showed any support for the assumption that the slope serves as an index of the degree of coarticulation between the consonant and the vowel.}, } @article {pmid10525949, year = {1999}, author = {Szmeja, Z and Leszczyńska, M}, title = {Voice function in patients after extended fronto-lateral laryngectomy.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {256}, number = {8}, pages = {418-422}, doi = {10.1007/s004050050179}, pmid = {10525949}, issn = {0937-4477}, mesh = {Adult ; Aged ; Carcinoma, Squamous Cell/*surgery ; Female ; Humans ; Laryngeal Neoplasms/*surgery ; Laryngectomy/*methods ; Laryngoscopy/methods ; Male ; Microsurgery ; Middle Aged ; Neoplasm Staging ; *Postoperative Care ; *Preoperative Care ; *Voice Quality ; }, abstract = {The main purpose of this study was subjective and objective evaluation of voice function before and after extended frontolateral laryngectomy, followed by reconstruction with a mucochondrial graft from the nose. The next aim of this study was analysis of phoniatric and acoustic results depending on: (1) the extent and localization of the neoplastic process; (2) the wound-healing process (including the time of decannulation and effective deglutition); (3) the substitute mechanism of phonation. In all, 40 patients (37 men and 3 women) having T1B (67.5%) and T2 (32.5%) glottic cancers were examined before and after the operation. The ENT investigation included each patient's history from the beginning of illness, a subjective evaluation of the voice by patients before and after the operation, determination of laryngeal mobility of the larynx and indirect laryngoscopy during phonation. Phoniatric examination included voice recordings, average voice pitch, voice range, maximum phonation time, microlaryngoscopy and microstroboscopy. Acoustic analysis covered the harmonic structure of the voice, jitter, shimmer, noise components and basic frequency. Results showed that the extent of the neoplastic process before operation did not affect essential voice quality after operation in stage T1B and T2 disease. The maximum phonation time was shortened in T1B patients from 16.5 s to 9.28 s and in T2 from 16.7 s to 9.8 s. The average voice pitch was decreased in T1B from 177.3 Hz to 111.74 Hz and in T2 from 163.8 to 99 Hz. A correlation between acoustic analysis and phoniatric examinations was found. The value of the first formant decreased by 15.1 Hz, jitter increased 2%, shimmer increased 0.9 dB and basic frequency decreased. The voice quality after operation showed a statistically significant dependence on the mechanism of phonation. The best results were found in patients in whom phonation was the result of removing the vocal fold and postoperative scar vibration. The worst results were found in those patients with a sphincter mechanism of phonation.}, } @article {pmid10516324, year = {1999}, author = {Shiller, DM and Ostry, DJ and Gribble, PL}, title = {Effects of gravitational load on jaw movements in speech.}, journal = {The Journal of neuroscience : the official journal of the Society for Neuroscience}, volume = {19}, number = {20}, pages = {9073-9080}, pmid = {10516324}, issn = {1529-2401}, support = {R01 DC000594/DC/NIDCD NIH HHS/United States ; DC-00594/DC/NIDCD NIH HHS/United States ; }, mesh = {Biomechanical Phenomena ; Computer Simulation ; *Gravitation ; Humans ; Jaw/*physiology ; Models, Biological ; Movement/*physiology ; Speech/*physiology ; }, abstract = {External loads arising as a result of the orientation of body segments relative to gravity can affect the achievement of movement goals. The degree to which subjects adjust control signals to compensate for these loads is a reflection of the extent to which forces affecting motion are represented neurally. In the present study we assessed whether subjects, when speaking, compensate for loads caused by the orientation of the head relative to gravity. We used a mathematical model of the jaw to predict the effects of control signals that are not adjusted for changes to head orientation. The simulations predicted a systematic change in sagittal plane jaw orientation and horizontal position resulting from changes to the orientation of the head. We conducted an empirical study in which subjects were tested under the same conditions. With one exception, empirical results were consistent with the simulations. In both simulation and empirical studies, the jaw was rotated closer to occlusion and translated in an anterior direction when the head was in the prone orientation. When the head was in the supine orientation, the jaw was rotated away from occlusion. The findings suggest that the nervous system does not completely compensate for changes in head orientation relative to gravity. A second study was conducted to assess possible changes in acoustical patterns attributable to changes in head orientation. The frequencies of the first (F1) and second (F2) formants associated with the steady-state portion of vowels were measured. As in the kinematic study, systematic differences in the values of F1 and F2 were observed with changes in head orientation. Thus the acoustical analysis further supports the conclusion that control signals are not completely adjusted to offset forces arising because of changes in orientation.}, } @article {pmid10515504, year = {1999}, author = {Kraus, N and Koch, DB and McGee, TJ and Nicol, TG and Cunningham, J}, title = {Speech-sound discrimination in school-age children: psychophysical and neurophysiologic measures.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {42}, number = {5}, pages = {1042-1060}, doi = {10.1044/jslhr.4205.1042}, pmid = {10515504}, issn = {1092-4388}, support = {DC01510/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Age Factors ; Cerebral Cortex/physiology ; Child ; Electrophysiology ; Female ; Humans ; Learning Disabilities/diagnosis ; Male ; Neural Pathways/physiology ; Neurophysiology ; Perceptual Disorders/diagnosis ; Phonetics ; Psychophysics ; Sex Characteristics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Thalamus/physiology ; }, abstract = {This study measured behavioral speech-sound discrimination and a neurophysiologic correlate of discrimination in normal school-age children (ages 6 to 15) to determine if developmental effects exist. Just noticeable differences (JNDs) and mismatch responses (MMNs) were assessed for synthetic syllables that differed in third-formant onset frequency (/da-ga/) and formant transition duration (/ba-wa/). These stimuli were selected because children with learning problems often find it difficult to discriminate rapid spectrotemporal changes like /da-ga/, whereas the ability to distinguish /ba-wa/ is relatively unimpaired. Results indicate that JNDs for /da-ga/ show no developmental effects and that JNDs for /ba-wa/ decrease slightly with age (although likely for task-related reasons). MMNs elicited by two /da-ga/ stimulus pairs (onset frequency differences = 20 Hz, 280 Hz) and three /ba-wa/ stimulus pairs (transition duration differences = 3, 5, 15 ms) showed no systematic or significant differences for onset latency, duration, or area as a function of age. Normative JND and MMN data are provided. These norms provide a metric against which children with suspected central auditory processing difficulties or auditory-based language disorders can be compared.}, } @article {pmid10512548, year = {2000}, author = {Maurer, D and D'Heureuse, C and Landis, T}, title = {Formant pattern ambiguity of vowel sounds.}, journal = {The International journal of neuroscience}, volume = {100}, number = {1-4}, pages = {39-76}, pmid = {10512548}, issn = {0020-7454}, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Germany/ethnology ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Pitch Perception/*physiology ; Sex Factors ; *Speech Acoustics ; Speech Perception/*physiology ; Switzerland/ethnology ; }, abstract = {The formant frequencies of a particular vowel vary according to the speaker group and to coarticulation. Therefore, overlapping formant patterns of different vowels are commonly related to sex and age differences and to coarticulation, and are considered to concern mainly the F1-F2 pattern of adjacent vowels. However, several studies have reported indications of a correlation between the lower formant frequencies and F0, as well as of the appearance of different formant numbers relevant to vowel identity. As a consequence, the overlap between formant patterns of different vowels might be more substantial than has traditionally been assumed. Within the present study, therefore, the extent to which a given formant pattern can represent different vowels was investigated for natural Swiss German vowels produced monotonously and in isolation by men, women and children at F0 of 85-870 Hz. Similar formant patterns were found for vocalizations of different vowels with both small and large phonetic distances, and within the entire frequency ranges of the formants relevant for phoneme identity. For vowel sounds displaying ambiguous formant patterns, the main spectral characteristics related to differences in their perceptual identity were found to concern F0 and relative formant amplitudes. Results are given in exemplary vowel series, and consequences for the psychophysics of the vowel are discussed.}, } @article {pmid10504322, year = {1999}, author = {Riede, T and Fitch, T}, title = {Vocal tract length and acoustics of vocalization in the domestic dog (Canis familiaris).}, journal = {The Journal of experimental biology}, volume = {202}, number = {Pt 20}, pages = {2859-2867}, doi = {10.1242/jeb.202.20.2859}, pmid = {10504322}, issn = {0022-0949}, support = {IT32DC00038/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Animals ; Body Weight ; Breeding ; Cues ; Dogs/anatomy & histology/*physiology ; Female ; Male ; Radiography ; Skull/anatomy & histology ; Statistics as Topic ; Time Factors ; Vocal Cords/*anatomy & histology/diagnostic imaging/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {The physical nature of the vocal tract results in the production of formants during vocalisation. In some animals (including humans), receivers can derive information (such as body size) about sender characteristics on the basis of formant characteristics. Domestication and selective breeding have resulted in a high variability in head size and shape in the dog (Canis familiaris), suggesting that there might be large differences in the vocal tract length, which could cause formant behaviour to affect interbreed communication. Lateral radiographs were made of dogs from several breeds ranging in size from a Yorkshire terrier (2.5 kg) to a German shepherd (50 kg) and were used to measure vocal tract length. In addition, we recorded an acoustic signal (growling) from some dogs. Significant correlations were found between vocal tract length, body mass and formant dispersion, suggesting that formant dispersion can deliver information about the body size of the vocalizer. Because of the low correlation between vocal tract length and the first formant, we predict a non-uniform vocal tract shape.}, } @article {pmid10489709, year = {1999}, author = {Huber, JE and Stathopoulos, ET and Curione, GM and Ash, TA and Johnson, K}, title = {Formants of children, women, and men: the effects of vocal intensity variation.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {3 Pt 1}, pages = {1532-1542}, doi = {10.1121/1.427150}, pmid = {10489709}, issn = {0001-4966}, support = {DC-2661-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; *Phonation ; Reference Values ; *Sound Spectrography ; Speech Acoustics ; *Voice Quality ; }, abstract = {The purpose of this study was to investigate the formant frequencies and amplitudes of a wide age range of children and one group of adults at three sound pressure levels (SPLs). Subjects were ten females and ten males in the following age groups (in years): 4, 6, 8, 10, 12, 14, 16, 18, and adults. A sustained /a/ was produced three times by each subject. Formant frequencies were obtained using linear predictive coding analysis. Formant amplitudes were measured from the highest amplitude harmonic in the area of each formant. In addition to following established trends previously published in other studies, it was hypothesized that the first formant frequency would increase with increasing intensity, that females would have higher formant frequencies than same aged males, and that women and children would have reduced formant amplitudes as compared to men. It was found that first formant frequency increased with intensity and changed as a function of age and sex. Second and third formant frequencies changed with age and sex. Formant amplitudes followed the trends set by the SPL differences and did not change as a function of age and sex. Results are discussed in terms of anatomic differences and SPL.}, } @article {pmid10478607, year = {1999}, author = {Knuuttila, H and Pukander, J and Määttä, T and Pakarinen, L and Vilkman, E}, title = {Speech articulation after subtotal glossectomy and reconstruction with a myocutaneous flap.}, journal = {Acta oto-laryngologica}, volume = {119}, number = {5}, pages = {621-626}, doi = {10.1080/00016489950180892}, pmid = {10478607}, issn = {0001-6489}, mesh = {Acoustics ; Adult ; Aged ; Female ; Finland ; Follow-Up Studies ; Glossectomy/*rehabilitation ; Humans ; Male ; Middle Aged ; Observer Variation ; Pectoralis Muscles/*transplantation ; Phonetics ; *Skin Transplantation ; Sound Spectrography ; Speech/*physiology ; Speech Disorders/etiology ; Speech Intelligibility/*physiology ; Speech Perception ; *Surgical Flaps ; Tongue Neoplasms/rehabilitation/surgery ; Treatment Outcome ; }, abstract = {Speech samples of 9 subjects (8 males, 1 female) were recorded before and 0.5-2 years after a partial glossectomy and reconstruction with a pectoralis major myocutaneous flap. A reading sample, a list of meaningful and nonsense words, and a list of sustained vowels were recorded. The speech samples were evaluated by pairs of naive listeners and using acoustic analysis of the vowel production. Each pair listened to the recordings of only one patient. Inter-rater agreement was satisfactory. The general impression of the speech outcome varied from normal to moderately impaired. The perceptually estimated impairments of speech articulation in the after/before comparisons were statistically significant. Only the first formant of the vowel /i/ (rise) and the second formant of the vowel /a/ (drop) changed significantly at the group level. There was a negative correlation (r = -0.79) between the extent of tongue resection and the drop of the second formant of the vowel /a/. The perceptual variables showed a relationship (r = 0.74-0.82) with the changes in the level of the second formant of the vowel /i/. The relationship that emerged between the perceptual estimates and the objective acoustic parameters suggests that it will be possible to develop clinically relevant test batteries for articulatory quality analysis.}, } @article {pmid10462811, year = {1999}, author = {Hanson, HM and Chuang, ES}, title = {Glottal characteristics of male speakers: acoustic correlates and comparison with female data.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {2}, pages = {1064-1077}, doi = {10.1121/1.427116}, pmid = {10462811}, issn = {0001-4966}, support = {1 F32 DC 00205-02/DC/NIDCD NIH HHS/United States ; DC00075/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Electrophysiology ; Female ; Glottis/*physiology ; Humans ; Male ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Verbal Behavior ; }, abstract = {Acoustic measurements believed to reflect glottal characteristics were made on recordings collected from 21 male speakers. The waveforms and spectra of three nonhigh vowels (/ae, lambda, epsilon/) were analyzed to obtain acoustic parameters related to first-formant bandwidth, open quotient, spectral tilt, and aspiration noise. Comparisons were made with previous results obtained for 22 female speakers [H. M. Hanson, J. Acoust. Soc. Am. 101, 466-481 (1997)]. While there is considerable overlap across gender, the male data show lower average values and less interspeaker variation for all measures. In particular, the amplitude of the first harmonic relative to that of the third formant is 9.6 dB lower for the male speakers than for the female speakers, suggesting that spectral tilt is an especially significant parameter for differentiating male and female speech. These findings are consistent with fiberscopic studies which have shown that males tend to have a more complete glottal closure, leading to less energy loss at the glottis and less spectral tilt. Observations of the speech waveforms and spectra suggest the presence of a second glottal excitation within a glottal period for some of the male speakers. Possible causes and acoustic consequences of these second excitations are discussed.}, } @article {pmid10462798, year = {1999}, author = {Sinnott, JM and Williamson, TL}, title = {Can macaques perceive place of articulation from formant transition information?.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {2}, pages = {929-937}, doi = {10.1121/1.427107}, pmid = {10462798}, issn = {0001-4966}, support = {R01 DC 00541/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Behavior, Animal/physiology ; Female ; Humans ; Macaca ; Phonetics ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {An important problem in speech perception is to determine how humans extract the perceptually invariant place of articulation information in the speech wave across variable acoustic contexts. Although analyses have been developed that attempted to classify the voiced stops /b/ versus /d/ from stimulus onset information, most of the human perceptual research to date suggests that formant transition information is more important than onset information. The purpose of the present study was to determine if animal subjects, specifically Japanese macaque monkeys, are capable of categorizing /b/ versus /d/ in synthesized consonant-vowel (CV) syllables using only formant transition information. Three monkeys were trained to differentiate CV syllables with a "go-left" versus a "go-right" label. All monkeys first learned to differentiate a /za/ versus /da/ manner contrast and easily transferred to three new vowel contexts /[symbol: see text], epsilon, I/. Next, two of the three monkeys learned to differentiate a /ba/ versus /da/ stop place contrast, but were unable to transfer it to the different vowel contexts. These results suggest that animals may not use the same mechanisms as humans do for classifying place contrasts, and call for further investigation of animal perception of formant transition information versus stimulus onset information in place contrasts.}, } @article {pmid10454267, year = {1999}, author = {Alku, P and Tiitinen, H and Näätänen, R}, title = {A method for generating natural-sounding speech stimuli for cognitive brain research.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {110}, number = {8}, pages = {1329-1333}, doi = {10.1016/s1388-2457(99)00088-7}, pmid = {10454267}, issn = {1388-2457}, mesh = {Humans ; Models, Neurological ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: In response to the rapidly increasing interest in using human voice in cognitive brain research, a new method, semisynthetic speech generation (SSG), is presented for generation of speech stimuli.

METHODS: The method synthesizes speech stimuli as a combination of purely artificial processes and processes that originate from the natural human speech production mechanism. SSG first estimates the source of speech, the glottal flow, from a natural utterance using an inverse filtering technique. The glottal flow obtained is then used as an excitation to an artificial digital filter that models the formant structure of speech.

RESULTS: SSG is superior to commercial voice synthesizers because it yields speech stimuli of a highly natural quality due to the contribution of the man-originating glottal excitation.

CONCLUSION: The artificial modelling of the vocal tract enables one to adjust the formant frequencies of the stimuli as desired, thus making SSG suitable for cognitive experiments using speech sounds as stimuli.}, } @article {pmid10450912, year = {1999}, author = {Nittrouer, S}, title = {Do temporal processing deficits cause phonological processing problems?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {42}, number = {4}, pages = {925-942}, doi = {10.1044/jslhr.4204.925}, pmid = {10450912}, issn = {1092-4388}, support = {P60 DC-00982/DC/NIDCD NIH HHS/United States ; }, mesh = {Awareness/physiology ; Child ; Female ; Humans ; Male ; Mental Recall/physiology ; Perceptual Disorders/*diagnosis ; Phonetics ; Speech Perception/*physiology ; Time Factors ; Time Perception/*physiology ; }, abstract = {This study tested the hypothesis that temporal processing deficits underlie phonological processing problems. The subjects were children aged 8 to 10 years (N = 110) who were separated into 2 groups on the basis of whether their reading scores were normal or poor. As predicted by many earlier studies, children with poor reading scores demonstrate poor abilities on tests of phonological awareness, as well as on 2 other language tasks that depend on phonological processing. Two specific tests of the temporal processing hypothesis were conducted. Children in both groups were tested (a) on their abilities to recall sequences of nonspeech tones presented at various rates and (b) on their abilities to make phonetic decisions using brief and transitional properties of the speech signal, especially formant transitions (the purported "trouble spot" in the speech signal for children with phonological processing problems). The children with poor phonological processing abilities showed no special difficulty recalling rapidly presented nonspeech stimuli, and, in their phonetic decisions, they were able to use brief and transitional signal properties, including formant transitions, at least as well as other children. Therefore, no evidence was found to support the hypothesis that temporal processing deficits cause phonological processing problems.}, } @article {pmid10450075, year = {1999}, author = {Lisker, L}, title = {Perceiving final voiceless stops without release: effects of preceding monophthongs versus nonmonophthongs.}, journal = {Phonetica}, volume = {56}, number = {1-2}, pages = {44-55}, doi = {10.1159/000028440}, pmid = {10450075}, issn = {0031-8388}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; DC-02717/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Prepausal postvocalic stops in English are reported to occur both with and without audible release bursts, more or less randomly, and this difference is said to be without distinctive function. However, there is evidence that an English final stop, absent its release, may be of reduced intelligibility, particularly as to its place of articulation. Without audible release a final stop's place is conveyed mainly, perhaps entirely, by frequency shifts in the vowel formants. Among the vowels of English some are diphthongal, a property also signaled by formant shifts. The question arises: Is the intelligibility of an unreleased stop significantly affected by the phonetic nature of the vowel? Perceptual testing of appropriately chosen nonsense monosyllables ending in [p(downward left crop mark) t(downward left crop mark) k(downward left crop mark)] indicates that these stops are generally somewhat less intelligible after diphthongs. However, not all three stops are affected equally, [k(downward left crop mark)] perception being especially reduced in this context.}, } @article {pmid10450074, year = {1999}, author = {Fourakis, M and Botinis, A and Katsaiti, M}, title = {Acoustic characteristics of Greek vowels.}, journal = {Phonetica}, volume = {56}, number = {1-2}, pages = {28-43}, doi = {10.1159/000028439}, pmid = {10450074}, issn = {0031-8388}, mesh = {Adult ; Humans ; *Language ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Verbal Behavior ; }, abstract = {Five male speakers produced the vowels of Greek at slow and fast tempo, in lexically stressed and unstressed syllables, and in lexically stressed syllables of words appearing in focus position. Duration, fundamental frequency (F(0)), amplitude, and the frequencies of the first (F(1)) and second formant (F(2)) were measured. The effects on these variables of the phonemic category of the vowel, tempo, stress, and focus were examined. The results indicated that the vowel system of Greek follows universal tendencies in terms of duration but not in terms of F(0) and amplitude. Vowels in focus position, when plotted by their F(1) and F(2) frequencies, defined a vowel space larger than that defined by vowels in any other condition.}, } @article {pmid10442747, year = {1999}, author = {Stone, RE and Cleveland, TF and Sundberg, J}, title = {Formant frequencies in country singers' speech and singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {13}, number = {2}, pages = {161-167}, doi = {10.1016/s0892-1997(99)80020-4}, pmid = {10442747}, issn = {0892-1997}, mesh = {Adult ; Biomechanical Phenomena ; Humans ; Larynx/physiology ; Male ; Phonation/*physiology ; Pressure ; Respiration ; Sound Spectrography ; Speech/*physiology ; Time Factors ; *Voice Quality ; }, abstract = {In previous investigations breathing kinematics, subglottal pressures, and voice source characteristics of a group of premier country singers have been analyzed. The present study complements the description of these singers' voice properties by examining the formant frequencies in five of these country singers' spoken and sung versions of the national anthem and of a song of their own choosing. The formant frequencies were measured for identical phonemes under both conditions. Comparisons revealed that the singers used the same or slightly higher formant frequencies when they were singing than when they were speaking. The differences may be related to the higher fundamental frequency in singing. These findings are in good agreement with previous observations regarding breathing, subglottal pressures, and voice source, but are in marked contrast to what has been found for classically trained singers.}, } @article {pmid10435919, year = {1999}, author = {Fujita, S and Ito, J}, title = {Ability of nucleus cochlear implantees to recognize music.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {108}, number = {7 Pt 1}, pages = {634-640}, doi = {10.1177/000348949910800702}, pmid = {10435919}, issn = {0003-4894}, mesh = {Adult ; Aged ; *Cochlear Implants ; *Cognition ; Equipment Design ; Female ; Humans ; Japan ; Male ; Middle Aged ; *Music ; }, abstract = {Eight adults with cochlear implants participated in experiments to test their ability to recognize music. Some subjects showed good ability to recognize songs that were sung with instrumental accompaniment but poor ability to recognize songs played on an electronic keyboard without verbal cues, indicating that they were recognizing the songs by verbal cues rather than by musical qualities such as tones and melodic intervals. This conclusion was strengthened by the finding that subjects were barely able to distinguish between songs with the same rhythm and pitch range, and they showed poor ability to discriminate musical intervals. (The closest discrimination was 4 semitones.) Subjects had good ability to distinguish among the synthesized sounds of various musical instruments played on the electronic keyboard. We speculate that subjects could distinguish the various musical instruments in the same way they distinguish among human voices using spectrographic patterns such as formants or maxima.}, } @article {pmid10431907, year = {1999}, author = {Langereis, MC and Bosman, AJ and van Olphen, AF and Smoorenburg, GF}, title = {Intelligibility of vowels produced by post-lingually deafened cochlear implant users.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {38}, number = {4}, pages = {206-224}, doi = {10.3109/00206099909073025}, pmid = {10431907}, issn = {0020-6091}, mesh = {Adult ; Aged ; *Cochlear Implantation ; Deafness/etiology/*rehabilitation ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Phonetics ; *Speech Intelligibility ; Voice Quality ; }, abstract = {The present study addresses the effect of cochlear implantation on the intelligibility of vowels produced by 20 post-lingually deafened Dutch subjects. All subjects received the Nucleus-22 cochlear implant (3 WSP and 17 MSP processors). Speech recordings were made pre-implantation and three and twelve months post-implantation with the implant switched on and off. Vowel intelligibility (monophthongs only) was determined using a panel of listeners. For all implanted subjects intelligibility was measured in a noisy background. For seven poorly speaking subjects it was also measured in a quiet background. After implantation with the Nucleus-22 device the results showed that vowel intelligibility, measured for all subjects in a noisy background, increased for most of them (about 15), while it increased for about half the number of poorly speaking subjects measured in a quiet background. Twelve months after implantation vowel intelligibility, measured for all subjects in noise, appeared to be based on first and second formant information. This was also found for the subgroup of seven subjects performing poorly pre-implantation when analysed separately. However, vowel intelligibility for this subgroup, when measured in a quiet background, was based also on vowel duration. The differences between the overall result in noise and the results of the subgroup in quiet should be attributed mainly to the noise and not to aspects of poor speech production in the subgroup. In addition, this study addresses the relationship between the intelligibility scores and objective measurements of vowel quality performed in a previous study. The results showed that the vowel intelligibility scores are mainly determined by the position of the second formant frequencies.}, } @article {pmid10420636, year = {1999}, author = {Watson, CI and Harrington, J}, title = {Acoustic evidence for dynamic formant trajectories in Australian English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {1}, pages = {458-468}, doi = {10.1121/1.427069}, pmid = {10420636}, issn = {0001-4966}, mesh = {Adult ; Algorithms ; Australia ; Female ; Humans ; Linguistics/methods ; Male ; Models, Biological ; Phonetics ; Speech/*physiology ; Speech Production Measurement ; }, abstract = {The extent to which it is necessary to model the dynamic behavior of vowel formants to enable vowel separation has been the subject of debate in recent years. To investigate this issue, a study has been made on the vowels of 132 Australian English speakers (male and female). The degree of vowel separation from the formant values at the target was contrasted to that from modeling the formant contour with discrete cosine transform coefficients. The findings are that, although it is necessary to model the formant contour to separate out the diphthongs, the formant values at the target, plus vowel duration are sufficient to separate out the monophthongs. However, further analysis revealed that there are formant contour differences which benefit the within-class separation of the tense/lax monophthong pairs.}, } @article {pmid10420634, year = {1999}, author = {Jenkins, JJ and Strange, W and Trent, SA}, title = {Context-independent dynamic information for the perception of coarticulated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {1}, pages = {438-448}, doi = {10.1121/1.427067}, pmid = {10420634}, issn = {0001-4966}, support = {NINC DS 00323/DS/DS NIH HHS/United States ; NINC DS 22568/DS/DS NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Speech/physiology ; Speech Perception/*physiology ; Speech Production Measurement ; Time Factors ; Voice/physiology ; }, abstract = {Most investigators agree that the acoustic information for American English vowels includes dynamic (time-varying) parameters as well as static "target" information contained in a single cross section of the syllable. Using the silent-center (SC) paradigm, the present experiment examined the case in which the initial and final portions of stop consonant-vowel-stop consonant (CVC) syllables containing the same vowel but different consonants were recombined into mixed-consonant SC syllables and presented to listeners for vowel identification. Ten vowels were spoken in six different syllables, /b Vb, bVd, bVt, dVb, dVd, dVt/, embedded in a carrier sentence. Initial and final transitional portions of these syllables were cross-matched in: (1) silent-center syllables with original syllable durations (silences) preserved (mixed-consonant SC condition) and (2) mixed-consonant SC syllables with syllable duration equated across the ten vowels (fixed duration mixed-consonant SC condition). Vowel-identification accuracy in these two mixed consonant SC conditions was compared with performance on the original SC and fixed duration SC stimuli, and in initial and final control conditions in which initial and final transitional portions were each presented alone. Vowels were identified highly accurately in both mixed-consonant SC and original syllable SC conditions (only 7%-8% overall errors). Neutralizing duration information led to small, but significant, increases in identification errors in both mixed-consonant and original fixed-duration SC conditions (14%-15% errors), but performance was still much more accurate than for initial and finals control conditions (35% and 52% errors, respectively). Acoustical analysis confirmed that direction and extent of formant change from initial to final portions of mixed-consonant stimuli differed from that of original syllables, arguing against a target + offglide explanation of the perceptual results. Results do support the hypothesis that temporal trajectories specifying "style of movement" provide information for the differentiation of American English tense and lax vowels, and that this information is invariant over the place of articulation and voicing of the surrounding stop consonants.}, } @article {pmid10420631, year = {1999}, author = {Liénard, JS and Di Benedetto, MG}, title = {Effect of vocal effort on spectral properties of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {1}, pages = {411-422}, doi = {10.1121/1.428140}, pmid = {10420631}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Perception/physiology ; Speech Production Measurement ; }, abstract = {The effects of variations in vocal effort corresponding to common conversation situations on spectral properties of vowels were investigated. A database in which three degrees of vocal effort were suggested to the speakers by varying the distance to their interlocutor in three steps (close--0.4 m, normal--1.5 m, and far--6 m) was recorded. The speech materials consisted of isolated French vowels, uttered by ten naive speakers in a quiet furnished room. Manual measurements of fundamental frequency F0, frequencies, and amplitudes of the first three formants (F1, F2, F3, A1, A2, and A3), and on total amplitude were carried out. The speech materials were perceptually validated in three respects: identity of the vowel, gender of the speaker, and vocal effort. Results indicated that the speech materials were appropriate for the study. Acoustic analysis showed that F0 and F1 were highly correlated with vocal effort and varied at rates close to 5 Hz/dB for F0 and 3.5 Hz/dB for F1. Statistically F2 and F3 did not vary significantly with vocal effort. Formant amplitudes A1, A2, and A3 increased significantly; The amplitudes in the high-frequency range increased more than those in the lower part of the spectrum, revealing a change in spectral tilt. On the average, when the overall amplitude is increased by 10 dB, A1, A2, and A3 are increased by 11, 12.4, and 13 dB, respectively. Using "auditory" dimensions, such as the F1-F0 difference, and a "spectral center of gravity" between adjacent formants for representing vowel features did not reveal a better constancy of these parameters with respect to the variations of vocal effort and speaker. Thus a global view is evoked, in which all of the aspects of the signal should be processed simultaneously.}, } @article {pmid10420625, year = {1999}, author = {de Cheveigné, A}, title = {Vowel-specific effects in concurrent vowel identification.}, journal = {The Journal of the Acoustical Society of America}, volume = {106}, number = {1}, pages = {327-340}, doi = {10.1121/1.427059}, pmid = {10420625}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Time Factors ; }, abstract = {An experiment investigated the effects of amplitude ratio (-35 to 35 dB in 10-dB steps) and fundamental frequency difference (0%, 3%, 6%, and 12%) on the identification of pairs of concurrent synthetic vowels. Vowels as weak as -25 dB relative to their competitor were easier to identify in the presence of a fundamental frequency difference (delta F0). Vowels as weak as -35 dB were not. Identification was generally the same at delta F0 = 3%, 6%, and 12% for all amplitude ratios: unfavorable amplitude ratios could not be compensated by larger delta F0's. Data for each vowel pair and each amplitude ratio, at delta F0 = 0%, were compared to the spectral envelope of the stimulus at the same ratio, in order to determine which spectral cues determined identification. This information was then used to interpret the pattern of improvement with delta F0 for each vowel pair, to better understand mechanisms of F0-guided segregation. Identification of a vowel was possible in the presence of strong cues belonging to its competitor, as long as cues to its own formants F1 and F2 were prominent. delta F0 enhanced the prominence of a target vowel's cues, even when the spectrum of the target was up to 10 dB below that of its competitor at all frequencies. The results are incompatible with models of segregation based on harmonic enhancement, beats, or channel selection.}, } @article {pmid10416870, year = {1999}, author = {Morse, RP and Evans, EF}, title = {Preferential and non-preferential transmission of formant information by an analogue cochlear implant using noise: the role of the nerve threshold.}, journal = {Hearing research}, volume = {133}, number = {1-2}, pages = {120-132}, doi = {10.1016/s0378-5955(99)00063-5}, pmid = {10416870}, issn = {0378-5955}, mesh = {Auditory Threshold ; *Cochlear Implants ; Cochlear Nerve/*physiology ; Deafness/physiopathology/therapy ; Humans ; Noise ; Speech Acoustics ; Stochastic Processes ; Synaptic Transmission ; }, abstract = {Previous experiments have shown that, in principle, the addition of noise to any vowel coded by an analogue multichannel cochlear implant can enhance the representation of formant information by the temporal pattern of evoked nerve discharges. The optimal addition of noise to some vowel stimuli caused a largely uniform transmission of all input harmonics, including those related to a formant. But for other vowel stimuli, the optimal addition of noise caused preferential transmission of the harmonic closest to a formant compared with other input harmonics. Such preferential transmission may be useful to a cochlear implantee for formant estimation, but the basis of this transmission is unknown. In the present study, the nature of this preferential transmission was investigated with a set of parallel discriminators (or level-crossing detectors) to determine whether the inherent threshold of a nerve fiber was the main cause of the effect. An explicit threshold was found to account for some but not all of the previously observed preferential transmission. Furthermore, many discriminators were required to obtain preferential transmission. Therefore, preferential transmission of a formant-related harmonic may be best achieved by pre-processing a stimulus and using methods associated with stochastic resonance.}, } @article {pmid10416869, year = {1999}, author = {Morse, RP and Evans, EF}, title = {Additive noise can enhance temporal coding in a computational model of analogue cochlear implant stimulation.}, journal = {Hearing research}, volume = {133}, number = {1-2}, pages = {107-119}, doi = {10.1016/s0378-5955(99)00062-3}, pmid = {10416869}, issn = {0378-5955}, mesh = {*Cochlear Implants ; Cochlear Nerve/physiology ; Computer Simulation ; Deafness/physiopathology/therapy ; Humans ; *Models, Neurological ; Noise ; Speech Acoustics ; Stochastic Processes ; }, abstract = {Conventional analogue multichannel cochlear implants are unlikely to convey formant information by the fine time structure of evoked discharges. Theoretically, however, the addition of noise to the channel outputs could enhance the representation of formants by time coding. In this study, the potential benefit of noise in analogue coding schemes was investigated using a computer model of cochlear implant stimulation. The cochlear nerve was modelled by the Frankenhauser-Huxley equations. For all five vowels investigated, the optimal addition of noise to the first channel of the simulated implant (200-671 Hz) caused enhancement of the first formant representation (as seen in amplitude spectra of the simulated discharges). For vowels with a low-frequency second formant, clear enhancement of the second formant resulted from the optimal addition of noise to the third channel (1200-2116 Hz). On the basis of the present computational study, additive noise would be expected to enhance the coding of temporal information by the discharges of a single nerve fiber.}, } @article {pmid10391624, year = {1999}, author = {Higashikawa, M and Minifie, FD}, title = {Acoustical-perceptual correlates of "whisper pitch" in synthetically generated vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {42}, number = {3}, pages = {583-591}, doi = {10.1044/jslhr.4203.583}, pmid = {10391624}, issn = {1092-4388}, mesh = {Adult ; Female ; Humans ; Judgment ; *Larynx, Artificial ; Male ; Phonetics ; Pitch Perception/*physiology ; Reproducibility of Results ; Speech/*physiology ; Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The purpose of this investigation was to clarify acoustical-perceptual relationships in identification of "pitch" during whispered vowel production. The experimenters systematically varied selected acoustic features of synthetically generated "whispered" vowels to control which formant frequencies were shifted (F1, F2, or F1&F2), the direction of formant frequency shifts (up or down), and the magnitude of formant frequency shifts (20 Hz, 40 Hz, 60 Hz). Two sets of stimuli were produced to simulate the resonance characteristics of the vowel /a/: one set for male talkers and one for female talkers. Ninety-four pairs of synthesized vowel tokens were randomly presented to 17 listeners who judged if the "pitch" of the second member of the pair was the same, higher, or lower than the "pitch" of the first member. The results showed an inverse relationship between the magnitude of formant frequency changes presented to the judges and the number of perceptual mismatches in "whisper pitch." Also, fewer mismatches in the identification of whisper pitch occurred when both F1 and F2 were changed simultaneously than when either F1 or F2 was changed individually. No differences were found between the perceptual responses to "male" and "female" vowel simulations. The primary implication of this study is that whisper pitch is more influenced by simultaneous changes in F1 and F2 than by changes in only one of the formants.}, } @article {pmid10380989, year = {1999}, author = {Szymanski, MD and Yund, EW and Woods, DL}, title = {Human brain specialization for phonetic attention.}, journal = {Neuroreport}, volume = {10}, number = {7}, pages = {1605-1608}, doi = {10.1097/00001756-199905140-00039}, pmid = {10380989}, issn = {0959-4965}, mesh = {Adult ; Attention/*physiology ; Auditory Cortex/*physiology ; Discrimination, Psychological/*physiology ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {The effects of auditory selective attention on event related potentials (ERPs) to speech sounds were examined in subjects attending to vowel-consonant-vowels (VCVs) in one ear while ignoring VCVs in the opposite ear. In one condition, subjects discriminated phonetic changes in the VC, CV, or both formant-transition regions. In another condition, they discriminated equally difficult intensity changes in the same VCV regions. Attention-related negative difference waves showed enhanced early and late components (Nde and Ndl) during phoneme-discrimination conditions. Hemispheric asymmetries developed only during the Ndl and were more pronounced during phoneme discrimination. The results suggest that auditory areas of both hemispheres are specialized for phonetic analysis, with hemispherically specialized mechanisms engaged primarily during the final stages of phoneme processing.}, } @article {pmid10380673, year = {1999}, author = {Hillenbrand, JM and Nearey, TM}, title = {Identification of resynthesized /hVd/ utterances: effects of formant contour.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {6}, pages = {3509-3523}, doi = {10.1121/1.424676}, pmid = {10380673}, issn = {0001-4966}, support = {2-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Discriminant Analysis ; Female ; Humans ; *Larynx, Artificial ; Male ; Phonetics ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The purpose of this study was to examine the role of formant frequency movements in vowel recognition. Measurements of vowel duration, fundamental frequency, and formant contours were taken from a database of acoustic measurements of 1668 /hVd/ utterances spoken by 45 men, 48 women, and 46 children [Hillenbrand et al., J. Acoust. Soc. Am. 97, 3099-3111 (1995)]. A 300-utterance subset was selected from this database, representing equal numbers of 12 vowels and approximately equal numbers of tokens produced by men, women, and children. Listeners were asked to identify the original, naturally produced signals and two formant-synthesized versions. One set of "original formant" (OF) synthetic signals was generated using the measured formant contours, and a second set of "flat formant" (FF) signals was synthesized with formant frequencies fixed at the values measured at the steadiest portion of the vowel. Results included: (a) the OF synthetic signals were identified with substantially greater accuracy than the FF signals; and (b) the naturally produced signals were identified with greater accuracy than the OF synthetic signals. Pattern recognition results showed that a simple approach to vowel specification based on duration, steady-state F0, and formant frequency measurements at 20% and 80% of vowel duration accounts for much but by no means all of the variation in listeners' labeling of the three types of stimuli.}, } @article {pmid10380672, year = {1999}, author = {de Cheveigné, A and Kawahara, H}, title = {Missing-data model of vowel identification.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {6}, pages = {3497-3508}, doi = {10.1121/1.424675}, pmid = {10380672}, issn = {0001-4966}, mesh = {Humans ; Larynx, Artificial ; *Models, Biological ; Phonetics ; Speech Perception/*physiology ; Vocal Cords/physiology ; }, abstract = {Vowel identity correlates well with the shape of the transfer function of the vocal tract, in particular the position of the first two or three formant peaks. However, in voiced speech the transfer function is sampled at multiples of the fundamental frequency (F0), and the short-term spectrum contains peaks at those frequencies, rather than at formants. It is not clear how the auditory system estimates the original spectral envelope from the vowel waveform. Cochlear excitation patterns, for example, resolve harmonics in the low-frequency region and their shape varies strongly with F0. The problem cannot be cured by smoothing: lag-domain components of the spectral envelope are aliased and cause F0-dependent distortion. The problem is severe at high F0's where the spectral envelope is severely undersampled. This paper treats vowel identification as a process of pattern recognition with missing data. Matching is restricted to available data, and missing data are ignored using an F0-dependent weighting function that emphasizes regions near harmonics. The model is presented in two versions: a frequency-domain version based on short-term spectra, or tonotopic excitation patterns, and a time-domain version based on autocorrelation functions. It accounts for the relative F0-independency observed in vowel identification.}, } @article {pmid10341372, year = {1999}, author = {Ejiri, K and Masataka, N}, title = {[Synchronization between preverbal vocal behavior and motor action in early infancy. II: An acoustical examination of the functional significance of the synchronization].}, journal = {Shinrigaku kenkyu : The Japanese journal of psychology}, volume = {69}, number = {6}, pages = {433-440}, doi = {10.4992/jjpsy.69.433}, pmid = {10341372}, issn = {0021-5236}, mesh = {Child Language ; Female ; Humans ; Infant ; *Infant Behavior ; *Language Development ; Male ; *Motor Activity ; Speech Acoustics ; *Verbal Behavior ; }, abstract = {Normal infants start to produce canonical babbling (CB) at the age of 6 to 10 months. CB consists of reduplicated sequences of consonant-vowel syllables which have adult-like acoustical features. Therefore, onset of CB is considered to be a landmark in the development of spoken language. The previous study indicated that a behavioral synchronization of vocalizations with rhythmic actions occurred in infants before the onset of CB. The present study examined the function of synchronization. Acoustical analyses were conducted on vocalizations of four infants during three-months-period including the month when synchronization occurred most frequently. The results show that both utterance length and formant frequency transition duration of synchronized vocalizations are shorter than those of non-synchronized vocalizations. These acoustical features, which are required to produce CB, persisted even after the synchronization has disappeared. The present study suggests that synchronization of vocalizations with rhythmic actions has the function of prompting infants to produce CB.}, } @article {pmid10335641, year = {1999}, author = {Adachi, S and Yamada, M}, title = {An acoustical study of sound production in biphonic singing, Xöömij.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {5}, pages = {2920-2932}, doi = {10.1121/1.426905}, pmid = {10335641}, issn = {0001-4966}, mesh = {Humans ; Models, Biological ; Mongolia ; Phonetics ; *Sound ; Speech/*physiology ; *Speech Acoustics ; *Speech Production Measurement ; Vocal Cords/physiology ; }, abstract = {A theory that the high melody pitch of biphonic singing, Xöömij, is produced by the pipe resonance of the rear cavity in the vocal tract is proposed. The front cavity resonance is not critical to the production of the melody pitch. This theory is derived from acoustic investigations on several three-dimensional shapes of a Xöömij singer's vocal tract measured by magnetic resonance imaging. Four different shapes of the vocal tract are examined, with which the melody pitches of F6, G6, A6, and C7 are sung, along with the F3 drone of a specific pressed voice. The second formant frequency calculated from each tract shape is close to the melody pitch within an error of 36 cents. Sounds are synthesized by convolving a glottal source waveform provided by the Rosenberg model with transfer functions calculated from the vocal tract shapes. Two pitches are found to be successfully perceived when the synthesized sounds are listened to. In a frequency range below 2 kHz, their spectra have a strong resemblance to those of the sounds actually sung. The synthesized sounds, however, fail to replicate the harmonic clustering at 4-5 kHz observed in the actual sounds. This is speculated to originate from the glottal source specific to the "pressed" timbre of the drone.}, } @article {pmid10335631, year = {1999}, author = {Lyzenga, J and Carlyon, RP}, title = {Center frequency modulation detection for harmonic complexes resembling vowel formants and its interference by off-frequency maskers.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {5}, pages = {2792-2806}, doi = {10.1121/1.426896}, pmid = {10335631}, issn = {0001-4966}, support = {//Wellcome Trust/United Kingdom ; }, mesh = {Auditory Threshold ; Humans ; Perceptual Masking/*physiology ; Phonetics ; Speech Perception/*physiology ; Time Factors ; }, abstract = {Vowels are characterized by peaks in their spectral envelopes: the formants. To gain insight into the perception of speech as well as into the basic abilities of the ear, sensitivity to modulations in the positions of these formants is investigated. Frequency modulation detection thresholds (FMTs) were measured for the center frequency of formantlike harmonic complexes in the absence and in the presence of simultaneous off-frequency formants (maskers). Both the signals and the maskers were harmonic complexes which were band-pass filtered with a triangular spectral envelope, on a log-log scale, into either a LOW (near 500 Hz), a MID (near 1500 Hz), or a HIGH region (near 3000 Hz). They had a duration of 250 ms, and either an 80- or a 240-Hz fundamental. The modulation rate was 5 Hz for the signals and 10 Hz for the maskers. A pink noise background was presented continuously. In a first experiment no maskers were used. The measured FMTs were roughly two times larger than previously reported just-noticeable differences for formant frequency. In a second experiment, no significant differences were found between the FMTs in the absence of maskers and those in the presence of stationary (i.e., nonfrequency modulated) maskers. However, under many conditions the FMTs were increased by the presence of simultaneous modulated maskers. These results indicate that frequency modulation detection interference (FMDI) can exist for formantlike complex tones. The FMDI data could be divided into two groups. For stimuli characterized by a steep (200-dB/oct) slope, it was found that the size of the FMDI depended on which cues were used for detecting the signal and masker modulations. For stimuli with shallow (50-dB/oct) slopes, the FMDI was reduced when the signal and the masker had widely differing fundamentals, implying that the fundamental information is extracted before the interference occurs.}, } @article {pmid10229453, year = {1999}, author = {Hertrich, I and Ackermann, H}, title = {Temporal and spectral aspects of coarticulation in ataxic dysarthria: an acoustic analysis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {42}, number = {2}, pages = {367-381}, doi = {10.1044/jslhr.4202.367}, pmid = {10229453}, issn = {1092-4388}, mesh = {Adult ; Aged ; Cerebellar Ataxia/*complications ; Dysarthria/*complications/*diagnosis ; Female ; Humans ; Male ; Middle Aged ; Models, Theoretical ; Phonetics ; *Speech Acoustics ; Time Factors ; }, abstract = {In order to analyze the impact of cerebellar disorders on temporal and spectral aspects of coarticulation, 9 individuals with cerebellar dysfunction and 9 controls were asked to produce test sentences comprising a target vowel (V = [a], [i], or [u]) within a schwa-t-V-t-schwa environment. The control speakers were investigated both at their habitual speech tempo and under a slow speaking condition. The squared distances between averaged FFT spectra served as a quantitative estimate of target-induced coarticulation, a method that can be applied to consonants as well as vowels, and which avoids the shortcomings of formant analysis. In order to test the significance of coarticulation effects at the level of individual speakers and to obtain F values as a further measure of the strength of coarticulation, multivariate tests of target effects were performed, with the first 6 principal components derived from the spectra of each speaker. First, inconsistent patterns of anticipatory vowel-to-vowel (W) interactions emerged across individuals, and neither significant group differences nor any effects of speech rate could be detected. The underlying control mechanisms thus seem to be segment-linked subject-specific specifications of the pretarget vowel, uncompromised by cerebellar dysfunction. Second, all participants exhibited highly significant anticipatory [t(h)]-to-vowel (CV) coarticulation. This effect was slightly smaller in the cerebellar group than in the control group, which can be at least partially explained by reduced spectral distances among the 3 target vowels. Speech rate did not influence the CV effects of the control group. As concerns temporal aspects of coarticulation, no significant group differences emerged in terms of length adjustments of the pretarget consonant to the intrinsic duration of the target vowel. Third, ataxic speakers showed a tendency toward enlarged perseverative vowel-to-[t(h)] (VC) and W effects if their slow speech rate was taken into account. Retentive coarticulation turned out to be similar in slow ataxic speakers and in fast-speaking controls. However, significant attenuation of these effects emerged in the latter group under the condition of decreased speech tempo. In summary, these results corroborate the suggestion of different mechanisms of gestural overlap in the temporal domain: Whereas perseverative coarticulation, presumably, reflects biomechanical or motor constraints, anticipation seems to represent higher level phonetic processing.}, } @article {pmid10206389, year = {1999}, author = {Chuma, AV and Cacace, AT and Rosen, R and Feustel, P and Koltaii, PJ}, title = {Effects of tonsillectomy and/or adenoidectomy on vocal function: laryngeal, supralaryngeal and perceptual characteristics.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {47}, number = {1}, pages = {1-9}, doi = {10.1016/s0165-5876(98)00162-1}, pmid = {10206389}, issn = {0165-5876}, mesh = {*Adenoidectomy ; Adolescent ; Child ; Child, Preschool ; Female ; Humans ; Male ; Postoperative Complications/diagnosis ; Prospective Studies ; Speech Acoustics ; *Tonsillectomy ; Voice Disorders/diagnosis ; *Voice Quality ; }, abstract = {A prospective, non-randomized study evaluated the effects of tonsillectomy and/or adenoidectomy (T +/- A) on acoustic and perceptual aspects of vocal function. Thirty-one children, ranging in age from 4 to 15 years participated and measurements were made prior to and 3 months following surgery. Twenty-three children had T +/- A and eight had adenoidectomy alone. Quantitative acoustic measures included: laryngeal (vocal fundamental frequency, FO) and supralaryngeal characteristics of sustained vowels (F1 and F2 formants, formant bandwidths, two-dimensional measures of vowel space) and temporal properties of consonant-vowel productions (diadochokinetic syllable rates). Perceptual measures were based on samples of continuous speech, using the Buffalo voice profile (BVP) and parental interviews/questionnaires were used to evaluate other aspects of surgery (i.e. subjective speech changes, protracted pain, difficulty swallowing, bleeding, etc.). Based on ANOVA, no significant post-surgical changes were detected for the majority of acoustic speech measures studied (vocal F0, formant bandwidths, measures of vowel space or diadochokinetic rates). However, the F2 formant frequency for vowels /i/ and /a/ increased and F1 decreased for /o/ following surgery. These changes had the largest effect on the structure of vowel /i/, which became more acute and diffuse following surgery. Furthermore, of the majority of perceptual measured studied with the BVP, 92% showed no change postoperatively. However, in the category of resonance, a significant decrease in hyponasality was detected. These results demonstrate that removing soft tissue from the oropharynx has only minimal impact on quantitative or qualitative (perceptual) aspects of vocal function, when measurements are made approximately 15 weeks post surgery.}, } @article {pmid10206368, year = {1999}, author = {Perrin, E and Berger-Vachon, C and Topouzkhanian, A and Truy, E and Morgon, A}, title = {Evaluation of cochlear implanted children's voices.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {47}, number = {2}, pages = {181-186}, doi = {10.1016/s0165-5876(98)00140-2}, pmid = {10206368}, issn = {0165-5876}, mesh = {Adolescent ; Child ; *Cochlear Implants ; Deafness/*physiopathology/*surgery ; Female ; Humans ; Male ; Phonetics ; Pilot Projects ; Voice Quality/*physiology ; }, abstract = {Cochlear implant (CI) is a good means in developing communication in deaf children. Nevertheless, compared to children with the same age, CI patients' voices are far from being similar. In this work, the voice of CI children has been compared with the voice of corresponding normal children (same age, same sex) included in the main stream. Six girls and two boys participated to the experiment. The phonetic material was a paragraph of the French standard text La bise et le soleil (The North Wind and the Sun). An objective and a subjective analysis of the voice were done and parameters were compared between both groups of people (implantees and control). Studied parameters were voice pitch, intensity, fluency, pauses, articulation and pleasantness in the objective analysis, and voice pitch, formants, and duration for the objective study. It appeared that intensity variations were different between control and implanted subjects. Also voice formants were not situated in the same region regarding the normal ranges, but differences were difficult to assess. Globally, the main change was in the speaking duration. This method is open for further studies and points out some relevant items for an efficient use in rehabilitation sessions.}, } @article {pmid10097014, year = {1998}, author = {Sussman, HM and Fruchter, D and Hilbert, J and Sirosh, J}, title = {Linear correlates in the speech signal: the orderly output constraint.}, journal = {The Behavioral and brain sciences}, volume = {21}, number = {2}, pages = {241-59; discussion 260-99}, doi = {10.1017/s0140525x98001174}, pmid = {10097014}, issn = {0140-525X}, support = {R01 DC2014-01A1/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Perception/physiology ; }, abstract = {Neuroethological investigations of mammalian and avian auditory systems have documented species-specific specializations for processing complex acoustic signals that could, if viewed in abstract terms, have an intriguing and striking relevance for human speech sound categorization and representation. Each species forms biologically relevant categories based on combinatorial analysis of information-bearing parameters within the complex input signal. This target article uses known neural models from the mustached bat and barn owl to develop, by analogy, a conceptualization of human processing of consonant plus vowel sequences that offers a partial solution to the noninvariance dilemma--the nontransparent relationship between the acoustic waveform and the phonetic segment. Critical input sound parameters used to establish species-specific categories in the mustached bat and barn owl exhibit high correlation and linearity due to physical laws. A cue long known to be relevant to the perception of stop place of articulation is the second formant (F2) transition. This article describes an empirical phenomenon--the locus equations--that describes the relationship between the F2 of a vowel and the F2 measured at the onset of a consonant-vowel (CV) transition. These variables, F2 onset and F2 vowel within a given place category, are consistently and robustly linearly correlated across diverse speakers and languages, and even under perturbation conditions as imposed by bite blocks. A functional role for this category-level extreme correlation and linearity (the "orderly output constraint") is hypothesized based on the notion of an evolutionarily conserved auditory-processing strategy. High correlation and linearity between critical parameters in the speech signal that help to cue place of articulation categories might have evolved to satisfy a preadaptation by mammalian auditory systems for representing tightly correlated, linearly related components of acoustic signals.}, } @article {pmid10089598, year = {1999}, author = {Lee, S and Potamianos, A and Narayanan, S}, title = {Acoustics of children's speech: developmental changes of temporal and spectral parameters.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {3}, pages = {1455-1468}, doi = {10.1121/1.426686}, pmid = {10089598}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Child Development/*physiology ; Child, Preschool ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Sex Factors ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; Time Factors ; Vocal Cords/*physiology ; }, abstract = {Changes in magnitude and variability of duration, fundamental frequency, formant frequencies, and spectral envelope of children's speech are investigated as a function of age and gender using data obtained from 436 children, ages 5 to 17 years, and 56 adults. The results confirm that the reduction in magnitude and within-subject variability of both temporal and spectral acoustic parameters with age is a major trend associated with speech development in normal children. Between ages 9 and 12, both magnitude and variability of segmental durations decrease significantly and rapidly, converging to adult levels around age 12. Within-subject fundamental frequency and formant-frequency variability, however, may reach adult range about 2 or 3 years later. Differentiation of male and female fundamental frequency and formant frequency patterns begins at around age 11, becoming fully established around age 15. During that time period, changes in vowel formant frequencies of male speakers is approximately linear with age, while such a linear trend is less obvious for female speakers. These results support the hypothesis of uniform axial growth of the vocal tract for male speakers. The study also shows evidence for an apparent overshoot in acoustic parameter values, somewhere between ages 13 and 15, before converging to the canonical levels for adults. For instance, teenagers around age 14 differ from adults in that, on average, they show shorter segmental durations and exhibit less within-subject variability in durations, fundamental frequency, and spectral envelope measures.}, } @article {pmid9972574, year = {1999}, author = {Krumbholz, K and Schmidt, S}, title = {Perception of complex tones and its analogy to echo spectral analysis in the bat, Megaderma lyra.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {2 Pt 1}, pages = {898-911}, doi = {10.1121/1.426278}, pmid = {9972574}, issn = {0001-4966}, mesh = {Animals ; Auditory Perception/*physiology ; Chiroptera/*physiology ; Learning/physiology ; *Sound ; Sound Localization/*physiology ; }, abstract = {The gleaning bat Megaderma lyra emits broadband echolocation sounds consisting of multiple frequency components. The present study investigates into which perceptual qualities the spectral characteristics of echoes may be translated in the auditory system of M. lyra. Three bats were trained in a 2-AFC behavioral experiment to classify nine complex tones, which spectrally resembled M. lyra's sonar calls, into two perceptual categories. Then the bats' spontaneous responses to unknown complex tones were recorded. The results show that the animals based their classifications of the complex tones on a sound quality which was mediated by their broadband frequency spectra. The bats used the training stimuli as spectral templates and classified the test stimuli according to their broadband spectral similarity with the learned patterns. Assuming that passive hearing and echo processing are governed by similar perceptual qualities and subject to similar limitations, the perceptual mode which was used by the bats to compare the multicomponent spectral patterns in the reported experiments could serve as a powerful tool for the spectral analysis of M. lyra's multicomponent echoes. The analogy between the perception of complex tones and echo spectral analysis in M. lyra is theoretically elaborated in the "formant-mode" model.}, } @article {pmid9921675, year = {1999}, author = {Kortekaas, RW and Kohlrausch, A}, title = {Psychoacoustical evaluation of PSOLA. II. Double-formant stimuli and the role of vocal perturbation.}, journal = {The Journal of the Acoustical Society of America}, volume = {105}, number = {1}, pages = {522-535}, doi = {10.1121/1.424588}, pmid = {9921675}, issn = {0001-4966}, mesh = {Adult ; Auditory Threshold ; Humans ; Pitch Discrimination/*physiology ; *Psychoacoustics ; Speech Perception/*physiology ; Vocal Cords/*physiology ; }, abstract = {This article presents the results of listening experiments and psychoacoustical modeling aimed at evaluating the pitch synchronous overlap-and-add (PSOLA) technique. This technique can be used for simultaneous modification of pitch and duration of natural speech, using simple and efficient time-domain operations on the speech waveform. The first set of experiments tested the ability of subjects to discriminate double-formant stimuli, modified in fundamental frequency using PSOLA, from unmodified stimuli. Of the potential auditory discrimination cues induced by PSOLA, cues from the first formant were found to generally dominate discrimination performance. In the second set of experiments the influence of vocal perturbation, i.e., jitter and shimmer, on discriminability of PSOLA-modified single-formant stimuli was determined. The data show that discriminability deteriorates at most modestly in the presence of jitter and shimmer. With the exception of a few conditions, the trends in these data could be replicated by either using a modulation-discrimination or an intensity-discrimination model, dependent on the formant frequency. As a baseline experiment detection thresholds for jitter and shimmer were measured. Thresholds for jitter could be replicated by using either the modulation-discrimination or the intensity-discrimination model, dependent on the (mean) fundamental frequency of stimuli. The thresholds for shimmer could be accurately predicted for stimuli with a 250-Hz fundamental, but less accurately in the case of a 100-Hz fundamental.}, } @article {pmid9894579, year = {1999}, author = {Cienfuegos, A and March, L and Shelley, AM and Javitt, DC}, title = {Impaired categorical perception of synthetic speech sounds in schizophrenia.}, journal = {Biological psychiatry}, volume = {45}, number = {1}, pages = {82-88}, doi = {10.1016/s0006-3223(98)00064-x}, pmid = {9894579}, issn = {0006-3223}, support = {K02 MH01439/MH/NIMH NIH HHS/United States ; R29 MH49334/MH/NIMH NIH HHS/United States ; }, mesh = {Adaptation, Psychological/physiology ; Adult ; Discrimination, Psychological/physiology ; Female ; Humans ; Male ; Psychiatric Status Rating Scales ; *Schizophrenic Psychology ; Speech Perception/*physiology ; }, abstract = {BACKGROUND: Simple speech sounds such as /ba/ and /da/ differ in the frequency composition of their underlying formants. Normal volunteers asked to identify intermediate phonemes along the /ba/ to /da/ continuum abruptly switch from perceiving "ba" to perceiving "da". The present study investigates precision of phonemic processing in schizophrenia.

METHODS: Categorical perception of speech sounds was evaluated in 15 schizophrenic and 14 control subjects, using a forced-choice phonemic discrimination paradigm.

RESULTS: Patients and controls were equally able to recognize endpoint forms of both phonemes, but differed significantly in their perception of intermediate forms near the center of the continuum. Patients also showed a significantly shallower response curve, suggesting an impairment in boundary definition. Despite their impairment in categorical perception, schizophrenic subjects showed normal adaptation of response when test stimuli were preceded by a series of /ba/ or /da/ stimuli from the endpoints of the continuum.

CONCLUSIONS: The present results suggest that precision of phonemic processing is impaired in schizophrenia. This categorical perception deficit may represent upward generalization of impaired memory-dependent acoustic processing. Deficits in the precision of cortical processing may contribute significantly to cognitive dysfunction in schizophrenia.}, } @article {pmid9892760, year = {1999}, author = {Krishnan, A}, title = {Human frequency-following responses to two-tone approximations of steady-state vowels.}, journal = {Audiology & neuro-otology}, volume = {4}, number = {2}, pages = {95-103}, doi = {10.1159/000013826}, pmid = {9892760}, issn = {1420-3030}, support = {R03-DC01980-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Humans ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Auditory nerve single-unit population studies have demonstrated that phase-locking plays a dominant role in the neural encoding of the spectrum of speech sounds. Since the scalp-recorded human frequency-following response (FFR) reflects synchronous, phase-locked activity in a population of neurons in the rostral auditory brainstem, it was reasoned that the human FFR might preserve information about certain acoustic features of speech sounds. FFRs to three different two-tone approximations of vowels (symbols, see text) were obtained from 10 normal-hearing human adults at 85, 75, 65 and 55 dB nHL. Spectrum analyses of the FFRs revealed distinct peaks at frequencies corresponding to the first and the second formants across all levels suggesting that phase-locked activity among two distinct populations of neurons are indeed preserved in the FFR. Also, the FFR spectrum for vowels (symbols, see text) revealed a robust component at 2F1-F2 frequency suggesting that the human FFR contains a neural representation of cochlear nonlinearity. Finally, comparison of FFRs to the vowel approximations and the individual components at F1 and F2 revealed effects that may be suggestive of two-tone synchrony suppression and/or lateral inhibition. These results suggest that the scalp-recorded FFR may be used to evaluate not only neural encoding of speech sounds but also processes associated with cochlear nonlinearity.}, } @article {pmid9879475, year = {1998}, author = {Hosemann, W and Göde, U and Dunker, JE and Eysholdt, U}, title = {Influence of endoscopic sinus surgery on voice quality.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {255}, number = {10}, pages = {499-503}, doi = {10.1007/s004050050107}, pmid = {9879475}, issn = {0937-4477}, mesh = {Adult ; Chronic Disease ; Endoscopy/*adverse effects ; Humans ; Middle Aged ; Paranasal Sinuses/*surgery ; Sinusitis/surgery ; *Voice Quality ; }, abstract = {Twenty-one patients with documented chronic paranasal sinusitis and in need of endoscopic endonasal sinus surgery were subjected to voice analysis. Tape recordings of different sustained vowels were performed pre- and postoperatively. All voice samples were examined with a sound spectrographic analysis system. Patients having known nasal obstruction detected by active anterior rhinomanometry were excluded from further study. Analysis of pre- and postoperative spectrograms focused on changes in center frequency or bandwidth of the first four formants, as well as variations in specific differences of the formant frequencies and amplitudes. The different subgroups of patients revealed a series of significant changes in the parameters studied. The vowels [a:] and [i:] showed inverse changes in measured values, while evaluation of the vowel [u:] was restricted due to artifactual scattering of individual values. In general, band-width diminished and energy peaks of formants increased postoperatively. In 6 of 21 patients (approximately one-third of the cases), patients or other individuals detected perceptual changes of speech postoperatively. Based on our data, we recommend informing all patients, and voice professionals in particular, about the possible effects of endonasal sinus surgery on altering speech.}, } @article {pmid9857518, year = {1998}, author = {Coughlin, M and Kewley-Port, D and Humes, LE}, title = {The relation between identification and discrimination of vowels in young and elderly listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {104}, number = {6}, pages = {3597-3607}, doi = {10.1121/1.423942}, pmid = {9857518}, issn = {0001-4966}, support = {AG-08293/AG/NIA NIH HHS/United States ; DCD-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Aged ; Audiometry, Pure-Tone ; Auditory Threshold ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Male ; Middle Aged ; Phonetics ; Speech Discrimination Tests ; }, abstract = {This study examined both the identification and discrimination of vowels by three listener groups: elderly hearing-impaired, elderly normal-hearing, and young normal-hearing. Each hearing-impaired listener had a longstanding symmetrical, sloping, mild-to-moderate sensorineural hearing loss. Two signal levels [70 and 95 dB sound-pressure level (SPL)] were selected to assess the effects of audibility on both tasks. The stimuli were four vowels, /I,e, epsilon, ae/, synthesized for a female talker. Difference limens (DLs) were estimated for both F1 and F2 formants using adaptive tracking. Discrimination DLs for F1 formants were the same across groups and levels. Discrimination DLs for F2 showed that the best formant resolution was for the young normal-hearing group, the poorest was for the elderly normal-hearing group, and resolution for the elderly hearing-impaired group fell in between the other two at both signal levels. Only the elderly hearing-impaired group had DLs that were significantly poorer than those of the young listeners at the lower, 70 dB, level. In the identification task at both levels, young normal-hearing listeners demonstrated near-perfect performance (M = 95%), while both elderly groups were similar to one another and demonstrated lower performance (M = 71%). The results were examined using correlational analysis of the performance of the hearing-impaired subjects relative to that of the normal-hearing groups. The results suggest that both age and hearing impairment contribute to decreased vowel perception performance in elderly hearing-impaired persons.}, } @article {pmid9857515, year = {1998}, author = {Kluender, KR and Lotto, AJ and Holt, LL and Bloedel, SL}, title = {Role of experience for language-specific functional mappings of vowel sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {104}, number = {6}, pages = {3568-3582}, doi = {10.1121/1.423939}, pmid = {9857515}, issn = {0001-4966}, support = {DBS-9258482//PHS HHS/United States ; DC-00719/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Animals ; Birds/physiology ; Humans ; *Language ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Studies involving human infants and monkeys suggest that experience plays a critical role in modifying how subjects respond to vowel sounds between and within phonemic classes. Experiments with human listeners were conducted to establish appropriate stimulus materials. Then, eight European starlings (Sturnus vulgaris) were trained to respond differentially to vowel tokens drawn from stylized distributions for the English vowels /i/ and /I/, or from two distributions of vowel sounds that were orthogonal in the F1-F2 plane. Following training, starlings' responses generalized with facility to novel stimuli drawn from these distributions. Responses could be predicted well on the bases of frequencies of the first two formants and distributional characteristics of experienced vowel sounds with a graded structure about the central "prototypical" vowel of the training distributions. Starling responses corresponded closely to adult human judgments of "goodness" for English vowel sounds. Finally, a simple linear association network model trained with vowels drawn from the avian training set provided a good account for the data. Findings suggest that little more than sensitivity to statistical regularities of language input (probability-density distributions) together with organizational processes that serve to enhance distinctiveness may accommodate much of what is known about the functional equivalence of vowel sounds.}, } @article {pmid9821340, year = {1998}, author = {Lyzenga, J and Horst, JW}, title = {Frequency discrimination of stylized synthetic vowels with two formants.}, journal = {The Journal of the Acoustical Society of America}, volume = {104}, number = {5}, pages = {2956-2966}, doi = {10.1121/1.423878}, pmid = {9821340}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Noise ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Just-noticeable differences (jnd's) in the formant frequencies of synthetic two-formant "vowels" were measured for normal hearing subjects. The jnd's were examined for a change in only the first or the second formant, and for a combined change of both formants. For the combined change two quantitative relations between the formant frequencies were used; one with equal relative changes in both formants, and one with a double relative change for the first formant. Formant frequencies were 500, 550, and 600 Hz in the first, and 2000, 2050, and 2100 Hz in the second formant region. Both formants had either shallow or steep slopes. For the fundamental frequency of the complexes we used 100 and 200 Hz. For the single-formant changes, a "natural," and a random-phase relation were used between the individual components of the complexes. These results were compared to jnd's for a Gaussian noise that was filtered with the same spectral envelopes as the harmonic complexes. For the combined formant changes only the natural phase relation was used. A three-interval, three-alternative forced-choice task was used. All measurements were performed with roving stimulus level. For the single formant changes, the phase relations had no effect on the results. For the harmonic stimuli, jnd's were mostly smaller for the formants between two harmonics than for those at a harmonic. The results for the harmonic stimuli as well as the noise bands could be described by a model using a spectral profile comparison. For the combined formant changes smaller jnd's were found than for the single changes. These jnd's could be explained by combining measures of the perceived differences from the two separately changed formants. In this combination these measures were summed as independent variables.}, } @article {pmid9819482, year = {1998}, author = {Rantala, L and Paavola, L and Körkkö, P and Vilkman, E}, title = {Working-day effects on the spectral characteristics of teaching voice.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {50}, number = {4}, pages = {205-211}, doi = {10.1159/000021462}, pmid = {9819482}, issn = {1021-7762}, mesh = {Adult ; Female ; Hoarseness/diagnosis ; Humans ; Middle Aged ; Occupational Diseases/*diagnosis ; *Sound Spectrography ; *Teaching ; Voice Disorders/*diagnosis ; }, abstract = {Ten teachers made recordings during one normal working day using a portable DAT recorder and a head-mounted microphone. In addition, the subjects filled in a questionnaire of signs of vocal fatigue. The speech samples were selected from the first and last lesson from three points representing the beginning, middle and end part of the lesson, respectively. To standardize the samples, 30 [a] vowels from stressed syllables were chosen for spectral analysis. The level of the fundamental and second formant regions (L1), the level of frequency ranges 2-5 kHz (L2) and 5-10 kHz (L5) were measured. From these measurements the parameters L1-L0, L1-L2 and L1-L5 were formed and used in the analyses as well as the energy levels below and above 1 kHz (alpha). Statistically significant changes were observed in the following parameters: the L1-L2 and L1-L5 differences, and the alpha ratio. In general, there was an increase in the energy content of the high frequency components due to vocal loading. The subjective reports revealed a statistically significant relationship with the spectral characteristics.}, } @article {pmid9796949, year = {1998}, author = {Fort, A and Manfredi, C}, title = {Acoustic analysis of newborn infant cry signals.}, journal = {Medical engineering & physics}, volume = {20}, number = {6}, pages = {432-442}, doi = {10.1016/s1350-4533(98)00045-9}, pmid = {9796949}, issn = {1350-4533}, mesh = {Acoustics ; Biomedical Engineering ; Computer Simulation ; *Crying ; Data Interpretation, Statistical ; Humans ; Infant, Low Birth Weight ; Infant, Newborn/*physiology ; Infant, Premature ; Signal Processing, Computer-Assisted ; }, abstract = {This paper aims at estimating the fundamental frequency (pitch) and the vocal tract resonant frequencies (formants) from newborn infant cry signals. Such parameters are of interest in exploring brain function at early stages of child development, for the timely diagnosis of neonatal disease and malformation. The paper compares a spectral parametric technique and the cepstrum approach, extending previous results. The parametric technique is based on autoregressive models whose order is adaptively estimated on subsequent signal frames by means of a new method. This allows the correct tracking of pitch and formant variations with time. The traditional cepstrum approach is modified in order to follow signal variability. In particular, the cepstrum spectral resolution is improved by applying the chirp Z-transform (CZT) and by adaptively varying the 'lifter' length. The two methods are tested on simulated data, as far as robustness to noise and spectral resolution are concerned, and are then applied to real baby cry data.}, } @article {pmid9796646, year = {1998}, author = {Dawson, PW and Nott, PE and Clark, GM and Cowan, RS}, title = {A modification of play audiometry to assess speech discrimination ability in severe-profoundly deaf 2- to 4-year-old children.}, journal = {Ear and hearing}, volume = {19}, number = {5}, pages = {371-384}, doi = {10.1097/00003446-199810000-00004}, pmid = {9796646}, issn = {0196-0202}, mesh = {Audiometry, Speech/*methods ; Child, Preschool ; Cochlear Implantation ; Deafness/*diagnosis/therapy ; Hearing Aids ; Humans ; Phonetics ; Reproducibility of Results ; Severity of Illness Index ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {OBJECTIVE: The aim was to develop an assessment procedure that was independent of language and speech production ability, to test speech feature discrimination in severe-profoundly deaf children 2 to 4 yr of age.

DESIGN: The procedure being trialed was adapted from existing procedures. The child was required to respond with a game-like motor response to a "change" in a speech stimulus that was being presented repeatedly through a speaker. The change occurred at randomly determined times, and false alarm responses were measured during the waiting periods (while the child waited for the change). Two- to four-yr-old normally hearing children and hearing-impaired children using hearing aids and a group of 4-yr-old hearing-impaired children using cochlear implants were assessed on the task.

RESULTS: More than 82% of the 3- and 4-yr-old normally hearing and hearing-impaired children were able to complete the testing for the eight speech sound contrasts within three 20 minute sessions. Fifty percent of the 2-yr-old normally hearing and hearing-impaired children were able to condition and complete the task. All of the normally hearing children who completed the task successfully discriminated all speech sound contrasts. The performance of the hearing-impaired children using hearing aids was influenced by the degree of hearing loss and the type of speech contrast being tested. Similarly, the average performance of the children using cochlear implants was better for easier contrasts such as /ba/bi/ with contrasting vowel formant cues.

CONCLUSIONS: This procedure has potential for use as a reliable clinical and research tool for assessing the development of auditory discrimination ability in 2- to 4-yr-old severe-profoundly deaf children.}, } @article {pmid9771627, year = {1998}, author = {Blomgren, M and Robb, M and Chen, Y}, title = {A note on vowel centralization in stuttering and nonstuttering individuals.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {41}, number = {5}, pages = {1042-1051}, doi = {10.1044/jslhr.4105.1042}, pmid = {9771627}, issn = {1092-4388}, mesh = {Adult ; Humans ; Male ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Stuttering/*diagnosis ; }, abstract = {Inferences were made regarding vocal tract vowel space during fluently produced utterances through examination of the first two formant frequencies. Fifteen adult males served as subjects, representing separate groups of untreated and treated individuals who stutter and nonstuttering controls. The steady-state portion of formant one (F1) and formant two (F2) was examined in the production of various CVC tokens containing the vowels /i/, /u/, and /a/. Vocal tract vowel space was estimated three ways. The first analysis scheme involved measurement of formant frequency spacing. The second measure involved calculating the area of the vowel space triangle. The third measure was based on calculating the average Euclidean distance from each subject's midpoint "centroid" vocal tract position to the corresponding /i/, /u/, and /a/ points on the vowel triangle. The formant frequency spacing measures proved to be most revealing of group differences, with the untreated stutterers showing significantly greater vowel centralization than the treated group and control group. Discussion focuses on the vocal tract articulation characterizing fluent speech productions and possible treatment implications for persons who stutter.}, } @article {pmid9771622, year = {1998}, author = {Tjaden, K and Weismer, G}, title = {Speaking-rate-induced variability in F2 trajectories.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {41}, number = {5}, pages = {976-989}, doi = {10.1044/jslhr.4105.976}, pmid = {9771622}, issn = {1092-4388}, support = {DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Time Factors ; }, abstract = {This study examined speaking-rate-induced spectral and temporal variability of F2 formant trajectories for target words produced in a carrier phrase at speaking rates ranging from fast to slow. F2 onset frequency measured at the first glottal pulse following the stop consonant release in target words was used to quantify the extent to which adjacent consonantal and vocalic gestures overlapped; F2 target frequency was operationally defined as the first occurrence of a frequency minimum or maximum following F2 onset frequency. Regression analyses indicated 70% of functions relating F2 onset and vowel duration were statistically significant. The strength of the effect was variable, however, and the direction of significant functions often differed from that predicted by a simple model of overlapping, sliding gestures. Results of a partial correlation analysis examining interrelationships among F2 onset, F2 target frequency, and vowel duration across the speaking rate range indicated that covariation of F2 target with vowel duration may obscure the relationship between F2 onset and vowel duration across rate. The results further suggested that a sliding based model of acoustic variability associated with speaking rate change only partially accounts for the present data, and that such a view accounts for some speakers' data better than others.}, } @article {pmid9769878, year = {1998}, author = {Prévost, G and Colin, DA and Staali, L and Baba Moussa, L and Gravet, A and Werner, S and Sanni, A and Meunier, O and Monteil, H}, title = {[Pore-forming leukotoxins from Staphylococcus aureus: variability of the target cells and 2 pharmacological processes].}, journal = {Pathologie-biologie}, volume = {46}, number = {6}, pages = {435-441}, pmid = {9769878}, issn = {0369-8114}, mesh = {Animals ; Bacterial Proteins/metabolism/*pharmacology ; Bacterial Toxins/metabolism/*pharmacology ; Calcium Channels/metabolism ; Cations, Divalent/metabolism ; Cattle ; Cell Membrane Permeability/*drug effects ; Chemotaxis, Leukocyte/drug effects ; Cross Infection/pathology/physiopathology ; Erythrocytes/*drug effects ; Exotoxins ; Female ; *Hemolysin Proteins ; Histamine Release/drug effects ; Humans ; Interleukin-8/metabolism ; Ion Transport ; Leukocidins/metabolism/*pharmacology ; Leukotriene B4/metabolism ; Male ; Mastitis, Bovine/physiopathology ; Models, Biological ; Necrosis ; Neutrophils/*drug effects ; Rabbits ; Staphylococcal Infections/pathology/physiopathology/veterinary ; Staphylococcus aureus/*metabolism ; T-Lymphocytes/*drug effects ; Vasodilation/drug effects ; Virulence ; Vitreous Body ; }, abstract = {The staphylococcal bi-component leukotoxins constitute a family included in the super-family of the beta-sheet-structured pore-forming toxins. They may be produced by Staphylococcus aureus and by Staphylococcus intermedius and their target cells vary according to the molecules. The mode of action proceeds by the sequential binding of the class S proteins, then by that of the class F proteins at the surface of the membranes. Then, the activation of cellular calcium-channels precedes the pore formation which seems to be sensitive to several monovalent cations. The cell response is inflammatory and includes the neosynthesis as well as the secretion of leukotriene B4, interleukin -8, histamine. The injection of leukotoxins to rabbits generates cell chemotaxis , vasodilatation, and tissue necrosis. The association of the production of leukotoxins with clinical syndromes concerns several aspects of the pathology of S. aureus, and confers to these leukotoxins an important role of virulence factors.}, } @article {pmid9752104, year = {1998}, author = {Garbaruk, VI}, title = {[The acoustic characteristics of the vocalizations of hypoacusic children].}, journal = {Vestnik otorinolaringologii}, volume = {}, number = {4}, pages = {62-64}, pmid = {9752104}, issn = {0042-4668}, mesh = {Female ; Hearing Disorders/*physiopathology ; Humans ; Infant ; Male ; *Phonation ; Sound Spectrography/statistics & numerical data ; *Speech Acoustics ; Time Factors ; }, abstract = {The author analyzes time and frequency characteristics of vocalizations registered in hypoacusis children aged 3-5 months. After the comparison of the acoustic characteristics, utterance duration, formant frequencies of the vocalic utterances, FO frequency changes during vocalization of hypoacusis infants with those of normal infants (literature data), it was determined that infants with impaired and normal hearing do not differ it their vocalizations.}, } @article {pmid9745956, year = {1998}, author = {Wong, JC and Miller, RL and Calhoun, BM and Sachs, MB and Young, ED}, title = {Effects of high sound levels on responses to the vowel "eh" in cat auditory nerve.}, journal = {Hearing research}, volume = {123}, number = {1-2}, pages = {61-77}, doi = {10.1016/s0378-5955(98)00098-7}, pmid = {9745956}, issn = {0378-5955}, support = {DC00109/DC/NIDCD NIH HHS/United States ; DC00202/DC/NIDCD NIH HHS/United States ; DC00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Threshold/*physiology ; Cats ; Male ; Nerve Fibers/physiology ; Speech Perception/physiology ; Statistics as Topic ; Vestibulocochlear Nerve/*physiology ; }, abstract = {The vowel "eh" was used to study auditory-nerve responses at high sound levels (60-110 dB). By changing the playback sampling rate of the stimulus, the second formant (F2) frequency was set at best frequency (BF) for fibers with BFs between 1 and 3 kHz. For vowel stimuli, auditory-nerve fibers tend to phase-lock to the formant component nearest the fiber's BF. The responses of fibers with BFs near F2 are captured by the F2 component, meaning that fibers respond as if the stimulus consisted only of the F2 component. These narrowband responses are seen up to levels of 80-100 dB, above which a response to F1 emerges. The F1 response grows, at the expense of the F2 response, and is dominant at the highest levels. The level at which the F1 response appears is BF dependent and is higher at lower BFs. This effect appears to be suppression of the F2 response by F1. At levels near 100 dB, a component 1/component 2 transition is observed. All components of the vowel undergo the transition simultaneously, as judged by the 180 degrees phase inversion that occurs at the C2 transition. Above the C2 threshold, a broadband response to many components of the vowel is observed. These results demonstrate that the neural representation of speech in normal ears is degraded at high sound levels, such as those used in hearing aids.}, } @article {pmid9714578, year = {1998}, author = {Gordon, M and O'Neill, WE}, title = {Temporal processing across frequency channels by FM selective auditory neurons can account for FM rate selectivity.}, journal = {Hearing research}, volume = {122}, number = {1-2}, pages = {97-108}, doi = {10.1016/s0378-5955(98)00087-2}, pmid = {9714578}, issn = {0378-5955}, support = {1-F31-MH11059/MH/NIMH NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Perception ; Brain Stem/*physiology ; Chiroptera ; Electrodes, Implanted ; Female ; Male ; Neurons, Afferent/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Auditory neurons tuned to the direction and rate of frequency modulations (FM) might underlie the encoding of frequency sweeps in animal vocalizations and formant transitions in human speech. We examined the relationship between FM direction and rate selectivity and the precise temporal interactions of excitatory and inhibitory sideband inputs. Extracellular single-unit recordings were made in the auditory midbrains of eight mustached bats. Up- and down-sweeping linear FM stimuli were presented at different modulation rates in order to determine FM selectivity. Brief tone pairs with varying interstimulus delays were presented in a forward masking paradigm to examine the relative timing of excitatory and inhibitory inputs. In the 33 units for which tone pair data were collected, a correspondence existed between FM rate selectivity and the time delays between paired tones. Moreover, FM directional selectivity was strongly linked to rate selectivity, because directional preferences were expressed only at certain rates and not others. We discuss how abnormalities in the relative timing of inputs could alter or abolish the selectivity of such neurons, and how such a mechanism could account for the perceptual deficits for formant transitions seen in certain children with phonological deficits.}, } @article {pmid9712128, year = {1998}, author = {Nittrouer, S and Crowther, CS}, title = {Examining the role of auditory sensitivity in the developmental weighting shift.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {41}, number = {4}, pages = {809-818}, doi = {10.1044/jslhr.4104.809}, pmid = {9712128}, issn = {1092-4388}, support = {R01 DC-00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Age Factors ; Child ; *Child Language ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Studies comparing children's and adults' labeling of speech stimuli have repeatedly shown that children's phonological decisions are more strongly related to portions of the signal that involve rapid spectral change (i.e., formant transitions) and less related to other signal components than are adults' decisions. Such findings have led to a model termed the Developmental Weighting Shift, which suggests that children initially assign particularly strong weight to formant transitions to help delimit individual words in the continuous speech stream but gradually modify these strategies to be more like those of adults as they learn about word-internal structure. The goal of the current study was to test a reasonable alternative: that these apparent age-related differences in perceptual weighting strategies for speech are instead due to age-related differences in auditory sensitivity. To this end, difference limens (DLs) were obtained from children (ages 5 and 7 years) and adults for three types of acoustic properties: dynamic-spectral, static-spectral, and temporal. Two testable hypotheses were offered: Labeling results could reflect either absolute differences in sensitivity between children and adults or relative differences in sensitivity within each group. Empirical support for either hypothesis would indicate that apparent developmental changes in perceptual weighting strategies are actually due to developmental changes in auditory sensitivity to acoustic properties. Results of this study contradicted predictions of both hypotheses, sustaining the suggestion that children's perceptual weighting strategies for speech-relevant acoustic properties change as they gain experience with a native language.}, } @article {pmid9693345, year = {1998}, author = {Fulop, SA and Kari, E and Ladefoged, P}, title = {An acoustic study of the tongue root contrast in Degema vowels.}, journal = {Phonetica}, volume = {55}, number = {1-2}, pages = {80-98}, doi = {10.1159/000028425}, pmid = {9693345}, issn = {0031-8388}, mesh = {Humans ; Language ; Male ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Tongue/*physiology ; }, abstract = {Degema is an Edoid language of Nigeria whose ten vowels are organized phonologically into two sets of five. The two sets are thought to be differentiated by the degree of tongue root advancing. This paper examines the acoustic nature of these vowels as represented in field recordings of six speakers. The most consistent acoustic correlate of the tongue root contrast was found to be the first formant frequency which consistently distinguishes four of the five vowel pairs, the exception being the two low vowels. Three of the five pairs could also be distinguished by F2, though the direction of the difference was not consistent. Additionally, a comparison of corresponding advanced and retracted vowels using a normalized measure of relative formant intensity demonstrated that this correlate could also distinguish them in general, but only operated reliably in two of the five vowel pairs. The pair of low vowels could not be distinguished from each other by any of these measures. Finally, a perceptual study was conducted which demonstrates that Degema speakers do not classify their vowels very well using formant frequencies as the sole acoustic variable; only the two pairs of mid vowels were reliably singled out by native listeners from an array of synthesized vowels.}, } @article {pmid9693344, year = {1998}, author = {Recasens, D and Pallarès, MD and Fontdevila, J}, title = {An electropalatographic and acoustic study of temporal coarticulation for Catalan dark/l/ and German clear/l/.}, journal = {Phonetica}, volume = {55}, number = {1-2}, pages = {53-79}, doi = {10.1159/000028424}, pmid = {9693344}, issn = {0031-8388}, mesh = {Humans ; Language ; Male ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Time Factors ; }, abstract = {Electropalatographic and F2 frequency data in /VlV/ sequences reveal more prominent C-to-V effects for Catalan dark /l/ than for German clear /l/, more so in the /i/ context than in the /a/ context, which is in agreement with the existence of high lingual requirements on the formation of two constriction places for dark /l/. German clear /l/ exerts a similar amount of F2 displacement on both vowels which may be indicative of the tongue dorsum being directed towards a target position; this is also suggested by dorsopalatal contact and formant frequency data showing less vowel-dependent variability than clear /l/ in other languages though more so than Catalan dark /l/. Salient anticipatory requirements for the implementation of /l/ in the two languages block V1-dependent carryover effects to a large extent which results in more prominent vocalic anticipation than vocalic carryover. This directionality trend in vocalic coarticulation is more obvious for Catalan dark /l/ than for German clear /l/ (in agreement with the former consonantal variety requiring more anticipation than the latter) and opposes German /l/ to clear /l/ in other languages (i.e., the less constrained /l/ variety of Spanish may favor vocalic carryover over vocalic anticipation in VCV sequences).}, } @article {pmid9693342, year = {1998}, author = {Matyear, CL and MacNeilage, PF and Davis, BL}, title = {Nasalization of vowels in nasal environments in babbling: evidence for frame dominance.}, journal = {Phonetica}, volume = {55}, number = {1-2}, pages = {1-17}, doi = {10.1159/000028422}, pmid = {9693342}, issn = {0031-8388}, support = {R01-HD27733-03/HD/NICHD NIH HHS/United States ; }, mesh = {Female ; Humans ; Infant ; Male ; Phonetics ; Sound Spectrography ; Speech Acoustics ; *Verbal Behavior ; }, abstract = {An emerging concept for the characterization of the form of babbling and early speech is 'frame dominance': most of the variance arises from a frame provided by open-close mandibular oscillation. In contrast, the tongue - the most versatile articulator in adults - plays only a minor role in intersegmental and even intersyllabic changes. The contribution of another articulator - the soft palate - to time-domain changes in babbling was evaluated in an acoustic analysis of 433 consonant-vowel-consonant sequences produced by 3 infants. Strong nasal effects on vowels in symmetrical consonantal environment were observed in the form of a lower frequency first formant region in low vowels and a lower frequency second formant region in front vowels. These results, the first of which also occurs in adults, were complemented by perceptual tendencies for transcribers to transcribe more mid vowels relative to low vowels and more central vowels relative to front vowels in nasal environments. Thus the soft palate is like the tongue in making only minor contributions to time-domain changes in babbling, and this is considered to be additional evidence for the frame dominance conception.}, } @article {pmid9673120, year = {1998}, author = {Kosztyła-Hojna, B and Chodynicki, S and Lazarczyk, B and Tupalska, M and Mikiel, W}, title = {[The voice function after horizontal laryngectomy].}, journal = {Otolaryngologia polska = The Polish otolaryngology}, volume = {52}, number = {2}, pages = {195-198}, pmid = {9673120}, issn = {0030-6657}, mesh = {Adult ; Aged ; Female ; Humans ; Laryngeal Neoplasms/*surgery ; Laryngectomy/adverse effects/*methods ; Male ; Middle Aged ; Retrospective Studies ; Voice Disorders/etiology/*therapy ; *Voice Training ; }, abstract = {Voice quality was assessed in 37 patients with cancer of the larynx before and after supraglottic surgery. Subjective and objective spectrography method were applied to evaluate dysphony. A change of the voice colour was found, which was manifested in spectrography by decrease of formant levels, specially F3 and F4. Dysphagia and longer tracheostomy were temporary complications after the surgery, and caused later beginning of phoniatric rehabilitation.}, } @article {pmid9670540, year = {1998}, author = {Strange, W and Bohn, OS}, title = {Dynamic specification of coarticulated German vowels: perceptual and acoustical studies.}, journal = {The Journal of the Acoustical Society of America}, volume = {104}, number = {1}, pages = {488-504}, doi = {10.1121/1.423299}, pmid = {9670540}, issn = {0001-4966}, support = {DC00323/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Germany ; Humans ; Male ; Phonation/*physiology ; Phonetics ; *Speech Acoustics ; Time Factors ; }, abstract = {To examine the generality of Strange's Dynamic Specification Theory of vowel perception, two perceptual experiments investigated whether dynamic (time-varying) acoustic information about vowel gestures was critical for identification of coarticulated vowels in German, a language without diphthongization. The perception by native North German (NG) speakers of electronically modified /dVt/ syllables produced in carrier sentences was assessed using the "silent-center" paradigm. The relative efficacy of static target information, dynamic spectral information (defined over syllable onsets and offsets together), and intrinsic vowel length was investigated in listening conditions in which the centers (silent-center conditions) or the onsets and offsets (vowel-center conditions) of the syllables were silenced. Listeners correctly identified most vowels in silent-center syllables and in vowel-center stimuli when both conditions included information about intrinsic vowel length. When duration information was removed, errors increased significantly, but performance was relatively better for silent-center syllables than for vowel-center stimuli. Acoustical analyses of the effects of coarticulation on target formant frequencies, vocalic duration, and dynamic spectro-temporal patterns in the stimulus materials were performed to elucidate the nature of the dynamic spectral information. In comparison with vowels produced in citation from /hVt/ syllables by the same speaker, the coarticulated /dVt/ utterances showed considerable "target undershoot" of formant frequencies and reduced duration differences between tense and lax vowel pairs. This suggests that both static spectral cues and relative duration information for NG vowels may not remain perceptually distinctive in continuous speech. Analysis of formant movement within syllable nuclei corroborated descriptions of German vowels as monophthongal. However, an analysis of first formant temporal trajectories revealed distinct patterns for tense and lax vowels that could be used by listeners to disambiguate coarticulated NG vowels.}, } @article {pmid9670539, year = {1998}, author = {Story, BH and Titze, IR and Hoffman, EA}, title = {Vocal tract area functions for an adult female speaker based on volumetric imaging.}, journal = {The Journal of the Acoustical Society of America}, volume = {104}, number = {1}, pages = {471-487}, doi = {10.1121/1.423298}, pmid = {9670539}, issn = {0001-4966}, support = {R01 DC02532-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Computer Simulation ; Female ; Humans ; Magnetic Resonance Imaging ; Phonation/*physiology ; Phonetics ; Tomography, X-Ray Computed ; *Vocal Cords/abnormalities/diagnostic imaging/physiology ; }, abstract = {Magnetic resonance imaging (MRI) was used to acquire vocal tract shapes of ten vowels /i, I, [symbol: see text] a, [symbol: see text], o, [symbol: see text] u/ and two liquid approximants /3[symbol: see text], 1/ for a 27-year-old adult female. These images were complemented with additional images acquired with electron beam computed tomography (CT) of /i/ and /a/. Each 3-D shape was condensed into a set of cross-sectional areas of oblique sections perpendicular to the centerline of the vocal tract's long axis, resulting in an "area function." Formant frequencies computed for each area function showed reasonable similarity to those determined from the natural (recorded) speech of the imaged subject, but differences suggest that some of the imaged vocal tract shapes were articulated differently during imaging than during recording of natural speech, and also that imaging procedures may have compromised some accuracy for a few shapes. The formant calculations also confirmed the significant effect that the piriform sinus can have on lowering the formant frequencies. A comparison is made between area functions derived using both MRI and CT methods for the vowels /i/ and /a/. Additionally, the area functions reported in this study are compared with those from two previous studies and demonstrate general similarities in shape but also obvious differences that can be attributed to anatomical differences of the imaged subjects and to differences in imaging techniques and image processing methods.}, } @article {pmid9644615, year = {1998}, author = {Parkinson, AJ and el-Kholy, W and Tyler, RS}, title = {Vowel perception in prelingually deafened children with multichannel cochlear implants.}, journal = {Journal of the American Academy of Audiology}, volume = {9}, number = {3}, pages = {179-190}, pmid = {9644615}, issn = {1050-0545}, support = {2 P50 DC 00242-11/DC/NIDCD NIH HHS/United States ; RR00059/RR/NCRR NIH HHS/United States ; }, mesh = {Age Factors ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*surgery ; Humans ; Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Vowel perception ability for 16 prelingually deafened children using Nucleus 22-channel cochlear implants was studied at 12, 24, and 36 months postimplantation. Information transmission analysis was used to evaluate the effectiveness of the implants in conveying the essential cues required for accurate vowel identification and whether the cues used varied with experience or device use. Individual vowel identification varied widely with mean scores significantly improving between 12 and 24 months but not between 24 and 36 months. Information transmission scores for all vowel features (fronting, height, duration, and diphthongization) increased dramatically between 12 and 36 months. Results indicated that vowel height and vowel fronting were the most salient features for the subject group. There were no differences in the pattern of confusions made across test sessions or across groups when divided into "poor" and "good" users. However, there was evidence that the "good" users made better use of higher frequency formant information than the "poor" users. The results of the present study add to the accumulation of evidence pointing to the great benefit that cochlear implantation can provide to prelingually deafened children. Overall performance for the vowel recognition test used in this study was quite high and analysis of the childrens' errors suggested that their cochlear implants were reasonably effective at conveying the most essential spectral information required for vowel discrimination.}, } @article {pmid9637035, year = {1998}, author = {Culling, JF and Summerfield, AQ and Marshall, DH}, title = {Dichotic pitches as illusions of binaural unmasking. I. Huggins' pitch and the "binaural edge pitch".}, journal = {The Journal of the Acoustical Society of America}, volume = {103}, number = {6}, pages = {3509-3526}, doi = {10.1121/1.423059}, pmid = {9637035}, issn = {0001-4966}, mesh = {*Dichotic Listening Tests ; Humans ; Models, Theoretical ; Noise ; Pitch Perception/*physiology ; }, abstract = {The two most salient dichotic pitches, the Huggins pitch (HP) and the binaural edge pitch (BEP), are produced by applying interaural phase transitions of 360 and 180 degrees, respectively, to a broadband noise. This paper examines accounts of these pitches, concentrating on a "central activity pattern" (CAP) model and a "modified equalization-cancellation" (mE-C) model. The CAP model proposes that a dichotic pitch is heard at frequency f when an individual across-frequency scan in an interaural cross-correlation matrix contains a sharp peak at f. The mE-C model proposes that a dichotic pitch is heard when a plot of interaural decorrelation against frequency contains a peak at f. The predictions of the models diverge for the BEP at very narrow transition bandwidths: the mE-C model predicts that salience is sustained, while the CAP model predicts that salience declines and that the dominant percept is of the in-phase segment of the noise. Experiment 1 showed that the salience of the BEP was sustained at the narrowest bandwidths that could be generated (0.5% of the transition frequency). Experiment 2 confirmed that the pitch of a BEP produced by a 0.5% transition bandwidth was close to the frequency of the transition band. Experiment 3 showed that pairs of simultaneous narrow 180-degree transitions, whose frequencies corresponded to vowel formants, were perceived as the intended vowels. Moreover, the same vowels were perceived whether the in-phase portion of the noise lay between the two transition frequencies or on either side of them. In contrast, different patterns of identification responses were made to diotic band-pass and band-stop noises whose cutoff frequencies corresponded to the same formants. Thus, the vowel-identification responses made to the dichotic stimuli were not based on hearing the in-phase portions of the noise as formants. These results are not predicted by the CAP model but are consistent with the mE-C model. It is argued that the mE-C model provides a more coherent and parsimonious account of many aspects of the HP and the BEP than do alternative models.}, } @article {pmid9535945, year = {1998}, author = {May, BJ and Prell, GS and Sachs, MB}, title = {Vowel representations in the ventral cochlear nucleus of the cat: effects of level, background noise, and behavioral state.}, journal = {Journal of neurophysiology}, volume = {79}, number = {4}, pages = {1755-1767}, doi = {10.1152/jn.1998.79.4.1755}, pmid = {9535945}, issn = {0022-3077}, support = {2 R01 DC-00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Anesthetics ; Animals ; Cats ; Cochlear Nucleus/cytology/*physiology ; Neurons/*physiology ; *Noise ; *Signal Detection, Psychological ; Speech/*physiology ; Vestibulocochlear Nerve/*physiology ; Wakefulness ; }, abstract = {Single-unit responses were studied in the ventral cochlear nucleus (VCN) of cats as formant and trough features of the vowel /epsilon/ were shifted in the frequency domain to each unit's best frequency (BF; the frequency of greatest sensitivity). Discharge rates sampled with this spectrum manipulation procedure (SMP) were used to estimate vowel representations provided by populations of VCN neurons. In traditional population measures, a good representation of a vowel's formant structure is based on relatively high discharge rates among units with BFs near high-energy formant features and low rates for units with BFs near low-energy spectral troughs. At most vowel levels and in the presence of background noise, chopper units exhibited formant-to-trough rate differences that were larger than VCN primary-like units and auditory-nerve fibers. By contrast, vowel encoding by primary-like units resembled auditory nerve representations for most stimulus conditions. As is seen in the auditory nerve, primary-like units with low spontaneous rates (SR <18 spikes/s) produced better representations than high SR primary-like units at all but the lowest vowel levels. Awake cats exhibited the same general response properties as anesthetized cats but larger between-subject differences in vowel driven rates. The vowel encoding properties of VCN chopper units support previous interpretations that patterns of auditory nerve convergence on cochlear nucleus neurons compensate for limitations in the dynamic range of peripheral neurons.}, } @article {pmid9581065, year = {1998}, author = {Xu, J and Cheng, J and Wu, Y}, title = {A cepstral method for analysis of acoustic transmission characteristics of respiratory system.}, journal = {IEEE transactions on bio-medical engineering}, volume = {45}, number = {5}, pages = {660-664}, doi = {10.1109/10.668757}, pmid = {9581065}, issn = {0018-9294}, mesh = {Adult ; Algorithms ; Humans ; Lung Diseases/diagnosis ; Male ; *Models, Biological ; Reference Values ; *Respiratory Sounds ; *Signal Processing, Computer-Assisted ; Transducers ; }, abstract = {The generation and transmission process of transmitted sound signals (TSS) is analyzed and a mathematical model of TSS is established in this paper. The power cepstral characteristics of TSS are studied based on the mathematical model and a new analysis method of acoustic transmission of respiratory system using homomorphic processing technique is proposed. The experimental results show that the normal respiratory system has only one formant, while the abnormal respiratory system presenting lung consolidation has two formants and the second formant plays important role in that system. This new method is a simple and effective one.}, } @article {pmid9542334, year = {1998}, author = {Silaeva, OL}, title = {[Comparative analysis of the acoustic characteristics of speech models of human prototypes and budgerigar (Melopsittacus undulatus) imitations].}, journal = {Izvestiia Akademii nauk. Seriia biologicheskaia}, volume = {}, number = {1}, pages = {47-54}, pmid = {9542334}, issn = {1026-3470}, mesh = {Animals ; Child ; Female ; Humans ; Imitative Behavior/*physiology ; Male ; Parakeets/*physiology ; *Speech ; Vocalization, Animal/*physiology ; }, abstract = {Phonemic comparative analysis of vowel (i) of the Russian language was carried out in pronunciation of nine budgerigars and humans (three men, five women, and three children under eleven years old). A complex device "KAPROS-01" was used as an analyzer. The contact signals of the same individuals was also analyzed. Transposition of the main tone and formants in the frequency range by 2 kHz upwards and adjacency of the main tone harmonics to the first formant area are the main distinctions of the three-dimensional (3D) graph between bird and human. The bird's signal does not differ practically from the human signal according to the number of formants (two to three). The length of vowel (i) in budgerigars averages 19% of the total word length, versus 29% in humans. The main voice frequency in budgerigars in their contact signals varies from 2.2 to 3.4 kHz and, in imitations, from 2.3 to 3.0 kHz.}, } @article {pmid9557978, year = {1998}, author = {Schilling, JR and Miller, RL and Sachs, MB and Young, ED}, title = {Frequency-shaped amplification changes the neural representation of speech with noise-induced hearing loss.}, journal = {Hearing research}, volume = {117}, number = {1-2}, pages = {57-70}, doi = {10.1016/s0378-5955(98)00003-3}, pmid = {9557978}, issn = {0378-5955}, support = {DC00109/DC/NIDCD NIH HHS/United States ; DC00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Cats ; Disease Models, Animal ; Hearing Aids/standards ; Hearing Loss, Noise-Induced/*physiopathology ; Nerve Fibers/*physiology ; Noise ; Signal Processing, Computer-Assisted ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Temporal response patterns of single auditory nerve fibers were used to characterize the effects of a common hearing-aid processing scheme, frequency-shaped amplification, on the encoding of the vowel /epsilon/ in cats with a permanent noise-induced hearing loss. These responses were contrasted with responses to unmodified stimuli in control and impaired cats. Noise-induced hearing loss leads to a degraded representation of the formant frequencies, in which strong phase locking to the formants is not observed in fibers with best frequencies (BFs) near the formants and there is a wide spread of formant phase locking to fibers with higher BFs (Miller et al., 1997a,b). Frequency shaping effectively limits the upward spread of locking to F1, which improves the representation of higher frequency components of the vowel. However, it also increases phase locking to harmonics in the trough between the formants, which decreases the contrast between F1 and the trough in the neural representation. Moreover, it does not prevent the spread to higher BFs of responses to the second and third formants. The results show a beneficial effect of frequency shaping, but also show that interactions between particular gain functions and particular spectral shapes can result in unwanted distortions of the neural representation of the signal.}, } @article {pmid9557977, year = {1998}, author = {Rhode, WS}, title = {Neural encoding of single-formant stimuli in the ventral cochlear nucleus of the chinchilla.}, journal = {Hearing research}, volume = {117}, number = {1-2}, pages = {39-56}, doi = {10.1016/s0378-5955(98)00002-1}, pmid = {9557977}, issn = {0378-5955}, support = {NS-17590/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation/*methods ; Animals ; Audiometry, Pure-Tone ; Chinchilla ; Cochlear Nucleus/*physiology ; Cortical Synchronization/methods ; Data Display ; Pitch Discrimination/physiology ; Reaction Time ; Signal Processing, Computer-Assisted ; Speech Perception/physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of the principal unit types in the ventral cochlear nucleus of the chinchilla were studied with a single-formant stimulus set that covered fundamental frequency (f0) from 100 Hz to 200 Hz and formant center frequency (F1) from 256 to 782 Hz. Temporal coding for f0 and F1 was explored for 95 stimulus combinations of f0 (n = 5) and F1 (n = 19) in primarylike, onset and chopper unit categories. Several analyses that explored temporal coding were employed including: autocorrelation, interspike interval analysis, and synchronization to each harmonic of f0. In general, the representation of f0 is better in onset and chopper units than in primarylike units. Nearly all units in the cochlear nucleus showed a gain in phase locking to the envelope (f0) of the single-formant stimulus relative to the auditory nerve. The fundamental is represented directly in neural discharges of units in the cochlear nucleus with an interval code (also Cariani and Delgutte, 1996; Rhode, 1995). The formant is represented in the temporal domain in primarylike units, though some chopper and onset units also possess the ability to code F1 through discharge synchrony. Onset-I units, which are associated with the octopus cells, exhibited the strongest phase locking to f0 of any unit types studied. The representation of f0 and F1 in the temporal domain is weak or absent in some units. All-order-interspike interval distributions computed for populations of units show preservation of temporal coding for both f0 and F1. Results are in agreement with earlier amplitude modulation studies that showed nearly all cochlear nucleus unit types phase lock to the signal envelope better than auditory nerve fibers over a considerable range of signal amplitudes.}, } @article {pmid9514029, year = {1998}, author = {Kewley-Port, D and Zheng, Y}, title = {Auditory models of formant frequency discrimination for isolated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {103}, number = {3}, pages = {1654-1666}, doi = {10.1121/1.421264}, pmid = {9514029}, issn = {0001-4966}, support = {NIHDCD-00250/HD/NICHD NIH HHS/United States ; NIHDCD-02229/HD/NICHD NIH HHS/United States ; }, mesh = {Auditory Threshold ; Female ; Humans ; Male ; *Models, Biological ; Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Thresholds for formant discrimination of female and male vowels are significantly elevated by two stimulus factors, increases in formant frequency and fundamental frequency [Kewley-Port et al., J. Acoust. Soc. Am. 100, 2462-2470 (1996)]. The present analysis systematically examined whether auditory models of vowel sounds, including excitation patterns, specific loudness, and a Gammatone filterbank, could explain the effects of stimulus parameters on formant thresholds. The goal was to determine if an auditory metric could be specified that reduced variability observed in the thresholds to a single-valued function across four sets of female and male vowels. Based on Sommers and Kewley-Port [J. Acoust. Soc. Am. 99, 3770-3781 (1996)], four critical bands around the test formant were selected to calculate a metric derived from excitation patterns. A metric derived from specific loudness difference (delta Sone) was calculated across the entire frequency region. Since analyses of spectra from Gammatone filters gave similar results to those derived from excitation patterns, only the 4-ERB (equivalent rectangular bandwidth) and delta Sone metrics were analyzed in detail. Three criteria were applied to the two auditory metrics to determine if they were single-valued functions relative to formant thresholds for female and male vowels. Both the 4-ERB and delta Sone metrics met the criteria of reduced slope, reduced effect of fundamental frequency, although delta Sone was superior to 4-ERB in reducing overall variability. Results suggest that the auditory system has an inherent nonlinear transformation in which differences in vowel discrimination thresholds are almost constant in the internal representation.}, } @article {pmid9509746, year = {1998}, author = {Hansen, JH and Gavidia-Ceballos, L and Kaiser, JF}, title = {A nonlinear operator-based speech feature analysis method with application to vocal fold pathology assessment.}, journal = {IEEE transactions on bio-medical engineering}, volume = {45}, number = {3}, pages = {300-313}, doi = {10.1109/10.661155}, pmid = {9509746}, issn = {0018-9294}, mesh = {Adult ; Algorithms ; Elasticity ; Female ; Humans ; Male ; Muscle Contraction ; *Nonlinear Dynamics ; *Signal Processing, Computer-Assisted ; Speech Discrimination Tests/*methods ; Speech Production Measurement/*methods ; Vocal Cords/physiopathology ; Voice Disorders/*diagnosis/therapy ; }, abstract = {Traditional speech processing methods for laryngeal pathology assessment assume linear speech production with measures derived from an estimated glottal flow waveform. They normally require the speaker to achieve complete glottal closure, which for many vocal fold pathologies cannot be accomplished. To address this issue, a nonlinear signal processing approach is proposed which does not require direct glottal flow waveform estimation. This technique is motivated by earlier studies of airflow characterization for human speech production. The proposed nonlinear approach employs a differential Teager energy operator and the energy separation algorithm to obtain formant AM and FM modulations from filtered speech recordings. A new speech measure is proposed based on parameterization of the autocorrelation envelope of the AM response. This approach is shown to achieve impressive detection performance for a set of muscular tension dysphonias. Unlike flow characterization using numerical solutions of Navier-Stokes equations, this method is extremely computationally attractive, requiring only a small time window of speech samples. The new noninvasive method shows that a fast, effective digital speech processing technique can be developed for vocal fold pathology assessment without the need for direct glottal flow estimation or complete glottal closure by the speaker. The proposed method also confirms that alternative nonlinear methods can begin to address the limitations of previous linear approaches for speech pathology assessment.}, } @article {pmid9508024, year = {1998}, author = {Hienz, RD and Stiles, P and May, BJ}, title = {Effects of bilateral olivocochlear lesions on vowel formant discrimination in cats.}, journal = {Hearing research}, volume = {116}, number = {1-2}, pages = {10-20}, doi = {10.1016/s0378-5955(97)00197-4}, pmid = {9508024}, issn = {0378-5955}, support = {1 R01 DC 01388/DC/NIDCD NIH HHS/United States ; 2 R01 DC 00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Action Potentials ; Animals ; Cats ; Cochlea/injuries/*innervation/*physiology ; Denervation ; Efferent Pathways/injuries/physiology ; Feedback ; Hair Cells, Auditory, Outer/injuries/physiology ; Male ; Noise ; Olivary Nucleus/anatomy & histology/injuries/*physiology ; Speech Perception/*physiology ; }, abstract = {Operant conditioning procedures were used to measure the effects of bilateral olivocochlear lesions on the cat's discrimination thresholds for changes in the second formant frequency (deltaF2) of the vowel /epsilon/. Three cats were tested with the formant discrimination task under quiet conditions and in the presence of continuous broadband noise at signal-to-noise ratios (S/Ns) of 23, 13, and 3 dB. In quiet, vowel levels of 50 and 70 dB produced average deltaF2s of 42 and 47 Hz, respectively, and these thresholds did not change significantly in low levels of background noise (S/Ns = 23 and 13 dB). Average deltaF2s increased to 94 and 97 Hz for vowel levels of 50 and 70 dB in the loudest level of background noise (S/N = 3 dB). Average deltaF2 thresholds in quiet and in lower noise levels were only slightly affected when the olivocochlear bundle was lesioned by making bilateral cuts into the floor of the IVth ventricle. In contrast, post-lesion deltaF2 thresholds in the highest noise level were significantly larger than pre-lesion values; the most severely affected subject showed post-lesion discrimination thresholds well over 200 Hz for both 50 and 70 dB vowels. These results suggest that olivocochlear feedback may enhance speech processing in high levels of ambient noise.}, } @article {pmid9503911, year = {1998}, author = {Nittrouer, S and Crowther, CS and Miller, ME}, title = {The relative weighting of acoustic properties in the perception of [s] + stop clusters by children and adults.}, journal = {Perception & psychophysics}, volume = {60}, number = {1}, pages = {51-64}, doi = {10.3758/bf03211917}, pmid = {9503911}, issn = {0031-5117}, support = {5 R01 DC 00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; Middle Aged ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; }, abstract = {We examined the perceptual weighting by children and adults of the acoustic properties specifying complete closure of the vocal tract following a syllable-initial [s]. Experiment 1 was a novel manipulation of previously examined acoustic properties (duration of a silent gap and first formant transition) and showed that children weight the first formant transition more than adults. Experiment 2, an acoustic analysis of naturally produced say and stay, revealed that, contrary to expectations, a burst can be present in stay and that first formant transitions do not necessarily distinguish say and stay in natural tokens. Experiment 3 manipulated natural speech portions to create stimuli that varied primarily in the duration of the silent gap and in the presence or absence of a stop burst, and showed that children weight these stop bursts less than adults. Taken together, the perception experiments support claims that children integrate multiple acoustic properties as adults do, but that they weight dynamic properties of the signal more than adults and weight static properties less.}, } @article {pmid9479767, year = {1998}, author = {Loizou, PC and Dorman, MF and Powell, V}, title = {The recognition of vowels produced by men, women, boys, and girls by cochlear implant patients using a six-channel CIS processor.}, journal = {The Journal of the Acoustical Society of America}, volume = {103}, number = {2}, pages = {1141-1149}, doi = {10.1121/1.421248}, pmid = {9479767}, issn = {0001-4966}, support = {R010000654-6//PHS HHS/United States ; }, mesh = {Adult ; Age Factors ; Aged ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; Sex Factors ; *Speech Perception ; }, abstract = {Five patients who used a six-channel, continuous interleaved sampling (CIS) cochlear implant were presented vowels, in two experiments, from a large sample of men, women, boys, and girls for identification. At issue in the first experiment was whether vowels from one speaker group, i.e., men, were more identifiable than vowels from other speaker groups. At issue in the second experiment was the role of the fifth and sixth channels in the identification of vowels from the different speaker groups. It was found in experiment 1 that (i) the vowels produced by men were easier to identify than vowels produced by any of the other speaker groups, (ii) vowels from women and boys were more difficult to identify than vowels from men but less difficult than vowels from girls, and (iii) vowels from girls were more difficult to identify than vowels from all other groups. In experiment 2 removal of channels 5 and 6 from the processor impaired the identification of vowels produced by women, boys and girls but did not impair the identification of vowels produced by men. The results of experiment 1 demonstrate that scores on tests of vowels produced by men overestimate the ability of patients to recognize vowels in the broader context of multi-talker communication. The results of experiment 2 demonstrate that channels 5 and 6 become more important for vowel recognition as the second formants of the speakers increase in frequency.}, } @article {pmid9430763, year = {1997}, author = {Hedrick, MS and Carney, AE}, title = {Effect of relative amplitude and formant transitions on perception of place of articulation by adult listeners with cochlear implants.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {40}, number = {6}, pages = {1445-1457}, doi = {10.1044/jslhr.4006.1445}, pmid = {9430763}, issn = {1092-4388}, mesh = {Adult ; *Cochlear Implantation ; Deafness/*therapy ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; *Speech Perception ; }, abstract = {Previous studies have shown that manipulation of a particular frequency region of the consonantal portion of a syllable relative to the amplitude of the same frequency region in an adjacent vowel influences the perception of place of articulation. This manipulation has been called the relative amplitude cue. Earlier studies have examined the effect of relative amplitude and formant transition manipulations upon labeling place of articulation for fricatives and stop consonants in listeners with normal hearing. The current study sought to determine if (a) the relative amplitude cue is used by adult listeners wearing a cochlear implant to label place of articulation, and (b) adult listeners wearing a cochlear implant integrated the relative amplitude and formant transition information differently than listeners with normal hearing. Sixteen listeners participated in the study, 12 with normal hearing and 4 postlingually deafened adults wearing the Nucleus 22 electrode Mini Speech Processor implant with the multipeak processing strategy. The stimuli used were synthetic consonant-vowel (CV) syllables in which relative amplitude and formant transitions were manipulated. The two speech contrasts examined were the voiceless fricative contrast /s/-"sh" and the voiceless stop consonant contrast /p/-/t/. For each contrast, listeners were asked to label the consonant sound in the syllable from the two response alternatives. Results showed that (a) listeners wearing this implant could use relative amplitude to consistently label place of articulation, and (b) listeners with normal hearing integrated the relative amplitude and formant transition information more than listeners wearing a cochlear implant, who weighted the relative amplitude information as much as 13 times that of the transition information.}, } @article {pmid9430760, year = {1997}, author = {Sussman, JE and Gekas, B}, title = {Phonetic category structure of [I]: extent, best exemplars, and organization.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {40}, number = {6}, pages = {1406-1424}, doi = {10.1044/jslhr.4006.1406}, pmid = {9430760}, issn = {1092-4388}, mesh = {Female ; Humans ; Male ; *Phonetics ; Speech/*physiology ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The current investigation examined the structure of the phonetic category [I] for 13 listeners. Experiments reported are results from identification, "best exemplar," and discrimination tasks using 105 [I] stimuli. The tokens were synthesized long a mel-spaced vowel continuum that differed in first and second formants. All stimuli ended in a 30 ms [b] sound. Results showed that 10 of 13 listeners demonstrated differing choices of the best exemplars, although most were within 37.5 mels of the central best exemplar chosen in the first experiment. Seven of the participants demonstrated "circular" patterns in identification of the [I] category that appeared to be organized around a central "best exemplar." Six participants showed other identification patterns: "downward," "upward," and "left-extending," with "best exemplars" on an edge or border of the phonetic categories. Graded category structure from a central "best exemplar" was apparent only in the averaged identification results, and not for individual participants. The size of the [I] category was significantly smaller than that surrounding the [i] best exemplar reported in a prior study by Sussman and Lauckner-Morano (1995). Finally, listeners had equivalent or better discrimination sensitivity with the best exemplar as the fixed standard compared to that for a "poor" exemplar token 45 mels away from the best exemplar. Results showed that phonetic category structure for the lax vowel [I] was different from the similar, but tense vowel [i]. The findings question whether prototype theory is generalizable to vowel categories other than [i].}, } @article {pmid9348695, year = {1997}, author = {Chen, MY}, title = {Acoustic correlates of English and French nasalized vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {4}, pages = {2360-2370}, doi = {10.1121/1.419620}, pmid = {9348695}, issn = {0001-4966}, support = {DC 00075/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {Acoustic analysis of nasalized vowels in the frequency domain indicates the presence of extra peaks: one between the first two formants with amplitude P1 and one at lower frequencies, often below the first formant, with amplitude P0. The first-formant amplitude A1 is also reduced relative to its amplitude for an oral vowel. These acoustic characteristics can be explained by speech production theory. The objective of this study was to determine the values for the acoustic correlates A1-P1 and A1-P0 (dB) for quantifying nasalization. They were tested as measures of nasalization by comparing vowels between nasal consonants and those between stop consonants for English speakers. Also, portions of nasal vowels following a stop consonant were compared for speakers of French, which makes a linguistic distinction between oral and nasal vowels. In the analysis of English, the mean difference of A1-P1 measured in oral vowels and nasalized vowels had a range of 10 dB-15 dB; the difference of A1-P0 had a range of 6 dB-8 dB. In the study of French, the difference of A1-P1 measured between the least-nasalized portion and the most-nasalized portion of the vowel had a range of 9 dB-12 dB; for A1-P0, the difference ranged between 3 dB and 9 dB. In order to obtain an absolute acoustic measure of nasalization that was independent of vowel type, normalized parameters were calculated by adjusting for the influence of the vowel formant frequencies.}, } @article {pmid9422276, year = {1997}, author = {Nawka, T and Anders, LC and Cebulla, M and Zurakowski, D}, title = {The speaker's formant in male voices.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {11}, number = {4}, pages = {422-428}, doi = {10.1016/s0892-1997(97)80038-0}, pmid = {9422276}, issn = {0892-1997}, mesh = {Humans ; Male ; Models, Biological ; Sound Spectrography ; Speech/*physiology ; Speech Production Measurement ; Time Factors ; Voice/*physiology ; Voice Quality/physiology ; }, abstract = {Spectral analysis of vowels during connected speech can be performed using the spectral intensity distribution within critical bands corresponding to a natural scale on the basilar membrane. Normalization of the spectra provides the opportunity to make objective comparisons independent from the recording level. An increasing envelope peak between 3,150 and 3,700 Hz has been confirmed statistically for a combination of seven vowels in three groups of male speakers with hoarse, normal, and professional voices. Each vowel is also analyzed individually. The local energy maximum is called "the speaker's formant" and can be found in the region of the fourth formant. The steepness of the spectral slope (i.e. the rate of decline) becomes less pronounced when the sonority or the intensity of the voice increases. The speaker's formant is connected with the sonorous quality of the voice. It increases gradually and is approximately 10 dB higher in professional male voices than in normal male voices at neutral loudness (60 dB at 0.3 min). The peak intensity becomes stronger (30 dB above normal voices) when the overall speaking loudness is increased to 80 dB. Shouting increases the spectral energy of the adjacent critical bands but not the speaker's formant itself.}, } @article {pmid9416451, year = {1997}, author = {Dawson, PW and Clark, GM}, title = {Changes in synthetic and natural vowel perception after specific training for congenitally deafened patients using a multichannel cochlear implant.}, journal = {Ear and hearing}, volume = {18}, number = {6}, pages = {488-501}, doi = {10.1097/00003446-199712000-00007}, pmid = {9416451}, issn = {0196-0202}, mesh = {Adolescent ; Adult ; Child ; *Cochlear Implantation ; Deafness/*congenital/*surgery ; Equipment Design ; Humans ; Phonetics ; Speech Discrimination Tests ; *Speech Perception ; Teaching ; }, abstract = {OBJECTIVE: The aim was to determine whether the ability to use place-coded vowel formant information could be improved after training in a group of congenitally deafened patients, who showed limited speech perception ability after cochlear implant use ranging from 1 yr 8 mo to 6 yr 11 mo. A further aim was to investigate the relationship between electrode position difference limens and vowel recognition.

DESIGN: Three children, one adolescent, and one young adult were assessed with synthesized versions of the words/hid, head, had, hud, hod, hood/containing three formants and with a natural version of these words as well as with a 12-alternative, closed-set task containing monosyllabic words. The change in performance during a nontraining period was compared to the change in performance after 10 training sessions.

RESULTS: After training, two children showed significant gains on a number of tests and improvements were consistent with their electrode discrimination ability. Difference limens ranged from one to three electrodes for these patients as well as for two other patients who showed minimal to no improvements. The minimal gains shown by the final patient could be partly explained by poorer apical electrode position difference limen.

CONCLUSIONS: Significant gains in vowel perception occurred post-training on several assessments for two of the children. This suggests the need for children to continue to have aural rehabilitation for a substantial period after implantation. Minimal improvements, however, occurred for the remaining patients. With the exception of one patient, their poorer performance was not associated with poorer electrode discrimination.}, } @article {pmid9416450, year = {1997}, author = {Hawks, JW and Fourakis, MS and Skinner, MW and Holden, TA and Holden, LK}, title = {Effects of formant bandwidth on the identification of synthetic vowels by cochlear implant recipients.}, journal = {Ear and hearing}, volume = {18}, number = {6}, pages = {479-487}, doi = {10.1097/00003446-199712000-00006}, pmid = {9416450}, issn = {0196-0202}, support = {R01 DC00581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; *Cochlear Implantation ; Deafness/*surgery ; Electric Stimulation/instrumentation ; Equipment Design ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Prognosis ; *Speech Perception ; }, abstract = {OBJECTIVE: The main objective was to investigate whether the broadening and narrowing of formant bandwidths had a significant effect on the identification of vowels often confused by Nucleus cochlear implant recipients using the Spectral Peak (SPEAK) speech coding strategy. Specifically, identification performance for synthetic vowels with the first two formants (F1 and F2) parametrically varied in bandwidth was explored.

DESIGN: Eight implanted subjects identified synthetic versions of the isolated vowel sounds [I, epsilon, lambda, [symbol: see text]] with F1 and F2 bandwidth manipulations, as well as foil tokens of [i, u, a, ae, [symbol: see text]]. Identification performance was examined in terms of percent correct as well as error patterns. Further analyses compared patterns of electrode activation.

RESULTS: In general, broader F1 bandwidths yielded poorer performance and narrower F1 bandwidths yielded better performance relative to identifications for the reference stimuli. However, similar manipulations of F2 bandwidths resulted in less predictable performance. Comparison of electrode activation patterns indicated a distinct sharpening or flattening in the F1 frequency region for subjects with the greatest performance extremes.

CONCLUSIONS: Manipulation of F1 bandwidth can result in concomitant changes in electrode activation patterns and identification performance. This suggests that modifications in the SPEAK coding strategy for the F1 region may be a consideration. Similar manipulations of F2 bandwidth yielded less predictable results and require further investigation.}, } @article {pmid9407663, year = {1997}, author = {Ohde, RN and Haley, KL}, title = {Stop-consonant and vowel perception in 3- and 4-year-old children.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {6}, pages = {3711-3722}, doi = {10.1121/1.420135}, pmid = {9407663}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Child, Preschool ; Humans ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Recent research on 5- to 11-year-old children's perception of stop consonants and vowels indicates that they can generally identify these sounds with relatively high accuracy from short duration stimulus onsets [Ohde et al., J. Acoust. Soc. Am. 97, 3800-3812 (1995); Ohde et al., J. Acoust. Soc. Am. 100, 3813-3824 (1996)]. The purpose of the current experiments was to determine if younger children, aged 3-4 years, can also recover consonant and vowel features from stimulus onsets. Ten adults, ten 3-year olds, and ten 4-year-olds listened to synthesized syllables composed of combinations of [b d g] and [i u a]. The synthesis parameters included manipulations of the following stimulus variables: formant transition (moving or straight), noise burst (present or absent), and voicing duration (10, 30, or 46 ms). Developmental effects were found for the perception of both stop consonants and vowels. In general, adults identified these sounds at a significantly higher level than children, and perception by 4-year-olds was significantly better than 3-year-olds. A developmental effect of dynamic formant motion was obtained, but it was limited to only the [g] stop consonant. Stimulus duration affected the children's perception of vowels indicating that they may utilize additional auditory information to a much greater extent than adults. The results support the importance of information in stimulus onsets for syllable identification, and developmental changes in sensitivity to these cues for consonant and vowel perception.}, } @article {pmid9401456, year = {1997}, author = {Sinnott, JM and Brown, CH and Malik, WT and Kressley, RA}, title = {A multidimensional scaling analysis of vowel discrimination in humans and monkeys.}, journal = {Perception & psychophysics}, volume = {59}, number = {8}, pages = {1214-1224}, doi = {10.3758/bf03214209}, pmid = {9401456}, issn = {0031-5117}, support = {K04 DC 00042/DC/NIDCD NIH HHS/United States ; R01 DC 00164/DC/NIDCD NIH HHS/United States ; R01 DC 00541/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Animals ; Attention ; Cercopithecus/*psychology ; Female ; Humans ; Macaca/*psychology ; Male ; Multilingualism ; *Phonetics ; Psychoacoustics ; Species Specificity ; *Speech Perception ; }, abstract = {Multidimensional scaling (MDS) was used to compare perceptual maps for 10 synthetic English vowels in humans and Old World monkeys (Macaca fuscata and Cercopithecus albogularis). Subjects discriminated among the vowels using a repeating background procedure, and reaction times were submitted to an MDS analysis to derive measures of perceive similarity. The dimensions that emerged related to the frequencies of the first (F1), second (F2), and third (F3) formants. Human data indicated a good match to previous MDS studies using rating procedures or confusion matrices: The dominant dimension mapped onto vowel F2, the phonetically most important formant, and the second and third dimensions mapped onto F1 and F3, respectively. For monkeys, equal weightings occurred for F1 and F2, and F3 was not clearly represented. Monkey sensitivity to the formants appeared to relate to formant amplitudes. If monkeys are giving an accurate representation of the psychoacoustic relations among the formants, then our human results suggest that species-specific mechanisms, reflecting the salience of the phonetic feature of advancement, may contribute to vowel coding in humans.}, } @article {pmid9373987, year = {1997}, author = {Fruchter, D and Sussman, HM}, title = {The perceptual relevance of locus equations.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {5 Pt 1}, pages = {2997-3008}, doi = {10.1121/1.421012}, pmid = {9373987}, issn = {0001-4966}, support = {R01 DC2014/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Identification curves were estimated for the English consonants /b,d,g/ using five-formant CV synthetic stimuli comprehensively sampling the F2 onset-F2 vowel acoustic space in the vicinity of /b,d,g/ locus equations [H. Sussman et al., J. Acoust. Soc. Am. 90, 1309-1325 (1991)]. The stimuli included 10 English monophthongal vowel contexts, 11 levels of F2 onset per vowel, and 3 levels of F3 onset orthogonally varied with the F2 variables (10 vowels x 11 F2 onsets x 3 F3 onsets = 330 stimuli). After brief training, each of six subjects, three male and three female, was presented eight trials of each of the stimuli, one or two trials per day over a period of several days. Systems of identification curves were visualized as identification surfaces situated in locus equation acoustic space and were overlaid with acoustic data from five male speakers in order to judge the degree of correspondence between perception and acoustic data. A chi square analysis was also performed in order to quantify the correspondence between the observed perception data and expected frequencies derived from the acoustic data. The results, when interpreted in terms of a dominance hierarchy hypothesis, strongly indicate F2 onset and F2 vowel, in combination, serve as important cues for stop consonant place of articulation.}, } @article {pmid9367244, year = {1997}, author = {Sinnott, JM and Street, SL and Mosteller, KW and Williamson, TL}, title = {Behavioral measures of vowel sensitivity in Mongolian gerbils (Meriones unguiculatus): effects of age and genetic origin.}, journal = {Hearing research}, volume = {112}, number = {1-2}, pages = {235-246}, doi = {10.1016/s0378-5955(97)00125-1}, pmid = {9367244}, issn = {0378-5955}, support = {K04 DC 00042/DC/NIDCD NIH HHS/United States ; }, mesh = {Aging/pathology/*physiology ; Animals ; Animals, Domestic/genetics/physiology ; Animals, Wild/genetics/physiology ; Auditory Perception/genetics/physiology ; Auditory Threshold/physiology ; Behavior, Animal ; Cochlear Nucleus/pathology/physiopathology ; Electrophysiology ; Female ; Gerbillinae/anatomy & histology/*genetics/*physiology ; Hair Cells, Auditory/pathology/physiopathology ; Hearing/*genetics/*physiology ; Male ; Models, Biological ; Nerve Degeneration ; Phonetics ; Presbycusis/etiology/genetics/physiopathology ; Psychoacoustics ; Speech Perception/physiology ; Spiral Ganglion/pathology/physiopathology ; }, abstract = {Absolute thresholds for complex vowel stimuli were compared in Mongolian gerbils (Meriones unguiculatus) as a function of age and genetic origin. For a group of 12-month-old 'domestic' gerbils obtained from Tumblebrook Farms, lowest thresholds averaging 14 dB SPL occurred for the vowel /alpha/, which had its most intense formant (F1) at 730 Hz. Thresholds increased to 22 dB SPL for /i/, which had its two most intense formants (F1 and F3) at 270 and 3000 Hz, respectively. Highest thresholds of 30 dB SPL occurred for /u/, which had its most intense formant (F1) at 300 Hz. Thresholds increased by about 10 dB per year through the ages of 12-36 months, with most of the loss occurring for /alpha/ and /u/. The domestic gerbils' /alpha/ thresholds corresponded well to those measured in aging gerbils in electrophysiological studies. Vowel thresholds were also measured in a group of first-generation offspring of 'wild' gerbils imported from Asia, first tested at the ages of 18-24 months. Thresholds were similar to those of the 12-month-old domestic gerbils, and showed no hearing loss with age up to 36 months. The wild gerbils were also free of ear impactions, which commonly occurred in the domestic gerbils. The hearing loss with age in the domestic gerbils may have a genetic basis, and might be due to inbreeding in the domestic strain, in contrast to the hybrid vigor of the wild gerbils.}, } @article {pmid9349877, year = {1997}, author = {Robb, MP and Yates, J and Morgan, EJ}, title = {Vocal tract resonance characteristics of adults with obstructive sleep apnea.}, journal = {Acta oto-laryngologica}, volume = {117}, number = {5}, pages = {760-763}, doi = {10.3109/00016489709113474}, pmid = {9349877}, issn = {0001-6489}, mesh = {Adult ; Aged ; Humans ; Male ; Middle Aged ; Sleep Apnea Syndromes/*physiopathology ; Sound Spectrography ; Speech Acoustics ; Vocal Cords/*physiopathology ; }, abstract = {Vocal tract acoustic resonance was evaluated in a group of 10 untreated adult males with diagnosed obstructive sleep apnea (OSA) syndrome compared to 10 non-OSA adult males. Subjects were required to prolong the vowels /i/, /u/ and /a/, which were subsequently submitted to acoustic analysis of formant frequency and formant bandwidth. Results of the formant frequency analysis indicated lower formant values among the OSA group compared to the non-OSA group, for each vowel type. The lower formant frequencies among the OSA group were attributed to greater vocal tract length compared to non-OSA speakers. The corresponding formant bandwidths for each vowel produced by the OSA group were significantly wider compared to the non-OSA group. The wide formant bandwidths were interpreted to reflect significantly greater vocal tract damping in the OSA subjects, resulting from either excessive vocal tract tissue compliance or general size differences in the length and cross-sectional area of the vocal tract. Discussion focuses on the potential applications of acoustic analysis to aid in the diagnosis and follow-up treatment of OSA.}, } @article {pmid9344479, year = {1997}, author = {Laframboise, M and Snyder, PJ and Cohen, H}, title = {Cerebral hemispheric control of speech during the intracarotid sodium amytal procedure: An acoustic exploration.}, journal = {Brain and language}, volume = {60}, number = {2}, pages = {243-254}, doi = {10.1006/brln.1997.1802}, pmid = {9344479}, issn = {0093-934X}, mesh = {Adult ; *Amobarbital/administration & dosage ; Brain/*drug effects ; Carotid Arteries ; Functional Laterality/*drug effects ; Humans ; *Hypnotics and Sedatives/administration & dosage ; Injections, Intra-Arterial ; Male ; Middle Aged ; Phonetics ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {In this study we investigated lateralized control of speech during the intracarotid amobarbital procedure. Vowel segments were extracted from recordings made during two separate amobarbital procedures and involving two patients. Subjects were right-handed, presented with focal left mesial temporal epileptogenic foci. Age of onset of seizure disorders was 1.5 years for one subject and 16 years for the other. Recorded pre- and postinjection speech samples were digitized. Analyses were conducted on formants 1 and 2 (F1, F2) measures to determine the extent of formant fluctuation in the time course of the IAP. Preliminary results showed, for these two cases, that the left hemisphere is involved in the control of both F1 and F2 and the right in the control of F2 only. The data reveal the potential of coupling the IAP procedure and the acoustical analysis of speech in the study of cerebral control of speech.}, } @article {pmid9339794, year = {1997}, author = {Hong, KH and Kwon, SH and Jung, SS}, title = {The assessment of nasality with a nasometer and sound spectrography in patients with nasal polyposis.}, journal = {Otolaryngology--head and neck surgery : official journal of American Academy of Otolaryngology-Head and Neck Surgery}, volume = {117}, number = {4}, pages = {343-348}, doi = {10.1016/S0194-5998(97)70124-4}, pmid = {9339794}, issn = {0194-5998}, mesh = {Acoustics/*instrumentation ; Adult ; Female ; Humans ; Male ; Middle Aged ; Nasal Cavity/*anatomy & histology/physiology ; Nasal Obstruction/etiology/physiopathology ; Nasal Polyps/*physiopathology/surgery ; Phonation ; Sound Spectrography ; *Speech Acoustics ; }, abstract = {With the development of computerized acoustic analysis systems, an objective measure of nasal speech has become readily available by means of a simple, noninvasive technique. In this study, we assessed the nasality in patients with multiple nasal polyposis before and after endoscopic sinus surgery. With the nasometer, we measured nasalance, which reflects the ratio of acoustic energy output of nasal sounds from the nasal and oral cavities, and the slope score of the nasogram curve. The nasalance scores of nasal sentences and the slope scores of the nasogram curves for all nasal consonants were significantly lower in patients with nasal polyposis than in healthy subjects. After surgery, however, the nasalance and slope scores increased significantly to the normal range. On the sound spectrographic analysis, the frequencies of the first nasal formant decreased slightly and the sound intensity increased slightly for all nasal consonants after surgery. However, no significant change was noticed in the frequencies of the second nasal formant. In conclusion, nasometric and sound spectrographic analyses are considered to be useful tools for objectively assessing the extent of nasality in patients with nasal airway obstruction.}, } @article {pmid9305524, year = {1997}, author = {Langereis, MC and Bosman, AJ and van Olphen, AF and Smoorenburg, GF}, title = {Changes in vowel quality in post-lingually deafened cochlear implant users.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {36}, number = {5}, pages = {279-297}, doi = {10.3109/00206099709071980}, pmid = {9305524}, issn = {0020-6091}, mesh = {Adult ; Aged ; Analysis of Variance ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The present study addresses the effect of cochlear implantation on vowel production of 20 post-lingually deafened Dutch subjects. All subjects received the Nucleus 22 implant (3 WSP and 17 MSP processors). Speech recordings were made pre-implantation and three and twelve months post-implantation with the implant switched on and off. The first and second formant frequencies were measured for eleven Dutch vowels (monophthongs only) in an h-vowel-t context. Twelve months post-implantation, the results showed an increase in the ranges of the first and second formant frequency covered by the respective vowels when the implant was switched on. The increase in the formant frequency range was most marked for some subjects with a relatively small formant range pre-implantation. Also, at 12 months post-implantation with the implant switched on we found a significant shift of the first and second formant frequency towards the normative values. Moreover, at this time the results showed significantly increased clustering of the respective vowels, suggesting an improvement in the ability to produce phonological contrasts between vowels. Clustering is defined as the ratio of the between-vowel variance of the first and second formant frequency and the within-vowel variance of three tokens of the same vowel.}, } @article {pmid9301053, year = {1997}, author = {Lyzenga, J and Horst, JW}, title = {Frequency discrimination of stylized synthetic vowels with a single formant.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {3}, pages = {1755-1767}, doi = {10.1121/1.420085}, pmid = {9301053}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; *Speech, Alaryngeal ; Time Factors ; }, abstract = {Just-noticeable differences (jnd's) in the center frequency of bandlimited harmonic complexes were measured for normal-hearing subjects. A triangular and a rounded spectral envelope were used. The center frequency ranged from 500 to 600 Hz in a region representing the first formant of vowels, and from 2000 to 2100 Hz in a second formant region. The slope of the spectral envelope was either 50 or 100 dB/oct for the first formant region and 100 or 200 dB/oct for the second formant region. For the fundamental frequency of the complexes 100 and 200 Hz were used. The jnd's were determined for various phase relations between the individual components of the complexes. For comparison we also determined jnd's for a Gaussian white noise that was filtered with the same spectral envelopes as the harmonic complexes. A three-interval, three-alternative forced-choice task was used. All measurements were performed with roving stimulus level. The jnd's found for center frequencies that were halfway between two harmonics were smaller than those found for center frequencies that coincided with a harmonic. The jnd's for the noise bands were mostly between those of the two aforementioned groups. Except for a small group of stimuli, the phase relations had little effect on the jnd's. The majority of the results for both the harmonic and the noise band stimuli can be described by a model using a spectral profile comparison. Most of the remaining data can be explained in the temporal domain from changes in the temporal envelope of the stimuli.}, } @article {pmid9299904, year = {1997}, author = {Diesch, E and Luce, T}, title = {Magnetic fields elicited by tones and vowel formants reveal tonotopy and nonlinear summation of cortical activation.}, journal = {Psychophysiology}, volume = {34}, number = {5}, pages = {501-510}, doi = {10.1111/j.1469-8986.1997.tb01736.x}, pmid = {9299904}, issn = {0048-5772}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; *Electromagnetic Fields ; Evoked Potentials, Auditory/physiology ; Female ; Hearing/*physiology ; Humans ; Magnetoencephalography ; Male ; Middle Aged ; *Nonlinear Dynamics ; Pitch Perception/physiology ; Speech ; }, abstract = {A long-latency response component (N1m) and the sustained field (SF) of the auditory evoked magnetic field elicited by two composite stimuli (a two-tone combination and a two-formant vowel) and their individually presented components (a 600-Hz and a 2100-Hz pure tone and two single-vowel formants with formant frequencies matched to the tone frequencies) were recorded using a 37-channel magnetometer. The response to the composite stimuli differed from the linear sum of the responses to the respective components in latency, equivalent dipole moment, and equivalent dipole location, suggesting an interaction among the processes elicited by the constituents of composite stimuli. N1m and SF source locations were more medial for the response to the high tone than to the low tone and more medial for the response to the high vowel formant than to the low vowel formant. The N1m formant sources were more lateral than the N1m tone sources. These findings suggest that, at the level of the auditory cortex, vowels are represented in terms of both the spectral pitches determined by their most prominent harmonics and, within the latency range of the N1m, the virtual pitch determined by the spacing of the harmonics.}, } @article {pmid9297669, year = {1997}, author = {Gilbert, HR and Robb, MP and Chen, Y}, title = {Formant frequency development: 15 to 36 months.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {11}, number = {3}, pages = {260-266}, doi = {10.1016/s0892-1997(97)80003-3}, pmid = {9297669}, issn = {0892-1997}, mesh = {Child, Preschool ; Humans ; Infant ; Male ; Phonation/*physiology ; Speech Acoustics ; Tongue/physiology ; Voice/*physiology ; }, abstract = {Developmental characteristics of formant 1 (F1) and formant 2 (F2) are reported for spontaneous vocalizations produced by four young children. Each child was systematically sampled at between 15 and 36 months of age. Results indicated that both F1 and F2 remained relatively unchanged prior to 24 months of age. Significant decreases in average F1 and F2 occurred between 24 and 36 months. When F1 and F2 values were categorized according to tongue elevation and tongue advancement, the most significant changes were associated with high/back articulations. The pattern of formant frequencies noted in the present group of children appears to reflect developmental changes in vocal tract growth and reconfiguration.}, } @article {pmid9256501, year = {1997}, author = {Ohl, FW and Scheich, H}, title = {Orderly cortical representation of vowels based on formant interaction.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {94}, number = {17}, pages = {9440-9444}, pmid = {9256501}, issn = {0027-8424}, mesh = {Animals ; Auditory Cortex/*physiology ; Gerbillinae ; Hearing/*physiology ; Humans ; Neurons/physiology ; }, abstract = {Psychophysical experiments have shown that the discrimination of human vowels chiefly relies on the frequency relationship of the first two peaks F1 and F2 of the vowel's spectral envelope. It has not been possible, however, to relate the two-dimensional (F1, F2)-relationship to the known organization of frequency representation in auditory cortex. We demonstrate that certain spectral integration properties of neurons are topographically organized in primary auditory cortex in such a way that a transformed (F1,F2) relationship sufficient for vowel discrimination is realized.}, } @article {pmid9265764, year = {1997}, author = {Fitch, WT}, title = {Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {2 Pt 1}, pages = {1213-1222}, doi = {10.1121/1.421048}, pmid = {9265764}, issn = {0001-4966}, mesh = {Acoustics ; Animals ; *Body Constitution ; Humans ; Macaca mulatta/*physiology ; Sound Spectrography ; Vocal Cords/*anatomy & histology/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Body weight, length, and vocal tract length were measured for 23 rhesus macaques (Macaca mulatta) of various sizes using radiographs and computer graphic techniques. linear predictive coding analysis of tape-recorded threat vocalizations were used to determine vocal tract resonance frequencies ("formants") for the same animals. A new acoustic variable is proposed, "formant dispersion," which should theoretically depend upon vocal tract length. Formant dispersion is the averaged difference between successive formant frequencies, and was found to be closely tied to both vocal tract length and body size. Despite the common claim that voice fundamental frequency (F0) provides an acoustic indication of body size, repeated investigations have failed to support such a relationship in many vertebrate species including humans. Formant dispersion, unlike voice pitch, is proposed to be a reliable predictor of body size in macaques, and probably many other species.}, } @article {pmid9265760, year = {1997}, author = {Lotto, AJ and Kluender, KR and Holt, LL}, title = {Perceptual compensation for coarticulation by Japanese quail (Coturnix coturnix japonica).}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {2 Pt 1}, pages = {1134-1140}, doi = {10.1121/1.419865}, pmid = {9265760}, issn = {0001-4966}, support = {DC-000719/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Behavior, Animal ; Coturnix/*physiology ; *Phonetics ; *Speech Perception ; }, abstract = {When members of a series of synthesized stop consonants varying in third-formant (F3) characteristics and varying perceptually from /da/ to /ga/ are preceded by /al/, human listeners report hearing more /ga/ syllables than when the members of the series are preceded by /ar/. It has been suggested that this shift in identification is the result of specialized processes that compensate for acoustic consequences of coarticulation. To test the species-specificity of this perceptual phenomenon, data were collected from nonhuman animals in a syllable "labeling" task. Four Japanese quail (Coturnix coturnix japonica) were trained to peck a key differentially to identify clear /da/ and /ga/ exemplars. After training, ambiguous members of a /da/-/ga/ series were presented in the context of /al/ and /ar/ syllables. Pecking performance demonstrated a shift which coincided with data from humans. These results suggest that processes underlying "perceptual compensation for coarticulation" are species-general. In addition, the pattern of response behavior expressed is rather common across perceptual systems.}, } @article {pmid9265754, year = {1997}, author = {Keilson, SE and Richards, VM and Wyman, BT and Young, ED}, title = {The representation of concurrent vowels in the cat anesthetized ventral cochlear nucleus: evidence for a periodicity-tagged spectral representation.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {2 Pt 1}, pages = {1056-1071}, doi = {10.1121/1.419859}, pmid = {9265754}, issn = {0001-4966}, support = {DC00023/DC/NIDCD NIH HHS/United States ; DC00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Anesthesia ; Animals ; Cats ; Cochlear Nucleus/*physiology ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Chopper units of the ventral cochlear nucleus (VCN) provide a rare representation of stimulus spectrum and a temporal representation of fundamental frequency (F0). This dual representation may be useful in segregating competing speech sounds, where differences in F0 are a cue. Responses to the vowel portion of concurrently presented pairs of syllables /bV integral/ with different F0's (88, 98, and 112 Hz) were studied in the VCN of anesthetized cats; 11 English vowels were used for V. Vowels were chosen so that one had a formant frequency just above the unit's best frequency (BF) and the other had a formant just below BF. By changing the stimulus sampling rate, formant peaks were shifted relative to the unit's BF, producing a range of stimuli, varying in the relative power of the two vowels within the unit's tuning curve. Results show that units' discharge rates reflect the energy within their tuning curves and the relative synchronization of units' responses to the two F0's favors the dominant vowel. A method of segregating two vowels is provided in which relative synchronization to the F0's is used to apportion discharge rate between the vowels. Best results were obtained in chopper units, although primarylike units showed similar behavior.}, } @article {pmid9263955, year = {1997}, author = {Hedrick, M}, title = {Effect of acoustic cues on labeling fricatives and affricates.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {40}, number = {4}, pages = {925-938}, doi = {10.1044/jslhr.4004.925}, pmid = {9263955}, issn = {1092-4388}, support = {R01 DC00136/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Auditory Threshold ; Hearing ; Hearing Loss, Sensorineural/diagnosis ; Humans ; *Phonetics ; Psychometrics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Previous studies have shown that manipulation of frication amplitude relative to vowel amplitude in the third formant frequency region affects labeling of place of articulation for the fricative contrast /s/-/integral of/ [Hedrick & Ohde, 1993; Stevens, 1985]. The current study examined the influence of this relative amplitude manipulation in conjunction with presentation level, frication duration, and formant transition cues for labeling fricative place of articulation by listeners with normal hearing and listeners with sensorineural hearing loss. Synthetic consonant-vowel (CV) stimuli were used in which the amplitude of the frication relative to vowel onset amplitude in the third formant frequency region was manipulated across a 20 dB range. The listeners with hearing loss appeared to have more difficulty using the formant transition component than the relative amplitude component for the labeling task than most listeners with normal hearing. A second experiment was performed with the same stimuli in which the listeners were given one additional labeling response alternative, the affricate /t integral of/. Results from this experiment showed that listeners with normal hearing gave more /t integral of/ labels as relative amplitude and presentation level increased and frication duration decreased. There was a significant difference between the two groups in the number of affricate responses, as listeners with hearing loss gave fewer /t integral of/ labels.}, } @article {pmid9263954, year = {1997}, author = {McAnally, KI and Hansen, PC and Cornelissen, PL and Stein, JF}, title = {Effect of time and frequency manipulation on syllable perception in developmental dyslexics.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {40}, number = {4}, pages = {912-924}, doi = {10.1044/jslhr.4004.912}, pmid = {9263954}, issn = {1092-4388}, mesh = {Adolescent ; Dyslexia/*physiopathology ; Humans ; Male ; *Periodicity ; *Speech Perception ; Time Factors ; }, abstract = {Many people with developmental dyslexia have difficulty perceiving stop consonant contrasts as effectively as other people and it has been suggested that this may be due to perceptual limitations of a temporal nature. Accordingly, we predicted that perception of such stimuli by listeners with dyslexia might be improved by stretching them in time-equivalent to speaking slowly. Conversely, their perception of the same stimuli ought to be made even worse by compressing them in time-equivalent to speaking quickly. We tested 15 children with dyslexia on their ability to identify correctly consonant-vowel-consonant (CVC) stimuli that had been stretched or compressed in the time domain. We also tested their perception of the same CVC stimuli after the formant transitions had been stretched or compressed in the frequency domain. Contrary to our predictions, we failed to find any systematic improvement in their performance with either manipulation. We conclude that simple manipulations in the time and frequency domains are unlikely to benefit the ability of people with dyslexia to discriminate between CVCs containing stop consonants.}, } @article {pmid9243647, year = {1997}, author = {Kojima, H and Hirano, S and Shoji, K and Naito, Y and Honjo, I and Kamoto, Y and Okazawa, H and Ishizu, K and Yonekura, Y and Nagahama, Y and Fukuyama, H and Konishi, J}, title = {The role of the temporal coding system in the auditory cortex on speech recognition.}, journal = {Neuroreport}, volume = {8}, number = {9-10}, pages = {2395-2398}, doi = {10.1097/00001756-199707070-00058}, pmid = {9243647}, issn = {0959-4965}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Cognition/*physiology ; Humans ; Male ; Speech/*physiology ; Temporal Lobe/diagnostic imaging/*physiology ; Tomography, Emission-Computed ; }, abstract = {To elucidate the temporal coding system for speech recognition, we synthesized stimulation sounds which do not contain formant information but do contain temporal information by transforming original sound wave to click sequences. Using this stimulation sound, we performed a recognition test and used PET to examine the cortical activities in normal subjects listening to this sound. The results of the recognition test showed a good perception of the sounds made from sequential speech. The PET study demonstrated significant activation of the superior temporal gyri while listening to the stimulation speech sounds. Our results imply that these stimulation sounds were processed semantically in the auditory cortices. The temporal processing system is thought to make an important contribution to speech recognition.}, } @article {pmid9327613, year = {1997}, author = {Perrin, E and Berger-Vachon, C and Kauffmann, I and Collet, L}, title = {Acoustical recognition of laryngeal pathology using the fundamental frequency and the first three formants of vowels.}, journal = {Medical & biological engineering & computing}, volume = {35}, number = {4}, pages = {361-368}, pmid = {9327613}, issn = {0140-0118}, mesh = {Adult ; Aged ; Female ; Humans ; Laryngeal Diseases/*diagnosis ; Male ; Middle Aged ; Recurrence ; Reference Values ; *Speech Acoustics ; Vocal Cord Paralysis/diagnosis ; Voice Disorders/diagnosis ; }, abstract = {The recognition of laryngeal pathology by analysis of the voice is investigated. The fundamental frequency and the first three formants are considered. The recognition strategy is based on comparison with normal ranges calculated over 200 ordinary voices, grouped in ten age classes ranging from 20 to 70 years, for males and females. 220 test voices are studied divided into four groups: normal voices, functional dysphonia, nodules and recurrent nerve palsy. Each subject is marked according to his/her normal range. Parameters (or items) are calculated on the Interactive Laboratory System workstation. The vocalic material is composed of 11 vowels taken from a sentence. Results are given in terms of the number of values out of the normal ranges. Statistical analysis considers both parameter ability and error rates in pathology recognition. Pathology recognition shows the following error percentages: 23% for dysphonia, 14% for nodules and 33% for recurrent nerve palsy. Parameters do not show the same efficiency for voice pathology characterisation. Formants appear to be better than the fundamental frequency.}, } @article {pmid9228818, year = {1997}, author = {Nittrouer, S and Miller, ME}, title = {Developmental weighting shifts for noise components of fricative-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {102}, number = {1}, pages = {572-580}, doi = {10.1121/1.419730}, pmid = {9228818}, issn = {0001-4966}, support = {4 R01 DC00633-09/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Noise ; Phonetics ; Speech/*physiology ; Speech Production Measurement ; }, abstract = {Previous studies have convincingly shown that the weight assigned to vocalic formant transitions in decisions of fricative identity for fricative-vowel syllables decreases with development. Although these same studies suggested a developmental increase in the weight assigned to the noise spectrum, the role of the aperiodic-noise portions of the signals in these fricative decisions have not been as well-studied. The purpose of these experiments was to examine more closely developmental shifts in the weight assigned to the aperiodic-noise components of the signals in decisions of syllable-initial fricative identity. Two experiments used noises varying along continua from a clear /s/ percept to a clear /[symbol: see text]/ percept. In experiment 1, these noises were created by combining /s/ and /[symbol: see text]/ noises produced by a human vocal tract at different amplitude ratios, a process that resulted in stimuli differing primarily in the amplitude of a relatively low-frequency (roughly 2.2-kHz) peak. In experiment 2, noises that varied only in the amplitude of a similar low-frequency peak were created with a software synthesizer. Both experiments used synthetic /a/ and /u/ portions, and efforts were made to minimize possible contributions of vocalic formant transitions to fricative labeling. Children and adults labeled the resulting stimuli as /s/ vowel or /[symbol: see text]/ vowel. Combined results of the two experiments showed that children's responses were less influenced than those of adults by the amplitude of the low-frequency peak of fricative noises.}, } @article {pmid9210121, year = {1997}, author = {Green, KP and Norrix, LW}, title = {Acoustic cues to place of articulation and the McGurk effect: the role of release bursts, aspiration, and formant transitions.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {40}, number = {3}, pages = {646-665}, doi = {10.1044/jslhr.4003.646}, pmid = {9210121}, issn = {1092-4388}, support = {P60 DC-01409/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Humans ; Phonetics ; Photic Stimulation ; Speech/*physiology ; Speech Acoustics ; }, abstract = {The McGurk effect demonstrates that the perceived place of articulation of an auditory consonant (such as/bi/) can be influenced by the simultaneous presentation of a videotape of a talker saying a conflicting consonant such as /gi/. Usually, such a presentation is perceived by observers as "di" or "delta i" (known as fusion responses). The reverse pairing (auditory /gi/ paired with a visual /bi/) results in "bgi." percepts. These are known as combination responses. In the current study, three experiments examined how acoustic information about place of articulation contained within the release bursts, aspiration, and voiced formants and transitions of a consonant contribute to the McGurk effect. In the first experiment, the release bursts and aspiration were deleted from the acoustic signal. This manipulation resulted in a smaller impact on McGurk "fusion" tokens relative to the McGurk "combination" tokens. This asymmetry may be related to the perceptual salience of the release bursts and aspiration for velar compared to the bilabial tokens used in this experiment and their importance for obtaining the combination percept. In Experiment 2, the release bursts and aspiration were increased in amplitude. Results revealed either no effect or a stronger McGurk effect for the manipulated tokens than for the intact tokens. This findings suggests that the McGurk effect for fusion tokens does not occur simply because the release bursts and aspiration are weak. In Experiment 3, low-pass filtering the second and higher formants and transitions was associated with the largest overall impact on the McGurk effect. This suggests that dynamic information contained within these formants is of primary importance in obtaining the McGurk effect. These cues are, however, context-dependent and vary as a function of talker and vowel context.}, } @article {pmid9201460, year = {1997}, author = {Gfeller, K and Woodworth, G and Robin, DA and Witt, S and Knutson, JF}, title = {Perception of rhythmic and sequential pitch patterns by normally hearing adults and adult cochlear implant users.}, journal = {Ear and hearing}, volume = {18}, number = {3}, pages = {252-260}, doi = {10.1097/00003446-199706000-00008}, pmid = {9201460}, issn = {0196-0202}, support = {2 P50 DC 00242/DC/NIDCD NIH HHS/United States ; RR00059/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; Aged ; *Cochlear Implants ; Deafness/physiopathology/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; *Music ; *Pitch Perception ; Speech Discrimination Tests ; Speech Perception ; }, abstract = {OBJECTIVE: This study compares the musical perception of 17 adult recipients of the Nucleus cochlear implant using two different formant extraction processing strategies (F0F1F2 and MPEAK).

DESIGN: Over a 12 mo period, participants were alternately switched between two strategies every 3 mo. Performance was evaluated using three measures of rhythmic and sequential pitch perception.

RESULTS: Three individuals performed significantly better with the MPEAK strategy on one particular rhythm task, 11 participants performed better with the MPEAK strategy on another rhythm task, and no significant differences were found between the two strategies on a sequential pitch pattern task.

CONCLUSIONS: Neither strategy seems clearly superior for perception of either sequential pitch or rhythmic patterns.}, } @article {pmid9193061, year = {1997}, author = {Boyce, S and Espy-Wilson, CY}, title = {Coarticulatory stability in American English /r/.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {6}, pages = {3741-3753}, doi = {10.1121/1.418333}, pmid = {9193061}, issn = {0001-4966}, support = {1R030C-02576.//PHS HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Palate/physiology ; *Phonation/physiology ; *Phonetics ; Reference Values ; Sound Spectrography ; Speech Acoustics ; *Speech Articulation Tests ; Tongue/physiology ; }, abstract = {A number of different researchers have reported a substantial degree of variability in how American English /r/ coarticulates with neighboring segments. Acoustic and articulatory data were used to investigate this variability for speakers of "rhotic" American English dialects. Three issues were addressed: (1) the degree to which the F3 trajectory is affected by segmental context and stress, (2) to what extent the data support a "coproduction" versus a "spreading" model of coarticulation, and (3) the degree to which the major acoustic manifestation of American English /r/--the time course of F3--reflects tongue movement for /r/. The f3 formant trajectory durations were measured by automatic procedure and compared for nonsense words of the form /'waCrav/ and /wa'Crav/, where C indicates a labial, alveolar, or velar consonant. These durations were compared to F3 trajectory durations in /'warav/ and /wa'rav/. In addition, formant values in initial syllables of words with and without /r/ were examined for effects of intervening consonant contexts. Results indicated similar F3 trajectory durations across the different consonant contexts, and to a lesser degree across stress, suggesting that coarticulation of /r/ can be achieved by overlap of a stable /r/-related articulatory trajectory with movements for neighboring sounds. This interpretation, and the concordance of F3 time course with tongue movement for /r/, was supported by direct measures of tongue movement for one subject.}, } @article {pmid9193048, year = {1997}, author = {Miller, RL and Schilling, JR and Franck, KR and Young, ED}, title = {Effects of acoustic trauma on the representation of the vowel "eh" in cat auditory nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {6}, pages = {3602-3616}, doi = {10.1121/1.418321}, pmid = {9193048}, issn = {0001-4966}, support = {DC00109/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Fatigue/physiology ; Auditory Threshold/physiology ; Cats ; Hearing Loss, Noise-Induced/*physiopathology ; Nerve Fibers/*physiology ; *Phonetics ; Sound Spectrography ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiopathology ; }, abstract = {A population study of cat auditory-nerve fibers was used to characterize the permanent deficits induced by exposure to 110-115 dB SPL, narrow-band noise. Fibers in the region of acoustic trauma (roughly 1-6 kHz) showed a loss of sensitivity at best frequency (BF) of about 50-60 dB and an increased tuning bandwidth. A correlation between weakened two-tone suppression and loss of sensitivity was found for fibers with BFs above 1 kHz. Single-fiber responses to the vowel "eh" were recorded at intensities ranging from near threshold to a maximum of about 110 dB SPL. In normal cochleas, the temporal response patterns show a capture phenomenon, in which the first two formant frequencies dominate the responses at high sound levels among fibers with BFs near the formant frequencies. After acoustic trauma, fibers in the region of threshold shift synchronized to a broad range of the vowel's harmonics and thus did not show capture by the second formant at any sound level used. The broadband nature of this response is consistent with the broadened tuning observed in the damaged fibers, but may also reflect a weakening of compressive nonlinearities responsible for synchrony capture in the normal cochlea.}, } @article {pmid9165741, year = {1997}, author = {Owren, MJ and Seyfarth, RM and Cheney, DL}, title = {The acoustic features of vowel-like grunt calls in chacma baboons (Papio cyncephalus ursinus): implications for production processes and functions.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {5 Pt 1}, pages = {2951-2963}, doi = {10.1121/1.418523}, pmid = {9165741}, issn = {0001-4966}, support = {HD-29433/HD/NICHD NIH HHS/United States ; }, mesh = {Animals ; Female ; *Papio ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {The acoustic features of 216 baboon grunts were investigated through analysis of field-recorded calls produced by identified females in known contexts. Analyses addressed two distinct questions: whether the acoustic features of these tonal sounds could be characterized using a source-filter approach and whether the acoustic features of grunts varied by individual caller and social context. Converging evidence indicated that grunts were produced through a combination of periodic laryngeal vibration and a stable vocal tract filter. Their acoustic properties closely resembled those of prototypical human vowel sounds. In general, variation in the acoustic features of the grunts was more strongly related to caller identity than to the social contexts of calling. However, two acoustic parameters, second formant frequency and overall spectral tilt, did vary consistently depending on whether the caller was interacting with an infant or participating in a group move. Nonetheless, in accordance with the general view that identity cueing is a compelling function in animal communication, it can be concluded that much of the observed variability in grunt acoustics is likely to be related to this aspect of signaling. Further, cues related to vocal tract filtering appear particularly likely to play an important role in identifying individual calling animals.}, } @article {pmid9165738, year = {1997}, author = {Summers, V and Leek, MR}, title = {Intraspeech spread of masking in normal-hearing and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {5 Pt 1}, pages = {2866-2876}, doi = {10.1121/1.419303}, pmid = {9165738}, issn = {0001-4966}, support = {DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Aged ; Auditory Threshold ; *Hearing ; *Hearing Loss, Sensorineural ; Humans ; Middle Aged ; Noise ; *Perceptual Masking ; Phonetics ; *Speech ; *Speech Perception ; }, abstract = {Hearing-impaired and normal-hearing listeners labeled synthetic consonant-vowel stimuli (/ba/, /da/, /ga/, /be/, /de/, /ge/) presented at moderate and high signal levels. First formant (F1) regions were synthesized at normal and at attenuated levels to test whether F1 attenuation might reduce upward spread of masking, making information contained in higher formant regions more available. Performance was tested in quiet and in broadband noise sufficient to mask initial release bursts. Although complete removal of F1 consistently reduced performance, F1 attenuation of up to 18 dB led to increased labeling accuracy, particularly in the /a/ vowel context. Benefit associated with F1 attenuation was more consistently seen for hearing-impaired than for normal-hearing listeners and, in particular, for listeners with steep increases in audiometric thresholds between the first and second formant regions of the test stimuli. The availability of initial bursts as a source of place cues during testing in quiet did not reduce the benefit associated with F1 attenuation.}, } @article {pmid9165736, year = {1997}, author = {Turner, CW and Smith, SJ and Aldridge, PL and Stewart, SL}, title = {Formant transition duration and speech recognition in normal and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {5 Pt 1}, pages = {2822-2825}, doi = {10.1121/1.418566}, pmid = {9165736}, issn = {0001-4966}, support = {DC 00377/DC/NIDCD NIH HHS/United States ; }, mesh = {*Hearing ; Hearing Loss, Sensorineural ; Humans ; Phonetics ; *Speech Perception ; Time Factors ; }, abstract = {Listeners with sensorineural hearing loss often have difficulty discriminating stop consonants even when the speech signals are presented at high levels. One possible explanation for this deficit is that hearing-impaired listeners cannot use the information contained in the rapid formant transitions as well as normal-hearing listeners. If this is the case, then perhaps slowing the rate of frequency change in formant transitions might assist their ability to perceive these speech sounds. In the present study, sets of consonant plus vowel (CV) syllables were synthesized corresponding to /ba, da, ga/ with formant transitions for each set ranging from 5 to 160 ms in duration. The listener's task was to identify the consonant in a three-alternative, closed-set response task. The results for normal-hearing listeners showed nearly perfect performance for transitions of 20 ms and longer, whereas the shortest transitions yielded poorer performance. A group of eight hearing-impaired listeners pure-tone averages (PTAs) ranging from 30 to 62 dB HL) was also tested. The hearing-impaired listeners tended to show poorer performance than the normals for transitions of all durations; however, the performance of a few hearing-impaired subjects was equal to that of normals for the shortest-duration transitions. A strong inverse relation was observed between degree of hearing loss and improvement in score as a function of transition duration. These results suggest that increasing the duration of formant transitions for listeners with more severe hearing losses may not provide a helpful solution to their speech recognition difficulties.}, } @article {pmid9104028, year = {1997}, author = {Protopapas, A and Lieberman, P}, title = {Fundamental frequency of phonation and perceived emotional stress.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {4}, pages = {2267-2277}, doi = {10.1121/1.418247}, pmid = {9104028}, issn = {0001-4966}, mesh = {*Affect ; Female ; Humans ; Male ; *Phonation ; Speech Acoustics ; Stress, Psychological/*psychology ; }, abstract = {Nonlinguistic information about the speaker's emotional state is conveyed in spoken utterances by means of several acoustic characteristics and listeners can often reliably identify such information. In this study we investigated the effect of short- and long-term F0 measures on perceived emotional stress using stimuli synthesized with the LPC coefficients of a steady vowel and varying F0 tracks. The original F0 tracks were taken from naturally occurring speech in highly stressful (contingent on terror) and nonstressful conditions. Stimuli with more jitter were rated as sounding more hoarse but not more stressed, i.e., a demonstrably perceptible amount of jitter did not seem to play a role in perceived emotional stress. Reversing the temporal pattern of F0 did not affect the stress ratings, suggesting that the directionality of variations in F0 does not convey emotional stress information. Mean and maximum F0 within an utterance correlated highly with stress ratings, but the range of F0 did not correlate significantly with the stress ratings, especially after the effect of maximum F0 was removed in stepwise regression. It is concluded that the range of F0 per se does not contribute to the perception of emotional stress, whereas maximum F0 constitutes the primary indicator. The observed effects held across several voices that were found to sound natural (three male voices and one of two female ones). An effect of the formant frequencies was also observed in the stimuli with the lowest F0; it is hypothesized that formant frequency structure dominated the F0 effect in the one voice that gave discrepant results.}, } @article {pmid9104027, year = {1997}, author = {Nittrouer, S and Miller, ME}, title = {Predicting developmental shifts in perceptual weighting schemes.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {4}, pages = {2253-2266}, doi = {10.1121/1.418207}, pmid = {9104027}, issn = {0001-4966}, support = {4R01 DC00633-09/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Humans ; Phonetics ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Recent models of developmental changes in speech perception suggest that the weights assigned to acoustic properties change as children gain experience with a native language. Empirical evidence supports this position, but few suggestions have been offered as to what guides this shift. These three experiments were designed to improve our ability to predict how perceptual weighting schemes change with development. The specific hypothesis explored was twofold: (1) the weight assigned by adults to any one acoustic property differs across phonetic environments according to how informative that property is in each environment; and (2) the weight assigned by children to any one acoustic property differs less across phonetic environments because children have not fully learned the patterns of covariation between phonetic informativeness and environment for each property. Experiment 1 replicated previous findings of age-related differences in the weights assigned to noise spectra and formant transitions in labeling of syllable-initial fricatives (/s/ or /[symbol: see text]/). In experiment 2 the variation in F3-onset frequency associated with place of fricative constriction was eliminated. This property differs more (i.e., is more informative) in /u/ than in /a/. Accordingly adults' transition effect was reduced more for /u/ than for /a/ from experiment 1. Children's transition effect was similarly reduced across vowel environments. In experiment 3, F3-onset frequency was appropriately manipulated for both vowels, and adults transition effect increased more for /u/ than for /a/ from experiment 2. The increase in children's transition effect was more similar across vowels. We conclude that the children had not fully learned how information provided by F3 transitions varies across /a/ and /u/ environments, and suggest that developmental weighting shifts may be guided by children learning the relation between phonetic informativeness and environment.}, } @article {pmid9104025, year = {1997}, author = {Titze, IR and Story, BH}, title = {Acoustic interactions of the voice source with the lower vocal tract.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {4}, pages = {2234-2243}, doi = {10.1121/1.418246}, pmid = {9104025}, issn = {0001-4966}, support = {P60 DC00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Larynx/*physiology ; Models, Anatomic ; *Speech Acoustics ; Voice/*physiology ; }, abstract = {The linear source-filter theory of speech production assumes that vocal fold vibration is independent of the vocal tract. The justification is that the glottis often behaves as a high-impedance (constant flow) source. Recent imaging of the vocal tract has demonstrated, however, that the epilarynx tube is quite narrow, making the input impedance to the vocal tract comparable to the glottal impedance. Strong interactions can exist, therefore. In particular, the inertance of the vocal tract facilitates vocal fold vibration by lowering the oscillation threshold pressure. This has a significant impact on singing. Not only does the epilarynx tube produce the desirable singer's formant (vocal ring), but it acts like the mouthpiece of a trumpet to shape the flow and influence the mode of vibration. Effects of the piriform sinuses, pharynx expansion, and nasal coupling are also discussed.}, } @article {pmid9104022, year = {1997}, author = {Kortekaas, RW and Kohlrausch, A}, title = {Psychoacoustical evaluation of the pitch-synchronous overlap-and-add speech-waveform manipulation technique using single-formant stimuli.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {4}, pages = {2202-2213}, doi = {10.1121/1.418204}, pmid = {9104022}, issn = {0001-4966}, mesh = {Adult ; Audiometry, Pure-Tone ; Humans ; Models, Theoretical ; *Pitch Perception ; *Psychoacoustics ; Psychometrics ; *Speech Perception ; Terminology as Topic ; }, abstract = {This article presents two experiments dealing with a psychoacoustical evaluation of the pitch-synchronous overlap-and-add (PSOLA) technique. This technique has been developed for modification of duration and fundamental frequency of speech and is based on simple waveform manipulations. Both experiments were aimed at deriving the sensitivity of the auditory system to the basic distortions introduced by PSOLA. In experiment I, manipulation of fundamental frequency was applied to synthetic single-formant stimuli under minimal stimulus uncertainty, level roving, and formant-frequency roving. In experiment II, the influence of the positioning of the so-called "pitch markers" was studied. Depending on the formant and fundamental frequency, experimental data could be described reasonably well by either a spectral intensity-discrimination model or a temporal model based on detecting changes in modulation of the output of a single auditory filter. Generally, the results were in line with psychoacoustical theory on the auditory processing of resolved and unresolved harmonics.}, } @article {pmid9075171, year = {1997}, author = {Miller, DG and Sulter, AM and Schutte, HK and Wolf, RF}, title = {Comparison of vocal tract formants in singing and nonperiodic phonation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {11}, number = {1}, pages = {1-11}, doi = {10.1016/s0892-1997(97)80018-5}, pmid = {9075171}, issn = {0892-1997}, mesh = {Acoustics ; Humans ; Magnetic Resonance Imaging ; Phonation/*physiology ; Phonetics ; Vocal Cords/*physiology ; }, abstract = {The skilled use of nonperiodic phonation techniques in combination with spectrum analysis has been proposed here as a practical method for locating formant frequencies in the singing voice. The study addresses the question of the degree of similarity between sung phonations and their nonperiodic imitations, with respect to both frequency of the first two formants as well as posture of the vocal tract. Using magnetic resonance imaging (MRI), linear predictive coding (LPC), and spectrum analysis, two types of nonperiodic phonation (ingressive and vocal fry) are compared with singing phonations to determine the degree of similarity/difference in acoustic and spatial dimensions of the vocal tract when these phonation types are used to approximate the postures of singing. In comparing phonation types, the close similarity in acoustic data in combination with the relative dissimilarity in spatial data indicates that the accurate imitations are not primarily the result of imitating the singing postures, but have instead an aural basis.}, } @article {pmid9069627, year = {1997}, author = {Martin, BA and Sigal, A and Kurtzberg, D and Stapells, DR}, title = {The effects of decreased audibility produced by high-pass noise masking on cortical event-related potentials to speech sounds/ba/and/da.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {3}, pages = {1585-1599}, doi = {10.1121/1.418146}, pmid = {9069627}, issn = {0001-4966}, support = {HD01799/HD/NICHD NIH HHS/United States ; P50 DC00223/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Audiometry, Pure-Tone ; Auditory Cortex/*physiology ; Auditory Threshold ; Electroencephalography ; *Evoked Potentials ; Female ; Humans ; Male ; Noise/*adverse effects ; *Perceptual Masking ; *Phonetics ; Reaction Time ; *Speech Acoustics ; }, abstract = {This study investigated the effects of decreased audibility produced by high-pass noise masking on cortical event-related potentials (ERPs) N1, N2, and P3 to the speech sounds /ba/and/da/presented at 65 and 80 dB SPL. Normal-hearing subjects pressed a button in response to the deviant sound in an oddball paradigm. Broadband masking noise was presented at an intensity sufficient to completely mask the response to the 65-dB SPL speech sounds, and subsequently high-pass filtered at 4000, 2000, 1000, 500, and 250 Hz. With high-pass masking noise, pure-tone behavioral thresholds increased by an average of 38 dB at the high-pass cutoff and by 50 dB one octave above the cutoff frequency. Results show that as the cutoff frequency of the high-pass masker was lowered, ERP latencies to speech sounds increased and amplitudes decreased. The cutoff frequency where these changes first occurred and the rate of the change differed for N1 compared to N2, P3, and the behavioral measures. N1 showed gradual changes as the masker cutoff frequency was lowered. N2, P3, and behavioral measures showed marked changes below a masker cutoff of 2000 Hz. These results indicate that the decreased audibility resulting from the noise masking affects the various ERP components in a differential manner. N1 is related to the presence of audible stimulus energy, being present whether audible stimuli are discriminable or not. In contrast, N2 and P3 were absent when the stimuli were audible but not discriminable (i.e., when the second formant transitions were masked), reflecting stimulus discrimination. These data have implications regarding the effects of decreased audibility on cortical processing of speech sounds and for the study of cortical ERPs in populations with hearing impairment.}, } @article {pmid9120381, year = {1997}, author = {Mody, M and Studdert-Kennedy, M and Brady, S}, title = {Speech perception deficits in poor readers: auditory processing or phonological coding?.}, journal = {Journal of experimental child psychology}, volume = {64}, number = {2}, pages = {199-231}, doi = {10.1006/jecp.1996.2343}, pmid = {9120381}, issn = {0022-0965}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Child ; Humans ; Intelligence ; Perceptual Disorders/*diagnosis ; *Phonetics ; *Reading ; Speech Discrimination Tests ; *Speech Perception ; Time Factors ; }, abstract = {Poor readers are inferior to normal-reading peers in aspects of speech perception. Two hypotheses have been proposed to account for their deficits: (i) a speech-specific failure in phonological representation and (ii) a general deficit in auditory "temporal processing," such that they cannot easily perceive the rapid spectral changes of formant transitions at the onset of stop-vowel syllables. To test these hypotheses, two groups of second-grade children (20 "good readers," 20 "poor readers"), matched for age and intelligence, were selected to differ significantly on a /ba/-/da/ temporal order judgment (TOJ) task, said to be diagnostic of a temporal processing deficit. Three experiments then showed that the groups did not differ in: (i) TOJ when /ba/ and /da/ were paired with more easily discriminated syllables (/ba/-/sa/, /da/-/fa/); (ii) discriminating nonspeech sine wave analogs of the second and third formants of /ba/ and /da/; (iii) sensitivity to brief transitional cues varying along a synthetic speech continuum. Thus, poor readers' difficulties with /ba/-/da/ reflected perceptual confusion between phonetically similar, though phonologically contrastive, syllables rather than difficulty in perceiving rapid spectral changes. The results are consistent with a speech-specific, not a general auditory, deficit.}, } @article {pmid9055622, year = {1997}, author = {Warren, RM and Hainsworth, KR and Brubaker, BS and Bashford, JA and Healy, EW}, title = {Spectral restoration of speech: intelligibility is increased by inserting noise in spectral gaps.}, journal = {Perception & psychophysics}, volume = {59}, number = {2}, pages = {275-283}, doi = {10.3758/bf03211895}, pmid = {9055622}, issn = {0031-5117}, support = {DC00208/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Dichotic Listening Tests ; Dominance, Cerebral ; Female ; Humans ; Male ; Noise ; *Perceptual Masking ; Phonetics ; Psychoacoustics ; *Sound Spectrography ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {In order to function effectively as a means of communication, speech must be intelligible under the noisy conditions encountered in everyday life. Two types of perceptual synthesis have been reported that can reduce or cancel the effects of masking by extraneous sounds: Phonemic restoration can enhance intelligibility when segments are replaced or masked by noise, and contralateral induction can prevent mislateralization by effectively restoring speech masked at one ear when it is heard in the other. The present study reports a third type of perceptual synthesis induced by noise: enhancement of intelligibility produced by adding noise to spectral gaps. In most of the experiments, the speech stimuli consisted of two widely separated narrow bands of speech (center frequencies of 370 and 6,000 Hz, each band having high-pass and low-pass slopes of 115 dB/octave meeting at the center frequency). These very narrow bands effectively reduced the available information to frequency-limited patterns of amplitude fluctuation lacking information concerning formant structure and frequency transitions. When stochastic noise was introduced into the gap separating the two speech bands, intelligibility increased for "everyday" sentences, for sentences that varied in the transitional probability of keywords, and for monosyllabic word lists. Effects produced by systematically varying noise amplitude and noise bandwidth are reported, and the implications of some of the novel effects observed are discussed.}, } @article {pmid9055618, year = {1997}, author = {Gordon, PC}, title = {Coherence masking protection in speech sounds: the role of formant synchrony.}, journal = {Perception & psychophysics}, volume = {59}, number = {2}, pages = {232-242}, doi = {10.3758/bf03211891}, pmid = {9055618}, issn = {0031-5117}, mesh = {Adult ; *Attention ; Auditory Threshold ; Female ; Humans ; Male ; *Perceptual Masking ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Three experiments were performed to examine listeners' thresholds for identifying stimuli whose spectra were modeled after the vowels/I/ and /epsilon/, with the differences between these stimuli restricted to the frequency of the first formant. The stimuli were presented in a low-pass masking noise that spectrally overlapped the first formant but not the higher formants. Identification thresholds were lower when the higher formants were present than when they were not, even though the first formant contained the only distinctive information for stimulus identification. This indicates that listeners were more sensitive in identifying the first formant energy through its contribution to the vowel than as an independent percept; this effect is given the name coherence masking protection. The first experiment showed this effect for synthetic vowels in which the distinctive first formant was supported by a series of harmonics that progressed through the higher formants. In the second two experiments, the harmonics in the first formant region were removed, and the first formant was simulated by a narrow band of noise. This was done so that harmonic relations did not provide a basis for grouping the lower formant with the higher formants; coherence masking protection was still observed. However, when the temporal alignment of the onsets and offsets of the higher and lower formants was disrupted, the effect was eliminated, although the stimuli were still perceived as vowels. These results are interpreted as indicating that general principles of auditory grouping that can exploit regularities in temporal patterns cause acoustic energy belonging to a coherent speech sound to stand out in the auditory scene.}, } @article {pmid9035400, year = {1997}, author = {Aaltonen, O and Eerola, O and Hellström, A and Uusipaikka, E and Lang, AH}, title = {Perceptual magnet effect in the light of behavioral and psychophysiological data.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {2}, pages = {1090-1105}, doi = {10.1121/1.418031}, pmid = {9035400}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; *Speech ; Speech Acoustics ; *Speech Production Measurement ; }, abstract = {Finnish speaking adults categorized synthetic vowels, varying in the frequency of the second formant (F2), as either /y/ or /i/. Two subject groups emerged: "good" and "poor" categorizers. In a /i/ rating experiment, only the good categorizers could consistently label their best /i/ (the prototype, P), being low in the F2 continuum. Poor categorizers rated /i/'s with high F2 values as Ps. In a same/different (AX) discrimination experiment, using the individual Ps and nonprototypes (NPs), it was more difficult for good categorizers to detect small F2 deviations from the P than from an NP (the "perceptual magnet effect"). For poor categorizers, the opposite effect was found. The same stimuli were used to record the mismatch negativity (MMN), an ERP component reflecting preattentive detection of deviations from a standard sound. For the good categorizers the MMNs were lower for Ps than for NPs; for the poor categorizers the MMNs for Ps and NPs did not differ significantly. The results show that individual listeners behaved differently in categorization and goodness rating but in the same way in attentive (AX) discrimination, being the poorest at about the same F2 location. The perceptual magnet effect was indicated in the good categorizers both by behavioral and psychophysiological (MMN) discrimination data.}, } @article {pmid9819971, year = {1997}, author = {Schmulian, D and van der Merwe, A and Groenewald, E}, title = {An exploratory study of an undefined acquired neuromotor speech disorder within the context of the four level framework for speech sensorimotor control.}, journal = {The South African journal of communication disorders = Die Suid-Afrikaanse tydskrif vir Kommunikasieafwykings}, volume = {44}, number = {}, pages = {87-97}, pmid = {9819971}, issn = {0379-8046}, mesh = {Adult ; Brain Injuries/*complications ; Humans ; Male ; Speech Acoustics ; Speech Disorders/classification/diagnosis/*physiopathology ; Speech Intelligibility ; Speech Perception ; Voice ; }, abstract = {In this study, the speech of a 28-year-old male with acquired brain injury and who presents with an undefined neuromotor speech disorder which cannot be categorised as either apraxia of speech or dysarthria, is described. Voice onset time, vowel duration, utterance duration and vowel formant analyses were done acoustically. A perceptual analysis and intelligibility rating were also executed. The subject was found to present with unique perceptual symptoms, intelligible speech, prolonged sound duration and distorted vowel quality. The results are interpreted within the context of the Four Level Framework of Speech Sensorimotor Control (Van der Merwe, 1997).}, } @article {pmid9428042, year = {1997}, author = {Bauer, D and Röckmann, D and van Son, N}, title = {SICONA: the development of signal conditioning communication aids for the severely hearing impaired.}, journal = {Scandinavian audiology. Supplementum}, volume = {47}, number = {}, pages = {34-37}, pmid = {9428042}, issn = {0107-8593}, mesh = {*Conditioning, Psychological ; Correction of Hearing Impairment/instrumentation ; Equipment Design ; *Hearing Aids ; Hearing Loss, Sensorineural/*rehabilitation ; Humans ; Severity of Illness Index ; Speech Perception/*physiology ; }, abstract = {Project SICONA endeavours to satisfy the specific needs of the severely hearing impaired. A complex system is introduced, consisting of several speech processing modules to achieve an improvement of the communication ability within the target group. The basic methods are improving the S/N-ratio by multichannel wireless FM transmission, enhancing weak consonants and second formant information and transposing the unvoiced fricative /s/ into the residual hearing area. Single case studies indicate the effectiveness of the proposed methods; improved reception of speech features within the F2 range and successful integration of the replacement /s/ could be shown. Practical useability is achieved by introducing wireless coupling between the miniaturized (SMD technology, ASIC) system components.}, } @article {pmid9332896, year = {1997}, author = {Zwirner, P and Murry, T and Woodson, GE}, title = {Effects of botulinum toxin on vocal tract steadiness in patients with spasmodic dysphonia.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {254}, number = {8}, pages = {391-395}, pmid = {9332896}, issn = {0937-4477}, mesh = {Adult ; Aged ; Botulinum Toxins/*administration & dosage ; Female ; Humans ; Injections, Intramuscular ; Laryngismus/*drug therapy ; Male ; Middle Aged ; Phonation/drug effects ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Vocal Cords/*drug effects ; Voice Disorders/*drug therapy ; Voice Quality/drug effects ; }, abstract = {Since laryngeal botulinum toxin (BTX) injections have become the treatment of choice for spasmodic dysphonia, the purpose of this study was to examine its effects on the stability of the upper vocal tract as compared to the effects on glottic stability. Two different acoustic methods were used to analyze voice samples from 16 patients with adductor-type spasmodic dysphonias before and after BTX therapy and from a normal control group. Independent acoustic analyses were used to determine laryngeal and upper vocal tract stability. The results showed significantly higher values for the standard deviation of fundamental frequency (SDF0), reflecting laryngeal instability, for the patient group than for the control group and an impressive improvement for the patients after BTX therapy. Further, the equally high values of SDF0 for the initial second and a second from the midsegment of phonation were differentially reduced by BTX therapy, resulting in a normal pattern of laryngeal stability during sustained phonation. The variability of the first and second formants, reflecting upper vocal tract instability, showed higher values for the patients compared with the control group, but this difference was not statistically significant. The present findings showed that BTX injections to the thyroarytenoid muscle had no discernible effect on stability of the upper vocal tract.}, } @article {pmid9248064, year = {1997}, author = {Lotto, AJ and Holt, LL and Kluender, KR}, title = {Effect of voice quality on perceived height of English vowels.}, journal = {Phonetica}, volume = {54}, number = {2}, pages = {76-93}, doi = {10.1159/000262212}, pmid = {9248064}, issn = {0031-8388}, support = {DC-00719/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Phonation/physiology ; *Phonetics ; Sex Factors ; Speech Acoustics ; Speech Production Measurement ; *Voice Quality ; }, abstract = {Across a variety of languages, phonation type and vocal-tract shape systematically covary in vowel production. Breathy phonation tends to accompany vowels produced with a raised tongue body and/or advanced tongue root. A potential explanation for this regularity, based on a hypothesized interaction between the acoustic effects of vocal-tract shape and phonation type, is evaluated. It is suggested that increased spectral tilt and first-harmonic amplitude resulting from breathy phonation interact with the lower-frequency first formant resulting from a raised tongue body to produce a perceptually 'higher' vowel. To test this hypothesis, breathy and modal versions of vowel series modelled after male and female productions of English vowel pairs /i/ and /i/, /u/ and /[symbol: see text]/, and /lamda/ and /a/ were synthesized. Results indicate that for most cases, breathy voice quality led to more tokens being identified as the higher vowel (i.e. /i/, /u/, /lamda/). In addition, the effect of voice quality is greater for vowels modelled after female productions. These results are consistent with a hypothesized perceptual explanation for the covariation of phonation type and tongue-root advancement in West African languages. The findings may also be relevant to gender differences in phonation type.}, } @article {pmid9197091, year = {1997}, author = {Robb, MP and Chen, Y and Gilbert, HR}, title = {Developmental aspects of formant frequency and bandwidth in infants and toddlers.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {49}, number = {2}, pages = {88-95}, doi = {10.1159/000266442}, pmid = {9197091}, issn = {1021-7762}, mesh = {Child, Preschool ; Cross-Sectional Studies ; Female ; Humans ; Infant ; Male ; Phonation/*physiology ; Phonetics ; Speech/*physiology ; Speech Acoustics ; }, abstract = {Average formant frequency and formant bandwidth values are reported in a cross-sectional sample of 20 children between 4 and 25 months of age. With the exception of a slight rise in F1 at 18 months of age, average F1 and F2 values changed little during the time period, while the average bandwidths for F1 (B1) and F2 (B2) were found to significantly decrease as age increased. The pattern observed for both average formant frequencies and formant bandwidths were attributed to developmental reconfiguration of the vocal tract.}, } @article {pmid9097490, year = {1997}, author = {Rastatter, MP and McGuire, RA and Kalinowski, J and Stuart, A}, title = {Formant frequency characteristics of elderly speakers in contextual speech.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {49}, number = {1}, pages = {1-8}, doi = {10.1159/000266431}, pmid = {9097490}, issn = {1021-7762}, mesh = {Adult ; Aged ; Aged, 80 and over ; Aging/*physiology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sex Factors ; *Sound Spectrography ; *Speech Acoustics ; Vocal Cords/physiology ; }, abstract = {The present study measured F1 and F2 formant frequency levels of elderly and young male and female speakers producing the /i/, /ae/, [symbol: see text], /u/, and /a/ vowels in two carrier phrases. Results of a series of ANOVAs showed significant interactions of speaker age x vowel for both F1 and F2 formant frequencies for the male and female speakers. Results suggested that while elderly male speakers exhibit significant alterations in vowel production during contextual speech, elderly female speakers generally maintain formant frequency integrity or appropriate articulatory posturing during contextual vowel productions.}, } @article {pmid9063559, year = {1997}, author = {Bosman, AJ and Smoorenburg, GF}, title = {Speechreading supplemented with auditorily presented speech elements in the profoundly hearing impaired.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {36}, number = {1}, pages = {29-45}, doi = {10.3109/00206099709071959}, pmid = {9063559}, issn = {0020-6091}, mesh = {Adolescent ; Adult ; Audiometry, Pure-Tone ; *Deafness/diagnosis ; Female ; Hearing ; Humans ; *Lipreading ; Male ; Middle Aged ; Phonetics ; Psychophysics ; Sound Spectrography ; *Speech Perception ; }, abstract = {For many profoundly hearing-impaired listeners (hearing loss > 90 dB HL) speechreading is the most important means of communication; amplified speech may provide, at best, additional information to speechreading. In order to improve audiovisual communication, three speech pattern elements comprising voice-fundamental frequency (f0), the first formant (F1), and the first and the second formant (F1F2) were presented as supplements to speechreading. A fourth condition consisted of a natural speech supplement, a fifth of speechreading only. Twenty subjects were tested; all audiovisual speech scores were significantly higher than the purely visual scores. Audiovisual scores for amplified, natural speech were significantly higher than those for f0 and F1F2 coded speech. Scores for natural speech and for F1 coded speech were not significantly different. The relations between the increase in audiovisual speech scores over the visual scores and measures of difference limen for frequency (DLf) and gap detection were not clear. The most prominent correlations with the speech scores were found for the DLf at 125 Hz and for gap detection.}, } @article {pmid9000739, year = {1997}, author = {Nábĕlek, AK and Ovchinnikov, A}, title = {Perception of nonlinear and linear formant trajectories.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {1}, pages = {488-497}, doi = {10.1121/1.417992}, pmid = {9000739}, issn = {0001-4966}, mesh = {Humans ; Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Perception of F2 trajectories in synthetic vowels was investigated. Perceptual boundaries (50% response points of identification functions) of 20-step/U-I/continua with various shapes of F2 trajectories were determined and compared with the boundary for vowels with steady-state F2. In experiment 1, the vowels were synthesized in either/j-j/or/w-w/context resulting in parabolic trajectories, then the stimuli were split into halves resulting in quadratic F2 trajectories. All stimuli were 200 ms long. For the/wVw/, and/wV/stimuli, the boundaries were at lower stimulus numbers than for the stimuli with steady-state F2, indicating that the nonlinear F2 trajectories were perceived as having frequencies beyond the extreme values actually synthesized in the stimuli. This type of signal processing has been termed "perceptual compensation." For the /jVj/ stimuli, there was only a trend for perceptual compensation of F2 trajectories. For the/Vw/,/Vj/, and/jV/stimuli, the boundaries were at stimulus numbers corresponding to frequencies of the relatively steady-state vowel-like segments. In experiment 2, the quadratic F2 trajectories of the/wV/stimuli were changed to linear trajectories and F1 and F3 trajectories were either quadratic, linear, or steady state. The results indicated that the shape of F1 and F3 trajectories had no effect on the boundaries. For the linear F2 trajectories, as for the quadratic F2 trajectories in the/wV/ stimuli, the boundaries were at lower stimulus numbers than for the stimulus with steady-state F2, indicating presence of perceptual compensation. In the experiment 3, the F1 and F3 were steady state and the linear F2 trajectories had three different values of frequency difference, delta F, between the initial and final frequencies. The perceptual compensation was found for the 200-ms stimuli with large delta F up to 500 Hz over 200-ms stimulus duration, and perceptual enhancement of final frequencies was found for the stimuli with delta F = 280 Hz.}, } @article {pmid9000737, year = {1997}, author = {Hanson, HM}, title = {Glottal characteristics of female speakers: acoustic correlates.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {1}, pages = {466-481}, doi = {10.1121/1.417991}, pmid = {9000737}, issn = {0001-4966}, support = {DC 00075/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Glottis/*physiology ; Humans ; Models, Anatomic ; Sound Spectrography ; Speech/*physiology ; *Speech Acoustics ; Speech Production Measurement ; Voice Quality ; }, abstract = {The aim of the research reported in this paper is to formulate a set of acoustic parameters of the voicing source that reflect individual differences in the voice qualities of female speakers. Theoretical analysis and observations of experimental data suggest that a more open glottal configuration results in a glottal volume-velocity waveform with relatively greater low-frequency and weaker high-frequency components, compared to a waveform produced with a more adducted glottal configuration. The more open glottal configuration also leads to a greater source of aspiration noise and larger bandwidths of the natural frequencies of the vocal tract, particularly the first formant. These different attributes of the glottal waveform can be measured directly from the speech spectrum or waveform. A set of acoustic parameters that are likely to indicate glottal characteristics is described. These parameters are measured in the speech of a group of female speakers, and the glottal configurations of the speakers are hypothesized. This research contributes to the description of normal variations of voicing characteristics across speakers and to a continuing effort to improve the analysis and synthesis of female speech. It may also have applications in clinical settings.}, } @article {pmid9000736, year = {1997}, author = {Dang, J and Honda, K}, title = {Acoustic characteristics of the piriform fossa in models and humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {101}, number = {1}, pages = {456-465}, doi = {10.1121/1.417990}, pmid = {9000736}, issn = {0001-4966}, mesh = {Female ; Humans ; Larynx/*physiology ; Magnetic Resonance Imaging ; Pharynx/physiology ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The piriform fossa forms the bottom part of the pharynx and acts as a pair of side branches of the vocal tract. Because of its obscure form and function, the piriform fossa has usually been neglected in the current speech production models. This study examines the geometric and acoustic characteristics of the piriform fossa by means of MRI-based mechanical modeling, in-vivo experiments and numerical computations. Volumetric MRI data showed that the piriform fossa is 2.1 to 2.9 cm3 in volume and 1.6 to 2.0 cm in depth for four Japanese subjects (three males and one female). The results obtained from mechanical models showed that the piriform fossa contributes strong troughs, i.e., spectral minima, to speech spectra in a region of 4 to 5 kHz. The antiresonances were identified with increasing frequency when water was injected into the piriform fossa of human subjects in in-vivo experiments. Antiresonances obtained from the experiments and simulations were confirmed to be consistent with those in natural speech within 5%. Acoustic measurements and simulations showed that the influence of the piriform fossa extends to the lower vowel formants in addition to the local troughs. This global effect can be explained by the location of the fossa near the glottal end of the vocal tract.}, } @article {pmid8973285, year = {1996}, author = {Maurer, D and Hess, M and Gross, M}, title = {High-speed imaging of vocal fold vibrations and larynx movements within vocalizations of different vowels.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {105}, number = {12}, pages = {975-981}, doi = {10.1177/000348949610501208}, pmid = {8973285}, issn = {0003-4894}, mesh = {Adult ; Female ; Humans ; Laryngoscopy ; Larynx/*physiology ; Male ; Phonation/*physiology ; Photography ; Vibration ; Videotape Recording ; Vocal Cords/*physiology ; }, abstract = {Theoretic investigations of the "source-filter" model have indicated a pronounced acoustic interaction of glottal source and vocal tract. Empirical investigations of formant pattern variations apart from changes in vowel identity have demonstrated a direct relationship between the fundamental frequency and the patterns. As a consequence of both findings, independence of phonation and articulation may be limited in the speech process. Within the present study, possible interdependence of phonation and phoneme was investigated: vocal fold vibrations and larynx position for vocalizations of different vowels in a healthy man and woman were examined by high-speed light-intensified digital imaging. We found 1) different movements of the vocal folds for vocalizations of different vowel identities within one speaker and at similar fundamental frequency, and 2) constant larynx position within vocalization of one vowel identity, but different positions for vocalizations of different vowel identities. A possible relationship between the vocal fold vibrations and the phoneme is discussed.}, } @article {pmid8969487, year = {1996}, author = {Smits, R and ten Bosch, L and Collier, R}, title = {Evaluation of various sets of acoustic cues for the perception of prevocalic stop consonants. II. Modeling and evaluation.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {6}, pages = {3865-3881}, doi = {10.1121/1.417242}, pmid = {8969487}, issn = {0001-4966}, mesh = {Humans ; Models, Theoretical ; *Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The purpose of the study presented in this paper and the accompanying paper [Smits et al., J. Acoust. Soc. Am. 100, 3852-3864 (1996)] is to evaluate whether detailed or gross time-frequency structures are more relevant for the perception of prevocalic stop consonants. To this end, first a perception experiment was carried out with "burst-spliced" stop-vowel utterances. This experiment is described in the accompanying paper. The present paper describes the second part of the investigation, i.e., the simulation of the behavior of the listeners in the perception experiment. First, a number of detailed and gross cues are measured on the stimuli. Next, these cues are mapped onto the observed perceptual data using a formal model of human classification behavior. The results show that in all cases the detailed cues, such as formant transitions, give a better account of the perceptual data than the gross cues, such as the global spectral tilt and its initial change. The best-performing models are interpreted in terms of the acoustic boundaries which are associated with the perceived linguistic contrast. These boundaries are highly interpretable linear functions of five or six acoustic cues, which give a quantitative description of the often-discussed "trade-off" relation between the various cues for perception of place of articulation in stop consonants.}, } @article {pmid8969486, year = {1996}, author = {Smits, R and ten Bosch, L and Collier, R}, title = {Evaluation of various sets of acoustic cues for the perception of prevocalic stop consonants. I. Perception experiment.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {6}, pages = {3852-3864}, doi = {10.1121/1.417241}, pmid = {8969486}, issn = {0001-4966}, mesh = {Humans ; Male ; Phonetics ; Speech/*physiology ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; Time Factors ; }, abstract = {The purpose of the study presented in this paper and the accompanying paper [Smits et al., J. Acoust. Soc. Am. 100, 3865-3881 (1996)] is to evaluate whether detailed or gross time-frequency structures are more relevant for the perception of place of articulation of prevocalic stop consonants. To this end, first a perception experiment is carried out with "burst-spliced" stop-vowel utterances, containing the Dutch stops /b,d,p,t/ and /k/. From the utterances burst-only, burstless, and cross-spliced stimuli were created and presented to listeners. The results of the experiment show that the relative importance of burst and transitions for the perception of place of articulation to a great extent depends on place and voicing of the stop consonant and on the vowel context. Velar bursts are generally more effective in cueing place of articulation than other bursts. There is no significant difference in the effectiveness of /p/, /t/, and /k/ transitions, while /b/ transitions are more effective than /d/ transitions. The release burst dominates the perception of place of articulation in front-vowel contexts, while the formant transitions are generally dominant in nonfront vowel contexts. The bursts of unvoiced stops are perceptually more important than the bursts of voiced stops.}, } @article {pmid8969485, year = {1996}, author = {Palethorpe, S and Wales, R and Clark, JE and Senserrick, T}, title = {Vowel classification in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {6}, pages = {3843-3851}, doi = {10.1121/1.417240}, pmid = {8969485}, issn = {0001-4966}, mesh = {Child, Preschool ; Female ; Humans ; Male ; Phonetics ; Random Allocation ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Research into the developing vowel system has assumed greater importance in recent years with growing evidence for a more important theoretical role for vowels in child phonology. One limitation of acoustic studies in child speech has been the practical difficulties associated with formant-based analysis and the experiments reported in this paper compare the reliability of critical bands and formant frequencies as acoustic correlates of vowel identity in children. Gaussian classification of vowels in/CVd/Australian English words was carried out using data collected from 4-year-old children and male and female adults. The results show that the use of critical bands for the classification of vowels in children is a robust technique which requires less experimenter intervention in the analysis procedure than the use of formant frequencies, while achieving similar results. The ability to utilize an automatic methodology such as critical band analysis can provide a very powerful tool for large-scale studies in this area.}, } @article {pmid8969483, year = {1996}, author = {Dorman, MF and Loizou, PC}, title = {Relative spectral change and formant transitions as cues to labial and alveolar place of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {6}, pages = {3825-3830}, doi = {10.1121/1.417238}, pmid = {8969483}, issn = {0001-4966}, support = {DC-00654-05/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Lip/*physiology ; Phonetics ; Speech/*physiology ; Speech Perception/physiology ; Speech Production Measurement ; }, abstract = {The perceptual salience of relative spectral change [Lahiri et al., J. Acoust. Soc. Am. 76, 391-404 (1984)] and formant transitions as cues to labial and alveolar/dental place of articulation was assessed in a conflicting cue paradigm. The prototype stimuli were produced by two English speakers. The stimuli with conflicting cues to place of articulation were created by altering the spectra of the signals so that the change in spectral energy from signal onset to voicing onset specified one place of articulation while the formant transitions specified the other place of articulation. Listeners' identification of these stimuli was determined principally by the information from formant transitions. This outcome provides no support for the view that the relative spectral change is a significant perceptual cue to stop consonant place of articulation.}, } @article {pmid8969482, year = {1996}, author = {Ohde, RN and Haley, KL and McMahon, CW}, title = {A developmental study of vowel perception from brief synthetic consonant-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {6}, pages = {3813-3824}, doi = {10.1121/1.417338}, pmid = {8969482}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The purpose of this study was to assess the perceptual role of brief synthetic consonant-vowel syllables as cues for vowel perception in children and adults. Nine types of consonant-vowel syllables comprised of the stops [b d g] followed by the vowels [i a u] were synthesized. Stimuli were generated with durations of 10, 30, or 46 ms, and with or without formant transition motion. Eight children at each of five age levels (5, 6, 7, 9, and 11 years) and a control group of eight adults were trained to identify each vowel in a three-alternative forced-choice (3AFC) paradigm. The results showed that children and adults extracted vowel information at a generally high level from stimuli as brief as 10 ms. For many stimuli, there was little or no difference between the performance of children and adults. However, developmental effects were observed. First, the accuracy of vowel perception was more influenced by the consonant context for children than for adults. Whereas perception was similar across age levels for stimuli in the alveolar context, the youngest children perceived vowels in the labial and velar contexts at significantly lower levels than adults. Second, children were more affected by variations in stimulus duration than were adults. This finding was particularly prominent for the syllable [ga], where the dependency on duration decreased with age in a nearly linear fashion. These findings are discussed in relation to current hypotheses of vowel perception in adults, and hypotheses of speech perception development.}, } @article {pmid8953561, year = {1996}, author = {Fort, A and Ismaelli, A and Manfredi, C and Bruscaglioni, P}, title = {Parametric and non-parametric estimation of speech formants: application to infant cry.}, journal = {Medical engineering & physics}, volume = {18}, number = {8}, pages = {677-691}, doi = {10.1016/s1350-4533(96)00020-3}, pmid = {8953561}, issn = {1350-4533}, mesh = {*Computer Simulation ; *Crying ; Data Interpretation, Statistical ; Fourier Analysis ; Humans ; Infant ; Infant, Newborn ; Linear Models ; Nonlinear Dynamics ; Phonation/physiology ; *Signal Processing, Computer-Assisted ; Speech/*physiology ; }, abstract = {The present paper addresses the issue of correctly estimating the peaks in the speech envelope (formants) occurring in newborn infant cry. Clinical studies have shown that the analysis of such spectral characteristics is a helpful noninvasive diagnostic tool. In fact it can be applied to explore brain function at very early stage of child development, for a timely diagnosis of neonatal disease and malformation. The paper focuses on the performance comparison between some classical parametric and non-parametric estimation techniques particularly well suited for the present application, specifically the LP, ARX and cepstrum approaches. It is shown that, if the model order is correctly chosen, parametric methods are in general more reliable and robust against noise, but exhibit a less uniform behaviour than cepstrum. The methods are compared also in terms of tracking capability, since the signals under study are nonstationary. Both simulated and real signals are used in order to outline the relevant features of the proposed approaches.}, } @article {pmid8943135, year = {1996}, author = {Rossiter, D and Howard, DM}, title = {ALBERT: a real-time visual feedback computer tool for professional vocal development.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {10}, number = {4}, pages = {321-336}, pmid = {8943135}, issn = {0892-1997}, mesh = {*Computers ; *Feedback ; Humans ; *Photic Stimulation ; Speech Acoustics ; Voice/*physiology ; Voice Quality ; Voice Training ; }, abstract = {This paper considers the nature of real-time visual feedback for vocal analysis and development and presents a new software tool, called ALBERT (acoustic and laryngeal biofeedback enhancement in real time), designed for use with those developing their voices professionally. This tool embodies several important issues in the provision of real-time visual feedback, including: (a) support for user-configurable visual displays, (b) the ability to use colour as a complementary or sole medium for the presentation of information, (c) the ability to combine algorithmically any number of vocal parameters to create a new single parameter representative of some aspect of vocal measurement which may be displayed and updated in real time, and (d) a rate of information update which may be altered by the user at any point for the most appropriate use according to the context of the feedback task. Several examples of system use are given, including the real-time display of fundamental frequency, jitter, and larynx closed quotient (CQ) parameters in a variety of visual configurations. Several examples are given relating to developing professional voice users, including the derivation of a new parameter which reflects the measure of progress of subjects along a linear correlation line between CQ and the level of energy in the singer's formant region.}, } @article {pmid8958542, year = {1996}, author = {Dejonckere, PH and Lebacq, J}, title = {Acoustic, perceptual, aerodynamic and anatomical correlations in voice pathology.}, journal = {ORL; journal for oto-rhino-laryngology and its related specialties}, volume = {58}, number = {6}, pages = {326-332}, doi = {10.1159/000276864}, pmid = {8958542}, issn = {0301-1569}, mesh = {Adolescent ; Adult ; Aged ; Aged, 80 and over ; Child ; Child, Preschool ; Female ; Humans ; Laryngoscopy ; Male ; *Phonation ; Sound Spectrography/methods/statistics & numerical data ; *Speech Acoustics ; Vocal Cords/pathology ; Voice Disorders/*diagnosis ; *Voice Quality ; }, abstract = {A principal components analysis was performed on a set (10) of acoustic, aerodynamic, perceptual and laryngoscopic data obtained from 87 dysphonic patients. Two principal components were clearly identified: the first represents in some way the glottal air leakage, resulting in turbulent noise, particularly obvious in higher spectral frequencies, and giving the perceptual impression of breathiness; the second accounts rather for the degree of aperiodicity in vocal fold oscillation, reflected in jitter measurements and with a perceptual correlate of harshness or roughness. Morphological changes of vocal folds correlate more closely with this second principal component. Among acoustic parameters, harmonics-to-noise ratio in the formant zone and magnitude of the dominant cepstrum peak seem to integrate to some extent the effects of both principal components.}, } @article {pmid8953481, year = {1996}, author = {Wakumoto, M and Ohno, K and Imai, S and Yamashita, Y and Akizuki, H and Michi, KI}, title = {Analysis of the articulation after glossectomy.}, journal = {Journal of oral rehabilitation}, volume = {23}, number = {11}, pages = {764-770}, doi = {10.1046/j.1365-2842.1996.d01-186.x}, pmid = {8953481}, issn = {0305-182X}, mesh = {Adult ; Aged ; Articulation Disorders/*etiology ; Female ; Glossectomy/adverse effects/*methods/*rehabilitation ; Humans ; Male ; Middle Aged ; Speech Acoustics ; Speech Articulation Tests ; *Speech Intelligibility ; Surgical Flaps ; Suture Techniques ; Tongue Neoplasms/rehabilitation/surgery ; }, abstract = {Rehabilitation of the oral cancer patient should aim to achieve not only morphological restoration but also post-operative functions. However, there are few reports describing quantitative evaluation of the latter. The authors have attempted quantitative evaluation of post-operative articulatory function after glossectomy, and report the evaluation of it by speech intelligibility, electropalatography (EPG), and acoustical analysis. Subjects were five directly sutured patients and five patients reconstructed with forearm flap, all after glossectomy. The target syllable was /ta/ from among speech intelligibility test samples. The speech intelligibility and acoustical analysis were investigated pre-operation, and 1,6 and 12 months post-operation. EPG data were collected by DP-01(RION) at 6 months post-operation. Acoustical analyses were carried out by consonant frequency characteristics and formant variance from consonant to vowel transitions. As a result, subjects reconstructed with a forearm flap showed higher restorative tendency than directly sutured subjects. Articulatory characteristics expected from acoustical analysis were more in agreement with the results of EPG than with the results of the speech intelligibility test. From the results, it was suggested that the acoustical analysis used for this research could reveal changes in articulatory movement and will be useful for quantitatively evaluating post-operative articulatory functions.}, } @article {pmid8939378, year = {1996}, author = {Garnier, S and Gallego, S and Collet, L and Berger-Vachon, C}, title = {Spectral and cepstral properties of vowels as a means for characterizing velopharyngeal impairment in children.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {33}, number = {6}, pages = {507-512}, doi = {10.1597/1545-1569_1996_033_0507_sacpov_2.3.co_2}, pmid = {8939378}, issn = {1055-6656}, mesh = {Case-Control Studies ; Child ; Child, Preschool ; Discriminant Analysis ; Female ; Fourier Analysis ; Humans ; Male ; Phonation/*physiology ; Sound Spectrography/*methods ; Speech Acoustics ; Speech Articulation Tests ; Speech Intelligibility ; Velopharyngeal Insufficiency/*diagnosis/*physiopathology ; }, abstract = {This study investigated spectral differences in the phonation of vowels between a group of 21 children with velopharyngeal impairment and a paired control group of 42 subjects. The speech material was composed of the isolated vowels /a/, /i/, and /u/, a sustained vowel /a/, and four vowels included in spoken words: two nasals /in/ and /on/ in "dindon," and two orals /a/ and /o/ in "gâteau." A bottom-up discriminant function analysis indicated that the cepstrum coefficients and the linear-FFT were the most efficient tools in the test to classify the set of children. They were superior to the results obtained with the formant-representation of the vowels. The use of a perceptive scale (barks) did not improve the results. Discrimination over the total group showed percentages lower than those obtained when boys and girls were assessed separately. The best discriminating results were obtained with the /a/ of "gâteau" for the girls and with the isolated /i/ for the boys.}, } @article {pmid8914319, year = {1996}, author = {Hedrick, MS and Jesteadt, W}, title = {Effect of relative amplitude, presentation level, and vowel duration on perception of voiceless stop consonants by normal and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {5}, pages = {3398-3407}, doi = {10.1121/1.416981}, pmid = {8914319}, issn = {0001-4966}, support = {2 T32 DC00013-14/DC/NIDCD NIH HHS/United States ; 5 R01 DC00136-16/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged ; Hearing/*physiology ; *Hearing Loss, Sensorineural ; Humans ; Middle Aged ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Previous work on the influence of relative amplitude and presentation level in listeners with normal hearing and those with sensorineural hearing loss was extended to include (1) a comparison of relative amplitude manipulation in the presence of neutral formant transitions versus manipulation of both formant transition and relative amplitude and (2) the additional variable of vowel duration. Synthetic consonant-vowel (CV) stimuli were used, and the amplitude of the burst relative to the vowel in the F4-F5 frequency range was varied across a 20-dB range using a /p-t/ contrast. In experiment I, some stimuli had neutral formant transition values and relative amplitude manipulations; other stimuli had both formant transition and relative amplitude manipulations. For stimuli in experiment II, neutral formant transition values were used, relative amplitude was manipulated, and vowel duration ranged from 14 to 200 ms. Results from experiment I showed no significant difference between listener groups when only relative amplitude information was manipulated, but significant differences when both relative amplitude and formant transition information was present. These results suggest that the listeners with normal hearing weighted the two acoustic cues differently from listeners with sensorineural hearing loss. Results from experiment II indicated that increasing vowel duration generally increased the number of labial responses from listeners with normal hearing, but did not always increase the number of labial responses from listeners with sensorineural hearing loss.}, } @article {pmid8914318, year = {1996}, author = {Dang, J and Honda, K}, title = {Acoustic characteristics of the human paranasal sinuses derived from transmission characteristic measurement and morphological observation.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {5}, pages = {3374-3383}, doi = {10.1121/1.416978}, pmid = {8914318}, issn = {0001-4966}, mesh = {Glottis/physiology ; Humans ; Models, Theoretical ; Paranasal Sinuses/*physiology ; Phonation/physiology ; *Speech Acoustics ; }, abstract = {This paper reports on the acoustic characteristics of the paranasal sinuses as determined from transmission characteristic measurements and morphological examinations. A new experimental approach was developed to explore the correspondence between antiresonance frequencies and the causal resonators [J. Dang and K. Honda, J. Acoust. Soc. Jpn. (E) 17, 93-99 (1996)], and it was adopted to determine the antiresonance frequency of each sinus cavity. In this study, the antiresonance frequencies and the locations of the sinus openings were estimated from transmission characteristics of the nasal tract for three subjects, and then MRI-based morphological data for the subjects were used to relate each antiresonance frequency to its causal sinus cavity. The results indicate that each of the three major sinuses, i.e., the sphenoidal, maxillary, and frontal sinuses, contributes its own antiresonances to the transmission characteristics of the nasal tract. The estimated antiresonance frequencies were compared with computed natural frequencies of Helmholtz resonators, and the differences were within 10% for the sinuses. On the basis of the frequency distribution of the sinus antiresonance, the acoustic characteristics of the paranasal sinuses were modeled by four Helmholtz resonators. The simulation with the four-zero model showed that the paranasal sinuses not only introduce antiresonances in the transfer function, but also change the spectral shape of the nasal formants.}, } @article {pmid8920837, year = {1996}, author = {Lotto, AJ and Kluender, KR and Green, KP}, title = {Spectral discontinuities and the vowel length effect.}, journal = {Perception & psychophysics}, volume = {58}, number = {7}, pages = {1005-1014}, pmid = {8920837}, issn = {0031-5117}, support = {DC-00719/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; *Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Perception of voicing for stop consonants in consonant-vowel syllables can be affected by the duration of the following vowel so that longer vowels lead to more "voiced" responses. On the basis of several experiments, Green, Stevens, and Kuhl (1994) concluded that continuity of fundamental frequency (f0), but not continuity of formant structure, determined the effective length of the following vowel. In an extension of those efforts, we found here that both effects were critically dependent on particular f0s and formant values. First, discontinuity in f0 does not necessarily preclude the vowel length effect because the effect maintains when f0 changes from 200 to 100 Hz, and 200-Hz partials extend continuously through test syllables. Second, spectral discontinuity does preclude the vowel length effect when formant changes result in a spectral peak shifting to another harmonic. The results indicate that the effectiveness of stimulus changes for sustaining or diminishing the vowel length effect depends critically on particulars of spectral composition.}, } @article {pmid8898248, year = {1996}, author = {Matthies, ML and Svirsky, M and Perkell, J and Lane, H}, title = {Acoustic and articulatory measures of sibilant production with and without auditory feedback from a cochlear implant.}, journal = {Journal of speech and hearing research}, volume = {39}, number = {5}, pages = {936-946}, doi = {10.1044/jshr.3905.936}, pmid = {8898248}, issn = {0022-4685}, mesh = {Adult ; *Cochlear Implants ; *Feedback ; Hearing Loss, Bilateral/*rehabilitation ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Transducers ; }, abstract = {The articulator positions of a subject with a cochlear implant were measured with an electromagnetic midsagittal articulometer (EMMA) system with and without auditory feedback available to the subject via his implant. Acoustic analysis of sibilant productions included specific measures of their spectral properties as well as the F3 formant amplitude. More general postural characteristics of the utterances, such as speech rate and sound level, were measured as well. Because of the mechanical and aerodynamic interdependence of the articulators, the postural variables must be considered before attributing speech improvement to the selective correction of a phonemic target with the use of auditory feedback. The tongue blade position was related to the shape and central tendency of the /integral of/ spectra; however, changes in the spectral contrast between /s/ and /integral of/ were not related to changes in the more general postural variables of rate and sound level. These findings suggest that the cochlear implant is providing this subject with important auditory cues that he can use to monitor his speech and maintain the phonemic contrast between /s/ and /integral of/.}, } @article {pmid8865653, year = {1996}, author = {Ohde, RN and Ochs, MT}, title = {The effect of segment duration on the perceptual integration of nasals for adult and child speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {4 Pt 1}, pages = {2486-2499}, doi = {10.1121/1.417357}, pmid = {8865653}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Electronic Data Processing ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech Perception ; Time Factors ; }, abstract = {It has been hypothesized that the acoustic properties within a temporal domain of 10 to 30 ms of boundaries between speech sounds contain significant information on the phonetic features of segments, and that these cues are perceptually integrated by the auditory system [Stevens, Phonetic Linguistics: Essays in Honor of Peter Ladefoged (Academic, London, 1985)]. The purpose of the current research was to examine the effects of stimulus duration adjacent to speech sound boundaries on the perceptual integration of place of articulation of nasals before and after disruption of the abrupt changes in spectra between the murmur and transition. In experiment I, three children, aged 3, 5, and 7 years, and an adult female and male produced consonant-vowel (CV) syllables consisting of [m] and [n] in four vowel contexts, [i ae u a]. Approximately 25-ms segments of the murmur and vowel transition adjacent to the speech sound boundary were digitally removed from these productions. Intervals of silence ranging from 0 to 2000 ms, which can potentially perturb integration processes, were inserted between these segments. The stimuli were then presented to adult listeners for the identification of the nasal. The main findings revealed a consistent decline in identification with gap durations up to 150 ms across speakers and vowel context. However, the adult labial feature was resistant to perceptual change as a function of gap duration. This result appeared to relate to formant transition duration, and not to response bias. In experiment II, stimuli with durations shorter than those in experiment I were further analyzed for adult speakers. The main finding was a quantification of the acoustic segment duration needed for perceptual integration of the murmur and vowel transition. Across both experiments, the results reveal a decline in the identification of both alveolar and labial nasals within a time interval mediated by short-term auditory memory, and that the duration of the acoustic segment needed for perceptual integration is longer for [n] than [m].}, } @article {pmid8865652, year = {1996}, author = {Sluijter, AM and van Heuven, VJ}, title = {Spectral balance as an acoustic correlate of linguistic stress.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {4 Pt 1}, pages = {2471-2485}, doi = {10.1121/1.417955}, pmid = {8865652}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {Although intensity has been reported as a reliable acoustical correlate of stress, it is generally considered a weak cue in the perception of linguistic stress. In natural speech stressed syllables are produced with more vocal effort. It is known that, if a speaker produces more vocal effort, higher frequencies increase more than lower frequencies. In this study, the effects of lexical stress on intensity are examined in the abstraction from the confounding accent variation. A production study was carried out in which ten speakers produced Dutch lexical and reiterant disyllabic minimal stress pairs spoken with and without an accent in a fixed carrier sentence. Duration, overall intensity, formant frequencies, and spectral levels in four contiguous frequency bands were measured. Results revealed that intensity differences as a function of stress are mainly located above 0.5 kHz, i.e., a change in spectral balance emphasizing higher frequencies for stressed vowels. Furthermore, we showed that the intensity differences in the higher regions are caused by an increase in physiological effort rather than by shifting formant frequencies due to stress. The potential of each acoustic correlate of stress to differentiate between initial- and final-stressed words was examined by linear discriminant analysis. Duration proved the most reliable correlate of stress. Overall intensity and vowel quality are the poorest cues. Spectral balance, however, turned out to be a reliable cue, close in strength to duration.}, } @article {pmid8865651, year = {1996}, author = {Kewley-Port, D and Li, X and Zheng, Y and Neel, AT}, title = {Fundamental frequency effects on thresholds for vowel formant discrimination.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {4 Pt 1}, pages = {2462-2470}, doi = {10.1121/1.417954}, pmid = {8865651}, issn = {0001-4966}, support = {DC-00250/DC/NIDCD NIH HHS/United States ; DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {*Auditory Threshold ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The present experiments examined the effect of fundamental frequency (F0) on thresholds for the discrimination of formant frequency for male vowels. Thresholds for formant-frequency discrimination were obtained for six vowels with two fundamental frequencies: normal F0 (126 Hz) and low F0 (101 Hz). Four well-trained subjects performed an adaptive tracking task under low stimulus uncertainty. Comparisons between the normal-F0 and the low-F0 conditions showed that formants were resolved more accurately for low F0. These thresholds for male vowels were compared to thresholds for female vowels previously reported by Kewley-Port and Watson [J. Acoust. Soc. Am. 95, 485-496 (1994)]. Analyses of the F0 sets demonstrated that formant thresholds were significantly degraded for increases both in formant frequency and in F0. A piece-wise linear function was fit to each of the three sets of delta F thresholds as a function of formant frequency. The shape of the three parallel functions was similar such that delta F was constant in the F1 region and increased with formant frequency in the F2 region. The capability for humans to discriminate formant frequency may therefore be described as uniform in the F1 region (< 800 Hz) when represented as delta F and also uniform in the F2 region when represented as a ratio of delta F/F. A model of formant discrimination is proposed in which the effects of formant frequency are represented by the shape of an underlying piece-wise linear function. Increases in F0 significantly degrade overall discrimination independently from formant frequency.}, } @article {pmid8865645, year = {1996}, author = {Alcántara, JI and Holube, I and Moore, BC}, title = {Effects of phase and level on vowel identification: data and predictions based on a nonlinear basilar-membrane model.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {4 Pt 1}, pages = {2382-2392}, doi = {10.1121/1.417948}, pmid = {8865645}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Basilar Membrane/*physiology ; Humans ; *Phonetics ; *Speech Discrimination Tests ; *Speech Perception ; Time Factors ; }, abstract = {This paper examines the role of component phase and level on vowel identification and interprets the results in terms of the shapes of the waveforms occurring at the outputs of the filters in a nonlinear basilar-membrane model. Four normally hearing subjects were asked to identify which of six possible vowel-like harmonic complexes was presented on each trial. The stimuli were complex tones containing the first 35 harmonics of a 100-Hz fundamental. All of the harmonics below 3000 Hz were equal in amplitude except for three pairs of successive harmonics, at frequencies corresponding to the first three formants of six vowels, which were incremented in level relative to the background harmonics by 1, 2, 4, 8, and 16 dB. The components in the harmonic complexes were added in four different starting phase relationships; cosine, random, Schroeder positive, and Schroeder negative. The stimuli were presented at three overall levels; 85, 65, and 45 dB SPL. Performance was similar for the random and Schroeder-negative phases and did not vary as a function of level. Performance for the cosine- and Schroeder-positive-phase conditions was better than for the other two phase conditions, but decreased as the level was reduced. Performance for all four phase conditions was equivalent for the lowest level. The variation in performance as a function of level and component phase is explained in terms of the shapes of the temporal waveforms that would occur at the output of nonlinear "basilar-membrane filters" [H. W. Strube, J. Acoust. Soc. Am. 79, 1511-1518 (1986)], with asymmetric phase responses about the center frequency.}, } @article {pmid8911712, year = {1996}, author = {Clark, GM}, title = {Electrical stimulation of the auditory nerve: the coding of frequency, the perception of pitch and the development of cochlear implant speech processing strategies for profoundly deaf people.}, journal = {Clinical and experimental pharmacology & physiology}, volume = {23}, number = {9}, pages = {766-776}, doi = {10.1111/j.1440-1681.1996.tb01178.x}, pmid = {8911712}, issn = {0305-1870}, mesh = {Acoustic Stimulation ; Animals ; Cochlea/pathology/physiopathology ; *Cochlear Implants ; Electric Stimulation ; Electrophysiology ; Evoked Potentials, Auditory, Brain Stem/physiology ; Humans ; Pitch Perception ; Speech Perception ; Vestibulocochlear Nerve/*physiology ; }, abstract = {1. The development of speech processing strategies for multiple-channel cochlear implants has depended on encoding sound frequencies and intensities as temporal and spatial patterns of electrical stimulation of the auditory nerve fibres so that speech information of most importance of intelligibility could be transmitted. 2. Initial physiological studies showed that rate encoding of electrical stimulation above 200 pulses/s could not reproduce the normal response patterns in auditory neurons for acoustic stimulation in the speech frequency range above 200 Hz and suggested that place coding was appropriate for the higher frequencies. 3. Rate difference limens in the experimental animal were only similar to those for sound up to 200 Hz. 4. Rate difference limens in implant patients were similar to those obtained in the experimental animal. 5. Satisfactory rate discrimination could be made for durations of 50 and 100 ms, but not 25 ms. This made rate suitable for encoding longer duration suprasegmental speech information, but not segmental information, such as consonants. The rate of stimulation could also be perceived as pitch, discriminated at different electrode sites along the cochlea and discriminated for stimuli across electrodes. 6. Place pitch could be scaled according to the site of stimulation in the cochlea so that a frequency scale was preserved and it also had a different quality from rate pitch and was described as tonality. Place pitch could also be discriminated for the shorter durations (25 ms) required for identifying consonants. 7. The inaugural speech processing strategy encoded the second formant frequencies (concentrations of frequency energy in the mid frequency range of most importance for speech intelligibility) as place of stimulation, the voicing frequency as rate of stimulation and the intensity as current level. Our further speech processing strategies have extracted additional frequency information and coded this as place of stimulation. The most recent development, however, presents temporal frequency information as amplitude variations at a constant rate of stimulation. 8. As additional speech frequencies have been encoded as place of stimulation, the mean speech perception scores have continued to increase and are now better than the average scores that severely-profoundly deaf adults and children with some residual hearing obtain with a hearing aid.}, } @article {pmid8890286, year = {1996}, author = {Cariani, PA and Delgutte, B}, title = {Neural correlates of the pitch of complex tones. I. Pitch and pitch salience.}, journal = {Journal of neurophysiology}, volume = {76}, number = {3}, pages = {1698-1716}, doi = {10.1152/jn.1996.76.3.1698}, pmid = {8890286}, issn = {0022-3077}, support = {DC-02356/DC/NIDCD NIH HHS/United States ; R01 DC002258-05/DC/NIDCD NIH HHS/United States ; P01 DC000119-24/DC/NIDCD NIH HHS/United States ; T32 DC000038-07/DC/NIDCD NIH HHS/United States ; DC-00019/DC/NIDCD NIH HHS/United States ; DC-00006/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Algorithms ; Animals ; Cats ; Evoked Potentials, Auditory/physiology ; Nerve Fibers/physiology ; Neurons, Afferent/*physiology ; Pitch Perception/*physiology ; }, abstract = {1. The temporal discharge patterns of auditory nerve fibers in Dial-anesthetized cats were studied in response to periodic complex acoustic waveforms that evoke pitches at their fundamental frequencies. Single-formant vowels, amplitude-modulated (AM) and quasi-frequency-modulated tones. AM noise, click trains, and other complex tones were utilized. Distributions of intervals between successive spikes ("1st-order intervals") and between both successive and nonsuccessive spikes ("all-order intervals") were computed from spike trains. Intervals from many fibers were pooled to estimate interspike interval distributions for the entire auditory nerve. Properties of these "pooled interspike interval distributions," such as the positions of interval peaks and their relative heights, were examined for correspondence to the psychophysical data on pitch frequency and pitch salience. 2. For a diverse set of complex stimuli and levels, the most frequent all-order interspike interval present in the pooled distribution corresponded to the pitch heard in psychophysical experiments. Pitch estimates based on pooled interval distributions (30-85 fibers, 100 stimulus presentations per fiber) were highly accurate (within 1%) for harmonic stimuli that produce strong pitches at 60 dB SPL. 3. Although the most frequent intervals in pooled all-order interval distributions were very stable with respect to sound intensity level (40, 60, and 80 dB total SPL), this was not necessarily the case for first-order interval distributions. Because the low pitches of complex tones are largely invariant with respect to level, pitches estimated from all-order interval distributions correspond better to perception. 4. Spectrally diverse stimuli that evoke similar low pitches produce pooled interval distributions with similar most-frequent intervals. This suggests that the pitch equivalence of these different stimuli could result from central auditory processing mechanisms that analyze interspike interval patterns. 5. Complex stimuli that evoke strong or "salient" pitches produce pooled interval distributions with high peak-to-mean ratios. Those stimuli that evoke weak pitches produce pooled interval distributions with low peak-to-mean ratios. 6. Pooled interspike interval distributions for stimuli consisting of low-frequency components generally resembled the short-time auto-correlation function of stimulus waveforms. Pooled interval distributions for stimuli consisting of high-frequency components resembled the short-time autocorrelation function of the waveform envelope. 7. Interval distributions in populations of neurons constitute a general, distributed means of encoding, transmitting, and representing information. Existence of a central processor capable of analyzing these interval patterns could provide a unified explanation for many different aspects of pitch perception.}, } @article {pmid8817914, year = {1996}, author = {Boothroyd, A and Mulhearn, B and Gong, J and Ostroff, J}, title = {Effects of spectral smearing on phoneme and word recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {3}, pages = {1807-1818}, doi = {10.1121/1.416000}, pmid = {8817914}, issn = {0001-4966}, support = {DC00178/DC/NIDCD NIH HHS/United States ; HI33E80019/HI/NHLBI NIH HHS/United States ; }, mesh = {Adult ; Humans ; Middle Aged ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {The principal goal was to measure the effects, on speech perception, of loss of spectral detail in the acoustic signal. Spectral smearing was produced by multiplying the speech waveform by low-pass filtered noise. Performance was measured in normal adults as the percentage of phonemes correctly repeated in lists of monosyllabic words. A smearing bandwidth of 250 Hz (i.e., each tonal component of the instantaneous spectrum replaced by a 250-Hz band of noise) had a small but significant effect on phoneme recognition. A smearing bandwidth of 8000 Hz was required to reduce phoneme recognition to a value that was indistinguishable from that produced by complete smearing. Vowels were somewhat more susceptible to the effects of spectral smearing than were consonants, and initial consonants were more susceptible than were final consonants. In an analysis of errors, place of consonant articulation was more susceptible than either manner of articulation or voicing. These findings are attributable to differences in the relative importance of spectral and temporal cues. Word recognition was more susceptible to the effects of spectral smearing than was phoneme recognition, but this finding was predictable on the basis of the known nonlinear relationship between the two measures. In a second experiment, smearing bandwidths of 707 and 2000 Hz increased phoneme recognition threshold by 12.9 and 16.4 dB, respectively, compared to that found without smearing. (Phoneme recognition threshold is defined, here, as the signal-to-noise ratio at which phoneme recognition is 50% of the value obtained in quiet.) The data are consistent with the hypothesis that reduced spectral resolution affects phoneme recognition to the extent that it reduces access to the formant patterns in the spectral envelope.}, } @article {pmid8817905, year = {1996}, author = {Leek, MR and Summers, V}, title = {Reduced frequency selectivity and the preservation of spectral contrast in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {3}, pages = {1796-1806}, doi = {10.1121/1.415999}, pmid = {8817905}, issn = {0001-4966}, support = {DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Aged ; Auditory Threshold ; Cochlea/*physiopathology ; Hearing Loss, Noise-Induced/*physiopathology ; Humans ; Middle Aged ; *Noise ; *Speech Perception ; }, abstract = {Reduced frequency selectivity associated with sensorineural hearing loss may pose particular problems for hearing-impaired listeners in noisy environments. In these situations, broader-than-normal auditory filters may affect the perception of speech by reducing the contrast between spectral peaks and valleys in at least two ways. First, the peaks and valleys in the internal representation of the speech spectrum become smeared, resulting in less precise frequency analysis. Second, there may be a reduction in the signal-to-noise ratio (S/N) at the output of each auditory filter. In order to examine the relationship between frequency selectivity and identification of speechlike stimuli in noise, hearing-impaired and normal-hearing listeners were trained to assign vowel labels to four harmonic complexes which differed in the frequency locations of four elevated ("peak") harmonics. Peak harmonics were chosen to approximate first- and second-formant frequencies in four English vowels. Listeners were then tested to determine the spectral contrast necessary between peak and background components in order to maintain identification accuracy in the presence of various levels of broadband noise. Results indicated that for these stimuli, normal-hearing listeners required about 1 dB of additional spectral contrast for every doubling of the intensity of noise. The required increase in spectral contrast was generally greater for listeners with broader-than-normal auditory filters at 2000 Hz. This finding suggests indirectly that in the internal representations of speech sounds embedded in noise, the signal-to-noise ratio for listeners with abnormal frequency selectivity is poorer than for listeners with normal frequency selectivity. A poorer-than-normal internal S/N may be one factor underlying the common observation that noise often is more degrading to speech understanding by hearing-impaired listeners than by normal-hearing listeners.}, } @article {pmid8817900, year = {1996}, author = {Marin, CM and McAdams, S}, title = {The role of auditory beats induced by frequency modulation and polyperiodicity in the perception of spectrally embedded complex target sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {3}, pages = {1736-1753}, doi = {10.1121/1.416071}, pmid = {8817900}, issn = {0001-4966}, mesh = {Adult ; *Auditory Perception ; Auditory Threshold ; Computer Simulation ; Female ; Humans ; Male ; Noise ; Perceptual Masking ; *Periodicity ; }, abstract = {The contribution of auditory beats to the perception of target sounds differing from an interfering background by their frequency modulation (FM) pattern or by a difference in fundamental frequency (F0) was investigated. On each trial, test sounds composed of a single, second-order formant were embedded in harmonic backgrounds and presented in successive intervals. The center frequencies of these "normal" formants differed across intervals. Subjects were to decide which interval contained the test formant with a center frequency matching that of an isolated target formant presented before each test stimulus. Matching thresholds were measured in terms of the width of modulation for FM stimuli or the mistuning of the F0's of unmodulated test formants relative to that of the background. Beats may have allowed the identification of the spectral region of the target in both experiments. To reduce interactions between test and background components, matching thresholds were measured for "flat" formants composed of two or three equal-amplitude components embedded in a harmonic background in which components corresponding to those of test formants were absent. These measures were repeated with the addition of a pink noise floor. Matching was still possible in all cases, though at higher thresholds than for normal formants. Computer simulations suggested that the modulation depth of envelope fluctuations within auditory channels played a significant role in the matching of target sounds when their components were mixed in the same frequency region with those of an interfering sound, but not when the target and background components were separated by as much as 250 Hz, the F0 of the stimulus.}, } @article {pmid8890509, year = {1996}, author = {Nugent, JK and Lester, BM and Greene, SM and Wieczorek-Deering, D and O'Mahony, P}, title = {The effects of maternal alcohol consumption and cigarette smoking during pregnancy on acoustic cry analysis.}, journal = {Child development}, volume = {67}, number = {4}, pages = {1806-1815}, pmid = {8890509}, issn = {0009-3920}, mesh = {Adolescent ; Adult ; *Alcohol Drinking ; *Crying ; Female ; Humans ; Infant, Newborn ; *Maternal Behavior ; Mothers/*psychology ; Pregnancy ; *Smoking ; *Speech Acoustics ; }, abstract = {During the last trimester of pregnancy, 127 primiparous Irish mothers were interviewed to ascertain their history of alcohol and tobacco use. Confounding effects due to other drugs were not a factor in this sample. Mothers consumed an average of .21 ounces absolute alcohol (AA) per day, with 62% classified as moderate drinkers, 10.6% as heavy drinkers, and 26% as nondrinkers. Neurobehavioral status was measured using acoustic characteristics of the infant's cry, collected on the third day of life. Multiple regression analysis showed that more ounces AA per day was related to more dysphonation and higher first formant, while more cigarette smoking was related to higher pitch, higher second formant, and more variability in the second formant. Analysis of variance comparisons of these 3 alcohol groups demonstrated significant cry effects on infants of heavy drinking mothers.}, } @article {pmid8844561, year = {1996}, author = {Shuster, LI}, title = {Linear predictive coding parameter manipulation/synthesis of incorrectly produced /r/.}, journal = {Journal of speech and hearing research}, volume = {39}, number = {4}, pages = {827-832}, doi = {10.1044/jshr.3904.827}, pmid = {8844561}, issn = {0022-4685}, support = {5R03 DC01742-02/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Child ; Humans ; *Phonetics ; Speech Disorders/diagnosis/*therapy ; Speech Perception ; Speech Production Measurement ; }, abstract = {This study describes the manipulation and synthesis of LPC parameters to edit incorrectly produced utterances. In particular, it shows that formant frequencies can be manipulated to produce a consistent and reliable change in perception. It also demonstrates that this method can be used to produce quality synthesis of high-pitched voices.}, } @article {pmid8844549, year = {1996}, author = {Stark, RE and Heinz, JM}, title = {Perception of stop consonants in children with expressive and receptive-expressive language impairments.}, journal = {Journal of speech and hearing research}, volume = {39}, number = {4}, pages = {676-686}, doi = {10.1044/jshr.3904.676}, pmid = {8844549}, issn = {0022-4685}, support = {DC00389/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Female ; Humans ; Language Disorders/*diagnosis ; Male ; *Phonetics ; Speech Discrimination Tests ; Speech Disorders/*diagnosis ; *Speech Perception ; Speech Production Measurement ; }, abstract = {The performance of 32 children with language impairment-11 with expressive language impairment only (LI-E subgroup) and 21 with both receptive and expressive language impairment (LI-ER subgroup)-and of 22 children without language impairment (LN subgroup) was examined in a study of perception and imitation of synthesized /ba/ and /da/ syllables. Formant transition duration and task difficulty were varied in the perceptual tasks. The LI-E children were able to identify the syllables as well as the LN; the LI-ER were not. Of the children who succeeded on an identification task and proceeded to a serial ordering task incorporating the same stimuli, the LI-E children were the least successful on the second task. The ability to label the stimuli perceptually was highly correlated with absence of speech articulation errors in the LI children and with performance on the imitation task in all subjects. The findings are examined in relation to the hypotheses that rapid-rate perceptual processing is the sole basis of language impairment in children and that, in these children, production skill may predict phoneme perception rather than the reverse.}, } @article {pmid8759967, year = {1996}, author = {Assmann, PF}, title = {Modeling the perception of concurrent vowels: Role of formant transitions.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {2 Pt 1}, pages = {1141-1152}, doi = {10.1121/1.416299}, pmid = {8759967}, issn = {0001-4966}, support = {R29-DC01258/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Models, Theoretical ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {When two synthetic vowels are presented concurrently and monaurally, listeners identify the members of the pair more accurately if they differ in fundamental frequency (F0), or if one of them is preceded or followed by formant transitions that specify a glide or liquid consonant. However, formant transitions do not help listeners identify the vowel to which they are linked; instead, they make the competing vowel easier to identify. One explanation is that the formant transition region provides a brief time interval during which the competing vowel is perceptually more prominent. This interpretation is supported by the predictions of two computational models of the identification of concurrent vowels that (i) perform a frequency analysis using a bank of bandpass filters, (ii) analyze the waveform in each channel using a brief, sliding temporal window, and (iii) determine which region of the signal provides the strongest evidence of each vowel. Model A [Culling and Darwin, J. Acoust. Soc. Am. 95, 1559-1569 (1994)] computes the rms energy in each channel at successive time intervals to generate running excitation patterns that serve as input to a vowel classifier, implemented as a linear associative neural network. Model B uses a temporal analysis in each channel to generate running autocorrelation functions, and it includes a further stage of source segregation [Meddis and Hewitt, J. Acoust. Soc. Am. 91, 233-245 (1992)] to partition the channels into two groups, one group providing evidence of the periodicity of the vowel with the dominant F0, the other group providing evidence of the competing vowel. Both models predicted effects of F0 and formant transitions on identification, but model B provided more accurate predictions of the pattern of listeners' identification responses. Taken together, the empirical and modeling results support the idea that the identification of concurrent vowels involves an analysis of the composite waveform using a sliding temporal window, combined with a form of F0-guided source segregation.}, } @article {pmid8759958, year = {1996}, author = {Hienz, RD and Aleszczyk, CM and May, BJ}, title = {Vowel discrimination in cats: thresholds for the detection of second formant changes in the vowel /epsilon/.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {2 Pt 1}, pages = {1052-1058}, doi = {10.1121/1.416291}, pmid = {8759958}, issn = {0001-4966}, support = {5 R01 DC 01388-04/DC/NIDCD NIH HHS/United States ; 5 R29 DC00954-5/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; *Auditory Threshold ; Cats ; Male ; *Phonetics ; Psychometrics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The ability of cats to discriminate changes in the second formant of the vowel /epsilon/ was examined across a range of stimulus levels. Cats were trained to press and hold down a lever to produce a pulsed train of the standard vowel /epsilon/, and to release the lever only when a variant of [epsilon] occurred. Six synthetic variants of /epsilon/ had the same first and third formants (F1 and F3), but with the second formant (F2) located between 1700 and 2000 Hz. All stimuli were tested at levels of 10, 30, 50, and 70 dB SPL. Average difference thresholds for changes in F2 (delta F2) of the vowel /epsilon/ ranged from 87 to 36 Hz across levels of 10 to 70 dB SPL, and were only slightly above those of humans. Further, the delta F2 values were lower than pure-tone delta F values in the same frequency range, whereas humans exhibit higher delta F2 values than pure-tone delta F values in the same frequency range. Changes in the second formant in a negative direction (downward F2 shifts) were also found to be more difficult to detect than upward F2 shifts. These results suggest that, compared to pure tones, cats are better able to discriminate small changes in more complex, vowel-like stimuli.}, } @article {pmid8811940, year = {1996}, author = {Kurowski, KM and Blumstein, SE and Alexander, M}, title = {The foreign accent syndrome: a reconsideration.}, journal = {Brain and language}, volume = {54}, number = {1}, pages = {1-25}, doi = {10.1006/brln.1996.0059}, pmid = {8811940}, issn = {0093-934X}, support = {DC00081/DC/NIDCD NIH HHS/United States ; DC00314/DC/NIDCD NIH HHS/United States ; }, mesh = {Brain/physiopathology ; Brain Ischemia/complications/physiopathology ; Functional Laterality ; Humans ; Magnetic Resonance Imaging ; Male ; Middle Aged ; *Phonetics ; Speech Acoustics ; Speech Disorders/diagnosis/*etiology ; Tomography, X-Ray Computed ; *Verbal Behavior ; }, abstract = {This study compared the post-CVA speech of a patient presenting with the foreign accent syndrome (FAS) to both a premorbid baseline for that patient and to similarly analyzed data from an earlier reported case of FAS. The object of this research was to provide quantitative acoustic data to determine whether: (1) the constellation of phonetic features associated with FAS is the same across patients and (2) a common neural mechanism underlies FAS. Acoustic parameters investigated included features of consonant production (voicing, place and manner of articulation), vowel production (formant frequency and duration), and prosody. Results supported the characterization of FAS patients as having a "generic" foreign accent and the hypothesis that FAS deficits are qualitatively different from that of Broca's aphasia. However, comparison of this case with recent studies revealed the extent to which the constellation of phonetic features may vary among FAS patients, challenging the notion that a general prosodic disturbance is the sole underlying mechanism in FAS.}, } @article {pmid8710451, year = {1996}, author = {Fahey, RP and Diehl, RL}, title = {The missing fundamental in vowel height perception.}, journal = {Perception & psychophysics}, volume = {58}, number = {5}, pages = {725-733}, pmid = {8710451}, issn = {0031-5117}, support = {5 R01 DC 00427-07/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Basilar Membrane/physiology ; Female ; Humans ; Male ; *Phonetics ; Pitch Discrimination/physiology ; Psychoacoustics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception/physiology ; }, abstract = {Traunmüller (1981) suggested that the tonotopic distance between the first formant (F1) and the fundamental frequency (F0) is a major determinant of perceived vowel height. In the present study, subjects identified a vowel-height continuum ranging in formant pattern from /I/to/epsilon/, at five F0 values. Increasing F0 led to an increased probability of /I/responses (i.e., the phoneme boundary shifted toward the /epsilon/ end of the continuum). Various conditions of filtering out the lower harmonics of the stimuli caused only marginal shifts of the phoneme boundary. The experiments provide evidence against interpretations of Traunmüller's (1981) results that claim that vowel height is determined by the distance between F1 and the lowest harmonic that is present in the basilar membrane excitation pattern.}, } @article {pmid8675851, year = {1996}, author = {Aikawa, K and Singer, H and Kawahara, H and Tohkura, Y}, title = {Cepstral representation of speech motivated by time-frequency masking: an application to speech recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {1}, pages = {603-614}, doi = {10.1121/1.415961}, pmid = {8675851}, issn = {0001-4966}, mesh = {Humans ; Models, Theoretical ; Noise ; *Perceptual Masking ; Phonetics ; *Speech Perception ; }, abstract = {A new spectral representation incorporating time-frequency forward masking is proposed. This masked spectral representation is efficiently represented by a quefrency domain parameter called dynamic-cepstrum (DyC). Automatic speech recognition experiments have demonstrated that DyC powerfully improves performance in phoneme classification and phrase recognition. This new spectral representation simulates a perceived spectrum. It enhances formant transition, which provides relevant cues for phoneme perception, while suppressing temporally stationary spectral properties, such as the effect of microphone frequency characteristics or the speaker-dependent time-invariant spectral feature. These features are advantageous for speaker-independent speech recognition. DyC can efficiently represent both the instantaneous and transitional aspects of a running spectrum with a vector of the same size as a conventional cepstrum. DyC is calculated from a cepstrum time sequence using a matrix lifter. Each column vector of the matrix lifter performs spectral smoothing. Smoothing characteristics are a function of the time interval between a masker and a signal. DyC outperformed a conventional cepstrum parameter obtained through linear predictive coding (LPC) analysis for both phoneme classification and phrase recognition by using hidden Markov models (HMMs). Compared with speaker-dependent recognition, an even greater improvement over the cepstrum parameter was found in speaker-independent speech recognition. Furthermore, DyC with only 16 coefficients exhibited higher speech recognition performance than a combination of the cepstrum and a delta-cepstrum with 32 coefficients for the classification experiment of phonemes contaminated by noises.}, } @article {pmid8675848, year = {1996}, author = {Bonneau, A and Djezzar, L and Laprie, Y}, title = {Perception of the place of articulation of French stop bursts.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {1}, pages = {555-564}, doi = {10.1121/1.415866}, pmid = {8675848}, issn = {0001-4966}, mesh = {France ; Humans ; Language ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This paper deals with the perception of French natural stop bursts and with the role played by the following vowel in this perception. The first experiment verified the ability of listeners to identify long stimuli containing the burst and part of the subsequent vowel. The second and third experiments investigated the identification of stop bursts with and without a priori knowledge of the following vowel. In order to determine the discriminating power of spectral characteristics of the burst, these experiments used fixed-length burst stimuli of 25-ms duration with all traces of vocalic segment cut off. The bursts of initial voiceless stops were extracted from CVC isolated words. The burst provided very reliable information about stop place since listeners identified correctly 87% of the stops, without a priori knowledge of the following vowel. Performance however was context-dependent. Knowing the identity of the vowel led to a slight but statistically significant improvement in stop identification. Nevertheless, the effects of this knowledge were selective and varied with context. Finally, the first experiment proved that a near perfect identification of stops can be achieved only when all main cues (burst spectrum, burst duration, and onset of vocalic formant transitions) were present simultaneously.}, } @article {pmid8675847, year = {1996}, author = {Story, BH and Titze, IR and Hoffman, EA}, title = {Vocal tract area functions from magnetic resonance imaging.}, journal = {The Journal of the Acoustical Society of America}, volume = {100}, number = {1}, pages = {537-554}, doi = {10.1121/1.415960}, pmid = {8675847}, issn = {0001-4966}, support = {P60 DC00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Electronic Data Processing ; Humans ; Larynx/*physiology ; *Magnetic Resonance Imaging ; Male ; Phonetics ; Speech/physiology ; Speech Acoustics ; Speech, Alaryngeal ; Vocal Cords/*physiology ; }, abstract = {There have been considerable research efforts in the area of vocal tract modeling but there is still a small body of information regarding direct 3-D measurements of the vocal tract shape. The purpose of this study was to acquire, using magnetic resonance imaging (MRI), an inventory of speaker-specific, three-dimensional, vocal tract air space shapes that correspond to a particular set of vowels and consonants. A set of 18 shapes was obtained for one male subject who vocalized while being scanned for 12 vowels, 3 nasals, and 3 plosives. The 3-D shapes were analyzed to find the cross-sectional areas evaluated within planes always chosen to be perpendicular to the centerline extending from the glottis to the mouth to produce an "area function." This paper provides a speaker-specific catalogue of area functions for 18 vocal tract shapes. Comparisons of formant locations extracted from the natural (recorded) speech of the imaged subject and from simulations using the newly acquired area functions show reasonable similarity but suggest that the imaged vocal tract shapes may be somewhat centralized. Additionally, comparisons of the area functions reported in this study are compared with those from four previous studies and demonstrate general similarities in shape but also obvious differences that can be attributed to differences in imaging techniques, image processing methods, and anatomical differences of the imaged subjects.}, } @article {pmid8807261, year = {1996}, author = {Skinner, MW and Fourakis, MS and Holden, TA and Holden, LK and Demorest, ME}, title = {Identification of speech by cochlear implant recipients with the Multipeak (MPEAK) and Spectral Peak (SPEAK) speech coding strategies. I. Vowels.}, journal = {Ear and hearing}, volume = {17}, number = {3}, pages = {182-197}, doi = {10.1097/00003446-199606000-00002}, pmid = {8807261}, issn = {0196-0202}, support = {R01-DC00581/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Deafness/*rehabilitation ; *Equipment Design ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {OBJECTIVE: The main objective was to evaluate differences in performance associated with the two speech coding strategies. To achieve this objective, acoustic and electrical analyses of vowels identified by cochlear implant recipients were compared with their responses when they used the Multipeak (MPEAK) and the Spectral Peak (SPEAK) speech coding strategies of the Nucleus Cochlear Implant System.

DESIGN: Nine subjects identified pure and r-colored English vowels with the two speech coding strategies. The two processing strategies were compared using an ABAB design. Evaluations were conducted at two weekly sessions after at least 3 wk of use with each strategy.

RESULTS: Group vowel identification scores with the MPEAK versus the SPEAK strategy were not significantly different (72.3% and 73.4%, respectively). However, hierarchical loglinear analysis of group data showed that transmitted information of r-color, duration, and second-formant features was significantly better with the SPEAK than with the MPEAK strategy. In contrast, identification of the first formant feature was significantly better with the MPEAK than with the SPEAK strategy. Individual subjects had different error patterns in response to the 14 vowels.

CONCLUSIONS: Electrical stimulation with the SPEAK strategy provides clearer spectral representation of second formant and duration information as well as second and third formant change in r-colored vowels than with the MPEAK strategy. Consequently, there was marked improvement in recognition of r-colored vowels with SPEAK compared with MPEAK. In contrast, transmitted information for first-formant features was significantly less with SPEAK than with MPEAK. This may have occurred because four instead of six to eight electrodes were assigned to first formant frequencies with SPEAK versus MPEAK.}, } @article {pmid8783127, year = {1996}, author = {Groenen, P and Maassen, B and Crul, T and Thoonen, G}, title = {The specific relation between perception and production errors for place of articulation in developmental apraxia of speech.}, journal = {Journal of speech and hearing research}, volume = {39}, number = {3}, pages = {468-482}, doi = {10.1044/jshr.3903.468}, pmid = {8783127}, issn = {0022-4685}, mesh = {*Apraxias ; Child ; Humans ; Phonetics ; Sound Spectrography ; *Speech ; Speech Discrimination Tests ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {Developmental apraxia of speech is a disorder of phonological and articulatory output processes. However, it has been suggested that perceptual deficits may contribute to the disorder. Identification and discrimination tasks offer a fine-grained assessment of central auditory and phonetic functions. Seventeen children with developmental apraxia (mean age 8:9, years:months) and 16 control children (mean age 8:0) were administered tests of identification and discrimination of resynthesized and synthesized monosyllabic words differing in place-of-articulation of the initial voiced stop consonants. The resynthetic and synthetic words differed in the intensity of the third formant, a variable potentially enlarging their clinical value. The results of the identification task showed equal slopes for both subject groups, which indicates no phonetic processing deficit in developmental apraxia of speech. The hypothesized effect of the manipulation of the intensity of the third formant of the stimuli was not substantiated. However, the children with apraxia demonstrated poorer discrimination than the control children, which suggests affected auditory processing. Furthermore, analyses of discrimination performance and articulation data per apraxic subject demonstrated a specific relation between the degree to which auditory processing is affected and the frequency of place-of-articulation substitutions in production. This indicates the interdependence of perception and production. The results also suggest that the use of perceptual tasks has significant clinical value.}, } @article {pmid8734390, year = {1996}, author = {Higashikawa, M and Nakai, K and Sakakura, A and Takahashi, H}, title = {Perceived pitch of whispered vowels--relationship with formant frequencies: a preliminary study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {10}, number = {2}, pages = {155-158}, doi = {10.1016/s0892-1997(96)80042-7}, pmid = {8734390}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; *Pitch Perception ; Verbal Behavior ; }, abstract = {To clarify the role of formant frequency in the perception of pitch in whispering, we conducted a preliminary experiment to determine (1.) whether speakers change their pitch during whispering; (2.) whether listeners can perceive differences in pitch; and (3.) what the acoustical features are when speakers change their pitch. The listening test of whispered Japanese speech demonstrates that one can determine the perceived pitch of vowel /a/ as ordinary, high, or low. Acoustical analysis revealed that the perception of pitch corresponds to some formant frequencies. Further data with synthesized whispered voice are necessary to confirm the importance of the formant frequencies in detail for perceived pitch of whispered vowels.}, } @article {pmid8655808, year = {1996}, author = {Sommers, MS and Kewley-Port, D}, title = {Modeling formant frequency discrimination of female vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {6}, pages = {3770-3781}, doi = {10.1121/1.414972}, pmid = {8655808}, issn = {0001-4966}, support = {DC-00012/DC/NIDCD NIH HHS/United States ; DC-00250/DC/NIDCD NIH HHS/United States ; DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Threshold ; Female ; Humans ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The present investigations were designed to establish the features of vowel spectra that mediate formant frequency discrimination. Thresholds for detecting frequency shifts in the first and second formants of two steady-state vowels were initially measured for conditions in which the amplitudes of all harmonics varied in accordance with a model of cascade formant synthesis. In this model, changes in formant frequency produce level variations in components adjacent to the altered formant as well as in harmonics spectrally remote from the shifted resonant frequency. Discrimination thresholds determined with the cascade synthesis procedure were then compared to difference limens (DLs) obtained when the number of harmonics exhibiting level changes was limited to the frequency region surrounding the altered formant. Results indicated that amplitude variations could be restricted to one to three components near the shifted formant before significant increases in formant frequency DLs were observed. In a second experiment, harmonics remote from the shifted formant were removed from the stimuli. In most cases, thresholds for these reduced-harmonic complexes were not significantly different from those obtained with full-spectrum vowels. Preliminary evaluation of an excitation-pattern model of formant frequency discrimination indicated that such a model can provide good accounts of the thresholds obtained in the present experiments once the salient regions of the vowel spectra have been identified. Implications of these findings for understanding the mechanism mediating vowel perception are discussed.}, } @article {pmid8655805, year = {1996}, author = {Avery, JD and Liss, JM}, title = {Acoustic characteristics of less-masculine-sounding male speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {6}, pages = {3738-3748}, doi = {10.1121/1.414970}, pmid = {8655805}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Sex Factors ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {This study compared samples of less-masculine-sounding (LMS) and more-masculine-sounding (MMS) male speech to identify acoustic characteristics, other than fundamental frequency, that might contribute to the perception of these categories. In the first phase, audiorecorded speech samples provided by 35 males were presented to 35 female listeners in a paired-comparison perceptual experiment. Nineteen of the male speech samples were judged reliably to fall within the LMS or MMS categories. Within those 19 samples, 8 speakers (4 LMS and 4 MMS) exhibited similar distributions of habitual fundamental frequency values in connected speech and in sustained phonation. In the second phase of the experiment, various acoustic measures of these eight connected speech samples were conducted. Significant differences between measures of fundamental frequency contours, vowel formant midpoint values, and in the first, third and fourth spectral moments of two fricatives were revealed. These findings may be useful in creating stylized synthetic speech that varies on the dimension of masculinity, and they may have clinical relevance for patients wishing to modify the perception of masculinity invoked by their speech.}, } @article {pmid8655797, year = {1996}, author = {Hienz, RD and Aleszczyk, CM and May, BJ}, title = {Vowel discrimination in cats: acquisition, effects of stimulus level, and performance in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {6}, pages = {3656-3668}, doi = {10.1121/1.414980}, pmid = {8655797}, issn = {0001-4966}, support = {DC00954/DC/NIDCD NIH HHS/United States ; DC01388/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Cats/*physiology ; *Discrimination Learning ; Male ; *Noise ; *Phonetics ; *Speech Perception ; Speech, Alaryngeal ; Vestibulocochlear Nerve/physiology ; }, abstract = {The ability of cats to discriminate accurately among different synthetic, steady-state vowels was examined across a range of stimulus levels and in background noise. Cats were trained to press and hold down a lever to produce a pulsed train of a standard vowel stimulus, and to release the lever only when a different vowel sound occurred. Five synthetic vowels were tested (/e/, /ae/, /a/, /o/, and /u/) at levels of 30, 50, 70, and 90 dB SPL. In separate experiments, each of these vowels served in turn as the standard vowel. All cats discriminated among the vowels accurately, and in general performed at least as well at high stimulus levels as at low levels. Where differences in vowel discriminability occurred, they were correlated with the relative changes in first and second formant peaks. Cats appear to predominantly utilize upward frequency changes in either the first or second formants of the vowels to make the discriminations; downward formant changes produced considerably lower discrimination performances. In background noise, high vowel discriminability was still maintained at an average signal/noise ratio of -12.3 dB. Thus cats can discriminate among vowels at high signal levels and in background noise, despite the fact that the neural representations of vowels based on rate responses in the auditory nerve can be severely degraded under these conditions.}, } @article {pmid8655792, year = {1996}, author = {McGee, T and Kraus, N and King, C and Nicol, T and Carrell, TD}, title = {Acoustic elements of speechlike stimuli are reflected in surface recorded responses over the guinea pig temporal lobe.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {6}, pages = {3606-3614}, doi = {10.1121/1.414958}, pmid = {8655792}, issn = {0001-4966}, support = {R01 DC000264/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Female ; Guinea Pigs/*physiology ; Male ; Phonetics ; Reaction Time ; *Speech Acoustics ; *Speech Perception ; Temporal Lobe/*physiology ; }, abstract = {Auditory evoked potentials measured from the guinea pig temporal lobe surface reflect acoustic elements of synthesized speech syllables. Eliciting stimuli included a four formant anchor stimulus /ba/, with a 40-ms formant transition duration. The other stimuli differed from /ba/ along simple acoustic dimensions. The /pa/ stimuli differed on a VOT continuum; /da/ stimuli had a higher frequency F2 onset; /wa/ had a longer (80 ms) formant transition duration; and /bi/ differed in three vowel formant frequencies. The /ba/ and /da/ onset response latencies decreased systematically with increasing F2 onset frequency. The response to the /pa/ voicing increased in latency with increasing VOT and showed a physiologic discontinuity at VOT of 15-20 ms. Responses to /ba/ and /wa/ showed similar onset morphology but significant amplitude differences at latencies corresponding to vowel onset. Significant amplitude differences in /ba/ and /bi/ responses corresponded in latency to both consonant and vowel portions of the syllables. Similar to previous reports in the awake monkey for VOT, these results demonstrate in the anesthetized guinea pig that acoustic elements essential to speech perception are reflected in aggregate response of ensembles of cortical neurons.}, } @article {pmid8642130, year = {1996}, author = {Rossiter, D and Howard, DM and DeCosta, M}, title = {Voice development under training with and without the influence of real-time visually presented biofeedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {5}, pages = {3253-3256}, doi = {10.1121/1.414872}, pmid = {8642130}, issn = {0001-4966}, mesh = {*Biofeedback, Psychology ; Humans ; *Photic Stimulation ; Speech Acoustics ; Time Factors ; *Voice Training ; }, abstract = {This paper describes an investigation into the developmental nature of the voice under training with and without the influence of real-time visually presented biofeedback. Two subjects who had not previously experienced any form of vocal training took six singing lessons. One was taught conventionally, while the other was taught with the aid of a system known as Acoustic and Laryngeal Biofeedback Enhancement Real Time (ALBERT). Real-time biofeedback was presented based upon measures of (i) larynx closed quotient (CQ), (ii) spectral amplitude in the singer's formant frequency band relative to the spectral amplitude of the full band (ratio), and (iii) both parameters combined in a manner based on previously observed correlations between them. Results indicate generally increased sound pressure levels (SPL) of acoustic output and generally consistent increases in the level of CQ and ratio across consecutive lessons for both subjects.}, } @article {pmid8730079, year = {1996}, author = {Moore, BC and Alcántara, JI}, title = {Vowel identification based on amplitude modulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {4 Pt 1}, pages = {2332-2343}, doi = {10.1121/1.415420}, pmid = {8730079}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This study investigated the extent to which flat-spectrum harmonic complexes could be identified as one of six vowels when three pairs of successive harmonics, located at the first, second, and third formant frequency values, were amplitude modulated. In experiment 1, the amplitude modulation (AM) rate was at or close to 10 Hz. In condition 1, all components were added in cosine phase, and the 10-Hz AM was in phase for all "formants." Performance improved monotonically with increasing modulation index, m. In condition 2, m was fixed at 0.5 and the level of each background harmonic was varied randomly (roved) from stimulus to stimulus. Even a rove range of only +/- 2 dB reduced scores considerably. Condition 3 was like condition 1, but with components added in random phase. Performance was very poor for all modulation indices. This suggests that subjects were unable to use momentary differences in level between formant and background harmonics, and supports the idea that, for cosine-phase stimuli, they were using information from the low-amplitude portions ("valleys") of the cochlea-filtered waveforms. In further conditions, the components were added in cosine phase and the AM had a different phase and/or different rate (10, 16, and 24 Hz) on the different formants. Scores were very similar to those obtained when the AM was identical for all formants. In experiment 2, the AM rate was at or close to 2 Hz. When all formants were modulated in phase at 2 Hz, very good performance was found for components added in cosine phase, and performance was essentially unaffected by making the AM different in rate and/or phase across formants. When the components were added in random phase, performance was well above chance when the formants were modulated in-phase at 2-Hz, but worsened markedly when the modulation differed in rate and/or phase across formants. Randomizing the level of each background harmonic caused performance to deteriorate and to become similar for cosine-phase and random-phase stimuli. Performance deteriorated further when the AM differed in phase across formants. The results suggest that, for a 2-Hz modulation rate, and when information from the valleys is not available, performance depends on momentary increases in level of the formant harmonics relative to the background.}, } @article {pmid8729924, year = {1996}, author = {Nittrouer, S and Studdert-Kennedy, M and Neely, ST}, title = {How children learn to organize their speech gestures: further evidence from fricative-vowel syllables.}, journal = {Journal of speech and hearing research}, volume = {39}, number = {2}, pages = {379-389}, doi = {10.1044/jshr.3902.379}, pmid = {8729924}, issn = {0022-4685}, support = {DC-00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; *Child Language ; Child, Preschool ; Female ; *Gestures ; Humans ; *Language Development ; Male ; *Phonetics ; *Speech ; Speech Production Measurement ; *Verbal Learning ; }, abstract = {Previous studies with fricative-vowel (FV) syllables have shown that the difference in overall spectrum between fricatives is less in children's speech than in that of adults, but that fricative noises show greater differences in the region of the second formant (F2) as a function of the upcoming vowel than those of adults at corresponding points in the fricative. These results have been interpreted as evidence that children produce fricatives that are not spatially differentiated as those of adults and that children initiate vowel gestures earlier during syllable production than adults do (Nittrouer, Studdert-Kennedy, & McGowan, 1989). The goals of the present study were (a) to replicate the previous age-related difference for F2 with FV syllables; (b) to test the alternative interpretation that age-related differences in fricative f2 reflect age-related differences in vocal-tract geometry; (c) to determine whether age-related differences in F2 (and so, by inference, in articulatory organization) might extend beyond the syllable boundaries, perhaps into the schwa of a preceding unstressed syllable; and (d) determine if gestures other than fricative gestures show less spatial differentiation in children's than in adults' speech. To these ends, F2 frequencies were measured in schwa-fricative-vowel utterances (consisting of the fricatives /s/ and [symbol:see text] and of the vowels /i/ and /a/) from 40 speakers (10 each of the ages of 3, 5, 7 years, and adults) at three locations (for the entire schwa, for 10 ms of fricative noise centered at 30 ms before voicing onset, and 10 pitch periods from vocalic center). Results of several analyses supported four conclusions: (a) the earlier finding was replicated; (b) age-related differences in vocal-tract geometry could not explain the age-related difference in vowel effects on fricative noise; (c) children master intersyllabic gestural organization prior to intrasyllabic gestural organization; and (d) unlike fricative gestures, children's vowel gestures are more spatially distinct than those of adults.}, } @article {pmid8653179, year = {1996}, author = {Mendoza, E and Valencia, N and Muñoz, J and Trujillo, H}, title = {Differences in voice quality between men and women: use of the long-term average spectrum (LTAS).}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {10}, number = {1}, pages = {59-66}, doi = {10.1016/s0892-1997(96)80019-1}, pmid = {8653179}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Respiration ; Sex Factors ; Speech Acoustics ; Vocal Cords/anatomy & histology/physiology ; *Voice Quality ; }, abstract = {The goal of this study was to determine if there are acoustical differences between male and female voices, and if there are, where exactly do these differences lie. Extended speech samples were used. The recorded readings of a text by 31 women and by 24 men were analyzed by means of the Long-Term Spectrum (LTAS), extracting the amplitude values (in decibels) at intervals of 160 Hz over a range of 8 KHz. The results showed a significant difference between genders, as well as an interaction of gender and frequency level. The female voice showed greater levels of aspiration noise, located in the spectral regions corresponding to the third formant, which causes the female voice to have a more "breathy" quality than the male voice. The lower spectral tilt in the women's voices is another consequence of this presence of greater aspiration noise.}, } @article {pmid8630549, year = {1996}, author = {Pasterkamp, H and Sanchez, I}, title = {Effect of gas density on respiratory sounds.}, journal = {American journal of respiratory and critical care medicine}, volume = {153}, number = {3}, pages = {1087-1092}, doi = {10.1164/ajrccm.153.3.8630549}, pmid = {8630549}, issn = {1073-449X}, mesh = {Adult ; Auscultation/instrumentation ; Fourier Analysis ; Helium/chemistry/*pharmacology ; Humans ; Inhalation ; Lung/drug effects/physiology ; Male ; Oxygen/chemistry/*pharmacology ; Pulmonary Ventilation/drug effects ; Respiratory Sounds/*drug effects ; Rheology ; Signal Processing, Computer-Assisted ; Speech/drug effects/physiology ; Total Lung Capacity ; Trachea/drug effects/physiology ; }, abstract = {The generation of normal lung sounds by turbulent air flow has been questioned because gas density appears to have only a minor effect. We studied whether gas density has a greater influence on lung sounds at higher frequencies than traditionally measured. Six healthy adult men breathed air followed by a mixture of 80% helium and 20% oxygen (He-O2) at a target flow of 1.5 L/s. Flow and sounds at the trachea and posterior right lower lobe were simultaneously acquired by computer. Fourier analysis was applied to sounds at target flow +/- 0.2 L/s. Average power spectra were computed for each recording site, respiratory phase, and respired gas. He-O2 reduced the power of inspiratory lung sounds below 300 Hz by only 1.7 +/- 1.5 dB whereas power between 300 and 600 Hz was reduced by 4.6 +/- 1.4 dB (p<0.05). Tracheal sound power was reduced less consistently but all subjects showed an upward frequency shift in power maxima on He-O2, similar to formant shifts observed in helium speech. Our findings suggest that flow turbulence is the major determinant of lung sounds at higher frequencies. Current instruments for auscultation and recording of respiratory sounds may have to be modified to optimize their response in this higher frequency range.}, } @article {pmid9099020, year = {1996}, author = {Pillot, C and Quattrocchi, S}, title = {[Acoustic measurements, perceptive judgements and physiological correlates of the "singing-formant" in male and female opera singers].}, journal = {Revue de laryngologie - otologie - rhinologie}, volume = {117}, number = {4}, pages = {335-339}, pmid = {9099020}, issn = {0035-1334}, mesh = {Acoustics ; Adult ; Female ; Humans ; Magnetic Resonance Imaging ; Male ; *Music ; Perception ; *Voice Quality/physiology ; }, abstract = {At the crossroads of speech and music, this research contribute to a description of the "singing-formant" not only in its acoustic aspects, but also in its physiological and perceptive aspects. From systematic spectral analysis of the phenomenon, the singing-formant is firstly redefined and described from an acoustic point of view, as a reinforcement of extra energy at around 2800 Hz in male subjects and up to 4000 Hz in female ones. What is the perceptive significance of such a phenomenon? Taking into account the voicing context of lyrical singing, we then attempt to show the impact of the singing-formant on the sounds perceived through perception tests. Finally, we attempt to explain its characteristics from a physiological point of view. Vibratory phenomena initially favour the emergence of high-pitched harmonics, richer in the larynx spectra of those of our subjects who have the singing-formant. From a resonance point of view from the analysis of the contours of middle sagittal slices of Magnetic Resonance Imaging (MRI) of the French vowels [a], [i] and [o] spoken and sung by a professional bass singer, the area functions and spectra of sounds given out have been elaborated, thanks to the vocal tract acoustic simulation software of S. Maeda. Hypotheses relating to the nature and position of articulators favourable to the production of the singing-formant can then be given out, and are of great interest for the education and reeducation of the singing voice.}, } @article {pmid9082797, year = {1996}, author = {Hori, Y and Koike, Y and Ohyama, G and Otsu, SY and Abe, K}, title = {Effects of tonsillectomy on articulation.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {523}, number = {}, pages = {248-251}, pmid = {9082797}, issn = {0365-5237}, mesh = {Adult ; Female ; Humans ; Infant, Newborn ; Male ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; *Tonsillectomy ; Voice Quality ; }, abstract = {Changes in the articulation of 5 Japanese vowels before and after tonsillectomy were compared using the formants as indicators. Data from 40 patients were analyzed and those from 10 healthy volunteers were adopted as control. The comparison of formant frequencies before and 1 month after operation revealed that F1, F2 and F4 remained within the range of intra-individual fluctuations in most cases. F3, however, exceedingly decreased in many cases, and the decrease was particularly marked for /o/. No evident change was observed as to formant bandwidth. The larger the tonsil was, the more changes were seen in the formant frequency. The evaluation according to Mackenzie's classification was found to be unsatisfactory. Observing the postoperative changes over time, it is suggested that the altered vocal tract slowly compensates by means of the surrounding tissues. From the acoustic evaluation, the changes in the articulation due to surgery were considered to be negligible in clinical practice, although in professional vocalists it seemed to be a problem that cannot be disregarded.}, } @article {pmid8913117, year = {1996}, author = {Recasens, D}, title = {An articulatory-perceptual account of vocalization and elision of dark /l/ in the Romance languages.}, journal = {Language and speech}, volume = {39 (Pt 1)}, number = {}, pages = {63-89}, doi = {10.1177/002383099603900104}, pmid = {8913117}, issn = {0023-8309}, mesh = {France ; Humans ; Italy ; *Language ; *Phonetics ; Spain ; }, abstract = {This investigation seeks to understand the factors causing vocalization and elision of dark/l/ in the Romance languages. Contrary to articulatory- and perceptual-based arguments in the literature it is claimed that preconsonantal vocalization conveys the phonemic categorization of the /w/-like formant transitions generated by the tongue dorsum retraction gesture (in a similar fashion to other processes such as /[symbol: see text] /Vjn/). The evolution /VwlC/ > /VwC/ may be explained using articulatory and perceptual arguments. A dissimilatory perceptual mechanism is required in order to account for a much higher frequency of vocalizations before dentals and alveolars than before labials and velars in the Romance languages. Through this process listeners assign the gravity property of dark /l/ to a following grave labial or velar consonant but not so to a following acute dental or alveolar consonant in spite of the alveolar lateral being equally dark (i.e., grave) in the three consonantal environments. Other articulatory facts appear to play a role in the vocalization of final /l/ (i.e., the occurrence of closure after voicing has ceased) and of geminate /ll/ (i.e., its being darker than non-geminate /l/). The elision of dark /l/ may occur preconsonantally and word finally either after vocalization has applied or not. This study illustrates the multiple causal factors and the articulatory-perceptual nature of sound change processes.}, } @article {pmid8891486, year = {1996}, author = {Nakai, K and Sakakura, A and Takahashi, H and Sadaoka, T and Kakitsuba, N}, title = {Articulation after uvulopalatopharyngoplasty.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {253}, number = {7}, pages = {417-420}, pmid = {8891486}, issn = {0937-4477}, mesh = {Adult ; Female ; Humans ; Male ; Palate, Soft/*surgery ; Pharynx/*surgery ; Postoperative Period ; Sleep Apnea Syndromes/*surgery ; Snoring/surgery ; *Speech Acoustics ; *Speech Articulation Tests ; Speech Perception ; Uvula/*surgery ; }, abstract = {We performed perceptual and acoustic studies to demonstrate articulation after uvulopalatopharyngoplasty (UPPP) in Japanese patients at 28.6 days (mean) after surgery (range 21-50 days). The results of listening tests showed that there were no significant changes between pre- and postoperative articulation scores for any of 25 monosyllables tested, and there were no significant changes in the timbres of 5 Japanese vowels after UPPP. In the acoustic study of the first and second formant frequencies of the vowels, some formant frequencies had significant changes after UPPP. However, these changes in formants fell within the range of intraindividual variation. In the acoustic study of /ka/ with a palatal plosive /k/, there was no significant change in the voice onset time or F2 transition after UPPP.}, } @article {pmid8849858, year = {1996}, author = {Kataoka, R and Michi, K and Okabe, K and Miura, T and Yoshida, H}, title = {Spectral properties and quantitative evaluation of hypernasality in vowels.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {33}, number = {1}, pages = {43-50}, doi = {10.1597/1545-1569_1996_033_0043_spaqeo_2.3.co_2}, pmid = {8849858}, issn = {1055-6656}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Cleft Lip/complications ; Cleft Palate/complications ; Factor Analysis, Statistical ; Female ; Humans ; Male ; Nose/physiopathology ; *Phonetics ; Regression Analysis ; *Sound Spectrography/methods/statistics & numerical data ; Speech Acoustics ; Speech Disorders/*diagnosis/physiopathology ; Speech Perception ; Speech Therapy/instrumentation ; Velopharyngeal Insufficiency/complications/congenital ; }, abstract = {A new technique for evaluating hypernasality using an acoustic approach is presented. In a preliminary study using this technique, nasal resonance was assessed in 17 normal subjects and 16 subjects judged to be hypernasal. Analyses of the one-third-octave power spectra revealed an increase in power level between the first and second formant, and a reduction in the power level in second and third formant regions among utterances judged to be hypernasal. Factor analysis of the perceptual ratings revealed that the consensus perception of hypernasality accounted for 71% of the total variance. An additional 8% was accounted for by individual differences. Multiple regression analysis revealed a high correlation between the consensus perception of hypernasality and the variance in two acoustic-power levels, these being the power level between the first and second formant and the power level of the second and third formant regions.}, } @article {pmid8634723, year = {1996}, author = {Maurer, D and Landis, T}, title = {Intelligibility and spectral differences in high-pitched vowels.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {48}, number = {1}, pages = {1-10}, doi = {10.1159/000266377}, pmid = {8634723}, issn = {1021-7762}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Germany ; Humans ; Male ; Phonation ; *Phonetics ; *Sound Spectrography ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {The first formant frequency of most German vowels can be 'oversung' in the sense of vocalizations with pitch frequencies above F1 of normal speech. Investigations of sung and synthesized vowels suggested that, with rising F0, either the vowel loses its identity and its spectral characteristics, or changes in the vocal effort and the speaker group are perceived. This study presents high-pitched vocalizations by untrained men, women and children, apart from singing or shouting. Three main results were found: (i) vowel identity can be maintained at high pitches (F0 = 660-870 Hz); (ii) clear spectral differences in high-pitched vowels are demonstrated; (iii) high pitched vowels can be found within one speaker group, apart from changes in the perceived speaker group. Both the F0 dependence of the lower formants and a possible relationship between phonation and articulation are discussed.}, } @article {pmid8568047, year = {1996}, author = {McGowan, RS and Lee, M}, title = {Task dynamic and articulatory recovery of lip and velar approximations under model mismatch conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {99}, number = {1}, pages = {595-608}, doi = {10.1121/1.415220}, pmid = {8568047}, issn = {0001-4966}, support = {DC-01247/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Lip ; Models, Anatomic ; *Palate, Soft ; Speech Acoustics ; }, abstract = {An algorithm for recovering task dynamics and speech articulator movements from speech acoustics was tested under various model mismatch conditions. There was evidence of articulatory compensation to recover tract-variable (constriction) trajectories in speech produced with a lip approximation under sufficiently constrained conditions. However, in more extensive studies of lip and velar approximations, the recovered tract-variable trajectories were also different from those of the data-producing utterance. This phenomenon can occur because the matching criterion in the analysis-by-synthesis procedure is an acoustic criterion and the correspondence between the tract-variable trajectories and the acoustic output is not exact. While there may be some tract-variable compensation to attain a good acoustic match, there is evidence of a correspondence between how well the tract-variable trajectories match and how well the formant frequencies match in particular instances.}, } @article {pmid8747815, year = {1995}, author = {Holmberg, EB and Hillman, RE and Perkell, JS and Guiod, PC and Goldman, SL}, title = {Comparisons among aerodynamic, electroglottographic, and acoustic spectral measures of female voice.}, journal = {Journal of speech and hearing research}, volume = {38}, number = {6}, pages = {1212-1223}, doi = {10.1044/jshr.3806.1212}, pmid = {8747815}, issn = {0022-4685}, support = {R01-DC 00266/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Electrophysiology ; Female ; Glottis/*physiology ; Humans ; Phonetics ; *Pulmonary Ventilation ; *Sound Spectrography ; *Voice Quality ; }, abstract = {This study examines measures of the glottal airflow waveform, the electroglottographic signal (EGG), amplitude differences between peaks in the acoustic spectrum, and observations of the spectral energy content of the third formant (F3), in terms of how they relate to one another. Twenty females with normal voices served as subjects. Both group and individual data were studied. Measurements were made for the vowel in two speech tasks: strings of the syllable /pae/and sustained phonation of /ae/, which were produced at two levels of vocal effort: comfortable and loud voice. The main results were: 1. Significant differences in parameter values between /pae/and/ae/were related to significant differences in the sound pressure level (SPL). 2. An "adduction quotient," measured from the glottal waveform at a 30% criterion, was sensitive enough to differentiate between waveforms reflecting abrupt versus gradual vocal fold closing movements. 3. DC flow showed weak or nonsignificant relationships with acoustic measures. 4. The spectral content in the third formant (F3) in comfortable loudness typically consisted of a mix of noise and harmonic energy. In loud voice, the F3 spectral content typically consisted of harmonic energy. 5. Significant differences were found in all measures between tokens with F3 harmonic energy and tokens with F3 noise, independent of loudness condition. 6. Strong relationships between flow- and EGG-adduction quotients suggested that these signals can be used to complement each other. 7. The amplitude difference between spectral peaks of the first and third formant (F1-F3) was found to add information about abruptness of airflow decrease (flow declination) that may be lost in the glottal waveform signal due to low-pass filtering. The results are discussed in terms of how an integrated use of these measures can contribute to a better understanding of the normal vocal mechanism and help to improve methods for evaluating vocal function.}, } @article {pmid8550947, year = {1995}, author = {Conley, RA and Keilson, SE}, title = {Rate representation and discriminability of second formant frequencies for /epsilon/-like steady-state vowels in cat auditory nerve.}, journal = {The Journal of the Acoustical Society of America}, volume = {98}, number = {6}, pages = {3223-3234}, doi = {10.1121/1.413812}, pmid = {8550947}, issn = {0001-4966}, support = {DC00023/DC/NIDCD NIH HHS/United States ; DC00109/DC/NIDCD NIH HHS/United States ; DC00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Cats ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Alternate forms of the steady-state vowel /epsilon/ with second formant peaks located at 1400, 1500, 1700, and 2000 Hz were used to study the representation and discrimination of second formant frequencies at the level of the auditory nerve. Recordings from large populations of auditory nerve fibers in response to these stimuli were used to create rate-place plots, which show second formant peaks that resembled the stimulus spectra. Measures of the peak amplitude decreased as sound level was increased and as second formant frequency was lowered. Representation of the spectra was degraded at the higher sound level because of saturation and two-tone suppressive effects. However, formant peaks were clearly represented in plots of rate differences between two vowels. Such plots resemble the ratio of the magnitudes of the two vowel spectra. The results suggest that information concerning the position of formant peaks is present in the average discharge rate of the auditory nerve. A measure of discriminability, d', between vowel pairs was also calculated. Second formants differing by 125-240 hz can be discriminated using the rate responses of individual fibers that are optimally placed on the basilar membrane; the estimated second formant jnd for the whole auditory nerve is approximately 1 Hz.}, } @article {pmid8746747, year = {1995}, author = {Maurer, D and Landis, T}, title = {FO-dependence, number alteration, and non-systematic behaviour of the formants in German vowels.}, journal = {The International journal of neuroscience}, volume = {83}, number = {1-2}, pages = {25-44}, doi = {10.3109/00207459508986323}, pmid = {8746747}, issn = {0020-7454}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Germany ; Humans ; *Language ; Male ; *Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {In acoustic theory, formant pattern differences within one vowel identify are related to differences in the speaker groups or in the types of vocalization. Yet, studies of vowel synthesis indicate that formant patterns can also vary strongly in relation either to pitch or to formant number alterations. Within this study, natural vocalizations of nine German vowels were investigated with regard to different formant patterns representing the same vowel identity, apart from differences in speaker groups and vocalization types. The results show i) FO-dependence of the lower formants < 1.5-2 kHz; ii) occurrence of natural one-formant back and two-formant front vowels; iii) a non-systematic relationship between FO and the formants, and a non-systematic relationship between formant patterns of different formant numbers. The implications for psychophysics, physiology and perception of speech are discussed.}, } @article {pmid8525870, year = {1995}, author = {Massaro, DW and Cohen, MM}, title = {Continuous versus discrete information processing in pattern recognition.}, journal = {Acta psychologica}, volume = {90}, number = {1-3}, pages = {193-209}, doi = {10.1016/0001-6918(95)00027-r}, pmid = {8525870}, issn = {0001-6918}, support = {R01 DC 00236/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Female ; Fuzzy Logic ; Humans ; Male ; *Phonetics ; Psychoacoustics ; *Reaction Time ; Speech Acoustics ; *Speech Perception ; }, abstract = {A discrete feature model (DFM) and the fuzzy logical model (FLMP) were formulated to predict the distribution of rating judgments in a pattern recognition task. The distinction was between the spoken vowels /i/ and /I/, as in beet and bit. Subjects were instructed to rate the vowel on a nine-point scale from /i/ to /I/. Two features, the first formant frequency (F1) and the vowel duration, were orthogonally varied: The vowel /i/ has a lower (F1) and a longer duration compared to a somewhat higher (F1) and shorter duration for /I/. The DFM predicts that the separate features are recognized discretely, whereas the FLMP assumes that continuous information is available about each feature. Tests of these models on the observed data indicated that the continuous information assumption of the FLMP gave a significantly better description of the distribution of rating judgments.}, } @article {pmid7593928, year = {1995}, author = {Chen, MY}, title = {Acoustic parameters of nasalized vowels in hearing-impaired and normal-hearing speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {98}, number = {5 Pt 1}, pages = {2443-2453}, doi = {10.1121/1.414399}, pmid = {7593928}, issn = {0001-4966}, support = {DC 00075/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Female ; *Hearing ; *Hearing Disorders ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {One of the more prevalent abnormalities that contribute to reduced intelligibility in the speech of hearing-impaired speakers is inadvertent nasalization. Acoustic analysis of the speech of hearing-impaired children and nasalized vowels of normal-hearing adults indicate the presence of an extra pole-zero pair between the first and second formants and a reduced first-formant prominence. The difference between the amplitude of the first formant and the amplitude of the extra peak, A1-P1, was shown to be a promising measure that correlates with listener judgments of the degree of vowel nasality in utterances of hearing-impaired and normal-hearing children. To obtain further validation of these parameters as measures of nasality, A1 and P1 were systematically manipulated in synthetic utterances. The results of perceptual experiments with a number of synthesized words (of the form bVt) showed somewhat different relative importance of the two parameters, depending on the vowel. High correlation of A1-P1 with the average nasality perception judgments was found for each of the stimuli except those for which the first formant was very close to the extra peak frequency. A procedure was developed for correcting the value of A1-P1 to normalize for different vowels, based on the frequencies of the first two formants.}, } @article {pmid7584902, year = {1995}, author = {Holden, SB and Niranjan, M}, title = {On the practical applicability of VC dimension bounds.}, journal = {Neural computation}, volume = {7}, number = {6}, pages = {1265-1288}, doi = {10.1162/neco.1995.7.6.1265}, pmid = {7584902}, issn = {0899-7667}, mesh = {Age Factors ; *Algorithms ; Bayes Theorem ; Databases, Factual ; Female ; Humans ; Learning ; Male ; Models, Theoretical ; *Neural Networks, Computer ; Pattern Recognition, Automated ; Sex Characteristics ; *Speech Perception ; }, abstract = {This article addresses the question of whether some recent Vapnik-Chervonenkis (VC) dimension-based bounds on sample complexity can be regarded as a practical design tool. Specifically, we are interested in bounds on the sample complexity for the problem of training a pattern classifier such that we can expect it to perform valid generalization. Early results using the VC dimension, while being extremely powerful, suffered from the fact that their sample complexity predictions were rather impractical. More recent results have begun to improve the situation by attempting to take specific account of the precise algorithm used to train the classifier. We perform a series of experiments based on a task involving the classification of sets of vowel formant frequencies. The results of these experiments indicate that the more recent theories provide sample complexity predictions that are significantly more applicable in practice than those provided by earlier theories; however, we also find that the recent theories still have significant shortcomings.}, } @article {pmid8558870, year = {1995}, author = {Turner, GS and Tjaden, K and Weismer, G}, title = {The influence of speaking rate on vowel space and speech intelligibility for individuals with amyotrophic lateral sclerosis.}, journal = {Journal of speech and hearing research}, volume = {38}, number = {5}, pages = {1001-1013}, doi = {10.1044/jshr.3805.1001}, pmid = {8558870}, issn = {0022-4685}, support = {DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Amyotrophic Lateral Sclerosis/*complications ; Dysarthria/diagnosis/*etiology ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {The relationship between speaking rate, vowel space area, and speech intelligibility was studied in a group of 9 subjects with amyotrophic lateral sclerosis (ALS) and 9 age- and gender-matched controls. Subjects read a standard passage (the Farm Passage) at three speaking rates, including HABITUAL, FAST, and SLOW. Vowel segment durations and target formant frequencies were measured at each speaking rate from select words containing the vowels /i/, /ae/, /a/, and /u/. To quantify changes in vowel space area across speaking rate, the area of the vowel quadrilateral was calculated for each speaker at each speaking rate. In addition, intelligibility estimates at each speaking rate were obtained for the dysarthric speakers. Results revealed that dysarthric speakers exhibited smaller vowel space areas and less systematic changes in vowel space as a function of speaking rate, when compared to the neurologically intact speakers. In an examination of the relationship between vowel space area and speech intelligibility, vowel space was found to account for 45% of the variance in speech intelligibility. This result suggests that vowel space area is an important component of global estimates of speech intelligibility.}, } @article {pmid8541972, year = {1995}, author = {Schutte, HK and Miller, DG and Svec, JG}, title = {Measurement of formant frequencies and bandwidths in singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {9}, number = {3}, pages = {290-296}, doi = {10.1016/s0892-1997(05)80236-x}, pmid = {8541972}, issn = {0892-1997}, mesh = {Humans ; *Speech Production Measurement ; *Voice ; }, abstract = {That singers under certain circumstances adjust the articulation of the vocal tract (formant tuning) to enhance acoustic output is both apparent from measurements and understood in theory. The precise effect of a formant on an approaching (retreating) harmonic as the latter varies in frequency during actual singing, however, is difficult to isolate. In this study variations in amplitude of radiated sound components as well as supraglottal and subglottal (esophageal) pressures accompanying the vibrato-related sweep of voice harmonics were used as a basis for estimating the effective center frequencies and bandwidths of the first and second formants.}, } @article {pmid7668763, year = {1995}, author = {Aronson, L and Arauz, SL}, title = {Fitting the Nucleus-22 cochlear implant for Spanish speakers.}, journal = {The Annals of otology, rhinology & laryngology. Supplement}, volume = {166}, number = {}, pages = {75-76}, pmid = {7668763}, issn = {0096-8056}, mesh = {Adult ; Child ; *Cochlear Implants ; Deafness/*rehabilitation ; Humans ; Language ; Middle Aged ; Phonetics ; Speech Perception ; }, abstract = {This work aimed to study the speech comprehension of 13 native Spanish-speaking patients implanted with the Nucleus-22 system. Experiments were performed under two conditions. First, tests of isolated vowel discrimination, bisyllable word recognition in a consonant-vowel-consonant context, and fluency speech comprehension were conducted with the speech processors (SPs) fitted with the frequency boundary distribution as specified by the diagnostic and programming system. This distribution was called default frequency boundary. For the second condition, the SP of each patient was refitted using a procedure based on changes in the frequency-to-electrode mapping, called modified frequency boundaries (MFB), taking into account the formant pattern of the Spanish Rioplatense vowel system. These changes intended to improve the vowel perception and therefore the fluency of speech, as the result of a more adequate placeformant distribution. Patients were retested, and as a consequence of the MFB, improvements in the results of the tests were observed for most of the patients. Counterbalance mode tests were conducted to avoid the influence of training effects in the results. This study suggested that speech comprehension could be improved by adjusting this SP according to the patients' specific language.}, } @article {pmid7668719, year = {1995}, author = {Cummings, S and Groenewald, E and Coetzee, L and Hugo, R and Van Derlinde, M}, title = {Speech production changes with the Nucleus 22-channel cochlear implant.}, journal = {The Annals of otology, rhinology & laryngology. Supplement}, volume = {166}, number = {}, pages = {394-397}, pmid = {7668719}, issn = {0096-8056}, mesh = {Adult ; Child, Preschool ; *Cochlear Implants ; Humans ; Phonetics ; Psychoacoustics ; Sound Spectrography ; *Speech Intelligibility ; }, abstract = {The speech production changes of a postlingually deaf child and adult with a multichannel cochlear implant were spectrographically analyzed over a period of time. The segmental analysis in both case studies included the measurement of vowel, fricative, and plosive burst duration; first and second vowel formant frequency relationship; and centroid frequencies of fricatives and plosive bursts. The suprasegmental analysis in the second case study (adult) investigated sentence duration, pitch variation, and word stress. Results of both case studies indicated an overall improvement in the accuracy of consonant and vowel productions following the multichannel cochlear implant, in terms of duration, centroid frequencies, and vowel formant frequency relationships. The suprasegmental analysis indicated an overall improvement with the use of the cochlear implant over time. These positive changes in speech production may be indicative of the improved auditory feedback information provided by the cochlear implant.}, } @article {pmid7668710, year = {1995}, author = {Lai, WK and Dillier, N}, title = {Glide psychophysics and related Mpeak and Speak speech performance.}, journal = {The Annals of otology, rhinology & laryngology. Supplement}, volume = {166}, number = {}, pages = {368-370}, pmid = {7668710}, issn = {0096-8056}, mesh = {Auditory Perception ; *Cochlear Implants ; Humans ; Psychoacoustics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speech test results have indicated that improvements in speech comprehension can be achieved by means of coding strategies with high pulse rates. A possible effect of using high rates to encode speech information is that temporal information such as formant transitions could be better represented in the resulting electrode activity. A psychophysical study involving 4 cochlear implant users was carried out to observe the effect of various carrier pulse rates on the absolute identification of stimuli with a transient electrode trajectory. Also, the duration of the trajectory was varied. The results from this study indicated, first, that the trajectory identification became progressively more difficult as the duration of the trajectory was shortened. The identification score generally deteriorated much faster at a lower rate (100 pulses per second [pps]) than at higher rates (300 and 500 pps) with decreasing transient duration. The higher two pulse rates, 300 pps and 500 pps, yielded results similar to one another for longer (500 and 300 milliseconds) transient durations, but the identification of shorter (100 and 50 milliseconds) transient durations was better at 500 pps than at 300 pps. The implication is that higher carrier pulse rates are likely to be able to transmit short transient information better than lower ones. A comparison with speech test results using the Speak (spectral peak) speech-coding strategy shows that the place of articulation feature is significantly improved when compared to the slower-rate Mpeak (Multipeak) pitch-synchronous strategy.}, } @article {pmid7656646, year = {1995}, author = {Sapira, JD}, title = {About egophony.}, journal = {Chest}, volume = {108}, number = {3}, pages = {865-867}, doi = {10.1378/chest.108.3.865}, pmid = {7656646}, issn = {0012-3692}, mesh = {*Auscultation/history/instrumentation ; France ; History, 18th Century ; History, 19th Century ; Humans ; Lung Diseases/*diagnosis ; Pleural Effusion/*diagnosis ; *Respiratory Sounds ; }, abstract = {Egophony is a change in timbre (Ee to A) but not pitch or volume. It is due to a decrease in the amplitude and an increase in the frequency [corrected] of the second formant, produced by solid (including compressed lung) interposed between the resonator and the stethoscope head. This explains certain difficulties in learning this valuable but currently neglected sign as well as in understanding certain physiologic false-positive occurrences.}, } @article {pmid7642817, year = {1995}, author = {Culling, JF and Summerfield, Q}, title = {Perceptual separation of concurrent speech sounds: absence of across-frequency grouping by common interaural delay.}, journal = {The Journal of the Acoustical Society of America}, volume = {98}, number = {2 Pt 1}, pages = {785-797}, doi = {10.1121/1.413571}, pmid = {7642817}, issn = {0001-4966}, mesh = {Humans ; Noise/adverse effects ; Perceptual Masking ; Phonetics ; *Speech Perception ; }, abstract = {Three experiments and a computational model explored the role of within-channel and across-channel processes in the perceptual separation of competing, complex, broadband sounds which differed in their interaural phase spectra. In each experiment, two competing vowels, whose first and second formants were represented by two discrete bands of noise, were presented concurrently, for identification. Experiments 1 and 2 showed that listeners were able to identify the vowels accurately when each was presented to a different ear, but were unable to identify the vowels when they were presented with different interaural time delays (ITDs); i.e. listeners could not group the noisebands in different frequency regions with the same ITD and thereby separate them from bands in other frequency regions with a different ITD. Experiment 3 demonstrated that while listeners were unable to exploit a difference in interaural delay between the pairs of noisebands, listeners could identify a vowel defined by interaurally decorrelated noisebands when the other two noisebands were interaurally correlated. A computational model based upon that of Durlach [J. Acoust. Soc. Am. 32, 1075-1076 (1960)] showed that the results of these and other experiments can be interpreted in terms of a within-channel mechanism, which is sensitive to interaural decorrelation. Thus the across-frequency integration which occurs in the lateralization of complex sounds may play little role in segregating concurrent sounds.}, } @article {pmid7642813, year = {1995}, author = {Bakkum, MJ and Plomp, R and Pols, LW}, title = {Objective analysis versus subjective assessment of vowels pronounced by deaf and normal-hearing children.}, journal = {The Journal of the Acoustical Society of America}, volume = {98}, number = {2 Pt 1}, pages = {745-762}, doi = {10.1121/1.413568}, pmid = {7642813}, issn = {0001-4966}, mesh = {Child ; *Deafness ; Female ; *Hearing ; Humans ; Male ; *Phonetics ; *Speech Production Measurement ; }, abstract = {Objective whole-spectrum and formant analyses have been performed on all 15 Dutch vowels pronounced in /C1VC2/ words by 24 deaf and 24 normal-hearing children, in order to develop a model of pronunciation quality for evaluating (deaf) speech; the results as obtained for adult males by Bakkum et al. [J. Acoust. Soc. Am. 94, 1989-2004 (1993)] have been verified and extended. Spectral representations of the vowels were created by determining the output levels of a bank of 16 filters (90-7200 Hz), with 1/3-oct bandwidths and logarithmic spacing of their center frequencies. Spectral differences agree well with subjective differences in pronunciation quality obtained from magnitude estimation and identification experiments. Spectral differences not related to pronunciation quality judgments arise as a consequence of physiological interspeaker differences and variation in fundamental frequency, but these differences can be compensated for by speaker-normalization and F0-compensation procedures. Using principal components analysis (PCA), the vowel spectra can be described by a limited number of dimensions, without losing much information; a description in a two-dimensional PCA subspace still agrees well with the subjective judgments and it also agrees with a description by the first two formants. The whole-spectrum approach provides a determinate, readily interpretable model of pronunciation quality for evaluating vowels. As a practical advantage, its computational requirements are modest and, in conjunction with PCA, the vowel dynamics can be visualized, which makes the approach suitable for vowel training and diagnostics.}, } @article {pmid7642812, year = {1995}, author = {Vorperian, HK and Ochs, MT and Grantham, DW}, title = {Stimulus intensity and fundamental frequency effects on duplex perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {98}, number = {2 Pt 1}, pages = {734-744}, doi = {10.1121/1.413567}, pmid = {7642812}, issn = {0001-4966}, mesh = {Adult ; Humans ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Duplex perception occurs when part of the acoustic signal is used for both a speech and a nonspeech percept. This phenomenon has been interpreted as evidence of a distinct system for speech perception that precedes other specialized systems of general auditory processing (such as auditory grouping, and perception of pitch, loudness, and timbre). This interpretation was investigated by using an intensity-dependent form of duplex perception with the acoustic pair /da/ and /ga/. The "base" portion of the stimulus, common to both, consisted of the first and second formants and the steady-state portion of the third formant (F3). The F3 transition (either a sinusoid or a true formant), which cued the difference between /da/ and /ga/, was varied in intensity and fundamental frequency (F0). For every subject, the level at which each type of F3 transition was barely audible in the context of the base, i.e., duplex perception threshold, was first established. Next, identification functions were obtained by varying the intensity of the F3 transition relative to each subject's duplex perception threshold. Results revealed that duplex perception thresholds decreased as the F0 of the F3 transition increasingly differed from the base. Also, identification functions showed that, as has been previously demonstrated, the F3 transition contributed to the speech percept over a wide range of intensities and fundamental frequencies. However, as F3 transition intensity increased well above duplex perception threshold, /ga/ identification decreased. Also, both /da/ and /ga/ identification progressively decreased as the F0 of the F3 transition increasingly differed from the base. Contrary to previous duplex perception reports, such findings indicate that both intensity and F0 information is available to the specialized speech perception system. Thus, the computations of the speech perception system and its relation to the general auditory processing systems need to be reexamined.}, } @article {pmid7472771, year = {1995}, author = {Momoi, T}, title = {[An analytical study on recognition confusion of vowels in patients with 22 channel cochlear implant].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {98}, number = {8}, pages = {1318-1322}, doi = {10.3950/jibiinkoka.98.1318}, pmid = {7472771}, issn = {0030-6622}, mesh = {Adult ; *Auditory Perception ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; }, abstract = {Vowel confusion was studied in 16 postlingually deaf patients with 22 multi-channel cochlear implants. Tests for vowel perception were performed on these patients after a 3 month training period, using/a,e,i,o,u/pronounced by a speech therapist. Analyses were conducted on the relationship between the vowel confusion matrix and the frequencies of the 1st and 2nd formants of vowels used in this study, and the relationship between the confusion matrix and the numbers in order of the activating electrodes. The results revealed that the higher percentages of errors in vowel recognition were as follows: /o/-->/a/ (12%),/o/-->/u/ (13%), and/e/-->/i/ (8%), possibly due to small differences found in the F1 and F2 frequency ratio or in the F2 frequency of the vowels, which results in a similar distance between two electrodes, pairs or similar stimulated places in the sense of the high frequency part of the cochlea.}, } @article {pmid8816083, year = {1995}, author = {van Dommelen, WA and Moxness, BH}, title = {Acoustic parameters in speaker height and weight identification: sex-specific behaviour.}, journal = {Language and speech}, volume = {38 (Pt 3)}, number = {}, pages = {267-287}, doi = {10.1177/002383099503800304}, pmid = {8816083}, issn = {0023-8309}, mesh = {Adult ; Age Factors ; *Body Height ; *Body Weight ; Female ; Humans ; *Judgment ; Male ; Sex Factors ; *Speech Acoustics ; Speech Perception ; }, abstract = {This study examines the ability of listeners to judge speaker height and weight from speech samples. Although previous investigations indicate that listeners are consistent in estimating body characteristics, it is not known which speech signal parameters are being used by the listeners for such estimates. Therefore, a series of listening tests was carried out in which male and female listeners judged the height and weight from male and female speakers reading isolated words and two text paragraphs. Both speaker sex and listener sex turned out to be important factors: Significant correlations between estimated height/weight and actual height/weight were found only for male speakers. The majority of these estimates came from the male listeners. Neither male nor female listeners, however, were able to estimate female speaker height or weight. Regression analysis involving F0, formant frequencies, energy below 1 kHz, and speech rate showed no significant correlations between these parameters and actually measured speaker height and weight, the only exception being a significant correlation between male speaker weight and speech rate. Furthermore, regression data suggested that the listeners (correctly) used speech rate information in judging male speaker weight, whereas low F0 and formant frequency values (wrongly) were taken to indicate large speaker body dimensions.}, } @article {pmid7673001, year = {1995}, author = {Koitschev, A and Waldmann, B and Ptok, M}, title = {[Function and morphology of the larynx of the domestic guinea pig. An animal model for laryngologic and phoniatric research?].}, journal = {HNO}, volume = {43}, number = {7}, pages = {432-438}, pmid = {7673001}, issn = {0017-6192}, mesh = {Animals ; Epithelium/anatomy & histology ; Female ; Fourier Analysis ; Guinea Pigs/anatomy & histology/*physiology ; Larynx/anatomy & histology/*physiology ; Male ; Microscopy, Electron, Scanning ; *Models, Biological ; *Signal Processing, Computer-Assisted ; Sound Spectrography/*instrumentation ; Vocalization, Animal/*physiology ; }, abstract = {The present study was undertaken to collect data on fundamental functional and morphological features of the guinea pig larynx using frequency analysis of animal vocalizations and light and scanning electron microscopy. Vocalizations of healthy animals were recorded in an anechoic chamber and were analyzed with short-time fourier transformation (color spectrograms). From the multitude of vocalization patterns produced by the guinea pig, distress squeals were selected as a typical, reproducible, voiced sound. These were stereotyped, upward-frequency modulated, multiharmonic signals with a formant-like structure. The fundamental frequency of this sound rose by about 1.5 octaves and reached up to 6 kHz near the end of the call. The different regions of the endolaryngeal epithelia showed morphological properties that corresponded to proposed functional significance. Keratinized squamous epithelium covered the supraglottic region. The marginal rim of the vocal cord was covered by a very thin epithelium with very flat cells covered with short microvilli. The subepithelial space was filled with loose connective tissue. The subglottic region was covered by respiratory epithelia. Based on these findings, we propose that the larynx of the guinea pig can be used as a model in phoniatric and laryngological research.}, } @article {pmid7596727, year = {1995}, author = {Corwin, MJ and Lester, BM and Sepkoski, C and Peucker, M and Kayne, H and Golub, HL}, title = {Newborn acoustic cry characteristics of infants subsequently dying of sudden infant death syndrome.}, journal = {Pediatrics}, volume = {96}, number = {1 Pt 1}, pages = {73-77}, pmid = {7596727}, issn = {0031-4005}, support = {R44 HD-20737/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; *Crying ; Humans ; *Infant, Newborn ; Prospective Studies ; Risk Factors ; Sudden Infant Death/*epidemiology ; }, abstract = {OBJECTIVE: To test the hypothesis that the occurrence of a neonatal cry exhibiting a high first formant is a risk factor for sudden infant death syndrome (SIDS) and to evaluate the association between SIDS and other acoustic cry variables.

METHOD: We recorded cries and obtained medical and demographic data for 21,880 apparently healthy term newborns. Two cries were recorded between days 2 and 7 of life, after a painful stimulus at the time of routine blood drawing. Acoustic variables were measured with an automated computer-based analysis system. Twelve infants died of SIDS. Age at death ranged from 19 days to 6.5 months. Autopsies were performed in all cases. At least one cry was analyzed for all 12 infants who died of SIDS and 20,167 infants without SIDS. Two cries were analyzed for 9 infants who died of SIDS and 14,235 infants without SIDS.

RESULTS: Newborns whose first cries exhibited a high first formant were more likely to die of SIDS than infants whose first cries did not have this characteristic (relative risk, 3.5; 95% confidence interval [CI], 1.1 to 12). The relative risk for SIDS increased to 8.8 (95% CI, 2.2 to 35) for newborns whose second cries showed that this characteristic persisted. Newborns with the combination of both a high first formant and a high number of mode changes on both of two cries had a relative risk of 32 (95% CI, 8.7 to 120).

CONCLUSIONS: We have shown an association between alterations in neonatal cry acoustics and SIDS. Cry analysis represents a potentially important research tool that, when studied in relation to other physiologic measures, may lead to an improved understanding of SIDS.}, } @article {pmid7790660, year = {1995}, author = {Summers, V and Leek, MR}, title = {Frequency glide discrimination in the F2 region by normal-hearing and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {6}, pages = {3825-3832}, doi = {10.1121/1.412397}, pmid = {7790660}, issn = {0001-4966}, support = {DC00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Auditory Threshold ; Cochlea/physiopathology ; Functional Laterality ; *Hearing ; *Hearing Loss, Sensorineural/physiopathology ; Humans ; Middle Aged ; Noise ; *Speech Perception ; }, abstract = {The dynamic formant transitions present within consonant-vowel and vowel-consonant utterances may be poorly processed by hearing-impaired (HI) listeners and may, as a result, provide reduced segmental information to these listeners. Both the brief durations and rapid spectral change associated with these transitions have been proposed as contributing to these deficits in processing. To investigate HI listeners' processing of transitional stimuli, six HI and six normal-hearing (NH) listeners were asked to discriminate between frequency glides patterned after second formant transitions in English CV syllables. The influences of glide duration, rate, and frequency extent were examined for each group in quiet and in broadband noise. Reductions in glide duration and the presence of noise each led to significant increases in frequency difference limens for glide onset. The magnitudes of these effects were similar across groups, thereby failing to show increased susceptibility to noise or greater deficits in dealing with brief stimuli on the part of HI listeners. Contrary to the expectation that HI listeners would show greatest deficits (relative to the NH group) for the most rapidly changing glides, the only significant group differences were observed for gradual glides of limited frequency extent. This latter finding is discussed in terms of sensitivity to "cochlear dispersion" cues [Porter et al., J. Acoust. Soc. Am. 90, 1298-1308 (1991)]. It is hypothesized that HI listeners may not process these cues as effectively as NH listeners, perhaps as a result of impaired frequency selectivity.}, } @article {pmid7790659, year = {1995}, author = {Alcántara, JI and Moore, BC}, title = {The identification of vowel-like harmonic complexes: effects of component phase, level, and fundamental frequency.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {6}, pages = {3813-3824}, doi = {10.1121/1.412396}, pmid = {7790659}, issn = {0001-4966}, mesh = {Adult ; Audiometry ; Humans ; *Phonetics ; *Speech Perception ; Task Performance and Analysis ; }, abstract = {These experiments investigate how the identification of vowel-like harmonic complexes, similar to those used by Leek et al. [J. Acoust. Soc. Am. 81, 148-154 (1987)], is affected by spectral contrast, overall level, component phase, and fundamental frequency (F0). Four normally hearing subjects were required to identify which of six vowel-like harmonic complexes was presented on each trial. The test stimuli were complex tones containing the first 35 harmonics of a 100-Hz fundamental or the first 70 harmonics of a 50-Hz fundamental. All of the harmonics with frequencies below 3000 Hz were equal in amplitude except for three pairs of successive harmonics which were located at the first, second, and third formant frequency values, and incremented in level by 1, 2, 4, 8, and 16 dB relative to the other components. Three overall levels were used, 85, 65, and 45 dB SPL, and harmonics were added in either cosine or random phase. The results indicated that identification was better for cosine phase than for random phase, except for the 100-Hz fundamental at 45 dB SPL. The difference between the two phase conditions increased with increasing presentation level and with decreasing fundamental frequency. The results are explained in terms of the waveforms that would occur at the outputs of different auditory filters. It does not appear necessary to invoke nonlinear enhancement mechanisms to explain the results, although an influence of such mechanisms cannot be ruled out.}, } @article {pmid7790658, year = {1995}, author = {Ohde, RN and Haley, KL and Vorperian, HK and McMahon, CW}, title = {A developmental study of the perception of onset spectra for stop consonants in different vowel environments.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {6}, pages = {3800-3812}, doi = {10.1121/1.412395}, pmid = {7790658}, issn = {0001-4966}, support = {DC00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {The importance of different acoustic properties for the perception of place of articulation in prevocalic stop consonants was investigated from a developmental perspective. Eight adults and eight children in each of the age groups, 5, 6, 7, 9, and 11 years, listened to synthesized syllables comprised of all combinations of [b d g] and [i a]. The synthesis parameters were adapted from Blumstein and Stevens [J. Acoust. Soc. Am. 67, 648-662 (1980)], and included manipulations of the following stimulus variables: formant transitions (moving or straight), noise burst (present or absent), and voicing duration (10 or 46 ms). Identification performance was high for all age groups across most stimulus types. Formant transition motion generally was not necessary for accurate identification, and there was no difference between age groups in terms of the perceptual weight placed on this cue. Furthermore, the results did not support the salience of duration as a developmental cue to place of articulation. The presence of a burst improved identification for the velar and alveolar places of articulation for all age groups, but was particularly important for the 11-year-olds and adults. These findings indicate that children, by age 5, do not rely on dynamic formant motion any more than adults do, and that the ability to integrate acoustic cues across regions of spectral change shows developmental patterns.}, } @article {pmid8561691, year = {1995}, author = {Tyler, RS and Lowder, MW and Parkinson, AJ and Woodworth, GG and Gantz, BJ}, title = {Performance of adult Ineraid and Nucleus cochlear implant patients after 3.5 years of use.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {34}, number = {3}, pages = {135-144}, doi = {10.3109/00206099509071907}, pmid = {8561691}, issn = {0020-6091}, support = {P50 DC000242/DC/NIDCD NIH HHS/United States ; DC00242/DC/NIDCD NIH HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Deafness/*rehabilitation ; *Hearing Aids ; Humans ; Lipreading ; Phonetics ; Speech Perception ; }, abstract = {Forty-two postlingually deafened adult patients, 21 with a formant extraction version of the Nucleus cochlear implant and 21 with the Ineraid cochlear implant (analog processing), were evaluated on a series of speech perception tests after using their implants for about 3.5 years. A wide range of performance was observed across patients for both devices. All but 4 patients showed an enhancement in their lipreading ability with the implant. Word recognition averaged about 14-19% correct, and word recognition in sentences averaged about 43-49% correct for the two implant groups. Average performance was superior with the Ineraid implant on consonant recognition in noise. An information transmission analysis suggested that vowel perception was influenced by first- and third-formant frequency for the Nucleus, and first-formant and fundamental frequency for the Ineraid patients. It appeared that the Ineraid device was more effective, on average, at conveying information about consonant nasality and frication. For consonant perception, nasality and frication contributed most to the total information transmitted for both implant types. Both devices had difficulty conveying information about vowel second-formant frequency and consonant place information. These scores at 3.5 years are substantially elevated from preoperative performance and, overall, the patients clearly benefit from their implant.}, } @article {pmid7759654, year = {1995}, author = {Kewley-Port, D}, title = {Thresholds for formant-frequency discrimination of vowels in consonantal context.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {5 Pt 1}, pages = {3139-3146}, doi = {10.1121/1.413106}, pmid = {7759654}, issn = {0001-4966}, support = {DC-00250/DC/NIDCD NIH HHS/United States ; DC-02229/DC/NIDCD NIH HHS/United States ; }, mesh = {*Auditory Threshold ; Female ; Humans ; *Phonetics ; Pilot Projects ; Psychophysics ; Speech Discrimination Tests ; *Speech Perception ; Speech, Alaryngeal ; }, abstract = {Thresholds for formant frequency discrimination were shown to be in the range of 1%-2% by Kewley-Port and Watson [J. Acoust. Soc. Am. 95, 485-496 (1994)]. The present experiment extends that study and one by Mermelstein [J. Acoust. Soc. Am. 63, 572-580 (1978)] to determine the effect of consonantal context on the discrimination of formant frequency. Thresholds for formant frequency were measured under minimal stimulus uncertainty for the vowel /I/ synthesized in isolation and in CVC syllables with the consonants /b/, /d/, /g/, /z/, /m/, and /l/. Overall, the effects of consonantal context were similar to those reported by Mermelstein (1978), although his threshold estimates were a factor of 4-5 times larger because less-than-optimal psychophysical methods had been used. Compared to the vowel in isolation, consonantal context had little effect on thresholds for F1 and a larger effect on F2. When a shift in threshold was observed, subject variability was high and resolution was degraded by as much as a factor of 2. Analyses of stimulus parameters indicated that resolution was degraded by shortening steady-state vowel duration or if the separation between the onsets of the formant transitions was small. Overall, consonantal context makes it more difficult for some, but not all, listeners to resolve formant frequency as accurately as for vowels in isolation.}, } @article {pmid7759653, year = {1995}, author = {Darwin, CJ and Sandell, GJ}, title = {Absence of effect of coherent frequency modulation on grouping a mistuned harmonic with a vowel.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {5 Pt 1}, pages = {3135-3138}, doi = {10.1121/1.411874}, pmid = {7759653}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; Pitch Perception ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {When a single harmonic close to the first formant frequency is mistuned by about 8%, that harmonic makes a reduced contribution to the vowel's first formant frequency as measured by a shift in the phoneme boundary along an F1 continuum between /I/ and /epsilon/ [C.J. Darwin and R.B Gardner, J. Acoust. Soc. Am. 79, 838-45 (1986)]. In the present experiments, phoneme boundaries along an /I/-/epsilon/ continuum were measured for vowels differing in F1 whose fourth harmonic (500 Hz) was mistuned by 0, +/- 3, +/- 6, or +/- 9%. All the harmonics of a vowel (including the mistuned one) were given either no FM or coherent FM at a rate of 6 Hz and modulation depth of +/- 5%. The results replicated the previous findings, but found no evidence for coherent FM preventing the segregation of the mistuned harmonic from the vowel.}, } @article {pmid7759650, year = {1995}, author = {Hillenbrand, J and Getty, LA and Clark, MJ and Wheeler, K}, title = {Acoustic characteristics of American English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {5 Pt 1}, pages = {3099-3111}, doi = {10.1121/1.411872}, pmid = {7759650}, issn = {0001-4966}, support = {1-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; }, abstract = {The purpose of this study was to replicate and extend the classic study of vowel acoustics by Peterson and Barney (PB) [J. Acoust. Soc. Am. 24, 175-184 (1952)]. Recordings were made of 45 men, 48 women, and 46 children producing the vowels /i,I,e, epsilon,ae,a, [symbol: see text],O,U,u, lambda,3 iota/ in h-V-d syllables. Formant contours for F1-F4 were measured from LPC spectra using a custom interactive editing tool. For comparison with the PB data, formant patterns were sampled at a time that was judged by visual inspection to be maximally steady. Analysis of the formant data shows numerous differences between the present data and those of PB, both in terms of average frequencies of F1 and F2, and the degree of overlap among adjacent vowels. As with the original study, listening tests showed that the signals were nearly always identified as the vowel intended by the talker. Discriminant analysis showed that the vowels were more poorly separated than the PB data based on a static sample of the formant pattern. However, the vowels can be separated with a high degree of accuracy if duration and spectral change information is included.}, } @article {pmid7562656, year = {1995}, author = {Jayaram, G and Abdelhamied, K}, title = {Experiments in dysarthric speech recognition using artificial neural networks.}, journal = {Journal of rehabilitation research and development}, volume = {32}, number = {2}, pages = {162-169}, pmid = {7562656}, issn = {0748-7711}, mesh = {Adult ; Cerebral Palsy/physiopathology ; Humans ; Male ; *Neural Networks, Computer ; Pilot Projects ; *Speech Disorders/physiopathology ; *Speech Intelligibility ; }, abstract = {In this study, we investigated the use of artificial neural networks (ANNs) to recognize dysarthric speech. Two multilayer neural networks were developed, trained, and tested using isolated words spoken by a dysarthric speaker. One network had the fast Fourier transform (FFT) coefficients as inputs, while the other network had the formant frequencies as inputs. The effect of additional features in the input vector on the recognition rate was also observed. The recognition rate was evaluated against the intelligibility rating obtained by five human listeners and also against the recognition rate of the Introvoice commercial speech-recognition system. Preliminary results demonstrated the ability of the developed networks to successfully recognize dysarthric speech despite its large variability. These networks clearly outperformed both the human listeners and the Introvoice commercial system.}, } @article {pmid7789671, year = {1995}, author = {Waldstein, RS and Boothroyd, A}, title = {Comparison of two multichannel tactile devices as supplements to speechreading in a postlingually deafened adult.}, journal = {Ear and hearing}, volume = {16}, number = {2}, pages = {198-208}, doi = {10.1097/00003446-199504000-00007}, pmid = {7789671}, issn = {0196-0202}, support = {P01DC00178/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Deafness ; Equipment Design ; Humans ; *Lipreading ; Male ; Phonetics ; Task Performance and Analysis ; *Touch ; }, abstract = {OBJECTIVE: The purpose of this study was to conduct a single-subject comparison of the effectiveness of two multichannel vibrotactile devices that encode different classes of speech information. One device, the Portapitch, is designed to convey fundamental frequency (F0) and its variation over time. The other, the TACTAID 7, is designed to convey the first two formant frequencies (F1 and F2) and their variation over time.

DESIGN: The subject, a postlingually deafened adult, underwent an intensive 17-wk training and testing protocol with the Portapitch and then completed a similar 17-wk protocol with the TACTAID 7. Performance measures were obtained on phonetic-contrast perception by speechreading alone, tactile device alone, and speechreading plus tactile device, and on open-set word and sentence recognition by speechreading alone and speechreading plus tactile device.

RESULTS: On phonetic-contrast testing, the subject demonstrated some ability to perceive voicing, stress, and intonation contrasts using the Portapitch, but gave little evidence of phonetic-contrast perception with the TACTAID 7. On open-set word recognition testing, no significant improvements were seen with either device. On open-set sentence recognition testing, the subject showed a significant 9 percentage point enhancement effect using the Portapitch; the mean 5 percentage point enhancement effect provided by the TACTAID 7 was not statistically significant.

CONCLUSIONS: A small advantage was seen in favor of the tactile display of F0 relative to the tactile display of formant frequency information on both phonetic-contrast testing and open-set sentence recognition. The difference, however, was of questionable significance and could have been confounded with an order effect. Nevertheless, the subject's preference was for the tactile formant frequency display.}, } @article {pmid7772228, year = {1995}, author = {Seidner, W and Schutte, HK and Nawka, T and Eichhorst, P}, title = {[Practical significance of measuring high formant spectra in phoniatric voice assessment].}, journal = {Laryngo- rhino- otologie}, volume = {74}, number = {4}, pages = {254-258}, doi = {10.1055/s-2007-997733}, pmid = {7772228}, issn = {0935-8943}, mesh = {Follow-Up Studies ; Humans ; Laryngeal Diseases/*diagnosis/surgery ; Postoperative Complications/*diagnosis ; Recurrence ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Treatment Outcome ; Vocal Cord Paralysis/diagnosis/surgery ; Voice Disorders/*diagnosis/surgery ; Voice Quality/*physiology ; }, abstract = {The measurement of sound energy in the spectral region between 2 and 5 kHz provides additional information to the auditive assessment of voice quality. It can be applied to healthy an sick voices, e. g. in tracing the therapeutic course. Devices for measuring voice range profiles with the possibility of evaluating the high formant region are especially useful and valuable. But, nonetheless, simultaneous auditive assessment is crucial to avoid misinterpretation. Calculation of the quotient of the sound pressure level in the region between 2 and 5 kHz by the total sound pressure level allows to determine vocal sound characteristics and changes of the sound. Pre- and postoperative sound spectra obtained from the voices of patients with diseases of the vocal folds illustrate the improvement of the harmonic structure and the decrease of noise components. The measurement of the high formant intensity has proven to be appropriate for the registration of the voice quality before and after therapy in 130 cases. Although the value of these measurements is highly validated in the phoniatric practice, extensive studies and, especially, expert discussions are still needed. The authors come to terms with opinions that question the validity of the described methods.}, } @article {pmid7714275, year = {1995}, author = {Busby, PA and Plant, GL}, title = {Formant frequency values of vowels produced by preadolescent boys and girls.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {4}, pages = {2603-2606}, doi = {10.1121/1.412975}, pmid = {7714275}, issn = {0001-4966}, mesh = {Age Factors ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sex Factors ; *Speech ; *Speech Production Measurement ; }, abstract = {The fundamental frequency (F0) and the first three formant frequency (F1, F2, and F3) values of vowels produced by 40 preadolescents were measured. There were five boys and five girls in each of four age groups: 5, 7, 9, and 11 years old. The 11 nondiphthong vowels of Australian English which can be produced in a stressed syllable were used. The F0 values decreased with increases in age, but there was no difference between boys and girls. In general, the F1, F2, and F3 values decreased with increases in age, and the values for girls were higher than those for boys.}, } @article {pmid7643170, year = {1995}, author = {Wang, X and Sachs, MB}, title = {Transformation of temporal discharge patterns in a ventral cochlear nucleus stellate cell model: implications for physiological mechanisms.}, journal = {Journal of neurophysiology}, volume = {73}, number = {4}, pages = {1600-1616}, doi = {10.1152/jn.1995.73.4.1600}, pmid = {7643170}, issn = {0022-3077}, support = {DC-00109/DC/NIDCD NIH HHS/United States ; DC-00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Cell Membrane/physiology ; Cochlear Nucleus/cytology/*physiology ; Evoked Potentials, Auditory/physiology ; Models, Neurological ; Nerve Fibers/physiology ; Neurons/*physiology ; }, abstract = {1. We have stimulated responses of stellate cells in the anteroventral cochlear nucleus (AVCN) to single-formant stimuli (SFSs) with the use of recorded auditory-nerve fiber (ANF) responses as inputs. In particular, two important features of temporal discharge patterns, the phase locking to best frequency (BF) tones and to stimulus envelopes, were examined in the model output. Our earlier experimental studies with SFSs found an enhancement of the envelope modulation in AVCN chopper units, presumably recorded from stellate cells, as compared with that of ANFs. 2. We simulated in the model three mechanisms for the enhancement in envelope modulation proposed earlier by us, namely, convergence of ANFs, temporal summation and inhibitory input. It was found that the convergence of multiple ANFs alone did not always lead to an enhancement in modulation depth, but was necessary for the model to produce other physiologically plausible envelope features; the temporal summation of subthreshold events can lead to an increase in modulation depth; and the somatic inhibition effectively reduced the envelope minimum and, as a result, increased the modulation depth. In addition, we found that, given the same input configuration, the closer the inputs were located to the soma, the greater modulation depth they produced at the model output. 3. Different types of convergence of ANF inputs were tested in our model. It was found that the convergence of both low and high spontaneous rate (SR) ANFs resulted in an enhancement in modulation depth over a wider range of sound level than that due to the convergence of ANFs from the same SR group.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid7607821, year = {1995}, author = {Robb, MP and Cacace, AT}, title = {Estimation of formant frequencies in infant cry.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {32}, number = {1}, pages = {57-67}, doi = {10.1016/0165-5876(94)01112-b}, pmid = {7607821}, issn = {0165-5876}, mesh = {*Acoustics ; *Crying/physiology ; Humans ; Infant ; Infant, Newborn ; Reference Values ; Sound Spectrography/instrumentation/methods ; }, abstract = {The formant frequencies (F1, F2, F3) of normal infant crying were measured using three different estimation techniques: sound spectrography, linear predictive coding (LPC), and power spectrum analysis. Results found all three techniques to be highly similar for estimation of F1. However, the techniques differed significantly in the estimation of F2 and F3. Power spectrum analysis tended to yield the highest F2 and F3 values, while LPC consistently provided the lowest F2 and F3 values. Based on the results of the study, serious questions arise whether formant estimates of cry are accurate or appropriate for use as a metric of infant vocal tract resonance.}, } @article {pmid7596100, year = {1995}, author = {Siren, KA and Wilcox, KA}, title = {Effects of lexical meaning and practiced productions on coarticulation in children's and adults' speech.}, journal = {Journal of speech and hearing research}, volume = {38}, number = {2}, pages = {351-359}, doi = {10.1044/jshr.3802.351}, pmid = {7596100}, issn = {0022-4685}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonetics ; *Semantics ; *Speech ; *Speech Production Measurement ; }, abstract = {This investigation examined the effect of familiarity with a speech target on the magnitude of the coarticulation observed in children (aged 3, 5, and 7 years) and adults. For the purposes of this investigation, coarticulation was defined as the effect that a following vowel, /i/ or /u/, had on the frequency value of the second formant (F2) in the preceding fricative, /s/ or /f/. Familiarity with the spoken targets was examined through the manipulation of two factors: (a) the presence or absence of lexical meaning and (b) the extent to which speakers were allowed to practice an item prior to recording. Results of acoustic measurements confirm that the children exhibited a greater effect of a following vowel on the preceding fricative when compared to adults. Nonmeaningful production items appeared to exhibit a greater effect of the vowel on the preceding fricative than meaningful production items, regardless of age of the individual. Limited motor practice did not have an effect on degree of fricative-vowel coarticulation in production items for any of the age groups. For the productions in this investigation, the primary coarticulatory effect was intrasyllabic.}, } @article {pmid7757150, year = {1995}, author = {Griffin, B and Woo, P and Colton, R and Casper, J and Brewer, D}, title = {Physiological characteristics of the supported singing voice. A preliminary study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {9}, number = {1}, pages = {45-56}, doi = {10.1016/s0892-1997(05)80222-x}, pmid = {7757150}, issn = {0892-1997}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Middle Aged ; Phonation/*physiology ; Pulmonary Ventilation ; Sex Factors ; Surveys and Questionnaires ; Voice/*physiology ; *Voice Quality ; Voice Training ; }, abstract = {The purpose of this study was to develop a definition of the supported singing voice based on physiological characteristics by comparing the subjects' concepts of a supported voice with objective measurements of their supported and unsupported voice. This preliminary report presents findings based on data from eight classically trained singers. Subjects answered questions about their concepts of the characteristics of the supported singing voice and how it is produced. Samples of the supported and unsupported singing voice produced at low, medium, and high pitches at a comfortable loudness level were collected for acoustic, spectral, airflow, electroglottographic, air volume, and stroboscopic analyses. Significant differences between the supported and unsupported voice were found for sound pressure level (SPL), peak airflow, subglottal pressure (Ps), glottal open time, and frequency of the fourth formant (F4). Mean flow and F2 frequency differences were sex and pitch related. Males adjusted laryngeal configuration to produce supported voice, whereas glottal configuration differences were greater in females. Breathing patterns were variable and not significantly different between supported and unsupported voice. Subjects in this study believe that the supported singing voice is resonant, clear, and easy to manage and is produced by correct breath management. Results of data analysis show that the supported singing voice has different spectral characteristics from and higher SPL, peak airflow, and Ps than the unsupported voice. Singers adjust laryngeal and/or glottal configuration to account for these changes, but no significant differences in breathing activity were found.}, } @article {pmid7738708, year = {1995}, author = {Nakai, K}, title = {[The influence of pharyngoplasty on articulation].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {98}, number = {3}, pages = {442-456}, doi = {10.3950/jibiinkoka.98.442}, pmid = {7738708}, issn = {0030-6622}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; Pharynx/*surgery ; Sleep Apnea Syndromes/surgery ; Snoring/surgery ; *Speech ; }, abstract = {UNLABELLED: Pharyngoplasty is a surgical technique for the treatment of obstructive sleep apnea and snoring. The most commonly employed pharyngoplasty technique is uvulopalatopharyngoplasty (UPPP). The anatomical structure of the pharynx is usually changed in this procedure, and a change in articulation may result, because the pharynx plays a important role in articulation. We, therefore, assessed the influence of pharyngoplasty on articulation in patients who underwent UPPP. Perceptual study: Twenty-five Japanese monosyllables uttered by 12 patients were tape-recorded pre-and postoperatively. The results of listening tests showed no significant changes in articulation scores for any syllables in the pre-and postoperative samples, and there were no nasalized articulations and no reduced intensity of consonants. Acoustical study: The first and second formant (F1, F2) of the five Japanese vowels pronounced by 20 patients were recorded pre-and postoperatively. In addition, intraindividual variation in the F1, F2 of the five Japanese vowels were measured in 4 normal subjects. The F1 and F2 of the vowel /e/ were slightly higher postoperatively. F2 of vowel /u/ was slightly lower postoperatively, and the F2 of vowel /o/ showed the same change. These changes in formants, however, fell within the range of intraindividual variation.

RESULTS: There is no problem in the articulation of phonemes after UPPP, but the timbre of the voice may be influenced by the changes in formants.}, } @article {pmid7699174, year = {1995}, author = {Loizou, P and Dorman, M and Spanias, A}, title = {Automatic recognition of syllable-final nasals preceded by /epsilon/.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {3}, pages = {1925-1928}, doi = {10.1121/1.412065}, pmid = {7699174}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; *Speech Perception ; }, abstract = {In this paper, it is shown how an automatic recognition algorithm, based on hidden Markov models (HMM), can benefit by properly utilizing findings from perceptual experiments on nasals. Perceptual studies on nasal consonants have shown that both nasal murmurs and formant transitions are important in the identification of place of articulation. Thus both acoustic segments bordering the nasal release were incorporated into this HMM-based system. A 7% improvement in alveolar recognition was obtained by explicitly modeling the vowel-nasal transition segments. Further overall improvement (6%) was realized by making the HMM recognizer "focus" more on the vowel-nasal transition segments bordering the nasal release, and less on the nasal murmur and vowel portion of the /epsilon m/ and /epsilon n/ syllables. An overall average [m]-[n] recognition of 95% was obtained when testing this technique on 60 speakers outside the training set.}, } @article {pmid7699172, year = {1995}, author = {Traunmüller, H and Eriksson, A}, title = {The perceptual evaluation of F0 excursions in speech as evidenced in liveliness estimations.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {3}, pages = {1905-1915}, doi = {10.1121/1.412942}, pmid = {7699172}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Humans ; Male ; Sex Factors ; *Speech Perception ; }, abstract = {In order to learn how listeners evaluate F0 excursions, a set of experiments was performed in which subjects had to estimate the liveliness of utterances. The stimuli were obtained by LPC analysis of one natural utterance that was modified by resynthesizing F0, the formant frequencies, and the time scale in order to simulate some of the natural extra- and paralinguistic variations that affect F0 and/or liveliness, namely the speaker's age, sex, articulation rate, and voice register. In each case, the extent of the F0 excursions was varied in seven steps. The results showed that, as long as the stimuli appeared to have been produced in the modal register (of men, women, and children), listeners judged F0 intervals to be equivalent if they were equal in semitones. When the voice register was shifted without adjustment in articulation, listeners appeared to judge the F0 excursions in relation to the spectral space available below F1. The liveliness ratings were found to be strongly dependent on articulation rate and to be affected by the perceived age of the speaker which, with the manipulated stimuli used here, turned out to be significantly affected by the sex of the listener.}, } @article {pmid7699165, year = {1995}, author = {Dooling, RJ and Best, CT and Brown, SD}, title = {Discrimination of synthetic full-formant and sinewave/ra-la/continua by budgerigars (Melopsittacus undulatus) and zebra finches (Taeniopygia guttata).}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {3}, pages = {1839-1846}, doi = {10.1121/1.412058}, pmid = {7699165}, issn = {0001-4966}, support = {DC-00198/DC/NIDCD NIH HHS/United States ; DC-00403/DC/NIDCD NIH HHS/United States ; MH-00982/MH/NIMH NIH HHS/United States ; }, mesh = {Animals ; Birds/*physiology ; Female ; Male ; Parrots/*physiology ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Discrimination of three synthetic versions of a/ra-la/ speech continuum was studied in two species of birds. The stimuli used in these experiments were identical to those used in a previous study of speech perception by humans [Best et al., Percept. Psychophys. 45, 237-250 (1989)]. Budgerigars and zebra finches were trained using operant conditioning and tested on three different series of acoustic stimuli: three-formant synthetic speech, sinewave versions of those tokens, and isolated F3 tones from the sinewave speech. Both species showed enhanced discrimination performance near the /l/-/r/ boundary in the full-formant speech continuum, whereas for the F3 continuum, neither species showed a peak near this boundary. These results are similar to human discrimination of the same continua. Budgerigars also showed a peak in discrimination of the sinewave analog continuum paralleling that for full-formant syllables, similar to humans who are induced to perceive sinewave speech as speech. Zebra finches, by contrast, showed a relatively flat function mirroring their performance for F3 sinewaves, similar to humans who are induced to perceive sinewave speech as nonspeech. These data provide new evidence of species similarities and differences in the discrimination of speech and speechlike sounds. These data also strengthen and refine previous findings on the sensitivities of the vertebrate auditory system to the acoustic distinctions between speech sound categories.}, } @article {pmid7699164, year = {1995}, author = {Nelson, PB and Nittrouer, S and Norton, SJ}, title = {"Say-stay" identification and psychoacoustic performance of hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {3}, pages = {1830-1838}, doi = {10.1121/1.412057}, pmid = {7699164}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Audiometry ; Auditory Threshold ; Hearing Disorders/*diagnosis ; Humans ; Middle Aged ; *Psychoacoustics ; Severity of Illness Index ; *Speech Perception ; Task Performance and Analysis ; }, abstract = {Controversy exists over the extent to which psychoacoustic abilities explain speech perception. The present study addressed this issue by assessing the contribution of psychoacoustic abilities to speech identification by hearing-impaired listeners. Speech stimuli and nonspeech analogs were presented to normal-hearing and hearing-impaired listeners with varying degrees of hearing loss. Phonemic judgments for synthetic versions of "say-stay" stimuli were evaluated as a function of first formant onset frequency and of silence duration between the /s/ noise and the vocalic portion. Three psychoacoustic measures were obtained: glide onset frequency discrimination, gap detection in noise, and gap detection in speechlike composite signals. Although the resulting difference limens increased with increasing hearing loss, the acoustic cues to the speech contrast were available to all listeners. Nonetheless, the response patterns to the speech stimuli varied among groups in a manner that was not explained by the psychoacoustic results. Thus, although hearing-impaired listeners were able to detect and discriminate essential acoustic cues for speech, the perceptual organization of speech stimuli appeared to differ with degree of loss. Alternative explanations for the findings are considered, including the possible effect of hearing loss on auditory/linguistic experience.}, } @article {pmid7876453, year = {1995}, author = {Hawks, JW and Miller, JD}, title = {A formant bandwidth estimation procedure for vowel synthesis [43.72.Ja].}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {2}, pages = {1343-1344}, doi = {10.1121/1.412986}, pmid = {7876453}, issn = {0001-4966}, support = {DC00296-06/DC/NIDCD NIH HHS/United States ; NS 03856/NS/NINDS NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonetics ; Sex Factors ; *Speech Perception ; *Speech, Alaryngeal ; }, abstract = {The specification of vowel formant bandwidths for speech synthesis has been inconsistent in the past, perhaps due to the difficulty of measuring formant bandwidths in natural speech and the possible perceptual insignificance of formant bandwidths on the intelligibility of synthetic speech. Here, regression equations are presented for the estimation of formant bandwidths based on measurements from natural speech which is based only on formant center frequency and independent of other formant values. Current usage, as well as comparison with another well-known estimation algorithm suggests that the new procedure should be quite acceptable for some types of speech synthesis.}, } @article {pmid7876449, year = {1995}, author = {Mitchell, PA and Easton, RD}, title = {Wave collation visual speech display: design and evaluation.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {2}, pages = {1297-1306}, doi = {10.1121/1.412171}, pmid = {7876449}, issn = {0001-4966}, mesh = {*Color Perception ; Humans ; Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The wave collation display is a pitch-synchronous, time-domain visual speech display. Collation processing maps the speech waveform into a planar array, condensing the waveform and making speech information, including pitch contour and formant transitions, salient. Evaluation included both analytic evaluation and training. Analytic evaluation was based on a perceptual sorting task using untrained subjects. Subjects sorted printed speech display tokens by visual similarity in a match-to-exemplar design. Stimuli included vowels, with single speaker, multiple speakers, and multiple phonemic contexts, and voiceless consonants. Results for untrained subjects ranged from 73% correct (consonants) and 71% correct (vowels) for single speaker tokens to 46% correct (multiple speaker vowels). For comparison, analytic evaluation using spectrograms was also performed for vowels with single and multiple speakers. Overall results were statistically equivalent to the collation display, with 76% correct (single speaker vowels) and 44% correct (multiple speakers). In the training component, four subjects were trained on collation display sorting tasks as above; after mastering these tasks, generalization to novel stimuli was tested. The tasks were mastered in a few hours, and generalization to novel tokens from a familiar speaker was nearly perfect; generalization to unfamiliar speakers was imperfect.}, } @article {pmid8563781, year = {1995}, author = {Doskov, D and Ivanov, T and Boyanov, B}, title = {Comparative analysis of singer's high formant in different type of singing voices.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {47}, number = {5}, pages = {291-295}, doi = {10.1159/000266363}, pmid = {8563781}, issn = {1021-7762}, mesh = {Humans ; Phonation ; Sound Spectrography ; *Voice ; *Voice Quality ; Voice Training ; }, abstract = {The presence of singer's high formant (SHF) in singers of different musical styles was investigated. The voices of 10 opera, 8 folk, 4 pop singers and 53 (non-singing) controls were recorded. The vowel 'a' was sung in three different registers. The percent of energy in the range of SHF and the stability of the pitch period generation were evaluated. The signals of all the singers are characterized by very high stability of the pitch period generation (> 99%), but their formant structures differ considerably. Energy concentration in the range of SHF is 23, 16, 7 and < 4% for opera, folk and pop singers and the controls, respectively.}, } @article {pmid8563779, year = {1995}, author = {Novák, A and Vokrál, J}, title = {Acoustic parameters for the evaluation of voice of future voice professionals.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {47}, number = {5}, pages = {279-285}, doi = {10.1159/000266361}, pmid = {8563779}, issn = {1021-7762}, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Music ; Phonetics ; *Speech Acoustics ; *Voice ; *Voice Quality ; }, abstract = {The authors tried to establish quantitative and qualitative acoustic parameters of a good voice, suitable for future voice professionals. In their work they used long-time average spectrum analysis (LTAS) and three-dimensional analysis of periodicity (3D-PAN). They consider the regression straight line of formant regions and the parameters offered by 3D-PAN--jitter first of all--as the main acoustic parameters for the evaluation of voice quality and draw attention to the fact that acoustic parameters represent only one part of the evaluation of voice quality.}, } @article {pmid7899808, year = {1995}, author = {Cassassolles, S and Paulus, C and Ajacques, JC and Berger-Vachon, C and Laurent, M and Perrin, E}, title = {[Acoustic characterization of velar insufficiency in young children].}, journal = {Revue de stomatologie et de chirurgie maxillo-faciale}, volume = {96}, number = {1}, pages = {13-20}, pmid = {7899808}, issn = {0035-1768}, mesh = {Case-Control Studies ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonation/*physiology ; Phonetics ; Sensitivity and Specificity ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; Velopharyngeal Insufficiency/diagnosis/*physiopathology ; Voice Quality ; }, abstract = {The following study concerns the acoustic parameters marking the difference between the phonation of children with velar incompetency and a corresponding control group. Population is made of 64 children aged 5 and 6; 32 are in the control group and 32 in the pathological group. Each child was required to repeat several words; 10 vowels and 9 consonants were taken from those words. Acoustic signal was recorded and studied using a standard speech assessment work station. 63 parameters for the vowels and 6 for the consonants were studied. Results showed that the best parameters concerned the vowels and that cepstrum coefficients were an efficient tool to perform the separation of the two groups. On the contrary, formants, voice pitch and jitter did not appear to be efficient enough to do this task. It was, then, deduced a check form for velopharyngeal incompetency, form which will be tested in future work.}, } @article {pmid7860834, year = {1995}, author = {Assmann, PF}, title = {The role of formant transitions in the perception of concurrent vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {1}, pages = {575-584}, doi = {10.1121/1.412281}, pmid = {7860834}, issn = {0001-4966}, support = {27032-542//PHS HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {When two voices compete for the attention of the listener, the spectral peaks that define the formants of one voice can be intermittently obscured or distorted by formants of the other voice. However, formant peaks vary slowly and continuously in frequency and time, providing a basis for tracking through regions of overlap. Three experiments investigated the ability of listeners to exploit formant-pattern continuity to segregate pairs of synthesized vowels that were presented simultaneously and monaurally. Experiment 1 and 2 examined the effects of introducing one member of the pair with formant-frequency transitions that specified syllable-initial glides /w/ or /j/. Identification accuracy was generally higher in conditions where glides were present. Gliding formants provided smaller benefits than a two-semitone difference in fundamental frequency between the vowels. Experiment 3 found larger effects of formant transitions specifying initial or final /l/. Overall, formant transitions did not make it easier to identify the vowel to which they were linked; instead, they helped by making the competing vowel more identifiable. One explanation for improvement in the glide conditions is a formant-tracking process which groups together the formants of each voice using the Gestalt principle of good continuation. However, this account predicts improvement for both vowels which was generally not observed. An alternative explanation is suggested by models that apply a brief, sliding temporal window to determine which region of the signal provides the strongest evidence of each vowel constituent. The formant transition region may provide a time interval during which the competing steady-state vowel is perceptually more prominent.}, } @article {pmid7860829, year = {1995}, author = {Childers, DG and Ahn, C}, title = {Modeling the glottal volume-velocity waveform for three voice types.}, journal = {The Journal of the Acoustical Society of America}, volume = {97}, number = {1}, pages = {505-519}, doi = {10.1121/1.412276}, pmid = {7860829}, issn = {0001-4966}, support = {DC 00577/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Female ; Glottis/*physiology ; Humans ; Male ; Speech/physiology ; Speech Production Measurement ; Voice/*physiology ; Voice Quality ; }, abstract = {The purpose of this study was to model features of the glottal volume-velocity waveform for three voice types: modal voice, vocal fry, and breathy voice. The study analyzed data measured from two sustained vowels and one sentence uttered by nine adult, male subjects who represented examples of the three voice types. The primary analysis procedure was glottal inverse filtering, which estimated the glottal volume-velocity waveform. The estimated glottal volume-velocity waveform was then fit to an LF model waveform. Four parameters of the LF model were adjusted to minimize the mean-squared error between the estimated glottal waveform and the LF model waveform. Statistical averages and standard deviations of the four parameters of the LF glottal waveform model were calculated using the data for each voice type. The four LF model parameters characterize important low-frequency features of the glottal waveform, namely, the glottal pulse width, pulse skewness, abruptness of closure of the glottal pulse, and the spectral tilt of the glottal pulse. Statistical analysis included ANOVA and multiple linear regression analysis. The ANOVA results demonstrated that there was a difference in three of the four LF model parameters for the three voice types. The linear regression analysis between the four LF model parameters and a formal rating by a listening test of the quality of the three voice types was used to determine the most significant LF model parameters for each voice type. A simple rule was devised for synthesizing the three voice types with a formant synthesizer using the LF glottal waveform model. Listener evaluations of the synthesized speech tended to confirm the results determined by the analysis procedures.}, } @article {pmid7754699, year = {1995}, author = {Telegina, TL}, title = {[A comparison of the dynamic speech indices and traditional autonomic correlates of emotions in 9- to 10-year-old children during play].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {45}, number = {1}, pages = {59-65}, pmid = {7754699}, issn = {0044-4677}, mesh = {Autonomic Nervous System/*physiology ; Child ; Emotions/*physiology ; Female ; Galvanic Skin Response/physiology ; Games, Experimental ; Heart Rate/physiology ; Humans ; Male ; Microcomputers ; *Play and Playthings ; Speech/*physiology ; }, abstract = {Emotional state of children aged 9-10 was studied during computer play: 1) under conditions of free choice of individual strategy, 2) time deficit, and 3) after demonstration of techniques of play without limitation in time. Prior and during the play the following parameters were recorded: heart rate, galvanic skin response, frequency of basic tone and evaluation of the first formant frequency of the vowel "a" in the word "da" pronounced by the child during the play in response to acoustic signal; time of speech reaction to acoustic signal, parameters of playing activity, and spontaneous verbal expressions of children. Analysis and comparison of the obtained data allowed us to characterize specific features of interaction of motivational, emotional, and cognitive processes under different playing conditions.}, } @article {pmid7670554, year = {1995}, author = {Zieger, K and Schneider, C and Gerull, G and Mrowinski, D}, title = {[Cepstrum analysis in voice disorders].}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {47}, number = {4}, pages = {210-217}, doi = {10.1159/000266352}, pmid = {7670554}, issn = {1021-7762}, mesh = {Adult ; Aged ; Female ; Fourier Analysis ; Hoarseness/classification/diagnosis/etiology ; Humans ; Male ; Middle Aged ; Reference Values ; Signal Processing, Computer-Assisted ; *Sound Spectrography ; Voice Disorders/classification/*diagnosis/etiology ; }, abstract = {Objective data and reproducible procedures are increasingly demanded for assessment and long-term comparison in voice disturbances. Well-known electroacoustic methods like formant analysis, autocorrelation and inverse filtering have been applied for this purpose, though without leaving experimental stage. The cepstrum analysis presented here permits an easy and distinct separation of glottal pitch and filter function of the vocal tract. In this study, 112 normal and hoarse voices were analyzed. The sum of amplitudes of the first cepstral pitch peaks differed significantly between the different degrees of hoarseness (0-3). Disturbances producing hoarseness, however, could not be sufficiently differentiated by the method in its present state.}, } @article {pmid7858665, year = {1994}, author = {Detweiler, RF}, title = {An investigation of the laryngeal system as the resonance source of the singer's formant.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {8}, number = {4}, pages = {303-313}, doi = {10.1016/s0892-1997(05)80278-4}, pmid = {7858665}, issn = {0892-1997}, mesh = {Adult ; Humans ; Laryngoscopy ; Larynx/*physiology ; Magnetic Resonance Imaging ; Male ; Middle Aged ; Phonation ; Phonetics ; Sound Spectrography ; Vocal Cords/physiology ; Voice/physiology ; }, abstract = {Since its introduction, the Sundberg model of the laryngeal system as the resonance source of the singer's formant has gained wide acceptance. However, no studies directly testing this hypothesis in vivo have previously been reported. Thus, the present study was undertaken to test this hypothesis on three classically trained professional male singers. The vocal behaviors of the singer-subjects were evaluated during model and pulse register phonation via magnetic resonance imaging, strobolaryngoscopy, and acoustic analysis. Results indicated the subjects did not achieve the laryngopharyngeal/laryngeal outlet cross-sectional area ratio requisite to the model and that the formant remained robust in pulse register phonation. It was concluded that these subjects' behaviors were not consistent with Sundberg's model and that the model was inadequate to account for the generation of the singer's formant in these three subjects.}, } @article {pmid7983281, year = {1994}, author = {Kraus, N and McGee, T and Carrell, T and King, C and Littman, T and Nicol, T}, title = {Discrimination of speech-like contrasts in the auditory thalamus and cortex.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {5 Pt 1}, pages = {2758-2768}, doi = {10.1121/1.411282}, pmid = {7983281}, issn = {0001-4966}, support = {R01 DC00264/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Auditory Pathways ; Cerebral Cortex/*physiology ; Electrodes ; Guinea Pigs ; Habituation, Psychophysiologic ; Phonetics ; *Speech Perception ; Thalamus/*physiology ; }, abstract = {The neurophysiologic discrimination of acoustic contrasts was investigated as reflected by the mismatch negativity (MMN) response. Evoked responses were recorded from guinea pig thalamus (medial geniculate nucleus) and epidural surface in response to synthesized speech contrasts /ga/-/da/ and /ba/-/wa/. From the caudomedial portion of the medial geniculate nucleus, /ba/-/wa/ elicited a strong mismatch response, whereas /ga/-/da/ did not. Neither stimulus contrast elicited an MMN from the ventral, or primary, portion of medial geniculate. Both stimulus contrasts elicited an MMN from the midline surface. Neither contrast elicited an MMN from the surface over the temporal lobe. Results indicate a hierarchy of processing of the spectrotemporal changes which characterize formant transitions. Also, results indicate that the nonprimary portions of the auditory pathway contribute substantially to the MMN.}, } @article {pmid7984394, year = {1994}, author = {Lea, AP and Summerfield, Q}, title = {Minimal spectral contrast of formant peaks for vowel recognition as a function of spectral slope.}, journal = {Perception & psychophysics}, volume = {56}, number = {4}, pages = {379-391}, pmid = {7984394}, issn = {0031-5117}, mesh = {Adult ; Humans ; *Phonetics ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {In four experiments we investigated whether listeners can locate the formants of vowels not only from peaks, but also from spectral "shoulders"--features that give rise to zero crossings in the third, but not the first, differential of the excitation pattern--as hypothesized by Assmann and Summerfield (1989). Stimuli were steady-state approximations to the vowels [a, i, e, u, o] created by summing the first 45 harmonics of a fundamental of 100 Hz. Thirty-nine harmonics had equal amplitudes; the other 6 formed three pairs that were raised in level to define three "formants." An adaptive psychophysical procedure determined the minimal difference in level between the 6 harmonics and the remaining 39 at which the vowels were identifiably different from one another. These thresholds were measured through simulated communication channels, giving overall slopes of the excitation patterns of the five vowels that ranged from -1 dB/erb to + 2 dB/erb. Excitation patterns of the threshold stimuli were computed, and the locations of formants were estimated from zero crossings in the first and third differentials. With the more steeply sloping communication channels, some formants of some vowels were represented as shoulders rather than peaks, confirming the predictions of Assmann and Summerfield's models. We discuss the limitations of the excitation pattern model and the related issue of whether the location of formants can be computed from spectral shoulders in auditory analysis.}, } @article {pmid7965918, year = {1994}, author = {Suthers, RA}, title = {Variable asymmetry and resonance in the avian vocal tract: a structural basis for individually distinct vocalizations.}, journal = {Journal of comparative physiology. A, Sensory, neural, and behavioral physiology}, volume = {175}, number = {4}, pages = {457-466}, pmid = {7965918}, support = {R01 NS29467/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Behavior, Animal/physiology ; Birds/anatomy & histology/*physiology ; Bronchi/physiology ; Forecasting ; Models, Biological ; Social Behavior ; Trachea/physiology ; Vocalization, Animal/*physiology ; }, abstract = {The social vocalizations of the oilbird (Steatornis caripensis) frequently have their acoustic energy concentrated into 3 prominent formants which appear to arise from the filter properties of their asymmetrical vocal tract with its bronchial syrinx. The frequency of the second and third formants approximate the predicted fundamental resonances of the unequal left and right cranial portions of each primary bronchus, respectively. Reversibly plugging either bronchus eliminates the corresponding formant. The first formant may arise in the trachea. The degree of vocal tract asymmetry varies between individuals, endowing them with different formant frequencies and providing potential acoustic cues by which individuals of this nocturnal, cave dwelling species may recognize each other in their dark, crowded colonies.}, } @article {pmid7963024, year = {1994}, author = {Tartter, VC and Braun, D}, title = {Hearing smiles and frowns in normal and whisper registers.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {4}, pages = {2101-2107}, doi = {10.1121/1.410151}, pmid = {7963024}, issn = {0001-4966}, support = {NIDCD RO1DC01250-02/DC/NIDCD NIH HHS/United States ; }, mesh = {*Emotions ; Female ; Humans ; Male ; Phonetics ; *Smiling ; Speech Acoustics ; *Speech Perception ; }, abstract = {Two experiments measured listeners' abilities to detect facial expression in unfamiliar speech in normal and whisper registers. Acoustic differences between speech produced with neutral or marked facial expression were also assessed. Experiment 1 showed that in a forced-choice identification task, listeners could accurately select frowned speech as such, and neutral speech as happier sounding than frowned speech in the same speakers. Listeners were able to judge frowning in the same speakers' whispered speech. Relative to neutral speech, frowning lowers formant frequencies and increases syllable duration. In both registers, judgments of frowning and its relative happiness were significantly poorer for lip-rounded vowels, suggesting that listeners may recover lip protrusion in making judgments. Experiment 2 replicated the finding [V. Tartter, Percept. Psychophys. 27, 24-27 (1980)] that listeners can select speech produced with a smile as happier sounding than neutral speech in normal register, and extended the findings to whisper register. Relative to neutral, smiling increased second formant frequency. Results are discussed with respect to nonverbal auditory emotion prototypes and with respect to the direct realist theory of speech perception.}, } @article {pmid7536540, year = {1994}, author = {Liublimskaia, VV and Malinnikova, TG and Chernova, EI}, title = {[The role of amplitude modulation in forming the auditory images of speech signals].}, journal = {Fiziologicheskii zhurnal imeni I.M. Sechenova}, volume = {80}, number = {10}, pages = {39-52}, pmid = {7536540}, issn = {1027-3646}, mesh = {Auditory Perception/*physiology ; Humans ; Phonetics ; Psychoacoustics ; Reference Values ; Speech Discrimination Tests/methods/statistics & numerical data ; Speech Perception/*physiology ; }, abstract = {The identification of stationary vowels in influenced by the change in relations of formant amplitudes as if their "centre gravity" is used in phoneme decision. That is valid for formants spaced in critical frequency band equal approximately 3.5 Bark. The hypothesis about large-scale spectral integration in auditory system is based on this effect. The data obtained supports the hypothesis in respect to non-stationary sounds, too. Imitation of the large-scale spectral integration on the model of auditory processing suggests that the frequency band of the integration may be equal to 3.5 Bark.}, } @article {pmid7963013, year = {1994}, author = {Aaltonen, O and Eerola, O and Lang, AH and Uusipaikka, E and Tuomainen, J}, title = {Automatic discrimination of phonetically relevant and irrelevant vowel parameters as reflected by mismatch negativity.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {3}, pages = {1489-1493}, doi = {10.1121/1.410291}, pmid = {7963013}, issn = {0001-4966}, mesh = {Adult ; *Automatism ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Pitch Perception ; *Speech Perception ; }, abstract = {An auditory event-related brain potential called mismatch negativity (MMN) was measured to study the perception of vowel pitch and formant frequency. In the MMN paradigm, deviant vowels differed from the standards either in F0 or F2 with equal relative steps. Pure tones of corresponding frequencies were used as control stimuli. The results indicate that the changes in F0 or F2 of vowels significantly affected the MMN amplitudes. The only variable significantly affecting the MMN latencies was sex which, however, did not have any effect on the amplitudes of the MMN. As expected, the MMN amplitudes increased with an increase in the acoustical difference between the standards and the deviants in all cases. On the average, the amplitudes were lower for the vowels than for the pure tones of equal loudness. However, in vowels, minor frequency changes in F0 produced higher MMN amplitudes than similar relative changes in F2. It was also noted that even the smallest and phonetically irrelevant change in F2 was detected by the MMN process. In overall, the results demonstrate that the MMN can be measured separately for F0 and F2 of vowels, although the MMN responses show large interindividual differences.}, } @article {pmid7967578, year = {1994}, author = {McCaffrey, HA and Sussman, HM}, title = {An investigation of vowel organization in speakers with severe and profound hearing loss.}, journal = {Journal of speech and hearing research}, volume = {37}, number = {4}, pages = {938-951}, doi = {10.1044/jshr.3704.938}, pmid = {7967578}, issn = {0022-4685}, mesh = {Adolescent ; Audiometry, Speech ; Child ; Female ; Hearing Disorders/*complications/diagnosis ; Humans ; Language Disorders/diagnosis/*etiology ; Male ; *Phonetics ; Severity of Illness Index ; Speech Acoustics ; Speech Perception ; Task Performance and Analysis ; }, abstract = {Vowel auditory formant distances were obtained from speakers with hearing loss to investigate how perceptual constraints affect the contrastiveness and intelligibility of their spoken vowels. These distances were evaluated in relation to the 3-Bark critical distance principle for vowel height and place as described by Syrdal (1985) and Syrdal and Gopal (1986). Seven speakers with profound hearing loss, 10 with severe hearing loss, and seven with normal hearing produced the vowels /u/, /i/, /I/, /ae/, /a/, and /--/ in an /hVt/ context. Vowel formants and fundamental frequencies were obtained with acoustic spectrographic and LPC analysis and converted to Bark values to establish auditory formant distances. Confusion matrices were constructed from normal listeners' identifications of recorded vowel productions. When frequency data were transformed to a Bark auditory scale, increasing convergence of vowel targets was obtained with increase in hearing loss. Percent correct identifications of the vowels produced by the three groups reflected speaker group differences seen in vowel contrastiveness/overlap in auditory phonetic space. Four levels of performance based on error incidence and type were determined. F1-F0 by F3-F2 Bark distance coordinate plots of a given speaker's vowel space reflected the differential intelligibility scores shown by confusion matrices of individual speakers from the four performance levels. Vowel organization by speakers with hearing loss was influenced by (a) formant critical distance, and (b) formant audibility. The least audible formants, F2 and F3, showed the greatest effects of severe and profound hearing loss. F1 and F0 showed further change with the most profound losses and revealed individual differences as well.}, } @article {pmid7967559, year = {1994}, author = {Caruso, AJ and Chodzko-Zajko, WJ and Bidinger, DA and Sommers, RK}, title = {Adults who stutter: responses to cognitive stress.}, journal = {Journal of speech and hearing research}, volume = {37}, number = {4}, pages = {746-754}, doi = {10.1044/jshr.3704.746}, pmid = {7967559}, issn = {0022-4685}, mesh = {Adolescent ; Adult ; Blood Pressure ; *Cognition ; Female ; Heart Rate ; Humans ; Male ; Reproducibility of Results ; Speech Acoustics ; Stress, Psychological/*psychology ; Stuttering/*psychology ; }, abstract = {The purpose of this study was to investigate the effects of speed and cognitive stress on the articulatory coordination abilities of adults who stutter. Cardiovascular (heart rate, systolic blood pressure, and diastolic blood pressure), behavioral (dysfluencies, errors, speech rate, and response latency), and acoustic (word duration, vowel duration, consonant-vowel transition duration/extent, and formant center frequency) measures for nine stutterers and nine nonstutterers were collected during performance of the Stroop Color Word task, a well-established and highly stressful cognitive task. Significant differences were found between the two groups for heart rate, word duration, vowel duration, speech rate, and response latency. In addition, stutterers produced more dysfluencies under speed plus cognitive stress versus speed stress or a self-paced reading task. These findings demonstrate that the presence of cognitive stress resulted in greater temporal disruptions and more dysfluencies for stutterers than for nonstutterers. However, similar spatial impairments were not evident. The potential contributions of the Stroop paradigm to stuttering research as well as the need for further research on autonomic correlates of stuttering are also discussed.}, } @article {pmid7967558, year = {1994}, author = {Walton, JH and Orlikoff, RF}, title = {Speaker race identification from acoustic cues in the vocal signal.}, journal = {Journal of speech and hearing research}, volume = {37}, number = {4}, pages = {738-745}, doi = {10.1044/jshr.3704.738}, pmid = {7967558}, issn = {0022-4685}, mesh = {Adolescent ; Adult ; *Black or African American ; Biomechanical Phenomena ; Humans ; Larynx/anatomy & histology ; Male ; Middle Aged ; *Speech Acoustics ; *Speech Perception ; Voice ; Voice Quality ; *White People ; }, abstract = {One-second acoustic samples were extracted from the mid-portion of sustained /a/ vowels produced by 50 black and 50 white adult males. Each vowel sample from a black subject was randomly paired with a sample from a white subject. From the tape-recorded samples alone, both expert and naive listeners could determine the race of the speaker with 60% accuracy. The accuracy of race identification was independent of the listener's own race, sex, or listening experience. An acoustic analysis of the samples revealed that, although within ranges reported by previous studies of normal voices, the black speakers had greater frequency perturbation, significantly greater amplitude perturbation, and a significantly lower harmonics-to-noise ratio than did the white speakers. The listeners were most successful in distinguishing voice pairs when the differences in vocal perturbation and additive noise were greatest and were least successful when such differences were minimal or absent. Because there were no significant differences in the mean fundamental frequency or formant structure of the voice samples, it is likely that the listeners relied on differences in spectral noise to discriminate the black and white speakers.}, } @article {pmid7931934, year = {1994}, author = {Hadjistavropoulos, HD and Craig, KD and Grunau, RV and Johnston, CC}, title = {Judging pain in newborns: facial and cry determinants.}, journal = {Journal of pediatric psychology}, volume = {19}, number = {4}, pages = {485-491}, doi = {10.1093/jpepsy/19.4.485}, pmid = {7931934}, issn = {0146-8693}, mesh = {Adolescent ; Adult ; *Crying ; *Facial Expression ; Female ; Humans ; *Infant, Newborn ; *Judgment ; *Pain ; Videotape Recording ; }, abstract = {Explored the facial and cry characteristics that adults use when judging an infant's pain. Sixteen women viewed videotaped reactions of 36 newborns subjected to noninvasive thigh rubs and vitamin K injections in the course of routine care and rated discomfort. The group mean interrater reliability was high. Detailed descriptions of the infants' facial reactions and cry sounds permitted specification of the determinants of distress judgments. Several facial variables (a brow bulge, eyes squeezed shut, and deepened nasolabial fold constellation, and taut tongue) accounted for 49% of the variance in ratings of affective discomfort after controlling for ratings of discomfort during a noninvasive event. In a separate analysis not including facial activity, several cry variables (formant frequency, latency to cry) also accounted for variance (38%) in ratings. When the facial and cry variables were considered together, cry variables added little to the prediction of ratings in comparison to facial variables. Cry would seem to command attention, but facial activity, rather than cry, can account for the major variations in adults' judgments of neonatal pain.}, } @article {pmid7930067, year = {1994}, author = {Ohde, RN}, title = {The development of the perception of cues to the [m]-[n] distinction in CV syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {2 Pt 1}, pages = {675-686}, doi = {10.1121/1.411326}, pmid = {7930067}, issn = {0001-4966}, support = {DC 00464/DC/NIDCD NIH HHS/United States ; RR05424-27/RR/NCRR NIH HHS/United States ; }, mesh = {Age Factors ; Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The contribution of the nasal murmur and vocalic formant transition to the perception of the [m]-[n] distinction by adult listeners was investigated for speakers of different ages. Children, aged 3, 5, and 7, and an adult female and male produced consonant-vowel (CV) syllables consisting of either [m] or [n] and followed by [i ae u a]. Three productions of each syllable were computer edited into the following segments: (1) full murmur; (2) 50-ms murmur preceding release; (3) 25-ms murmur preceding release; (4) 25-ms murmur preceding release +25-ms transition following release; (5) 25-ms transition following release; (6) 50-ms transition following release; and (7) full transition+vowel. The results indicate that both the murmur and transition provide cues to place of articulation, but that the latter property is more prominent in perception than the former across speaker age. The salience of the murmur and vocalic transition cues was greater for adults than children indicating a developmental progression of the encoding of gestures associated with these properties. Although the simultaneous presence of murmur and vocalic transition cues surrounding the point of spectral discontinuity improved perception of place of articulation across speakers, there was evidence of a developmental progression of this property also. For speakers of all ages, as segment duration decreased, consistent decrements in identification of place of articulation occurred only for transition stimuli. The murmur+transition was the most salient cue supporting the importance of spectral discontinuities and/or relational properties in production and perception particularly in the acquisition of sound features.}, } @article {pmid7930066, year = {1994}, author = {Hoemeke, KA and Diehl, RL}, title = {Perception of vowel height: the role of F1-F0 distance.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {2 Pt 1}, pages = {661-674}, doi = {10.1121/1.410305}, pmid = {7930066}, issn = {0001-4966}, support = {DC00427/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Perceived vowel height has been reported to vary inversely with the distance (in Bark) between the first formant frequency (F1) and the fundamental frequency (F0) [H. Traunmüller, J. Acoust. Soc. Am. 69, 1465-1475 (1981)]. Syrdal and Gopal [J. Acoust. Soc. Am. 79, 1086-1100 (1986)] observed that naturally produced [+high] and [-high] vowels tend to divide at a critical F1-F0 distance of 3-3.5 Bark, corresponding to the bandwidth of the "center of gravity" effect [L. Chistovich and V. Lublinskaja, Hear. Res. 1, 185-195 (1979)]. In the present study, listeners identified three sets of synthetic vowels varying orthogonally in F1 and F0 and ranging from /i/-/I/, /I/-/epsilon/, and /epsilon/-/ae/. For the /I/-/epsilon/ set, which corresponds to the [+high]/[-high] distinction, there was a relatively sharp identification boundary located at an F1-F0 distance of 3-3.5 Bark. However, for the /epsilon/-/ae/ and /i/-/I/ sets, which occupied regions where the F1-F0 distance was always greater than or always less than 3 Bark, vowel labeling varied more gradually as a function of F1-F0 distance. Also, F1-F0 distance was a better predictor of labeling performance than F1 alone only for the /I/-/epsilon/ set. Possible sources of the F1-F0 distance cue for vowel height are discussed.}, } @article {pmid7930064, year = {1994}, author = {Patterson, DK and Pepperberg, IM}, title = {A comparative study of human and parrot phonation: acoustic and articulatory correlates of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {96}, number = {2 Pt 1}, pages = {634-648}, doi = {10.1121/1.410303}, pmid = {7930064}, issn = {0001-4966}, mesh = {Animals ; Female ; Humans ; Male ; Parrots/*physiology ; Phonation/*physiology ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; Tongue/anatomy & histology ; Trachea/anatomy & histology ; Vocalization, Animal ; }, abstract = {General acoustic and articulatory parallels between human and avian production of human vowels have been identified. A complete set of vowels from an African Grey parrot (Psittacus erithacus) and a limited set from a Yellow-naped Amazon parrot (Amazonica ochrocephala auropalliata) have been analyzed. Comparison of human and avian acoustic parameters demonstrated both differences (e.g., absolute values of first formant frequencies) and similarities (e.g., separation of vowels into back and front categories with respect to tongue placement) in acoustic properties of avian and human speech. Similarities and differences were also found in articulatory mechanisms: Parrots, for example, use their tongues in some but not all the ways used by humans to produce vowels. Because humans perceive and correctly label vowels produced by psittacids despite differences in avian and human articulatory and acoustic parameters, the findings (a) are consistent with research that demonstrates the flexibility of vowel perception by humans and (b) suggest that the perceptual discontinuities that are exploited by speech may be basic to vertebrates rather than to mammals.}, } @article {pmid7848582, year = {1994}, author = {Kim, HI and Palmini, A and Choi, HY and Kim, YH and Lee, JC}, title = {Congenital bilateral perisylvian syndrome: analysis of the first four reported Korean patients.}, journal = {Journal of Korean medical science}, volume = {9}, number = {4}, pages = {335-340}, doi = {10.3346/jkms.1994.9.4.335}, pmid = {7848582}, issn = {1011-8934}, mesh = {Adolescent ; Adult ; Anticonvulsants/therapeutic use ; Cerebral Cortex/*abnormalities ; Dysarthria/*diagnosis/therapy ; Electroencephalography ; Epilepsy, Generalized/congenital/*diagnosis/therapy ; Evoked Potentials, Somatosensory ; Facial Paralysis/congenital/*diagnosis/therapy ; Female ; Follow-Up Studies ; Humans ; Intellectual Disability/*diagnosis/therapy ; Magnetic Resonance Imaging ; Male ; Surgical Procedures, Operative/methods ; Syndrome ; }, abstract = {The advent of MRI technique has enabled the diagnosis of neuronal migration disorders(NMD) and made it possible to make "in vivo" diagnosis. Congenital bilateral perisylvian syndrome(CBPS) is a recently described disease identify characterized by pseudobulbar palsy, epilepsy, mental retardation, and migration disorders in the bilateral perisylvian area. We have identified four CBPS patients based on neuroimaging and dysarthria patterns among the candidates for epilepsy surgery. All the patients had orofacial diplegia and variable degrees of mental retardation. In the spectrographic analysis of dysarthria, the loss of specific characteristics of formants of vowels and increment of noise in the high frequency formants were observed. Epilepsy was present in all, but only one patient showed intractable seizure requiring surgical intervention. MRI was most helpful in identifying NMD and polymicrogyria in both centroparietal areas in this context. Great alertness is needed to identify this disorder to determine the etiology of epilepsy and dysarthria of uncertain origin.}, } @article {pmid8064511, year = {1994}, author = {Higashikawa, M}, title = {[Perceptual, acoustical and aerodynamic study of whispering].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {97}, number = {7}, pages = {1268-1280}, doi = {10.3950/jibiinkoka.97.1268}, pmid = {8064511}, issn = {0030-6622}, mesh = {Adult ; Auditory Perception/*physiology ; Female ; Humans ; Male ; Phonation ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {Perceptual study: Sixty-seven Japanese monosyllables were whispered by 12 normal speakers and tape-recorded. Listening tests by 24 normal listeners revealed that vowels, semivowels, nasals and fricatives were identified with excellent scores. Although voiced stop consonants were frequently judged as corresponding voiceless consonants, the difference in scores between the two groups was not statistically significant. Acoustical study: The acoustical features of the samples uttered by the subjects who were skilled in whispering showed that first-formant transition plays an important role in discrimination between voiced and voiceless stop consonants. Aerodynamic study: In the subjects who were skilled in whispering, intraoral air pressure was significantly higher in voiceless stop consonants than in corresponding voiced consonants. As a result, it was suggested that the articulation score of whispering would be improved when uttered under acoustical and/or aerodynamic monitoring.}, } @article {pmid7992781, year = {1994}, author = {Wold, DC and Evans, CR and Montague, JC and Dancer, JE}, title = {A pilot study of SPINE test scores and measures of tongue deviancy in speakers with severe-to-profound hearing loss.}, journal = {American annals of the deaf}, volume = {139}, number = {3}, pages = {352-357}, doi = {10.1353/aad.2012.0347}, pmid = {7992781}, issn = {0002-726X}, mesh = {Adolescent ; Adult ; Female ; Hearing Disorders/complications/*diagnosis ; Humans ; Male ; Phonetics ; Pilot Projects ; Speech Disorders/complications/*diagnosis ; Speech Intelligibility ; Speech Production Measurement ; Tongue/*physiology ; }, abstract = {Two developments show promise in the assessment and remediation of defective speech production in persons with hearing loss. A perceptual speech-intelligibility test, the SPINE (for Speech Intelligibility Evaluation), is a simple, clinician-administered instrument which is valid, reliable, and clinically efficient. In addition, the development of acoustic measures of tongue deviancy, computed from formant frequencies, makes possible a direct lateral visualization of tongue placement in relation to standard vowel placement. In this study, SPINE test scores of 28 persons with severe-to-profound hearing loss were correlated with two measures of tongue deviancy during production of the vowels /i/, /a/, and /u/. For both measures of tongue deviancy, correlations with the SPINE were significant for the three vowels combined and for the isolated vowel /i/. These findings suggest that clinicians may ultimately have two different but complementary means of assessing speech production in persons with hearing loss.}, } @article {pmid7927387, year = {1994}, author = {Childers, DG and Wong, CF}, title = {Measuring and modeling vocal source-tract interaction.}, journal = {IEEE transactions on bio-medical engineering}, volume = {41}, number = {7}, pages = {663-671}, doi = {10.1109/10.301733}, pmid = {7927387}, issn = {0018-9294}, support = {NIDCD DC 00577/DC/NIDCD NIH HHS/United States ; }, mesh = {*Algorithms ; Female ; Glottis/physiology ; Humans ; Male ; *Models, Biological ; *Signal Processing, Computer-Assisted ; Software ; Speech Intelligibility ; *Speech, Alaryngeal ; Voice/*physiology ; }, abstract = {The quality of synthetic speech is affected by two factors: intelligibility and naturalness. At present, synthesized speech may be highly intelligible, but often sounds unnatural. Speech intelligibility depends on the synthesizer's ability to reproduce the formants, the formant bandwidths, and formant transitions, whereas speech naturalness is thought to depend on the excitation waveform characteristics for voiced and unvoiced sounds. Voiced sounds may be generated by a quasiperiodic train of glottal pulses of specified shape exciting the vocal tract filter. It is generally assumed that the glottal source and the vocal tract filter are linearly separable and do not interact. However, this assumption is often not valid, since it has been observed that appreciable source-tract interaction can occur in natural speech. Previous experiments in speech synthesis have demonstrated that the naturalness of synthetic speech does improve when source-tract interaction is simulated in the synthesis process. The purpose of this paper is two-fold: 1) to present an algorithm for automatically measuring source-tract interaction for voiced speech, and 2) to present a simple speech production model that incorporates source-tract interaction into the glottal source model. This glottal source model controls: 1) the skewness of the glottal pulse, and 2) the amount of the first formant ripple superimposed on the glottal pulse. A major application of the results of this paper is the modeling of vocal disorders.}, } @article {pmid8084181, year = {1994}, author = {Mulligan, M and Carpenter, J and Riddel, J and Delaney, MK and Badger, G and Krusinski, P and Tandan, R}, title = {Intelligibility and the acoustic characteristics of speech in amyotrophic lateral sclerosis (ALS).}, journal = {Journal of speech and hearing research}, volume = {37}, number = {3}, pages = {496-503}, doi = {10.1044/jshr.3703.496}, pmid = {8084181}, issn = {0022-4685}, support = {FD-U-000512-03-1/FD/FDA HHS/United States ; GCRC 452//PHS HHS/United States ; }, mesh = {Adult ; Aged ; Amyotrophic Lateral Sclerosis/*complications ; Dysarthria/*etiology ; Female ; Humans ; Male ; Middle Aged ; Sound Spectrography ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {The purpose of this study was to analyze the changes in specific speech parameters in 14 patients, 7 dysarthric and 7 non-dysarthric, with amyotrophic lateral sclerosis (ALS), over a 6-month period. Measurements of single word intelligibility, F2 formant trajectories (extent, duration and rate) and diadochokinetic rate showed decreased performance in dysarthric patients as compared to non-dysarthric patients at baseline. F2 transition rates of less than 4 Hz/msec were seen only in dysarthric ALS patients. A relationship between the F2 transition rate and single word intelligibility was noted for patients with moderate to high intelligibility, but at lower levels of intelligibility the F2 rate reached a plateau despite continued decline in intelligibility. Our results support the need for frequent evaluation of dysarthric ALS patients to better understand the relationship between intelligibility and the acoustic parameters of speech.}, } @article {pmid8061767, year = {1994}, author = {Sundberg, J}, title = {Perceptual aspects of singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {8}, number = {2}, pages = {106-122}, doi = {10.1016/s0892-1997(05)80303-0}, pmid = {8061767}, issn = {0892-1997}, mesh = {*Auditory Perception ; Female ; Humans ; Loudness Perception ; Male ; Phonation/*physiology ; Pitch Discrimination ; Sex Factors ; Time Factors ; Voice/*physiology ; Voice Quality ; }, abstract = {The relations between acoustic and perceived characteristics of vowel sounds are demonstrated with respect to timbre, loudness, pitch, and expressive time patterns. The conditions for perceiving an ensemble of sine tones as one tone or several tones are reviewed. There are two aspects of timbre of voice sounds: vowel quality and voice quality. Although vowel quality depends mainly on the frequencies of the lowest two formants. In particular, the center frequency of the so-called singer's formant seems perceptually relevant. Vocal loudness, generally assumed to correspond closely to the sound pressure level, depends rather on the amplitude balance between the lower and the higher spectrum partials. The perceived pitch corresponds to the fundamental frequency, or for vibrato tones, the mean of this frequency. In rapid passages, such as coloratura singing, special patterns are used. Pitch and duration differences are categorically perceived in music. This means that small variations in tuning or duration do not affect the musical interval and the note value perceived. Categorical perception is used extensively in music performance for the purpose of musical expression because without violating the score, the singer may sharpen or flatten and lengthen or shorten the tones, thereby creating musical expression.}, } @article {pmid8058448, year = {1994}, author = {Fowler, CA}, title = {Invariants, specifiers, cues: an investigation of locus equations as information for place of articulation.}, journal = {Perception & psychophysics}, volume = {55}, number = {6}, pages = {597-610}, pmid = {8058448}, issn = {0031-5117}, support = {NICHD HD 01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Cues ; Female ; Humans ; Male ; Microcomputers ; *Phonetics ; Signal Processing, Computer-Assisted/instrumentation ; Sound Spectrography/instrumentation ; *Speech Acoustics ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {This experiment explored the information for place of articulation provided by locus equations--equations for a line relating the second formant (F2) of a vowel at midpoint of F2 of the formant at consonant-vowel (CV) syllable onset. Locus equations cue place indirectly by quantifying directly the degree of coarticulatory overlap (coarticulation resistance) between consonant and vowel. Coarticulation resistance is correlated with place. The experiment tested predictions that when coarticulation resistance varies due to properties of the consonant other than place of articulation (in particular, due to manner of articulation), locus equations would not accurately reflect consonantal place of articulation. These predictions were confirmed. In addition, discriminant analyses, using locus equation variables as classifiers, were generally unsuccessful in classifying a set of consonants representing six different places of articulation. I conclude that locus equations are unlikely to provide useful place information to listeners.}, } @article {pmid8046143, year = {1994}, author = {Summers, V and Leek, MR}, title = {The internal representation of spectral contrast in hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {6}, pages = {3518-3528}, doi = {10.1121/1.409969}, pmid = {8046143}, issn = {0001-4966}, support = {DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Audiometry ; *Auditory Perception ; Auditory Threshold ; *Hearing ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Middle Aged ; Phonetics ; Speech Perception ; }, abstract = {Abnormal frequency resolution associated with sensorineural hearing impairment produces a smearing of spectral detail in the internal representation of complex acoustic stimuli. As a result, listeners with hearing loss may have difficulty locating spectral peaks (e.g., vowel formants) within stimuli which cue their identity. This study examined the relationship between frequency separation of peaks in a complex spectrum and the degree of spectral contrast preserved in the internal representations in normal and impaired auditory systems. Hearing-impaired and normal-hearing subjects discriminated a flat-spectrum bandpass stimulus from a stimulus containing a sinusoidal ripple across its frequency range. The peak-to-valley amplitude (in dB) necessary for detection of the ripple was measured for ripple frequencies ranging from 1 to 9 cycles/oct. Auditory filter characteristics were also measured at 1 and 3 kHz in order to examine the internal representations of the stimuli after cochlear processing. There were clear differences between groups in both auditory filter characteristics and spectral contrast detection. However, the amount of contrast in the internal representations predicted from these measurements was nearly the same for all subjects, suggesting that the reduced frequency resolution of the hearing-impaired group was largely responsible for differences in required peak-to-valley amplitude in the input spectra. Further, for all subjects, there was a trade-off between the absolute level of internal contrast necessary for ripple detection and the number of samples of this contrast available to the listener.}, } @article {pmid8207141, year = {1994}, author = {Nábĕlek, AK and Czyzewski, Z and Crowley, H}, title = {Cues for perception of the diphthong /aI/ in either noise or reverberation. Part I. Duration of the transition.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {5 Pt 1}, pages = {2681-2693}, doi = {10.1121/1.409837}, pmid = {8207141}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adult ; Aged ; *Hearing ; *Hearing Loss, Bilateral ; Humans ; Middle Aged ; Noise/adverse effects ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Location of boundaries (the 50% response point) and slopes of identification functions were determined for synthesized /a-aI/ vowel continua. Within each continuum, the stimuli contained a steady-state segment followed by a transition in which the frequencies of formants changed in time. Here, F1 changed in a downward direction and F2 changed in an upward direction. Total duration of each stimulus was 200 ms. The duration of the transition was increased in steps from 0 to 140 ms. Two patterns of formant transition were used: (1) formants changing in the direction of, but not reaching, target frequencies (except in the end-point stimulus), and (2) formants reaching F1 and F2 targets. The data were collected with ten normal-hearing and ten hearing-impaired subjects. The boundaries and slopes were determined for four listening conditions: quiet, noise, short reverberation (0.8 s), and long reverberation (1.1 s). The location of boundaries depended upon: (1) pattern of formant transitions, (2) listening condition, and (3) status of subjects' hearing. Generally, longer transitions were needed for formants changing in the direction of, but not reaching, target frequencies, than for those reaching F1 and F2 targets. The required transition durations were similar in quiet and noise, but were longer in reverberation. The hearing-impaired subjects generally required longer transitions to reach the boundaries than normal-hearing subjects. The slopes of the identification functions were shallower in either noise or reverberation than in quiet and were shallower for hearing-impaired than for normal-hearing subjects. In reverberation, the slopes for formants reaching targets were shallower than the slopes for stimuli with formants changing in the direction of target frequencies. The relationships between these findings and identification errors for naturally produced tokens of the diphthong /aI/ are discussed.}, } @article {pmid8040095, year = {1994}, author = {Knauth, M and Hartmann, R and Klinke, R}, title = {Discharge pattern in the auditory nerve evoked by vowel stimuli: a comparison between acoustical and electrical stimulation.}, journal = {Hearing research}, volume = {74}, number = {1-2}, pages = {247-258}, doi = {10.1016/0378-5955(94)90193-7}, pmid = {8040095}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Animals ; Cats ; Cochlear Implants ; Electric Stimulation ; Evoked Potentials, Auditory/*physiology ; Humans ; Linguistics ; Phonetics ; Psychoacoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Single channel cochlear implants only transmit the time structure of the electrically coded input signal. All nerve fibres show similar thresholds for monopolar round window stimulation, i.e., activation does not depend on their site of origin. To investigate the fine structure of the firing pattern elicited by stimulation with an analogue coded speech processing system (VIENNA 1-channel implant), cats were electrically stimulated with German steady-state vowels at the round window. Single fibre activity was recorded from primary auditory fibres and period histograms were calculated. The electrically evoked impulse patterns were compared with those from acoustic stimulation with the same vowels. With acoustic stimulation, the response of a fibre depends on the individual characteristic frequency (CF) with regard to the fundamental F0 and the formants F1, F2 and F3 of the vowels, the spontaneous activity of the fibre and the sound level. The evoked firing pattern was used to calculate period histograms, the frequency content of which was analysed by Fourier transformation. With electrical stimulation in the threshold range, an action potential is strongly synchronized to a cathodic peak of the current within one period of F0. With increasing current level 3-5 impulses can be locked to the same period. The timing of the short intervals is determined by the relative refractory period and current peaks (negative or positive) caused by the dominant higher formant F2 or F3. The acoustically evoked patterns are specific for the CF of the neuron and represent the spectral information of the different vowels.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid8200137, year = {1994}, author = {Watanabe, S and Arasaki, K and Nagata, H and Shouji, S}, title = {[Analysis of dysarthria in amyotrophic lateral sclerosis--MRI of the tongue and formant analysis of vowels].}, journal = {Rinsho shinkeigaku = Clinical neurology}, volume = {34}, number = {3}, pages = {217-223}, pmid = {8200137}, issn = {0009-918X}, mesh = {Aged ; Amyotrophic Lateral Sclerosis/complications/*physiopathology ; Dysarthria/diagnosis/*physiopathology ; Humans ; *Magnetic Resonance Imaging ; Male ; Middle Aged ; *Phonetics ; Tongue/*physiopathology ; }, abstract = {To evaluate dysarthria in patients with ALS, we used MRI (gradient rephasing echo method) and compared it with the computed acoustic analysis. Five ALS male patients of progressive bulbar palsy type and five normal male were asked to phonate the five Japanese vowels, /a/./i/./u/./e/./o/. MRI of the sagittal tongue and vocal tract was obtained by the gradient rephasing echo method (0.2 Tesla, TR:30 ms, TE:10 ms, FA 25 degrees C, Hitachi). We could clearly visualize the change of tongue shape and the narrow site of the vocal tract for each vowel phonation. In normal subjects, the tongue shape and the narrow site of the vocal tract were distinguishable between each vowel, but unclear in ALS. Acoustic analysis showed that the first formant frequency of /i/./u/ in ALS was higher than normal and the second formant frequency of /i/./e/ in ALS was significantly lower than normal. The discrepancy from the normal first, second and third formant frequency for each vowel of ALS was most seen in /i/./e/. It was speculated that /i/ and /e/ were the most disturbed vowels in ALS. The first and second formant frequency of vowel depends on the tongue shape and the width of the oral cavity. Therefore the results of the acoustic analysis in ALS indicated poor movement of tongue in /i/./u/./e/ and were compatible with the findings of the sagittal tongue MRI. The sagittal view of the tongue in the gradient rephasing echo MRI and the acoustic analysis are useful in evaluation dysarthria in ALS.}, } @article {pmid8167783, year = {1994}, author = {Cleveland, TF}, title = {A clearer view of singing voice production: 25 years of progress.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {8}, number = {1}, pages = {18-23}, doi = {10.1016/s0892-1997(05)80315-7}, pmid = {8167783}, issn = {0892-1997}, mesh = {Humans ; Larynx/physiology ; Research ; Speech Acoustics ; Speech, Alaryngeal ; Teaching ; Vocal Cords/physiology ; Voice/*physiology ; }, abstract = {Voice research has enjoyed its most productive period of history during the past 25 years. Many of the enigmas related to the biomechanics and acoustics of the singing voice have been resolved. This paper presents state-of-the-art understanding regarding the following topics: vibrato, the singer's formant, formant tracking, voice registers, subglottal pressure, voice classification, modes of vocal fold vibration, laryngeal position during singing, flow glottography, and singing synthesis. In addition to these topics, the people who have made the most significant contributions to the advancement of singing research are recognized.}, } @article {pmid8036106, year = {1994}, author = {Green, KP and Stevens, EB and Kuhl, PK}, title = {Talker continuity and the use of rate information during phonetic perception.}, journal = {Perception & psychophysics}, volume = {55}, number = {3}, pages = {249-260}, pmid = {8036106}, issn = {0031-5117}, support = {1 P60 DC-01409/DC/NIDCD NIH HHS/United States ; HD-18286/HD/NICHD NIH HHS/United States ; NS-26475/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Female ; Humans ; Male ; *Periodicity ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Verbal Behavior ; }, abstract = {Research has shown that speaking rate provides an important context for the perception of certain acoustic properties of speech. For example, syllable duration, which varies as a function of speaking rate, has been shown to influence the perception of voice onset time (VOT) for syllable-initial stop consonants. The purpose of the present experiments was to examine the influence of syllable duration when the initial portion of the syllable was produced by one talker and the remainder of the syllable was produced by a different talker. A short-duration and a long-duration /bi/-/pi/ continuum were synthesized with pitch and formant values appropriate to a female talker. When presented to listeners for identification, these stimuli demonstrated the typical effect of syllable duration on the voicing boundary: a shorter VOT boundary for the short stimuli than for the long stimuli. An /i/ vowel, synthesized with pitch and formant values appropriate to a male talker, was added to the end of each of the short tokens, producing a new hybrid continuum. Although the overall syllable duration of the hybrid stimuli equaled the original long stimuli, they produced a VOT boundary similar to that for the short stimuli. In a second experiment, two new /i/ vowels were synthesized. One had a pitch appropriate to a female talker with formant values appropriate to a male talker; the other had a pitch appropriate to a male talker and formants appropriate to a female talker. These vowels were used to create two new hybrid continua. In a third experiment, new hybrid continua were created by using more extreme male formant values. The results of both experiments demonstrated that the hybrid tokens with a change in pitch acted like the short stimuli, whereas the tokens with a change in formants acted like the long stimuli. A fourth experiment demonstrated that listeners could hear a change in talker with both sets of hybrid tokens. These results indicate that continuity of pitch but not formant structure appears to be the critical factor in the calculation of speaking rate within a syllable.}, } @article {pmid8193504, year = {1994}, author = {Donzelli, GP and Rapisardi, G and Moroni, M and Zani, S and Tomasini, B and Ismaelli, A and Bruscaglioni, P}, title = {Computerized cry analysis in infants affected by severe protein energy malnutrition.}, journal = {Acta paediatrica (Oslo, Norway : 1992)}, volume = {83}, number = {2}, pages = {204-211}, doi = {10.1111/j.1651-2227.1994.tb13052.x}, pmid = {8193504}, issn = {0803-5253}, mesh = {Acoustics ; Brain Damage, Chronic/etiology/physiopathology ; Child, Preschool ; Crying/*physiology ; Fourier Analysis ; Humans ; Infant ; Kwashiorkor/complications/diagnosis ; Protein-Energy Malnutrition/complications/*diagnosis ; *Signal Processing, Computer-Assisted ; }, abstract = {A new method of computerized cry analysis has been utilized to evaluate the cries of infants affected by severe protein energy malnutrition. We studied 17 Kenian babies affected by severe malnutrition for more than four months (9 cases of marasmus and 8 of kwashiorkor) and a control group of 17 well-nourished babies. The cries of the malnourished children showed lower inter-utterance variability, formants' frequencies and cry score, assigned by the Infant Cry Modulation Assessment Scale. The melodic pattern was more often flat, rising or falling-rising, when compared to the cries of the well-nourished babies. We hypothesize that these differences reflect the state of brain damage associated with protein energy malnutrition. No differences were found between the cries of infants affected by marasmus and those affected by kwashiorkor, between the cries recorded before and after nutritional therapy and between the first cries of malnourished children who subsequently died during hospitalization and those of infants who survived.}, } @article {pmid8137144, year = {1994}, author = {Hauser, MD and Ybarra, MS}, title = {The role of lip configuration in monkey vocalizations: experiments using xylocaine as a nerve block.}, journal = {Brain and language}, volume = {46}, number = {2}, pages = {232-244}, doi = {10.1006/brln.1994.1014}, pmid = {8137144}, issn = {0093-934X}, support = {HD07213-02/HD/NICHD NIH HHS/United States ; RR03640/RR/NCRR NIH HHS/United States ; }, mesh = {Animals ; Larynx/physiology ; Lidocaine/pharmacology ; Lip/drug effects/*physiology ; Macaca mulatta/*physiology ; Nerve Block ; Sound Spectrography ; Vocalization, Animal/drug effects/*physiology ; }, abstract = {Human and nonhuman primates commonly alter the configuration of their lips during vocal production and thereby modify vocal tract length and shape. In nonhuman primates, however, the effects of lip configuration on call structure are unknown. This study was designed to investigate the importance of lip configuration in rhesus monkey (Macaca mulatta) vocal production by temporarily blocking lip movement with injections of xylocaine. For "coo" vocalizations, an affiliative contact call that is normally produced with protruded lips, the xylocaine treatment had no statistically significant effect on call duration or characteristics of the fundamental frequency (i.e., features associated with respiration and laryngeal function). However, the two formant frequencies (i.e., features resulting from the filtering properties of the supralaryngeal cavity) of the call were significantly effected. Specifically, formant frequencies increased, as would be expected from an individual incapable of compensating for a shortened vocal tract. In contrast to coos, xylocaine had no statistically significant effect on the acoustic structure of "noisy screams," a call given in response to being attacked by a dominant and produced with retracted lips (i.e., a shortened vocal tract). Results suggest that for some vocalizations, lip configuration may be essential for achieving the intended acoustic target, whereas for other vocalizations, it is less important.}, } @article {pmid8132911, year = {1994}, author = {Carlyon, RP}, title = {Detecting pitch-pulse asynchronies and differences in fundamental frequency.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {2}, pages = {968-979}, doi = {10.1121/1.410013}, pmid = {8132911}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Auditory Threshold ; Cues ; Female ; Humans ; Male ; *Pitch Perception ; *Psychoacoustics ; Speech Perception ; }, abstract = {A series of experiments investigated the detection of pitch-pulse asynchronies (PPAs) and of differences in fundamental frequency (delta F0's) between two simultaneous "formants," each of which consisted of a group of sinusoidal components spaced sufficiently closely in frequency to be incompletely resolved by the peripheral auditory system. For such stimuli, introducing a delta F0 caused the pitch pulses of the two groups to become progressively more asynchronous at later and later parts of the stimulus. A comparison of the psychometric functions for the detection of delta F0's and of (constant) PPAs suggested that listeners could, under some circumstances, detect delta F0's from the resulting asynchrony. Using an adaptive procedure, it was also shown that, for a range of F0's from 20 to 125 Hz, the threshold PPA was constant at about 2.5 ms. This relationship between PPA threshold and F0 differs from that previously reported for the detection of envelope asynchronies between pairs of sinusoidally modulated tones [e.g., Strickland et al., J. Acoust. Soc. Am. 86, 2160-2166 (1989)], which remain constant in degrees, rather than in ms. Further experiments investigated the role of onset asynchronies in the detection of PPAs and showed that, although an asynchrony between the first pitch pulse in each formant was sufficient for near-asymptotic performance, listeners could still do the task when this cue was removed. For the detection of delta F0's, onset asynchronies also aided, but were not necessary for, discrimination.}, } @article {pmid8132903, year = {1994}, author = {Titze, IR and Mapes, S and Story, B}, title = {Acoustics of the tenor high voice.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {2}, pages = {1133-1142}, doi = {10.1121/1.408461}, pmid = {8132903}, issn = {0001-4966}, support = {P60 DC 00976/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Models, Biological ; Phonetics ; *Speech Acoustics ; Voice/*physiology ; Voice Quality ; }, abstract = {The spectra of six tenors were analyzed at high pitches, F4 to B4. Because of the wide separation between harmonics, formant frequencies could not be extracted in the traditional way. Rather, an analysis-by-synthesis technique was used to match the spectra of a model to the measured spectra, using parameter optimization. Results suggest that tenors maintain their first formant frequencies well above the fundamental for all vowels except [u]. The purpose of this seems to be to distribute the acoustic energy between harmonics 2, 3, and 4 rather than to boost the fundamental. Tuning the first formant to the fundamental is a technique used effectively by sopranos but seems to be deliberately avoided by tenors in order to preserve a male quality.}, } @article {pmid8132901, year = {1994}, author = {Hawks, JW}, title = {Difference limens for formant patterns of vowel sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {2}, pages = {1074-1084}, doi = {10.1121/1.410015}, pmid = {8132901}, issn = {0001-4966}, support = {NIDCD DC00296-06/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Computers ; Female ; Humans ; Male ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Studies of thresholds for discrimination of formant frequency variation in synthetic vowel sounds have been predominantly limited to variations in a single formant. Here, differences limens (DLs) are presented for multiformant variations expressed in measures of delta F and as distances in the auditory-perceptual space (APS) proposed by J. D. Miller [J. Acoust. Soc. Am. 85, 2114-2134 (1989)]. DLs for four subjects were estimated along 102 synthetic vowel continua representing five patterns of formant variation [(1) single variation in F1; (2) single variation in F2; (3) parallel simultaneous variation in F1 and F2; (4) opposing simultaneous variation in F1 and F2; and parallel simultaneous variation in F1, F2, and F3] and 17 within- or between-category vowel sounds. Minimal uncertainty methodology was employed utilizing an adaptive up-down procedure with a cued, two-interval forced-choice (2IFC) task. The results of this experiment reflect smaller DLs for both single- and multiple-formant changes than have been found in the past and also suggest that discrimination of parallel multiformant variation is significantly better than opposing multiformant or single-formant variation.}, } @article {pmid8132898, year = {1994}, author = {Kluender, KR and Lotto, AJ}, title = {Effects of first formant onset frequency on [-voice] judgments result from auditory processes not specific to humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {2}, pages = {1044-1052}, doi = {10.1121/1.408466}, pmid = {8132898}, issn = {0001-4966}, support = {NICDC DC-00719/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Coturnix ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {When F1-onset frequency is lower, longer F1 cut-back (VOT) is required for human listeners to perceive synthesized stop consonants as voiceless. K. R. Kluender [J. Acoust. Soc. Am. 90, 83-96 (1991)] found comparable effects of F1-onset frequency on the "labeling" of stop consonants by Japanese quail (coturnix coturnix japonica) trained to distinguish stop consonants varying in F1 cut-back. In that study, CVs were synthesized with natural-like rising F1 transitions, and endpoint training stimuli differed in the onset frequency of F1 because a longer cut-back resulted in a higher F1 onset. In order to assess whether earlier results were due to auditory predispositions or due to animals having learned the natural covariance between F1 cut-back and F1-onset frequency, the present experiment was conducted with synthetic continua having either a relatively low (375 Hz) or high (750 Hz) constant-frequency F1. Six birds were trained to respond differentially to endpoint stimuli from three series of synthesized /CV/s varying in duration of F1 cut-back. Second and third formant transitions were appropriate for labial, alveolar, or velar stops. Despite the fact that there was no opportunity for animal subjects to use experienced covariation of F1-onset frequency and F1 cut-back, quail typically exhibited shorter labeling boundaries (more voiceless stops) for intermediate stimuli of the continua when F1 frequency was higher. Responses by human subjects listening to the same stimuli were also collected. Results lend support to the earlier conclusion that part or all of the effect of F1 onset frequency on perception of voicing may be adequately explained by general auditory processes.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid8036102, year = {1994}, author = {Chalikia, MH and Warren, RM}, title = {Spectral fissioning in phonemic transformations.}, journal = {Perception & psychophysics}, volume = {55}, number = {2}, pages = {218-226}, pmid = {8036102}, issn = {0031-5117}, support = {DC00208/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Humans ; *Phonetics ; Sound Spectrography ; Speech Perception ; }, abstract = {Listeners presented with a repeated sequence of brief (30- to 100-msec) steady-state vowels hear phonemic transformations--they cannot identify the vowels, but they perceive two simultaneous utterances that differ in both phonemic content and timbre (Warren, Bashford & Gardner, 1990). These utterances consist of either English words or syllables that occur in English words. In the present study, we attempted to determine whether the two percepts represent alternative interpretations of the same formant structures, or whether different portions of the vowels are used for each verbal organization. It was found that separate spectral regions are employed for each verbal form: Components below 1500 Hz were generally used for one form, and components above 1500 Hz for the other. Hypotheses are offered concerning the processes responsible for the verbal organization of the vowel sequences and for the splitting into two spectrally limited forms. It appears that the tendency to organize spectral regions separately competes with, and can overcome, the tendency to integrate the different spectral components of speech into a single auditory image. A contralateral induction paradigm was used in a procedure designed to quantitatively evaluate these opposing forces of spectral fission and fusion.}, } @article {pmid8158242, year = {1994}, author = {Wang, X and Sachs, MB}, title = {Neural encoding of single-formant stimuli in the cat. II. Responses of anteroventral cochlear nucleus units.}, journal = {Journal of neurophysiology}, volume = {71}, number = {1}, pages = {59-78}, doi = {10.1152/jn.1994.71.1.59}, pmid = {8158242}, issn = {0022-3077}, support = {DC-00109/DC/NIDCD NIH HHS/United States ; DC-00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; Auditory Threshold/physiology ; Cats ; Cochlear Nucleus/anatomy & histology/cytology/*physiology ; Evoked Potentials, Auditory/physiology ; Microelectrodes ; Nerve Fibers/physiology ; Neurons/*physiology ; Vestibulocochlear Nerve/anatomy & histology/cytology/physiology ; }, abstract = {1. We have studied responses of anteroventral cochlear nucleus (AVCN) units to single-formant stimuli (SFS), in an effort to make quantitative comparisons with responses observed in auditory-nerve fibers (ANFs) to the same stimuli (Wang and Sachs 1993) and to reveal some of the signal processing mechanisms at the AVCN. Single-unit recordings and subsequent analyses were performed on each type of commonly recorded units, namely primarylike (Pri), primarylike with notch (PN), sustained chopper (ChS), transient chopper (ChT), and onset chopper (OnC), as well as a few onset (On) units, from the AVCN in anesthetized cats. The responses were obtained at a wide range of sound levels and at a frequency range of 1-10 kHz. Modulation in the envelopes of discharge patterns was quantified by a measure called modulation depth. 2. At moderate to high sound levels, most AVCN units were found to have enhanced modulation depth compared with that of ANFs, although the degree of enhancement varies among different types. All AVCN units, except Pri type, showed an enhancement in modulation depth over that of the highest of ANFs at moderate to high sound levels in the order of (from the highest to the lowest) On, OnC, ChT/PN, and ChS. Specifically, 1) modulation depth in Pri units was comparable to that of high spontaneous rate (SR) ANFs at low sound levels and to that of low/medium SR ANFs at high sound levels (in dB SPL). When sound level was normalized by unit threshold, Pri units, on average, exhibited only limited enhancement in envelope modulation at high sound levels (> 80 dB re threshold); 2) PN units showed substantially enhanced modulation depth over that of all SR groups of ANFs at moderate to high sound levels in dB SPL or dB re threshold scales; 3) significant enhancement in modulation depth was seen in both ChS and ChT units, with a slightly higher modulation depth in ChT type across sound levels (in dB SPL or dB re threshold); 4) modulation depth of OnC units was higher than those of primary-like (Pri and PN) and chopper (ChS and ChT) units at a wide range of sound levels; 5) responses from a limited sample of On units showed the highest modulation depth among all types of AVCN units. 3. Detailed analysis revealed that the enhanced modulation depth in the responses of AVCN units is the result of increased envelope peak height and decreased envelope minimum, relative to those of ANFs.(ABSTRACT TRUNCATED AT 400 WORDS)}, } @article {pmid8129677, year = {1994}, author = {Alcántara, JI and Dooley, GJ and Blamey, PJ and Seligman, PM}, title = {Preliminary evaluation of a formant enhancement algorithm on the perception of speech in noise for normally hearing listeners.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {33}, number = {1}, pages = {15-27}, doi = {10.3109/00206099409072951}, pmid = {8129677}, issn = {0020-6091}, mesh = {Adult ; *Algorithms ; Cochlear Implants ; Female ; Hearing Aids ; Humans ; Male ; *Noise ; *Perceptual Masking ; *Phonetics ; Pitch Perception ; Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Perception ; }, abstract = {The effects on speech perception in noise of dynamic filtering with bandpass filters centred at the first formant (f1) and second formant (f2) frequencies were evaluated with four normally hearing listeners. Multitalker babble was added to the speech signal with signal-to-noise ratios of -5 to -15 dB, chosen to reduce intelligibility to about 50%. The combined signal was then filtered with two-pole programmable bandpass filters centred at f1 and f2 under the control of a real-time speech processor. The f1 and f2 frequencies were estimated from the speech signal before noise was added to avoid hardware processing errors. Closed set vowel and consonant tests (using 11 /h/vowel/d/ and 12 /a/consonant/a/ stimuli), the Consonant-Nucleus-Consonant Monosyllabic Word Test and the Bamford-Kowal-Bench Sentence Test were carried out for three filter bandwidths (3/4, 1/3 and 1/6 octave) and for unprocessed speech in noise. The processing produced a small significant improvement for vowels in all three processed speech conditions and for monosyllables at the broadest filter setting compared to the unprocessed speech condition. There was no significant effect on consonants. A small negative effect was observed for sentences at the narrowest filter setting.}, } @article {pmid8120261, year = {1994}, author = {van Wieringen, A and Pols, LC}, title = {Frequency and duration discrimination of short first-formant speechlike transitions.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {1}, pages = {502-511}, doi = {10.1121/1.408344}, pmid = {8120261}, issn = {0001-4966}, mesh = {Adult ; *Attention ; Auditory Threshold ; Female ; Humans ; Male ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Reaction Time ; Reference Values ; Sound Spectrography ; *Speech Perception ; *Time Perception ; }, abstract = {Frequency and duration discrimination thresholds of short rising and falling one-formant speechlike transitions without a steady state were determined by means of same/different paired comparison tasks in two experiments. When frequency extent is varied (experiment 1), just noticeable differences decrease with increasing transition duration. Expressed in Hz, thresholds are, on average, 70, 63, and 58 Hz for 20, 30, and 50 ms, respectively. However, when transition duration is varied at a constant frequency extent (experiment 2), difference limens increase with increasing duration and are, on average, 2.7, 4.5, and 4.9 ms for standard transitions of 20, 30, and 50 ms, respectively. The thresholds determined in the two experiments indicate that different psychoacoustical cues are used depending on whether final frequency (experiment 1) or transition duration (experiment 2) are varied. Both experiments were performed at two different frequency regions (between 200 and 700 Hz and between 500 and 1000 Hz), but the results did not differ per region. In addition, no significant differences were found between rising and falling transitions. Particular attention was paid to a methodological issue, viz., the extent to which sensitivity changes as a result of different proportions of catch trials. It was found that the listeners maintained the same response strategies throughout the tests, as their performance is similar, irrespective of the number of catch trials included in the testing sessions.}, } @article {pmid8120259, year = {1994}, author = {Kewley-Port, D and Watson, CS}, title = {Formant-frequency discrimination for isolated English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {1}, pages = {485-496}, doi = {10.1121/1.410024}, pmid = {8120259}, issn = {0001-4966}, support = {DC-00250/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Auditory Threshold ; Female ; Humans ; Male ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Thresholds for formant-frequency discrimination were obtained for ten synthetic English vowels patterned after a female talker. To estimate the resolution of the auditory system for these stimuli, thresholds were measured using well-trained subjects under minimal-stimulus-uncertainty procedures. Thresholds were estimated for both increments and decrements in formant frequency for the first and second formants. Reliable measurements of threshold were obtained for most formants tested, the exceptions occurring when a harmonic of the fundamental was aligned with the center frequency of the test formant. In these cases, unusually high thresholds were obtained from some subjects and asymmetrical thresholds were measured for increments versus decrements in formant frequency. Excluding those cases, thresholds for formant frequency, delta F, are best described as a piecewise-linear function of frequency which is constant at about 14 Hz in the F1 frequency region (< 800 Hz), and increases linearly in the F2 region. In the F2 region, the resolution for formant frequency is approximately 1.5%. The present thresholds are similar to previous estimates in the F1 region, but about a factor of three lower than those in the F2 region. Comparisons of these results to those for pure tones and for complex, nonspeech stimuli are discussed.}, } @article {pmid8120258, year = {1994}, author = {Assmann, PF and Summerfield, Q}, title = {The contribution of waveform interactions to the perception of concurrent vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {95}, number = {1}, pages = {471-484}, doi = {10.1121/1.408342}, pmid = {8120258}, issn = {0001-4966}, support = {27032-542//PHS HHS/United States ; }, mesh = {Adult ; *Attention ; Cues ; Dichotic Listening Tests ; Female ; Humans ; Male ; *Phonetics ; Pitch Discrimination ; Reaction Time ; Sound Spectrography ; *Speech Perception ; }, abstract = {Models of the auditory and phonetic analysis of speech must account for the ability of listeners to extract information from speech when competing voices are present. When two synthetic vowels are presented simultaneously and monaurally, listeners can exploit cues provided by a difference in fundamental frequency (F0) between the vowels to help determine their phonemic identities. Three experiments examined the effects of stimulus duration on the perception of such "double vowels." Experiment 1 confirmed earlier findings that a difference in F0 provides a smaller advantage when the duration of the stimulus is brief (50 ms rather than 200 ms). With brief stimuli, there may be insufficient time for attentional mechanisms to switch from the "dominant" member of the pair to the "nondominant" vowel. Alternatively, brief segments may restrict the availability of cues that are distributed over the time course of a longer segment of a double vowel. In experiment 1, listeners did not perform better when the same 50-ms segment was presented four times in succession (with 100-ms silent intervals) rather than only once, suggesting that limits on attention switching do not underlie the duration effect. However, performance improved in some conditions when four successive 50-ms segments were extracted from the 200-ms double vowels and presented in sequence, again with 100-ms silent intervals. Similar improvements were observed in experiment 2 between performance with the first 50-ms segment and one or more of the other three segments when the segments were presented individually. Experiment 3 demonstrated that part of the improvement observed in experiments 1 and 2 could be attributed to waveform interactions that either reinforce or attenuate harmonics that lie near vowel formants. Such interactions were beneficial only when the difference in F0 was small (0.25-1 semitone). These results are compatible with the idea that listeners benefit from small differences in F0 by performing a sequence of analyses of different time segments of a double vowel to determine where the formants of the constituent vowels are best defined.}, } @article {pmid8068772, year = {1994}, author = {Liu, F and Yamaguchi, Y and Shimizu, H}, title = {Flexible vowel recognition by the generation of dynamic coherence in oscillator neural networks: speaker-independent vowel recognition.}, journal = {Biological cybernetics}, volume = {71}, number = {2}, pages = {105-114}, pmid = {8068772}, issn = {0340-1200}, mesh = {Auditory Cortex/physiology ; Computer Simulation ; Cybernetics ; Humans ; *Models, Neurological ; Nerve Net/*physiology ; Neural Networks, Computer ; Speech Perception/*physiology ; }, abstract = {We propose a new model for speaker-independent vowel recognition which uses the flexibility of the dynamic linking that results from the synchronization of oscillating neural units. The system consists of an input layer and three neural layers, which are referred to as the A-, B- and C-centers. The input signals are a time series of linear prediction (LPC) spectrum envelopes of auditory signals. At each time-window within the series, the A-center receives input signals and extracts local peaks of the spectrum envelope, i.e., formants, and encodes them into local groups of independent oscillations. Speaker-independent vowel characteristics are embedded as a connection matrix in the B-center according to statistical data of Japanese vowels. The associative interaction in the B-center and reciprocal interaction between the A- and B-centers selectively activate a vowel as a global synchronized pattern over two centers. The C-center evaluates the synchronized activities among the three formant regions to give the selective output of the category among the five Japanese vowels. Thus, a flexible ability of dynamical linking among features is achieved over the three centers. The capability in the present system was investigated for speaker-independent recognition of Japanese vowels. The system demonstrated a remarkable ability for the recognition of vowels very similar to that of human listeners, including misleading vowels. In addition, it showed stable recognition for unsteady input signals and robustness against background noise. The optimum condition of the frequency of oscillation is discussed in comparison with stimulus-dependent synchronizations observed in neurophysiological experiments of the cortex.}, } @article {pmid8052671, year = {1994}, author = {Maeda, S and Honda, K}, title = {From EMG to formant patterns of vowels: the implication of vowel spaces.}, journal = {Phonetica}, volume = {51}, number = {1-3}, pages = {17-29}, doi = {10.1159/000261955}, pmid = {8052671}, issn = {0031-8388}, support = {DC-00121/DC/NIDCD NIH HHS/United States ; }, mesh = {*Electromyography ; Humans ; Phonation/physiology ; *Phonetics ; Speech/*physiology ; Tongue/physiology ; }, abstract = {With a few exceptions, EMG data are interpreted with reference to the intended output, such as the phonetic description of utterances spoken by speakers. For a more rigorous interpretation, the data should also be analysed in terms of the displacement of the articulators and the acoustic patterns. In this paper, we describe our attempts to calculate the formant patterns from EMG activity patterns via an articulatory model. The value of the model parameters, such as the tongue body position or tongue body shape, is derived from the EMG activities of the specific pairs of antagonistic tongue muscles. The model-calculated F1-F2 patterns for 11 American English vowels correspond rather well with those measured from the acoustic signals. What strikes us is the simplicity of the mappings from the muscle activities to vocal-tract configurations and to the formant patterns. We speculate that the brain optimally exploits the morphology of the vocal tract and the kinematic functions of the tongue muscles so that the mappings from the muscle activities (production) to the acoustic patterns (perception) are simple and robust.}, } @article {pmid7948627, year = {1994}, author = {Lucas, WE and Hudson, W}, title = {An adaptive algorithm for the automatic segmentation of continuous stuttered speech.}, journal = {Biomedical sciences instrumentation}, volume = {30}, number = {}, pages = {147-152}, pmid = {7948627}, issn = {0067-8856}, mesh = {*Algorithms ; Female ; Humans ; Male ; *Signal Processing, Computer-Assisted ; *Speech ; Stuttering/*diagnosis ; }, abstract = {This paper presents an adaptive algorithm for the automatic segmentation of continuous speech into voiced, unvoiced, and silence regions (V/U/S). The procedure presented is based on the use of a digital filter bank with control parameters established using formant frequencies of voiced segments of speech. This procedure makes it possible to separate continuous speech into syllable like structures. The use of these speech segments makes possible the determination of stuttered speech. Special care is given to the handling of prolongations and repetitions (audible and silent) to insure they are properly handled by the segmenting algorithm. Segmentation experiments have been conducted on male and female subjects to test the reliability of the system at segmenting fluent and stuttered continuous speech.}, } @article {pmid8300949, year = {1993}, author = {De Mori, R and Flammia, G}, title = {Speaker-independent consonant classification in continuous speech with distinctive features and neural networks.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {6}, pages = {3091-3103}, doi = {10.1121/1.407243}, pmid = {8300949}, issn = {0001-4966}, mesh = {Female ; Humans ; Information Systems ; Male ; *Neural Networks, Computer ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Perception ; }, abstract = {This paper provides experimental evidence to the assertion that the design of appropriate neural networks (NN) for speech recognition should be inspired by acoustic and phonetic knowledge, and not only by knowledge in pattern recognition. Rather than investigating the NN learning paradigm, the paper is focused on the influence of the input parameters, of the internal structure, and of the desired output representation on the classification performance of recurrent multilayer perceptrons. As an instructive example, the paper analyzes the problem of classifying ten stop and nasal consonants in continuous speech independently of the speaker. Experiments are reported for the TIMIT database, using 343 speakers in the training set and 77 different speakers in the test set. Comparative experiments show that good performance is obtained when many input acoustic parameters are used, including a time/frequency gradient operator related to transitions of the second formant, and when the desired outputs represent context-dependent articulatory features. Classification is performed by principal component analysis of the NN outputs. Refinement of the design parameters yield increasingly better performance on the test set, ranging from 45% errors for a perceptron without hidden nodes to 23.3% errors for the best NN.}, } @article {pmid8293062, year = {1993}, author = {Sundberg, J and Gramming, P and Lovetri, J}, title = {Comparisons of pharynx, source, formant, and pressure characteristics in operatic and musical theatre singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {4}, pages = {301-310}, doi = {10.1016/s0892-1997(05)80118-3}, pmid = {8293062}, issn = {0892-1997}, mesh = {Female ; Glottis/physiology ; Humans ; Pharynx/anatomy & histology/*physiology ; Phonation/physiology ; *Pulmonary Ventilation ; Sound Spectrography ; Speech Acoustics ; Vocal Cords ; *Voice Quality ; }, abstract = {Belting, a vocal technique typically cultivated in musical theatre singing, differs timbrally from operatic singing in many interesting respects. The underlying phonatory differences have not been previously investigated in detail. Yet, belting is frequently associated with disturbances of voice function. Articulatory and phonatory characteristics are investigated in a female subject who is a professional singer (co-author JL) trained in both the operatic and belting styles and in an intermediate vocal technique ("mixed"). This article presents data obtained from this subject by video-fiberoptic observation of the pharynx, inverse filtering of airflow, and measurement of subglottal pressure. The results reveal that belting was characterized by very high subglottal pressures and sound levels, and apparently also by a comparatively high degree of glottal adduction. Comparisons with other investigations of related aspects of belting and operatic singing support the assumption that the data obtained from our subject are representative for these vocal techniques.}, } @article {pmid8114495, year = {1993}, author = {Sussman, JE}, title = {Perception of formant transition cues to place of articulation in children with language impairments.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {6}, pages = {1286-1299}, doi = {10.1044/jshr.3606.1286}, pmid = {8114495}, issn = {0022-4685}, support = {SORR07066/OR/ORS NIH HHS/United States ; }, mesh = {Child, Preschool ; Female ; Humans ; Language Disorders/*diagnosis ; Language Tests ; Male ; *Phonetics ; *Speech Perception ; Verbal Learning ; }, abstract = {Discrimination and phonetic identification abilities of 5- to 6-year-old children with language impairments were compared to those of 4-year-olds with normally developing language and to previous findings from 5- to 6-year-olds and adults for synthetic stimuli ranging from [ba] to [da]. Results showed similar discrimination sensitivity to the second- and third-formant transition cues of stimuli by all children, with poorest sensitivity by the youngest. Phonetic categorization by children with language impairments was most different from the groups with normal language abilities, evidenced by a difference in the percent of tokens labeled as "BA" and by greater variability in labeling and in placement of phonetic category boundaries. Results support hypotheses by Gathercole and Baddeley (1990) suggesting that the phonological component of working memory may be disordered in children with language impairments. Results are also suggestive of specific difficulties with left-hemisphere processing associated with language learning rather than with problems related to sensitivity to formant transitions of the speech tokens.}, } @article {pmid8114479, year = {1993}, author = {Titze, IR and Liang, H}, title = {Comparison of Fo extraction methods for high-precision voice perturbation measurements.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {6}, pages = {1120-1133}, doi = {10.1044/jshr.3606.1120}, pmid = {8114479}, issn = {0022-4685}, support = {DC00387-05/DC/NIDCD NIH HHS/United States ; }, mesh = {Electronic Data Processing ; Female ; Humans ; Male ; Models, Theoretical ; Noise ; Pitch Perception ; Speech/physiology ; Speech Production Measurement ; *Voice Quality ; }, abstract = {Voice perturbation measures, such as jitter and shimmer, depend on accurate extraction of fundamental frequency (Fo) and amplitude of various waveform types. The extraction method directly affects the accuracy of the measures, particularly if several waveform types (with or without formant structure) are under consideration and if noise and modulation are present in the signal. For frequency perturbation, high precision is defined here as the ability to extract Fo to +/- 0.01% under conditions of noise and modulation. Three Fo-extraction methods and their software implementations are discussed and compared. The methods are cycle-to-cycle waveform matching, zero-crossing and peak-picking. Interpolation between samples is added to make the extractions more accurate and reliable. The sensitivity of the methods to different parameters such as sampling frequency, mean Fo, signal-to-noise ratio, frequency modulation, and amplitude modulation are explored.}, } @article {pmid8270732, year = {1993}, author = {Fourakis, M and Geers, A and Tobey, E}, title = {An acoustic metric for assessing change in vowel production by profoundly hearing-impaired children.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {5}, pages = {2544-2552}, doi = {10.1121/1.407366}, pmid = {8270732}, issn = {0001-4966}, support = {5 R01 DC00443/DC/NIDCD NIH HHS/United States ; R15 DC00037/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; *Communication Aids for Disabled ; *Correction of Hearing Impairment ; Female ; Humans ; Longitudinal Studies ; Male ; Phonetics ; Speech Intelligibility ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {The purpose of this study was to investigate the feasibility of developing an acoustic metric to assess vowel production in profoundly hearing-impaired children. The approach taken was to develop a metric from acoustic analysis of vowel productions and then compare it with the perceptual ratings of the same productions by listeners. Speech samples were collected from three profoundly hearing-impaired children participating in a longitudinal study that investigated the effectiveness of assistive listening devices upon speech development. The metric used the extracted fundamental and first, second, and third formant frequencies to represent the tokens as points in a three-dimensional auditory-perceptual space modeled after earlier work by Miller [J. Acoust. Soc. Am. 85, 2114-2134 (1989)]. Euclidean distances were determined between each point and the intended vowel, which was represented by coordinates taken from the Peterson and Barney [J. Acoust. Soc. Am. 24, 175-184 (1952)] data for children. The data suggest that the three-dimensional metric provides significant correlations between production and perception.}, } @article {pmid8246484, year = {1993}, author = {Nittrouer, S}, title = {The emergence of mature gestural patterns is not uniform: evidence from an acoustic study.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {5}, pages = {959-972}, doi = {10.1044/jshr.3605.959}, pmid = {8246484}, issn = {0022-4685}, support = {DC-00633/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Analysis of Variance ; Child ; *Child Language ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Time Factors ; Voice ; }, abstract = {Previous studies investigating the organization of articulatory gestures present conflicting accounts of age-related differences in the execution of the articulatory gestures themselves and in the organization of those gestures. Several methodological differences may help to explain these contradictions: First, different studies have used different measures, all of which reflect vocal-tract activity to varying extents; second, the articulatory gestures being analyzed differed across studies; third, the phonetic composition of syllables has varied; and finally, utterance length, and therefore complexity, has varied across studies. The purpose of this study was to investigate the possibility that the reason these methodological differences have led to contradictory results is because the emergence of mature gestural patterns in children's speech is not uniform. To accomplish this goal, detailed acoustic analyses were performed on schwa-stop-vowel utterances from adults and from children (3, 5, and 7 years of age). Temporal measures showed that some acoustic segments were longer in children's than in adults' samples, whereas others were similar in duration. Formant frequencies indicated that vocal-tract opening and closing achieve adult-like patterns of movement by the age of 3 years, but children's tongue gestures are constrained by phonetic context more than those of adults until at least the age of 7 years. Taken together, these results suggest that the pace of development for learning to produce and to coordinate articulatory gestures is not uniform. Thus, the contradictions in findings among earlier studies may very well reflect differences in choices of measurement and utterances to be analyzed, both of which may lead to evaluations of different aspects of gestural patterning.}, } @article {pmid8246477, year = {1993}, author = {Yaruss, JS and Conture, EG}, title = {F2 transitions during sound/syllable repetitions of children who stutter and predictions of stuttering chronicity.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {5}, pages = {883-896}, doi = {10.1044/jshr.3605.883}, pmid = {8246477}, issn = {0022-4685}, support = {DC000523/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; Female ; Humans ; Male ; *Phonetics ; Prognosis ; *Speech Acoustics ; *Stuttering ; }, abstract = {The purpose of the present study was to examine the relationships between second formant (F2) transitions during the sound/syllable repetitions (SSRs) of young children who stutter and their predicted chronicity of stuttering. Subjects were 13 youngsters who stutter, who were divided into two groups based on their predicted chronicity of stuttering as measured by the Stuttering Prediction Instrument (SPI; Riley, 1984): a high-risk group, consisting of 7 boys (mean age = 50.6 months), and a low-risk group, consisting of 5 boys and 1 girl (mean age = 48.5 months). Each child was audio/videotape-recorded during a 30-minute conversational interaction with his or her mother. Ten SSRs per child were acoustically analyzed to identify differences in F2 transitions between the repeated (stuttered) and fluent (nonstuttered) portions of the words. Present findings are consistent with those of Stromsta (1965, 1986), who reported that children who stutter produce F2 transitions during stuttering that are nonmeasurable or missing or that differ in direction of movement from fluent transitions. However, there were no significant between-group differences in the frequency of occurrence of these "abnormal" F2 transitions, findings that are apparently inconsistent with Stromsta's results. The remaining measurable F2 transitions showed no significant between-group differences in the mean differences between stuttered and fluent F2 transitions for onset and offset frequencies, transition extents, and transition rates. Within both groups, significant positive correlations were found between stuttered and fluent F2 transitions for all acoustic measures except for transition durations, which were not significantly correlated for either high-risk or low-risk subjects. Within the low-risk group, stuttered F2 transitions were typically shorter than fluent transitions. Findings were taken to suggest that some elements of sound or segment prolongation may be present within the SSRs of children who stutter and who are considered to be at high risk for continuing to stutter, indicating that further study of selected aspects of F2 transitions during stuttering may provide useful clinical information for predicting the likelihood that a child will continue to stutter.}, } @article {pmid8227744, year = {1993}, author = {Hedrick, MS and Ohde, RN}, title = {Effect of relative amplitude of frication on perception of place of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {4}, pages = {2005-2026}, doi = {10.1121/1.407503}, pmid = {8227744}, issn = {0001-4966}, support = {DC 00464/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Phonetics ; Sound Spectrography ; *Speech Perception ; Speech Production Measurement ; }, abstract = {The amplitude of frication relative to vowel onset amplitude in the F3 and F5 formant frequency regions was manipulated for the synthetic fricative contrasts /s/-/integral of/ and /s/-/theta/, respectively. The influence of this relative amplitude manipulation on listeners' perception of place of articulation was tested by (1) varying the duration of frication from 30 to 140 ms, (2) pairing the frication noise with different vowels /i a u/, (3) placing formant transitions in conflict with relative amplitude, and (4) holding relative amplitude constant within a continuum while varying formant transitions and the amplitudes of spectral regions where relative amplitude was not manipulated. To determine if listeners were using absolute spectral cues or relative amplitude comparisons between frication and vowel for fricative identification, the frication and vowel were separated by (1) presenting the frication in isolation, and (2) inserting a gap of silence between the frication and vowel. The results showed that relative amplitude was perceived across vowel context and frication duration, and overrode context-dependent formant transition cues. The findings for temporal separations between the frication and vowel suggest that short-term memory processes may dominate the mediation of the relative-amplitude comparison. However, the overall results indicate that relative amplitude is only a component of spectral prominence, which is comprised of a primary frication spectral peak and a secondary frication/vowel peak comparison.}, } @article {pmid8227742, year = {1993}, author = {Remez, RE and Rubin, PE}, title = {On the intonation of sinusoidal sentences: contour and pitch height.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {4}, pages = {1983-1988}, doi = {10.1121/1.407501}, pmid = {8227742}, issn = {0001-4966}, support = {NICHHD (01994)/HD/NICHD NIH HHS/United States ; NIDCD (00308)/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Pitch Perception ; *Speech Perception ; }, abstract = {A sinusoidal replica of a sentence evokes a clear impression of intonation despite the absence of the primary acoustic correlate of intonation, the fundamental frequency. Our previous studies employed a test of differential similarity to determine that the tone analog of the first formant is a probable acoustic correlate of sinusoidal sentence intonation. Though the typical acoustic and perceptual effects of the fundamental frequency and the first formant differ greatly, our finding was anticipated by reports that harmonics of the fundamental within the dominance region provide the basis for impressions of pitch more generally. The frequency extent of the dominance region roughly matches the range of variability typical of the first formant. Here, we report two additional tests with sinusoidal replicas to identify the relevant physical attributes of the first formant analog that figure in the perception of intonation. These experiments determined (1) that listeners represent sinusoidal intonation as a pattern of relative pitch changes correlated with the frequency of the tonal replica of the first formant, and (2) that sinusoidal sentence intonation is probably a close match to the pitch height of the first formant tone. These findings show that some aspects of auditory pitch perception apply to the perception of intonation; and, that impressions of pitch of a multicomponent nonharmonic signal can be derived from the component within the dominance region.}, } @article {pmid8227741, year = {1993}, author = {Zahorian, SA and Jagharghi, AJ}, title = {Spectral-shape features versus formants as acoustic correlates for vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {4}, pages = {1966-1982}, doi = {10.1121/1.407520}, pmid = {8227741}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Models, Theoretical ; *Phonetics ; Speech Acoustics ; *Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The first three formants, i.e., the first three spectral prominences of the short-time magnitude spectra, have been the most commonly used acoustic cues for vowels ever since the work of Peterson and Barney [J. Acoust. Soc. Am. 24, 175-184 (1952)]. However, spectral shape features, which encode the global smoothed spectrum, provide a more complete spectral description, and therefore might be even better acoustic correlates for vowels. In this study automatic vowel classification experiments were used to compare formants and spectral-shape features for monopthongal vowels spoken in the context of isolated CVC words, under a variety of conditions. The roles of static and time-varying information for vowel discrimination were also compared. Spectral shape was encoded using the coefficients in a cosine expansion of the nonlinearly scaled magnitude spectrum. Under almost all conditions investigated, in the absence of fundamental frequency (F0) information, automatic vowel classification based on spectral-shape features was superior to that based on formants. If F0 was used as an additional feature, vowel classification based on spectral shape features was still superior to that based on formants, but the differences between the two feature sets were reduced. It was also found that the error pattern of perceptual confusions was more closely correlated with errors in automatic classification obtained from spectral-shape features than with classification errors from formants. Therefore it is concluded that spectral-shape features are a more complete set of acoustic correlates for vowel identity than are formants. In comparing static and time-varying features, static features were the most important for vowel discrimination, but feature trajectories were valuable secondary sources of information.}, } @article {pmid8224577, year = {1993}, author = {Richardson, LM and Busby, PA and Blamey, PJ and Dowell, RC and Clark, GM}, title = {The effects of auditory feedback from the nucleus cochlear implant on the vowel formant frequencies produced by children and adults.}, journal = {Ear and hearing}, volume = {14}, number = {5}, pages = {339-349}, doi = {10.1097/00003446-199310000-00005}, pmid = {8224577}, issn = {0196-0202}, mesh = {Aged ; Analysis of Variance ; Child ; *Cochlear Implants ; Cochlear Nucleus/*physiopathology/surgery ; Female ; Hearing Loss, Sensorineural/*physiopathology/surgery ; Humans ; Male ; *Phonetics ; }, abstract = {Cochlear implants provide an auditory signal with which profoundly deaf users may monitor their own speech production. The vowel production of two adults and three children who used the Nucleus multiple-electrode cochlear implant was examined to assess the effect of altered auditory feedback. Productions of words were recorded under conditions where the talkers received auditory feedback (speech processor turned on) and where no auditory feedback was provided (speech processor turned off). Data were collected over 3 days at weekly intervals. First and second formant frequencies were measured and the data were analysed to assess significant differences between auditory feedback conditions, vowel context, and data collection points. Overall, the results varied across talkers, across the data collection days, and depended on the consonant environment of the vowel. However, two effects of auditory feedback were noted. First, there was a generalized shift in first formant frequencies between the processor on and processor off conditions across three of the five subjects, but the shift differed in direction for each subject. Second, for three of the five talkers, the two front vowels /epsilon/ and /i/ were more neutralised in the absence of auditory feedback. However, this effect was less pronounced than that noted by previous studies.}, } @article {pmid8353637, year = {1993}, author = {Miller, DG and Schutte, HK}, title = {Physical definition of the "flageolet register".}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {3}, pages = {206-212}, doi = {10.1016/s0892-1997(05)80328-5}, pmid = {8353637}, issn = {0892-1997}, mesh = {Female ; Humans ; Phonation/physiology ; Phonetics ; Sound Spectrography ; Speech Acoustics ; *Terminology as Topic ; Vocal Cords/*physiology ; Voice ; Voice Quality ; }, abstract = {The highest "register" of the female singing voice, often called the "flageolet register" (also called "flute register," "bell register," etc., as well as the misleading term "whistle register"), is broadly recognized by voice pedagogues, but not generally defined in terms that are adequate for objective description. This article presents a description of characteristic patterns of vocal fold movement and of vocal tract formants that are specific for the register. Measurements are made by electroglottograph, pharyngeally placed wide-band pressure transducers, and an external microphone in professional soprano subjects who are adept in using this register.}, } @article {pmid8288280, year = {1993}, author = {Zierhofer, CM and Hochmair, ES}, title = {A feedback control system for real-time formant estimation. II--Analysis of a hysteresis effect and F2 estimation.}, journal = {IEEE transactions on bio-medical engineering}, volume = {40}, number = {9}, pages = {892-898}, doi = {10.1109/10.245610}, pmid = {8288280}, issn = {0018-9294}, mesh = {*Computer Simulation ; Female ; Humans ; *Models, Biological ; *Signal Processing, Computer-Assisted ; *Speech Acoustics ; }, abstract = {This paper presents the second part of the analysis of a feedback control system for real-time formant estimation. The system behavior is analyzed for an input signal composed of two sinusoids. If the frequency difference between the two input spectral lines is sufficiently great and the amplitude ratio is within certain limits, a hysteresis effect occurs. Then the system shows a tendency to select one of the two input spectral lines. The existence of the second line has only little influence on the accuracy of the detection of the selected line. From the analysis, conclusions of the system behavior regarding formant estimation can be drawn. A design example for second formant detection is simulated and compared with the results obtained by simulation of a zero-crossing system for F2 estimation and LPC analysis.}, } @article {pmid8288279, year = {1993}, author = {Zierhofer, CM and Hochmair, ES}, title = {A feedback control system for real-time formant estimation. I--Static and dynamic analysis for sinusoidal input signals.}, journal = {IEEE transactions on bio-medical engineering}, volume = {40}, number = {9}, pages = {886-891}, doi = {10.1109/10.245609}, pmid = {8288279}, issn = {0018-9294}, mesh = {*Cochlear Implants ; Prosthesis Design ; *Signal Processing, Computer-Assisted ; }, abstract = {This paper presents a novel analog scheme suitable for the real-time estimation of formant frequencies. Formant tracking is based on a feedback technique which uses both the amplitude and phase characteristics of two stagger-tuned bandpass filters to give an improved dynamic behavior. The implementation of the system requires a small number of components, and is practical for low-power applications. An analysis of the static and dynamic behavior is given for sinusoidal input signals. The transient response is independent of the amplitude level of the input signal. The system is designed for second formant detection in a cochlear prosthesis system.}, } @article {pmid8229159, year = {1993}, author = {Wang, X and Sachs, MB}, title = {Neural encoding of single-formant stimuli in the cat. I. Responses of auditory nerve fibers.}, journal = {Journal of neurophysiology}, volume = {70}, number = {3}, pages = {1054-1075}, doi = {10.1152/jn.1993.70.3.1054}, pmid = {8229159}, issn = {0022-3077}, support = {DC-00109/DC/NIDCD NIH HHS/United States ; DC-00979/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Attention/*physiology ; Auditory Pathways/physiology ; Cats ; Evoked Potentials, Auditory/*physiology ; Fourier Analysis ; Loudness Perception/physiology ; *Phonetics ; Pitch Perception/physiology ; Signal Processing, Computer-Assisted ; Sound Spectrography ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {1. We have studied auditory responses to a set of speech-related narrowband sounds, single-formant stimuli (SFSs), in populations of auditory nerve fibers (ANFs). An analytic method was developed to extract the envelope of temporal discharge patterns of the ANF responses to nonsinusoidally modulated stimuli, whose spectra have multiple clusters of components. Such responses are often encountered in the auditory system when complex stimuli are used and have traditionally been studied by analyzing the fundamental component of the responses. 2. The envelope modulation in the SFSs is shown to be represented by the response patterns of ANFs. When the whole ANF population is considered, the information on modulation in stimulus envelope does not disappear at the highest sound level tested at all best frequencies (BFs) we studied (1-10 kHz). The representation is the best at medium sound levels and degrades at high sound levels. Low/medium-spontaneous rate (SR) ANFs showed greater envelope modulation in their responses at high sound levels than do high-SR ANFs. The quality of the representation at high sound levels is, on average, proportional to BF threshold of an ANF. On the basis of populations of ANFs with all SRs, the envelope modulation in the SFSs is represented over a wide range of sound levels. 3. We found that low-BF ANFs differ from high-BF ANFs in representing envelope modulation in the SFSs. For ANFs with BFs less than approximately 6 kHz, information on stimulus envelope is not only contained in spectral components near direct current but also in components at the vicinities of frequencies equal to BF and its multiples. In fact, for ANFs with BFs < 3 kHz, the contribution from spectral components centered at BF to overall response modulation is greater than that from spectral components near direct current. These findings indicate that, by using measures solely based on the fundamental component, the amount of modulation in the responses to narrowband stimuli is underestimated for low-BF ANFs. 4. Off-BF stimulation of ANFs with SFSs was found to result in increased envelope modulation in responses at high sound levels. The further away the stimulus is centered relative to unit BF, the greater the modulation it induces, provided that the stimulus is capable of exciting the unit. An SFS centered as close as 15% off unit BF can produce a significant increase in the modulation of responses at very high sound levels.(ABSTRACT TRUNCATED AT 400 WORDS)}, } @article {pmid8216030, year = {1993}, author = {van Son, N and Bosman, AJ and Lamoré, PJ and Smoorenburg, GF}, title = {The perception of complex harmonic patterns by profoundly hearing-impaired listeners.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {32}, number = {5}, pages = {308-327}, doi = {10.3109/00206099309072947}, pmid = {8216030}, issn = {0020-6091}, mesh = {Acoustic Stimulation ; Adult ; Audiometry, Pure-Tone ; Auditory Perception ; Ear, Middle/*physiopathology ; Female ; Hearing Aids ; Hearing Loss, Sensorineural/*diagnosis/physiopathology/rehabilitation ; Humans ; *Loudness Perception ; Male ; Middle Aged ; Phonetics ; Psychoacoustics ; Reproducibility of Results ; Speech Discrimination Tests ; Speech Perception ; }, abstract = {In providing profoundly hearing-impaired persons with processed speech through a signal-processing hearing aid, it is important that the new speech code matches their auditory capacities. This processing capacity for auditory information was investigated in this study. In part 1, the subjects' ability to judge similarities among 8 different but related harmonic complexes was studied. The patterns contained different numbers of harmonics to a 125-Hz fundamental frequency; the harmonics had been spread over the spectrum in various ways. The perceptual judgments appeared to be based on a temporal cue, beat strength, and a spectral cue, related to the balance of high and low frequency components. In part 2, three sets of synthetic vowels were presented to the subjects. Each vowel was realized by summing harmonically related in-phase sinusoids at two formant frequencies. The sets differed in the number of sinusoids per formant: 1, 2 or 3. It was found that the subjects used spectral cues and vowel length for differentiating among the vowels. The overall results show the limited but perhaps usable ability of the profoundly impaired ear to handle spectral information. Implications of these results for the development of signal-processing hearing aids for the profoundly hearing impaired are discussed.}, } @article {pmid8377482, year = {1993}, author = {Hillenbrand, J and Gayvert, RT}, title = {Vowel classification based on fundamental frequency and formant frequencies.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {4}, pages = {694-700}, doi = {10.1044/jshr.3604.694}, pmid = {8377482}, issn = {0022-4685}, support = {NIDCD 1-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; Phonetics ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; Speech Production Measurement ; }, abstract = {A quadratic discriminant classification technique was used to classify spectral measurements from vowels spoken by men, women, and children. The parameters used to train the discriminant classifier consisted of various combinations of fundamental frequency and the three lowest formant frequencies. Several nonlinear auditory transforms were evaluated. Unlike previous studies using a linear discriminant classifier, there was no advantage in category separability for any of the nonlinear auditory transforms over a linear frequency scale, and no advantage for spectral distances over absolute frequencies. However, it was found that parameter sets using nonlinear transforms and spectral differences reduced the differences between phonetically equivalent tokens produced by different groups of talkers.}, } @article {pmid8370873, year = {1993}, author = {Nábĕlek, AK and Czyzewski, Z and Crowley, HJ}, title = {Vowel boundaries for steady-state and linear formant trajectories.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {2 Pt 1}, pages = {675-687}, doi = {10.1121/1.406885}, pmid = {8370873}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adult ; Aged ; Audiometry ; Audiometry, Pure-Tone ; Female ; Hearing/physiology ; Hearing Loss, Bilateral/*diagnosis/physiopathology ; Humans ; Male ; Middle Aged ; Noise ; Phonetics ; Speech Perception ; }, abstract = {Locations of boundaries and slopes of identification functions were tested for /I-epsilon/ vowel continua with steady-state and linearly changing formant trajectories. In experiment 1, the boundaries and slopes for arbitrarily selected trajectory directions were determined for ten normal-hearing and ten hearing-impaired subjects in three listening conditions: Quiet, noise, and reverberation. The boundaries did not depend upon the group of subjects or the listening condition. A boundary shift was found for stimuli with F1 changing in a downward direction relative to boundaries for stimuli with either only F1 or with both F1 and F2 changing in an upward direction. The slope of the identification function for stimuli with F1 changing in a downward direction was shallower than the slopes for stimuli with steady-state formants or stimuli with F1 changing in an upward direction. The slopes obtained from the hearing-impaired subjects were shallower than those of the normal-hearing subjects and were shallower in noise than in either quiet or reverberation. In experiment 2, boundaries and slopes for the trajectory directions found in the natural vowels /I/ and /epsilon/, F1 changing in an upward direction and F2 in a downward direction, were determined for nine normal-hearing subjects in two listening conditions, quiet and reverberation. The boundary for stimuli with both F1 and F2 changing in directions characteristic for natural vowels was shifted relative to the boundary for stimuli with steady-state formants. The directions of the boundary shifts in experiments 1 and 2 indicated a perceptual emphasis on the initial sections of changing F1 and F2. Sound quality of the end-point /I/ and /epsilon/ stimuli depended upon F1 and F2 trajectories. For both vowels, the best quality judgments were found for the stimuli with natural F1 and F2 trajectory directions. The quality judgments were weakly correlated with the slopes of identification functions, with better quality judgments being associated with steeper slopes.}, } @article {pmid8370872, year = {1993}, author = {Hillenbrand, J and Gayvert, RT}, title = {Identification of steady-state vowels synthesized from the Peterson and Barney measurements.}, journal = {The Journal of the Acoustical Society of America}, volume = {94}, number = {2 Pt 1}, pages = {668-674}, doi = {10.1121/1.406884}, pmid = {8370872}, issn = {0001-4966}, support = {NIDCD 1-R01-DC01661/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Female ; Humans ; Male ; *Phonetics ; Pitch Perception ; Speech Acoustics ; *Speech Perception ; }, abstract = {The purpose of this study was to determine how well listeners can identify vowels based exclusively on static spectral cues. This was done by asking listeners to identify steady-state synthesized versions of 1520 vowels (76 talkers x 10 vowels x 2 repetitions) using Peterson and Barney's measured values of F0 and F1-F3 [J. Acoust. Soc. Am. 24, 175-184 (1952)]. The values for all control parameters remained constant throughout the 300-ms duration of each stimulus. A second set of 1520 signals was identical to these stimuli except that a falling pitch contour was used. The identification error rate for the flat-formant, flat-pitch signals was 27.3%, several times greater than the 5.6% error rate shown by Peterson and Barney's listeners. The introduction of a falling pitch contour resulted in a small but statistically reliable reduction in the error rate. The implications of these results for interpreting pattern recognition studies using the Peterson and Barney database are discussed. Results are also discussed in relation to the role of dynamic cues in vowel identification.}, } @article {pmid8369541, year = {1993}, author = {Kraus, N and McGee, T and Carrell, T and Sharma, A and Micco, A and Nicol, T}, title = {Speech-evoked cortical potentials in children.}, journal = {Journal of the American Academy of Audiology}, volume = {4}, number = {4}, pages = {238-248}, pmid = {8369541}, issn = {1050-0545}, mesh = {Acoustic Stimulation ; Adult ; Arousal ; Brain/physiology ; Child ; Child Development/physiology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Event-related potentials (ERPs) were obtained to synthesized speech stimuli in 16 school-aged children (7-11 years) and compared to responses in 10 adults. P1, N1, and P2 event-related potentials were elicited by the phoneme /ga/. The mismatch negativity (MMN) was elicited by variants of /da/ and /ga/, which differ in the onset frequency of the second and third formant transitions. In general, the well-defined N1/P2 complex characteristic of the adult response, was not found in children. Waves P1 and N1 had longer peak latencies in children than in adults. Wave P2 amplitude was smaller in children than in adults. In contrast to the often poorly delineated earlier cortical potentials, the MMN was well defined in children. Significant MMNs were obtained in all subjects tested. MMN magnitude (peak amplitude and area) was significantly larger in the children. No significant differences were found in peak latency and duration of the MMN in children compared to the adult response. Another negative wave occurring at 400 msec was also observed in response to the deviant stimuli. This negative wave occurred at a similar latency in adults and children and was significantly larger and more robust in children. Results support the view that development of ERPs does not involve a hierarchical process with respect to latency. That is, earlier occurring waves do not necessarily mature before later occurring waves. The latencies of P1, N1, and P2 and overall morphology of these waves may provide a measure of maturation of central pathways. The early development of the MMN, its apparent robustness in school-aged children, and its reflection of the processing of acoustic differences in speech stimuli suggest its possible use in the assessment of central auditory function.}, } @article {pmid8295162, year = {1993}, author = {Flint, AJ and Black, SE and Campbell-Taylor, I and Gailey, GF and Levinton, C}, title = {Abnormal speech articulation, psychomotor retardation, and subcortical dysfunction in major depression.}, journal = {Journal of psychiatric research}, volume = {27}, number = {3}, pages = {309-319}, doi = {10.1016/0022-3956(93)90041-y}, pmid = {8295162}, issn = {0022-3956}, mesh = {Adult ; Aged ; Articulation Disorders/diagnosis/*physiopathology/psychology ; Corpus Striatum/*physiopathology ; Depressive Disorder/diagnosis/*physiopathology/psychology ; Dopamine/physiology ; Female ; Humans ; Male ; Middle Aged ; Neurologic Examination ; Parkinson Disease/diagnosis/physiopathology/psychology ; Phonetics ; Psychiatric Status Rating Scales ; Psychomotor Disorders/diagnosis/*physiopathology/psychology ; Sound Spectrography ; Speech Intelligibility/physiology ; Speech Production Measurement ; Substantia Nigra/*physiopathology ; }, abstract = {Psychomotor retardation, characterized by changes in speech, motility and cognition, is common in major depression. It is also a cardinal feature of subcortical disorders such as Parkinson's disease (PD). Based on this observation and other data it has been hypothesized that the retardation of depression is related to mesolimbic-nigrostriatal dysfunction. To further test this hypothesis, speech articulation in major depression was compared to that in PD, where disordered articulation is related to bradykinesia and rigidity caused by striatal dopamine depletion. Thirty subjects with major depression were compared with 30 patients with PD and 31 normal controls on 3 acoustic measures of articulation. Major depression and PD groups had significantly shortened voice onset time and decreased second formant transition compared to controls, and major depression also had increased spirantization. There were no differences between the depression and PD groups on any of the acoustic measures. These findings provide indirect support for the hypothesis that nigrostriatal dysfunction is related to psychomotor slowing in major depression.}, } @article {pmid8353626, year = {1993}, author = {Ternström, S}, title = {Perceptual evaluations of voice scatter in unison choir sounds.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {2}, pages = {129-135}, doi = {10.1016/s0892-1997(05)80342-x}, pmid = {8353626}, issn = {0892-1997}, mesh = {Female ; Humans ; Larynx/anatomy & histology/physiology ; Male ; Phonation/physiology ; Phonetics ; *Speech Acoustics ; Voice/*physiology ; *Voice Quality ; Voice Training ; }, abstract = {The preferences of experiences listerners for pitch and formant frequency dispersion in unison choir sounds were explored using synthesized stimuli. Two types of dispersion were investigated: (a) pitch scatter, which arises when voices in an ensemble exhibit small differences in mean fundamental frequency, and (b) spectral smear, defined as such dispersion of formants 3 to 5 as arises from differences in vocal tract length. Each stimulus represented a choir section of five bass, tenor, alto, or soprano voices, producing the vowel [u], [a], or [ae]. Subjects chose one dispersion level out of six available, selecting the "maximum tolerable" in a first run and the "preferred" in a second run. The listeners were very different in their tolerance for dispersion. Typical scatter choices were 14 cent standard deviation for "tolerable" and 0 or 5 cent for "preferred." The smear choices were less consistent; the standard deviations were 12 and 7%, respectively. In all modes of assessment, the largest dispersion was chosen for the vowel [u] on a bass tone. There was a vowel effect on the smear choices. The effects of voice category were not significant.}, } @article {pmid8353624, year = {1993}, author = {Klingholz, F}, title = {Overtone singing: productive mechanisms and acoustic data.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {2}, pages = {118-122}, doi = {10.1016/s0892-1997(05)80340-6}, pmid = {8353624}, issn = {0892-1997}, mesh = {Humans ; Male ; Pharynx/physiology ; Phonation/physiology ; Sound Spectrography ; Speech Acoustics ; Voice/*physiology ; Voice Quality ; }, abstract = {Overtone singing is where one person sings in two voices, the first voice represented by the fundamental and the second by an enhanced harmonic. Overtone singing is performed in chest register. Tuning of the first or second formant and a reduction of the formant bandwidth down to 20 Hz make harmonics prominent. Narrowing the pharynx, velar constriction, variation of the small mouth opening, and a tension of the walls of the mouth cavity are used. Changing prominent harmonics has the effect of creating an overtone melody with sustained tones, tone steps, and trillos.}, } @article {pmid8331906, year = {1993}, author = {Tye-Murray, N and Kirk, KI}, title = {Vowel and diphthong production by young users of cochlear implants and the relationship between the phonetic level evaluation and spontaneous speech.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {3}, pages = {488-502}, doi = {10.1044/jshr.3603.488}, pmid = {8331906}, issn = {0022-4685}, support = {DC00242/DC/NIDCD NIH HHS/United States ; DC00976/DC/NIDCD NIH HHS/United States ; NIDCD RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; Cochlea/physiopathology ; *Cochlear Implants ; Deafness/complications/physiopathology/*rehabilitation ; Ear, Inner/physiopathology ; Female ; Humans ; Male ; Phonetics ; Remedial Teaching ; *Speech ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {This investigation assessed how the vowel and diphthong production of young users of cochlear implants varied over time and how performance on the Phonetic Level Evaluation (PLE, Ling, 1976) corresponded with vowel and diphthong production during spontaneous speech. Eight children with prelingual deafness were tested with the PLE on five occasions: before receiving a Cochlear Corporation Nucleus cochlear implant and at the following points after receiving a cochlear implant: 6 months, 12 months, 18 months, and 24 or 36 months. An audiovideo recording of spontaneous speech was obtained at each test session. Performance on the PLE was scored with the scoring system designed by Kirk and Hill-Brown (1985). Performance during spontaneous speaking was scored by referencing a transcription of the spoken message to a transcription of the signed message. The correlations between the PLE and the spontaneous speech measures were weak, suggesting that performance on the PLE has low predictive value for vowel and diphthong production during spontaneous speaking. The results from the spontaneous speech samples collected over time suggest that two changes occurred: (a) vowel and diphthong production became more diverse and (b) production became more accurate. It is suggested that increased access to formant information enables subjects to enlarge their system of phonological performance and refine their motoric ability to establish vowel and diphthong targets.}, } @article {pmid8326071, year = {1993}, author = {Culling, JF and Darwin, CJ}, title = {Perceptual separation of simultaneous vowels: within and across-formant grouping by F0.}, journal = {The Journal of the Acoustical Society of America}, volume = {93}, number = {6}, pages = {3454-3467}, doi = {10.1121/1.405675}, pmid = {8326071}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Female ; Humans ; Male ; *Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Six experiments explored why the identification of the two members of a pair of diotic, simultaneous, steady-state vowels improves with a difference in fundamental frequency (delta F0). Experiment 1 confirmed earlier reports that a delta F0 improves identification of 200-ms but not 50-ms duration "double vowels"; identification improves up to 1 semitone delta F0 and then asymptotes. In such stimuli, all the formants of a given vowel are excited by the same F0, providing listeners with a potential grouping cue. Subsequent experiments asked whether the improvement in identification with delta F0 for the longer vowels was due to listeners using the consistent F0 within each vowel of a pair to group formants appropriately. Individual vowels were synthesized with a different F0 in the region of the first formant peak from in the region of the higher formant peaks. Such vowels were then paired so that the first formant of one vowel bore the same F0 as the higher formants of the other vowel. These across-formant inconsistencies in F0 did not substantially reduce the previous improvement in identification rates with increasing delta F0's of up to 4 semitones (experiment 2). The subjects' improvement with increasing delta F0 in the inconsistent condition was not produced by identifying vowels on the basis of information in the first-formant or higher-formant regions alone, since stimuli which contained either of these regions in isolation were difficult for subjects to identify. In addition, the inconsistent condition did produce poorer identification for larger delta F0's (experiment 3). The improvement in identification with delta F0 found for the inconsistent stimuli persisted when the delta F0 between vowel pairs was confined to the first formant region (experiment 4) but not when it was confined to the higher formants (experiment 6). The results replicate at different overall presentation levels (experiment 5). The experiments show that at small delta F0's only the first-formant region contributes to improvements in identification accuracy, whereas with larger delta F0's the higher formant region may also contribute. This difference may be related to other results that demonstrate the superiority of resolved rather than unresolved harmonics in coding pitch.}, } @article {pmid8315158, year = {1993}, author = {Perkell, JS and Matthies, ML and Svirsky, MA and Jordan, MI}, title = {Trading relations between tongue-body raising and lip rounding in production of the vowel /u/: a pilot "motor equivalence" study.}, journal = {The Journal of the Acoustical Society of America}, volume = {93}, number = {5}, pages = {2948-2961}, doi = {10.1121/1.405814}, pmid = {8315158}, issn = {0001-4966}, support = {DC00075/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; *Lip ; Male ; *Phonetics ; Pilot Projects ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement ; *Tongue ; }, abstract = {Articulatory and acoustic data were used to explore the following hypothesis for the vowel /u/: The objective of articulatory movements is an acoustic goal; varying and reciprocal contributions of different articulators may help to constrain acoustic variation in achieving the goal. Previous articulatory studies of similar hypotheses, expressed entirely in articulatory terms, have been confounded by interdependencies of the variables being studied (e.g., lip and mandible displacements). One case in which this problem may be minimized is that of lip rounding and tongue-body raising (formation of a velo-palatal constriction) for the vowel /u/. Lip rounding and tongue-body raising should have similar acoustic effects for /u/, mainly to lower F2. In multiple repetitions, reciprocal contributions of lip rounding and tongue-body raising could help limit F2 variability for /u/; thus this experiment looked for complementary covariation (negative correlations) in measures of these two parameters. An electro-magnetic midsagittal articulometer (EMMA) was used to track movements of midsagittal points on the tongue body, upper and lower lips, and mandible for large numbers of repetitions of utterances containing /u/. (Interpretation of the data was aided by results from area-function-to-formant modeling.) Three of four subjects showed weak negative correlations, tentatively supporting the hypothesis; a fourth showed the opposite pattern: positive correlations of lip rounding and tongue raising. The results are discussed with respect to ideas about motor equivalence, the nature of speech motor programming, and potential improvements to the paradigm.}, } @article {pmid8225334, year = {1993}, author = {Deng, L and Kheirallah, I}, title = {Dynamic formant tracking of noisy speech using temporal analysis on outputs from a nonlinear cochlear model.}, journal = {IEEE transactions on bio-medical engineering}, volume = {40}, number = {5}, pages = {456-467}, doi = {10.1109/10.243416}, pmid = {8225334}, issn = {0018-9294}, mesh = {*Algorithms ; Cochlear Duct/*physiology ; Evaluation Studies as Topic ; Fourier Analysis ; Humans ; *Models, Statistical ; *Noise ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {In this paper we take a modeling approach to studying representation of formant frequencies of spoken speech and speech in noise in the temporal responses of the peripheral auditory system. On the basis of the properties of the representation, we have devised and evaluated a cross-channel correlation algorithm and an interpeak interval analysis for automatic formant extraction of speech which is strongly dynamic in acoustic characteristics and is embedded in noise. The basilar membrane model used in this study contains laterally coupled damping elements, which are made monotonically dependent on the spatial distribution of the short-term power in the outputs of the model. Efficient digital implementation and the related salient numerical properties of the model are described. Simulation results from the model in response to speech and speech in noise illustrate temporal response patterns that are tonotopically organized in relation to speech formant parameters with little influence by the noise level. By utilizing such relations the devised cross-channel correlation algorithm is shown to be capable of accurately tracking formant movements in spoken syllables and sentences.}, } @article {pmid8487530, year = {1993}, author = {Sussman, JE}, title = {Auditory processing in children's speech perception: results of selective adaptation and discrimination tasks.}, journal = {Journal of speech and hearing research}, volume = {36}, number = {2}, pages = {380-395}, doi = {10.1044/jshr.3602.380}, pmid = {8487530}, issn = {0022-4685}, mesh = {Acoustic Stimulation ; Adult ; Child ; Child Language ; Child, Preschool ; Communication Aids for Disabled ; Female ; Humans ; Language Tests ; Male ; Phonetics ; *Speech Perception ; }, abstract = {Five- to six-year-old children and adults participated in discrimination and selective adaptation speech perception tasks using a synthetic consonant-vowel continuum ranging from [ba] to [da]. In one condition of selective adaptation, attention was focused on the adapting stimulus, the continuum-endpoint [ba], with a whispering task. In another condition, attention was focused away from the continuum-endpoint [da] adaptor to contralaterally presented syllables "SHE" and "SEE." Results, compared with two more typical adaptation conditions, indicated that focused attention did not augment selective adaptation effects, particularly for children who showed smaller effects with focused attention on the adaptor. In contrast to adults, children did not significantly change labeling responses after exposure to endpoint-[ba] adaptors, results matching those of Sussman and Carney (1989). However, children did significantly change labeling following exposure to endpoint-[da] adaptors. Discrimination findings with five-formant consonant-vowel and single-formant stimuli supported the importance of acoustic processing for the selective adaptation tasks performed. Together, results support hypotheses of sensory processing differences in younger, normally developing children compared with adults and show that such abilities appear to be related to speech perception skills.}, } @article {pmid8473839, year = {1993}, author = {Nygaard, LC}, title = {Phonetic coherence in duplex perception: effects of acoustic differences and lexical status.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {19}, number = {2}, pages = {268-286}, doi = {10.1037//0096-1523.19.2.268}, pmid = {8473839}, issn = {0096-1523}, support = {HD05331/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; *Semantics ; Sound Spectrography ; *Speech Perception ; }, abstract = {A modified version of duplex perception was used to investigate the nature of perceptual organization of speech. Ss were asked to identify stimuli in which a third-formant transition was presented to 1 ear and a full syllable (base) was presented to the other ear. Phonetic integration occurred even when the spectral composition or onset frequency of the isolated transition was varied relative to the base. However, when onset asynchronies were added to spectral differences, the isolated transition ceased to contribute to the syllable percept. Lexical status of the eventual phonetic percepts also influenced the phonetic integration of acoustic components into syllable percepts. These results suggest that combinations of low-level acoustic and phonetic differences between components and higher level lexical information interact to influence the grouping of acoustic components into phonetic percepts.}, } @article {pmid8353622, year = {1993}, author = {Boone, DR and McFarlane, SC}, title = {A critical view of the yawn-sigh as a voice therapy technique.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {1}, pages = {75-80}, doi = {10.1016/s0892-1997(05)80114-6}, pmid = {8353622}, issn = {0892-1997}, mesh = {Adult ; Female ; Humans ; Larynx/physiology ; Male ; Middle Aged ; Pharynx/physiology ; Phonation/physiology ; Phonetics ; Relaxation Therapy ; Speech Acoustics ; Speech Therapy ; Vocal Cords/*physiology ; Voice/*physiology ; *Yawning ; }, abstract = {The purpose of this study was to take a critical look at a voice therapy technique known as the yawn-sigh. The voiced sigh as an approach in voice therapy has had increased use in recent years, particularly with problems of vocal hyperfunction. In this study, the physiology of the yawn-sigh was studied with video nasoendoscopy in eight normal subjects; their taped voices were also studied acoustically for possible fundamental frequency and format changes in producing selected vowels under normal and sigh conditions. Although each subject was given a model by the examiner of a yawn-sigh, one of the eight subjects could not produce a true yawn-sigh. Endoscopic findings for seven of the eight subjects performing the yawn-sigh demonstrated retracted elevation of the tongue, a lower positioning of the larynx, and a widened pharynx. Acoustic analyses for the seven subjects producing the sigh found a marked lowering of the second and third formants. Implications for using the yawn-sigh in voice therapy are given, such as using a modified "silent" yawn-sigh, as an easy method for producing greater vocal tract relaxation.}, } @article {pmid8353616, year = {1993}, author = {Sundberg, J and Titze, I and Scherer, R}, title = {Phonatory control in male singing: a study of the effects of subglottal pressure, fundamental frequency, and mode of phonation on the voice source.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {7}, number = {1}, pages = {15-29}, doi = {10.1016/s0892-1997(05)80108-0}, pmid = {8353616}, issn = {0892-1997}, support = {NIDCD 1 P60 DC00976-01/DC/NIDCD NIH HHS/United States ; }, mesh = {*Air Pressure ; Equipment Design ; Glottis/*physiology ; Humans ; Male ; Phonation/*physiology ; Speech Acoustics ; Speech Production Measurement ; Transducers ; Voice/*physiology ; Voice Quality ; }, abstract = {This article describes experiments carried out in order to gain a deeper understanding of the mechanisms underlying variation of vocal loudness in singers. Ten singers, two of whom are famous professional opera tenor soloists, phonated at different pitches and different loudnesses. Their voice source characteristics were analyzed by inverse filtering the oral airflow signal. It was found that the main physiological variable underlying loudness variation is subglottal pressure (Ps). The voice source property determining most of the loudness variation is the amplitude of the negative peak of the differentiated flow signal, as predicted by previous research. Increases in this amplitude are achieved by (a) increasing the pulse amplitude of the flow waveform; (b) moving the moment of vocal fold contact earlier in time, closer to the center of the pulse; and (c) skewing the pulses. The last mentioned alternative seems dependent on both Ps and the ratio between the fundamental frequency and the first formant. On the average, the singers doubled Ps when they increased fundamental frequency by one octave, and a doubling of the excess Ps over threshold caused the sound pressure level (SPL) to increase by 8-9 dB for neutral phonation, less if mode of phonation was changed to pressed. A shift of mode of phonation from flow over neutral to pressed was associated with a reduction of the peak glottal permittance i.e., the ratio between peak transglottal airflow to Ps. Flow phonation had the most favorable relationship between Ps and SPL.}, } @article {pmid8445117, year = {1993}, author = {Busby, PA and Tong, YC and Clark, GM}, title = {Electrode position, repetition rate, and speech perception by early- and late-deafened cochlear implant patients.}, journal = {The Journal of the Acoustical Society of America}, volume = {93}, number = {2}, pages = {1058-1067}, doi = {10.1121/1.405554}, pmid = {8445117}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Child, Preschool ; Cochlea/physiopathology/surgery ; *Cochlear Implants ; Electric Stimulation ; Female ; Hearing Loss, Sensorineural/etiology/physiopathology/*rehabilitation ; Humans ; Language ; Language Tests ; Male ; Middle Aged ; Photic Stimulation ; Psychophysics ; *Speech Perception ; }, abstract = {Psychophysical and speech perception studies were conducted on eight patients using the 22-electrode cochlear implant manufactured by Cochlear Pty. Ltd. Four early-deafened patients became deafened at 1-3 years of age and were implanted at 5-14 years of age. Four late-deafened (postlingual adult) patients became deafened at 38-47 years of age and were implanted at 42-68 years of age. Psychophysical studies measured the discrimination of trajectories with time-varying electrode positions and repetition rates. Speech perception studies measured performance using two speech coding strategies: a multi-electrode strategy which coded the first and second formant frequencies, the amplitudes of the two formants, and the fundamental frequency; and a single-electrode strategy which coded the amplitudes of the first and second formants, and the fundamental frequency. In general, the four late-deafened patients and one early-deafened patient were more successful than the other three early-deafened patients in the discrimination of electrode position trajectories and in speech perception using the multi-electrode strategy. Three of the four late-deafened patients were more successful than the early-deafened patients in the discrimination of repetition rate trajectories. Speech perception performance in the single-electrode strategy was closely related to performance in repetition rate discrimination. The improvement in speech perception performance from the single-electrode to multi-electrode strategy was consistent with successful performance in electrode discrimination.}, } @article {pmid8445113, year = {1993}, author = {Formby, C and Barker, C and Abbey, H and Raney, JJ}, title = {Detection of silent temporal gaps between narrow-band noise makers having second-formantlike properties of voiceless stop/vowel combinations.}, journal = {The Journal of the Acoustical Society of America}, volume = {93}, number = {2}, pages = {1023-1027}, doi = {10.1121/1.405550}, pmid = {8445113}, issn = {0001-4966}, support = {NSO1113/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; Adult ; *Auditory Perception ; Auditory Threshold ; Female ; Humans ; Male ; Noise ; *Phonetics ; Speech Acoustics ; }, abstract = {Temporal gap detection thresholds were measured in narrow-band noise-burst markers having acoustic characteristics representative of isolated steady-state second-formant (F2) properties for/p,t,k/paired separately with/i,ae,u,o/. The results revealed that gap detection threshold increased systematically as the difference was increased between the simulated stop and vowel F2 frequencies. A strong positive correlation (r = 0.87) between gap detection threshold and linear marker center frequency difference was highly significant (p < 0.001). Differences in other stimulus features had little influence on gap detection performance. Implications for speech perception are discussed.}, } @article {pmid8433910, year = {1993}, author = {Chalikia, MH and Bregman, AS}, title = {The perceptual segregation of simultaneous vowels with harmonic, shifted, or random components.}, journal = {Perception & psychophysics}, volume = {53}, number = {2}, pages = {125-133}, pmid = {8433910}, issn = {0031-5117}, mesh = {Adult ; *Attention ; Dichotic Listening Tests ; Female ; Humans ; Male ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; }, abstract = {This experiment was an investigation of the ability of listeners to identify the constituents of double vowels (pairs of synthetic vowels, presented concurrently and binaurally). Three variables were manipulated: (1) the size of the difference in F0 between the constituents (0, 1/2, and 6 semitones); (2) the frequency relations among the sinusoids making up the constituents: harmonic, shifted (spaced equally in frequency but not integer multiples of the F0), and random; and (3) the relationship between the F0 contours imposed on the constituents: steady state, gliding in parallel, or gliding in opposite directions. It was assumed that, in the case of the gliding contours, the harmonics of each vowel would "trace out" their spectral envelope and potentially improve the definition of the formant locations. It was also assumed that the application of different F0 contours would introduce differences in the direction of harmonic movement (common fate), thus aiding the perceptual segregation of the two vowels. The major findings were the following: (1) For harmonic constituents, a difference in F0 leads to improved identification performance. Neither tracing nor common-fate differences add to the effect of pitch differences. (2) For shifted constituents, a difference between the spacing of the constituents also leads to improved performance. Formant tracing and common fate contribute some further improvement. (3) For random constituents, tracing does not contribute, but common fate does.}, } @article {pmid8425384, year = {1993}, author = {Gordon, PC and Eberhardt, JL and Rueckl, JG}, title = {Attentional modulation of the phonetic significance of acoustic cues.}, journal = {Cognitive psychology}, volume = {25}, number = {1}, pages = {1-42}, doi = {10.1006/cogp.1993.1001}, pmid = {8425384}, issn = {0010-0285}, mesh = {Acoustic Stimulation ; Adult ; *Attention ; Cues ; Female ; Humans ; Male ; *Phonetics ; Photic Stimulation ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Four experiments addressing the role of attention in phonetic perception are reported. The first experiment shows that the relative importance of two cues to the voicing distinction changes when subjects must perform an arithmetic distractor task at the same time as identifying a speech stimulus. The contribution of voice onset time to phonetic labeling decreases when subjects are distracted, while that of FO onset frequency increases. The second experiment shows a similar pattern for two cues to the distinction between the vowels /i/ (as in "beat") and /I/ (as in "bit"). Under low attention conditions, formant pattern has a smaller effect on phonetic labeling while vowel duration has a larger effect. Together these experiments indicate that careful attention to speech perception is necessary for strong acoustic cues (voice-onset time and formant patterns) to achieve their full impact on phonetic labeling, while weaker acoustic cues (FO onset frequency and vowel duration) achieve their full impact on phonetic labeling without close attention. Experiment 3 shows that this pattern is obtained when the distractor task places little demand on verbal short-term memory. Experiment 4 provides a data set for testing formal models of the role of attention in speech perception. Attention is shown to influence the signal-to-noise ratio in the phonetic encoding of acoustic cues; the sustained phonetic contribution of weak cues without close attention stems from reduced competition from strong cues. This principle is instantiated in a network model in which the role of attention is to reduce noise in the phonetic encoding of acoustic cues. Implications of this work for understanding speech perception and general theories of the role of attention in perception are discussed.}, } @article {pmid8423264, year = {1993}, author = {Sussman, JE}, title = {Focused attention during selective adaptation along a place of articulation continuum.}, journal = {The Journal of the Acoustical Society of America}, volume = {93}, number = {1}, pages = {488-498}, doi = {10.1121/1.405629}, pmid = {8423264}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adult ; Auditory Perception ; Dichotic Listening Tests ; Female ; Hearing ; Humans ; Male ; *Phonetics ; Photic Stimulation ; *Speech Perception ; }, abstract = {The current investigation manipulated subjects' attention to adaptor tokens in five selective adaptation experiments. All stimuli were synthetic consonant-vowel syllables, with the consonant varying from [b] to [d] by formant frequency transitions. Two distractor conditions (auditory and visual) were compared to a more typical endpoint-[d alpha] adaptor condition. Distraction from endpoint-[d alpha] adaptors to phonetically distinct [si] and [integral of i] was used to observe whether smaller adaptation effects would result when attention was not focused on adaptor stimuli. In contrast, a focused attention condition required subjects to whisper [b alpha] adaptors right after they were heard. Performance in the focused attention condition was compared to a more typical endpoint-[b alpha] adaptation condition. Results indicated that focused attention did not affect the size of the adaptation effect. Asymmetrical adaptation results for [d alpha] vs [b alpha] adaptors, and a larger amount of adaptation with the presence of contralateral "distractor" syllables, resembled findings in psychoacoustic studies of discrimination and loudness adaptation. These results suggest that two levels of auditory processing (not special to speech perception) were responsible for the observed adaptation effects.}, } @article {pmid8406269, year = {1993}, author = {Novák, A and Vokrál, J}, title = {Emotions in the sight of long-time averaged spectrum and three-dimensional analysis of periodicity.}, journal = {Folia phoniatrica}, volume = {45}, number = {4}, pages = {198-203}, doi = {10.1159/000266249}, pmid = {8406269}, issn = {0015-5705}, mesh = {Computer Graphics ; *Emotions ; Fourier Analysis ; Humans ; Male ; Reference Values ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; *Verbal Behavior ; }, abstract = {The authors used long-time averaged spectrum and three-dimensional analysis of periodicity (3D-PAN) for analyzing emotional expressions in utterances. They confirmed the movement of the skewness of formant region, that is getting smaller at anger and at joy. It can reach positive values. The skewness is higher at sadness. These findings correspond to the physiological phenomena that we can follow in the whole musculature. The 3D-PAN has not brought any new finding as far.}, } @article {pmid8385395, year = {1993}, author = {Telegina, TL and Pigareva, ML}, title = {[Changes in the galvanic skin reaction and speech parameters of 8- to 9-year-old children during play].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {43}, number = {1}, pages = {23-30}, pmid = {8385395}, issn = {0044-4677}, mesh = {Acoustic Stimulation ; Child ; Emotions/physiology ; Female ; Galvanic Skin Response/*physiology ; Humans ; Male ; Microcomputers ; *Play and Playthings ; Speech/*physiology ; Stress, Psychological/physiopathology ; Time Factors ; }, abstract = {Changes of parameters of the speech signal (fundamental frequency, Ef, and evaluation of the frequency of the first formant, n(0)) were compared to the skin galvanic (SGR) reaction shifts in 8-9 years old children during playing computer games under different conditions: 1) playing using the child's own strategy; 2) that under the conditions of time deficits; 3) that with minimization of errors. In each playing situation a child had 3 trials. Parameter SGR-1 reflected SGR amplitude changes depending on a trial number, SGR-2 reflected those depending on the beginning, the middle, of the end of a battle with an enemy. At playing under the conditions of time deficits there was an increase of emotional stress and that of the number of negative emotional reactions of children being accompanied by worsening of playing activity parameters. Ef and SGR-2 were mostly expressed in comparison to other playing situations. The greatest changes of n(0) and SGR-1 were observed at playing with child's own strategy. Emotional stress in the latter situation was minimal, number of positive emotional reactions was comparable to that in the third playing situation, and negative emotional reactions were absent. The data obtained point to the structural change of motivation at complicating the situation (at playing with time deficits).}, } @article {pmid8325570, year = {1993}, author = {Kitzing, P and Akerlund, L}, title = {Long-time average spectrograms of dysphonic voices before and after therapy.}, journal = {Folia phoniatrica}, volume = {45}, number = {2}, pages = {53-61}, doi = {10.1159/000266213}, pmid = {8325570}, issn = {0015-5705}, mesh = {Adult ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Voice Disorders/diagnosis/psychology/*therapy ; *Voice Training ; }, abstract = {Tape recordings before and after successful voice therapy from 174 subjects with non-organic voice disorders (functional dysphonia) were analysed by long-time averaged voice spectrograms (LTAS). In female as well as in male voices there was a statistically significant increase in level in the first formant region of the spectra. In the female voices there was also an increase in level in the region of the fundamental. The LTAS were compared to the results of a perceptual evaluation of the voice qualities by a small group of expert listeners. There was no significant change of the LTAS in voices with negligible amelioration after therapy. In the voices, where the change after therapy was perceptually rated to be considerable, the LTAS showed only an increase in intensity, but the general configuration of the spectral envelope remained unchanged. There was only a weakly positive correlation between the quality ratings and parameters of the spectra.}, } @article {pmid8324062, year = {1993}, author = {Zadák, J and Unbehauen, R}, title = {An application of mapping neural networks and a digital signal processor for cochlear neuroprostheses.}, journal = {Biological cybernetics}, volume = {68}, number = {6}, pages = {545-552}, pmid = {8324062}, issn = {0340-1200}, mesh = {*Cochlear Implants ; Computer Simulation ; Computers ; Cybernetics ; Deafness/physiopathology/surgery ; Humans ; Nerve Net/*physiology ; Neural Networks, Computer ; Signal Processing, Computer-Assisted ; Software ; Speech Acoustics ; }, abstract = {Cochlear neuroprostheses strive to restore the sensation of hearing to patients with a profound sensorineural deafness. They exhibit a stimulation of the surviving auditory nerve neurons by electrical currents delivered through electrodes placed on or within the cochlea. The present article describes a new method for an efficient derivation of the required information from the incoming speech signal necessary for the implant stimulation. Also some realization aspects of the new approach are addressed. In the new strategy, a multilayer neural network is employed in the formant frequency estimation having some suitable speech signal descriptors as particular input signals. The proposed method allows us a fast formant frequency estimation necessary for the implant stimulation. With the developed strategy, the prosthesis can be adjusted to the environment which the patient is supposed to live in. Moreover, the neural network concept offers us an alternative for dealing with the areas of neural loss or "holes" in the frequency map of the patient's ear.}, } @article {pmid8263822, year = {1993}, author = {Weiss, MR}, title = {Effects of noise and noise reduction processing on the operation of the Nucleus-22 cochlear implant processor.}, journal = {Journal of rehabilitation research and development}, volume = {30}, number = {1}, pages = {117-128}, pmid = {8263822}, issn = {0748-7711}, support = {P01 DC00178-09/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; *Cochlear Implants ; Humans ; *Noise/prevention & control ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; Speech Intelligibility ; Speech Perception ; }, abstract = {Cochlear implants, like other types of auditory sensory aids, become increasingly ineffective with increasing ambient noise levels. One method of signal processing to reduce additive random wideband noise, the INTEL method, has been used to good effect as an input preprocessor for the Nucleus-22 cochlear implant. The implant's own signal processor estimates and encodes pitch frequency and the frequencies of Formants 1 and 2. The study reported here shows that additive noise results in substantial deviations in formant frequency estimates from those that would be observed in the absence of noise. When noisy speech is preprocessed by the INTEL method to reduce noise intensity, the deviations in the frequency estimates for Formant 2 are substantially reduced.}, } @article {pmid8253452, year = {1993}, author = {Morsomme, D and Remacle, M and Millet, B}, title = {High-resolution frequency analysis as applied to the singing voice.}, journal = {Folia phoniatrica}, volume = {45}, number = {6}, pages = {280-287}, doi = {10.1159/000266276}, pmid = {8253452}, issn = {0015-5705}, mesh = {Adult ; Female ; Fourier Analysis ; Humans ; Microcomputers ; Phonetics ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Voice Quality/*physiology ; }, abstract = {We have applied high-resolution vocal frequent analysis to a population of singing voices. Two important elements have become apparent: (1) Confirmation that the singing formant originates in the resonators. This is observed especially on a low fundamental, and it is acquired through technical skill and experience. (2) Observation of the vibrato, which, isolated from the clinical study, regarding only its graphic presentation, could have been interpreted as 'abnormal'.}, } @article {pmid8234754, year = {1993}, author = {Peeters, S and Offeciers, E and Kinsbergen, J and van Durme, M and van Enis, P and Dijkmans, P and Bouchataoui, I}, title = {A digital speech processor and various speech encoding strategies for cochlear implants.}, journal = {Progress in brain research}, volume = {97}, number = {}, pages = {283-289}, doi = {10.1016/s0079-6123(08)62288-8}, pmid = {8234754}, issn = {0079-6123}, mesh = {Acoustic Stimulation/*instrumentation ; *Cochlear Implants ; Electrodes ; Humans ; *Signal Processing, Computer-Assisted/instrumentation ; }, abstract = {We distinguish two main categories in the speech decoding strategies: those based on feature extraction and those based on time information of the filtered incoming signal. In the first group only those electrode pairs corresponding to the localisation of the four or more, maximum spectral peaks or the formants of the speech signal are stimulated. The second category is essentially based on the timing information that is included in the filter outputs of the filtered speech signal. The Laura implant has the capability to be programmed to the different decodings-strategies. To understand the potential possibilities the main features of the implant are discussed.}, } @article {pmid8234747, year = {1993}, author = {Zenner, HP and Ernst, A}, title = {Sound preprocessing by ac and dc movements of cochlear outer hair cells.}, journal = {Progress in brain research}, volume = {97}, number = {}, pages = {21-30}, doi = {10.1016/s0079-6123(08)62259-1}, pmid = {8234747}, issn = {0079-6123}, mesh = {Cochlea/*physiology ; Electric Conductivity ; Electrophysiology ; Hair Cells, Auditory, Outer/*physiology ; Otoacoustic Emissions, Spontaneous/physiology ; *Sound ; }, abstract = {In inner and outer hair cells, a sound event results mechano-electrically in a receptor potential from the hair cells by the functioning of apical and lateral K(+)-channels. However, after this point, the signal transfer is divided. Inner hair cells (IHC) release an unknown afferent transmitter. By contrast, outer hair cells (OHC) are proposed to produce mechanical ac and dc responses. In our model, the ac components of the sound signal, the carrier frequencies, determine the response of the OHC. Usually, they respond by ac and dc movements. The rapid ac movements of OHC, for which the underlying mechanism is unknown, may respond cycle-by-cycle to and interfere with the carrier frequency of the traveling wave. Near hearing threshold, they could drastically amplify the traveling wave thus contributing to the postulated cochlear amplifier. Active dc movements of the cytoskeleton of the cell body, as well as of the cuticular plate with the sensory hairs, are proposed to respond to millisecond changes of the sound stimulus over time. Such changes could be a modulation of the amplitude (AM), i.e., an increase or decrease of the sound pressure level (SPL), which is reflected in the envelope of the traveling wave. The active mechanical dc response of OHC to the amplitude (AM) and frequency modulation (FM) pattern is then expected to result in dc position changes of the reticular lamina (RL). These should control the operation point of the stereocilia, thus influencing their transfer function and sensitivity. In addition, experimental data suggest that there are modulations of the compliance of the organ of Corti (OC) and changes of its geometry. This dc modulation of micromechanical properties and geometry of the OC by active force generation of OHCs might contribute to automatic gain control, adaptation, TTS, as well as to the homeostasis of the basilar membrane location. In particular, the motile mechanism may protect the vulnerable cochlear partition against high sound pressure levels. Moreover, according to this model, changes of the sound signal with time are expected to be encoded in the actively produced dc movements of the RL. As the signal changes may carry important information (e.g., complex sound signal modulations such as formant transitions in speech), this is extracted and mechanically encoded by the proposed active dc mechanism. It cannot be excluded that the information-carrying dc signal is transferred to inner hair cells contributing to their adequate stimulus.(ABSTRACT TRUNCATED AT 400 WORDS)}, } @article {pmid8234737, year = {1993}, author = {Palmer, AR and Moorjani, PA}, title = {Responses to speech signals in the normal and pathological peripheral auditory system.}, journal = {Progress in brain research}, volume = {97}, number = {}, pages = {107-115}, doi = {10.1016/s0079-6123(08)62268-2}, pmid = {8234737}, issn = {0079-6123}, mesh = {Animals ; Cochlear Nerve/*physiology ; Guinea Pigs ; Hearing Loss, Sensorineural/physiopathology ; Nerve Fibers/physiology ; Speech/*physiology ; }, abstract = {The responses to single (/a/ and /i/) and double vowel (/a,i/) stimuli of normal guinea pig cochlear nerve fibres are compared with those from animals with a cochlear hearing loss. When the threshold losses are sufficient to exclude the higher harmonics of the /i/, the temporal representation of the second and higher formants is lost. Smaller threshold elevations allow a representation of the second formant when the vowel /i/ is presented alone. However, under double vowel stimulation wider auditory filters allow the capture of the synchrony of high characteristic frequency fibres by lower frequencies thereby losing the higher formants of the /i/ and also much of the information about its fundamental frequency.}, } @article {pmid8047937, year = {1993}, author = {Erasmus, I and van der Merwe, A and Groenewald, E}, title = {[Speech sound distortion of neuromotor speech disorders: a comparison between cerebral dysarthria and verbal apraxia].}, journal = {The South African journal of communication disorders = Die Suid-Afrikaanse tydskrif vir Kommunikasieafwykings}, volume = {40}, number = {}, pages = {85-96}, pmid = {8047937}, issn = {0379-8046}, mesh = {Apraxias/*physiopathology ; Dysarthria/*physiopathology ; Humans ; *Phonetics ; Speech Acoustics ; }, abstract = {Speech sound distortion is considered to be a salient feature of neuromotor speech disorders such as cerebellar dysarthria and apraxia of speech. The aim of this study was to compare the temporal and spatial aspects of speech of two persons with acquired cerebellar dysarthria and of one person with acquired apraxia of speech. Voice onset time of [d], duration of articulatory closure of [d], duration of [s] and [l], formants of [l] and the range of acoustic energy of [s] were analysed spectrographically in a number of utterances with various sound structures. The results indicated that spatial and temporal distortion of articulatory movements occurred in all three subjects. However, differences in the nature and degree of speech sound distortion in the two different disorders were observed. The theoretical implications of these differences are discussed with reference to a model of normal speech production.}, } @article {pmid1838561, year = {1991}, author = {Bernstein, LE and Demorest, ME and Coulter, DC and O'Connell, MP}, title = {Lipreading sentences with vibrotactile vocoders: performance of normal-hearing and hearing-impaired subjects.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {6}, pages = {2971-2984}, doi = {10.1121/1.401771}, pmid = {1838561}, issn = {0001-4966}, support = {DC00023/DC/NIDCD NIH HHS/United States ; DC00695/DC/NIDCD NIH HHS/United States ; NS22308/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; *Communication Aids for Disabled ; *Deafness ; Female ; Humans ; *Lipreading ; Male ; Random Allocation ; Speech Perception ; Touch ; Vibration ; }, abstract = {Three vibrotactile vocoders were compared in a training study involving several different speech perception tasks. Vocoders were: (1) the Central Institute for the Deaf version of the Queen's University vocoder, with 1/3-oct filter spacing and logarithmic output scaling (CIDLog) [Engebretson and O'Connell, IEEE Trans. Biomed. Eng. BME-33, 712-716 (1986)]; (2) the same vocoder with linear output scaling (CIDLin); and (3) the Gallaudet University vocoder designed with greater resolution in the second formant region, relative to the CID vocoders, and linear output scaling (GULin). Four normal-hearing subjects were assigned to either of two control groups, visual-only control and vocoder control, for which they received the CIDLog vocoder. Five normal-hearing and four hearing-impaired subjects were assigned to the linear vocoders. Results showed that the three vocoders provided equivalent information in word-initial and word-final tactile-only consonant identification. However, GULin was the only vocoder significantly effective in enhancing lipreading of isolated prerecorded sentences. Individual subject analyses showed significantly enhanced lipreading by the three normal-hearing and two hearing-impaired subjects who received the GULin vocoder. Over the entire training period of the experiment, the mean difference between aided and unaided lipreading of sentences by the GULin aided hearing-impaired subjects was approximately 6% words correct. Possible explanations for failure to confirm previous success with the CIDLog vocoder [Weisenberger et al., J. Acoust. Soc. Am. 86, 1764-1775 (1989)] are discussed.}, } @article {pmid1787708, year = {1991}, author = {Kent, RD and Sufit, RL and Rosenbek, JC and Kent, JF and Weismer, G and Martin, RE and Brooks, BR}, title = {Speech deterioration in amyotrophic lateral sclerosis: a case study.}, journal = {Journal of speech and hearing research}, volume = {34}, number = {6}, pages = {1269-1275}, doi = {10.1044/jshr.3406.1269}, pmid = {1787708}, issn = {0022-4685}, support = {DC00319/DC/NIDCD NIH HHS/United States ; K08 DC00018/DC/NIDCD NIH HHS/United States ; }, mesh = {Amyotrophic Lateral Sclerosis/complications/*physiopathology ; Dysarthria/etiology/*physiopathology ; Female ; Humans ; Longitudinal Studies ; Middle Aged ; Phonation/physiology ; *Phonetics ; Respiratory Mechanics ; Speech Acoustics ; *Speech Intelligibility ; Time Factors ; }, abstract = {Few detailed reports have been published on the nature of speech and voice changes during the course of amyotrophic lateral sclerosis (ALS). The subject of this case study is a woman who was diagnosed as having ALS with bulbar signs at the age of 53. Speech intelligibility, pulmonary function, and selected speech and voice functions were tested during an approximately 2-year course of her disease. Over this period, her speech intelligibility, as measured by a multiple-choice word identification test, declined from 98% to 48%. Phonetic features that were most affected during the intelligibility decline included voicing contrast for syllable-initial and syllable-final consonants, place of articulation contrasts for lingual consonants, manner of articulation for lingual consonants, stop versus nasal manner of production, features related to the liquid consonants, and various features related to syllable shape. An acoustic measure, average slope of the second-formant frequency, declined in association with the intelligibility reduction and is thought to reflect the loss of lingual motoneurons. Her pulmonary function also declined over the observation interval, with particularly severe reduction in measures of air flow. Oral diadochokinesis and measures of vocal function (including jitter, shimmer, and signal-to-noise ratio) were highly variable across test sessions. These results are discussed in terms of the challenges they present to sensitive assessment of change and to management of the communication disability in ALS.}, } @article {pmid1787250, year = {1991}, author = {Silkes, SM and Geisler, CD}, title = {Responses of "lower-spontaneous-rate" auditory-nerve fibers to speech syllables presented in noise. I: General characteristics.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {6}, pages = {3122-3139}, doi = {10.1121/1.401421}, pmid = {1787250}, issn = {0001-4966}, support = {DC00116/DC/NIDCD NIH HHS/United States ; }, mesh = {Animals ; Cats ; Cochlea/innervation/physiology ; Electrophysiology ; Nerve Fibers/*physiology ; Noise ; *Phonetics ; Speech Acoustics ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of auditory-nerve fibers in anesthetized cats to nine different spoken stop- and nasal-consonant/vowel syllables presented at 70 dB SPL in various levels of speech-shaped noise [signal-to-noise (S/N) ratios of 30, 20, 10, and 0 dB] are reported. The temporal aspects of speech encoding were analyzed using spectrograms. The responses of the "lower-spontaneous-rate" fibers (less than 20/s) were found to be more limited than those of the high-spontaneous-rate fibers. The lower-spontaneous-rate fibers did not encode noise-only portions of the stimulus at the lowest noise level (S/N = 30 dB) and only responded to the consonant if there was a formant or major spectral peak near its characteristic frequency. The fibers' responses at the higher noise levels were compared to those obtained at the lowest noise level using the covariance as a quantitative measure of signal degradation. The lower-spontaneous-rate fibers were found to preserve more of their initial temporal encoding than high-spontaneous-rate fibers of the same characteristic frequency. The auditory-nerve fibers' responses were also analyzed for rate-place encoding of the stimuli. The results are similar to those found for temporal encoding.}, } @article {pmid1960277, year = {1991}, author = {Fourakis, M}, title = {Tempo, stress, and vowel reduction in American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {4 Pt 1}, pages = {1816-1827}, doi = {10.1121/1.401662}, pmid = {1960277}, issn = {0001-4966}, support = {R01-DC00296/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Phonetics ; Reference Values ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Speech Acoustics ; *Verbal Behavior ; }, abstract = {Two processes that affect the acoustic characteristics of vowels, namely, phonological and phonetic vowel reduction are discussed. Phonological vowel reduction applies to unstressed vowels. Phonetic vowel reduction is supposed to apply to all vowels and be caused by fast speech rates, context, as well as lack of stress. In this experiment, the effects of changes in stress and in rate of speech (tempo) on the acoustic characteristics of American English monophthongal, nonretroflex vowels were examined. Four male and four female native speakers produced these vowels in two contexts, [h_d] and [b_d], in a carrier sentence, under four conditions of tempo stress (slow-stressed, slow-unstressed, fast-stressed, and fast-unstressed). Measurements of duration and fundamental frequency showed that the subjects did, in fact, vary tempo and stress as instructed. The effect of a change in stress on vowel duration was found to be slightly larger than that of a change in tempo. The putative vowel portion of each utterance was analyzed, formant tracks were obtained, and these were plotted in an auditory-perceptual space [J.D. Miller, J. Acoust. Soc. AM. 85, 2114-2134 (1989)]. These plots served to determine the part of the utterance that could, in most cases, be considered its steady state. For each utterance, an average of the coordinates of this steady-state portion was taken and was used to represent the utterance as a point in the auditory-perceptual space. The distance of these data points from the point representing the acoustic characteristics of a vowel produced by a neutral vocal tract was used to determine the magnitude of phonetic vowel reduction caused by faster tempo and less stress, relative to the slow-stressed condition. Although the results indicate that tempo and stress may not have a major influence on the distances of individual vowels from the neutral point, the size of the vowel space overall was affected. The vowel space was largest for the slow stressed condition and smallest for the fast unstressed condition. In addition, several vowel classifications schemes were tested using linear discriminant analysis, and the one proposed by Miller (1989) performed better than other combinations of fundamental frequency and the first three formants.}, } @article {pmid1960273, year = {1991}, author = {Whalen, DH}, title = {Perception of the English /s/-/integral of/ distinction relies on fricative noises and transitions, not on brief spectral slices.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {4 Pt 1}, pages = {1776-1785}, doi = {10.1121/1.401658}, pmid = {1960273}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; *Phonetics ; Psychoacoustics ; *Sound Spectrography ; *Speech Perception ; }, abstract = {A series of experiments compared two approaches to fricative identification, spectral template matching and articulatory dynamics. Natural-speech /s/ and /integral of/ noises from fricative-vowel or vowel-fricative syllables were cross spliced so that "hybrid" noises started out as either /s/ or /integral of/ and ended up with the other fricative noise in varying proportions. With both initial and final fricatives, listener judgments most often agreed with the longer part of the noise even when spectral templates would predict the other category. Also, the vocalic formant transitions contributed to the judgment. In another experiment, open transcriptions by four expert listeners similarly showed that all the cues were used; there were also some instances of nonspeech percepts that would be predicted by gestural models. One further experiment had subjects identify two fricatives from hybrid noises between two vocalic segments. When the order of the noises differed from the order of the transitions, the perceived ordering of the fricatives was often the reverse of the order of the noise segments. Taken together with previous results, these experiments indicate that listeners take the whole fricative noise, as well as the transitions, into account in fricative identification.}, } @article {pmid1939896, year = {1991}, author = {Porter, RJ and Cullen, JK and Collins, MJ and Jackson, DF}, title = {Discrimination of formant transition onset frequency: psychoacoustic cues at short, moderate, and long durations.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {3}, pages = {1298-1308}, doi = {10.1121/1.401922}, pmid = {1939896}, issn = {0001-4966}, support = {R01 NS21731/NS/NINDS NIH HHS/United States ; }, mesh = {*Attention ; Humans ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; Time Perception ; Voice Quality ; }, abstract = {Two experiments determined the just noticeable difference (jnd) in onset frequency for speech formant transitions followed by a 1800-Hz steady state. Influences of transition duration (30, 45, 60, and 120 ms), transition-onset region (above or below 1800 Hz), and the rate of transition were examined. An overall improvement in discrimination with duration was observed suggesting better frequency resolution and, consequently, better use of pitch/timbre cues with longer transitions. In addition, falling transitions (with onsets above 1800 Hz) were better discriminated than rising, and changing onset to produce increments in transition rate-of-change in frequency yielded smaller jnd's than changing onset to produce decrements. The shortest transitions displayed additional rate-related effects. This last observation may be due to differences in the degree of dispersion of activity in the cochlea when high-rate transitions are effectively treated as non-time-varying, wideband events. The other results may reflect mechanisms that extract the temporal envelopes of signals: Envelope slope and magnitude differences are proposed to provide discriminative cues that supplement or supplant weaker spectrally based pitch/timbre cues for transitions in the short-to-moderate duration range. It is speculated that these cues may also support some speech perceptual decisions.}, } @article {pmid1945691, year = {1991}, author = {Scukanec, GP and Petrosino, L and Squibb, K}, title = {Formant frequency characteristics of children, young adult, and aged female speakers.}, journal = {Perceptual and motor skills}, volume = {73}, number = {1}, pages = {203-208}, doi = {10.2466/pms.1991.73.1.203}, pmid = {1945691}, issn = {0031-5125}, mesh = {Adult ; Aged ; Aging/*psychology ; Child, Preschool ; Female ; Humans ; Language Development ; Middle Aged ; *Phonetics ; Reference Values ; *Sound Spectrography ; *Voice Quality ; }, abstract = {Previous literature indicates that a difference may exist between formant frequencies (F1 and F2) of children, young adult, and elderly speakers. The purpose of this study was to compare F1 and F2 of 3 young, 6 young adult, and 3 elderly female speakers for the /i/, /ae/, /u/, and /a/ vowels. Analysis indicates a trend towards vowel reduction across the life span. These findings support previous research regarding age-associated acoustic changes as well as support for the possible anatomical and physiological alterations which may influence such changes.}, } @article {pmid1939884, year = {1991}, author = {Jenison, RL and Greenberg, S and Kluender, KR and Rhode, WS}, title = {A composite model of the auditory periphery for the processing of speech based on the filter response functions of single auditory-nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {2 Pt 1}, pages = {773-786}, doi = {10.1121/1.401947}, pmid = {1939884}, issn = {0001-4966}, support = {DC 00719/DC/NIDCD NIH HHS/United States ; NS 17590/NS/NINDS NIH HHS/United States ; NS 26274/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Attention/*physiology ; Basilar Membrane/physiology ; Cats ; Computer Graphics ; Computer Simulation ; Hair Cells, Auditory/physiology ; Humans ; Loudness Perception/physiology ; Microcomputers ; Nerve Fibers/*physiology ; Pitch Discrimination/*physiology ; Psychoacoustics ; Software ; Sound Localization/*physiology ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {A composite model of the auditory periphery, based upon a unique analysis technique for deriving filter response characteristics from cat auditory-nerve fibers, is presented. The model is distinctive in its ability to capture a significant broadening of auditory-nerve fiber frequency selectivity as a function of increasing sound-pressure level within a computationally tractable time-invariant structure. The output of the model shows the tonotopic distribution of synchrony activity of single fibers in response to the steady-state vowel [e] presented over a 40-dB range of sound-pressure levels and is compared with the population-response data of Young and Sachs (1979). The model, while limited by its time invariance, accurately captures most of the place-synchrony response patterns reported by the Johns Hopkins group. In both the physiology and in the model, auditory-nerve fibers spanning a broad tonotopic range synchronize to the first formant (F1), with the proportion of units phase-locked to F1 increasing appreciably at moderate to high sound-pressure levels. A smaller proportion of fibers maintain phase locking to the second and third formants across the same intensity range. At sound-pressure levels of 60 dB and above, the vast majority of fibers with characteristic frequencies greater than 3 kHz synchronize to F1 (512 Hz), rather than to frequencies in the most sensitive portion of their response range. On the basis of these response patterns it is suggested that neural synchrony is the dominant auditory-nerve representation of formant information under "normal" listening conditions in which speech signals occur across a wide range of intensities and against a background of unpredictable and frequently intense acoustic interference.}, } @article {pmid1935340, year = {1991}, author = {Lester, BM and Corwin, MJ and Sepkoski, C and Seifer, R and Peucker, M and McLaughlin, S and Golub, HL}, title = {Neurobehavioral syndromes in cocaine-exposed newborn infants.}, journal = {Child development}, volume = {62}, number = {4}, pages = {694-705}, doi = {10.1111/j.1467-8624.1991.tb01563.x}, pmid = {1935340}, issn = {0009-3920}, support = {N01-HD-6-2930/HD/NICHD NIH HHS/United States ; R44-HD-20737/HD/NICHD NIH HHS/United States ; }, mesh = {Birth Weight ; Body Height ; Cocaine/*adverse effects ; *Crying ; Fetal Growth Retardation/physiopathology ; Humans ; Infant, Newborn ; Neonatal Abstinence Syndrome/*physiopathology ; Nervous System Diseases/*chemically induced/physiopathology ; }, abstract = {The effects of fetal cocaine exposure on newborn cry characteristics were studied in 80 cocaine-exposed and 80 control infants. The groups were stratified to be similar on maternal demographic characteristics and maternal use of other illegal substances and alcohol during pregnancy. The hypothesis was that excitable cry characteristics were related to the direct effects of cocaine, while depressed cry characteristics were related to the indirect effects of cocaine secondary to low birthweight. Structural equation modeling (EQS) showed direct effects of cocaine on cries with a longer duration, higher fundamental frequency, and a higher and more variable first formant frequency. Indirect effects of cocaine secondary to low birthweight resulted in cries with a longer latency, fewer utterances, lower amplitude, and more dysphonation. Cocaine-exposed infants had a lower birthweight, shorter length, and smaller head circumference than the unexposed controls. Findings were consistent with the notion that 2 neurobehavioral syndromes, excitable and depressed, can be described in cocaine-exposed infants, and that these 2 syndromes are due, respectively, to direct neurotoxic effects and indirect effects secondary to intrauterine growth retardation.}, } @article {pmid1775659, year = {1991}, author = {Walsh, MA and Diehl, RL}, title = {Formant transition duration and amplitude rise time as cues to the stop/glide distinction.}, journal = {The Quarterly journal of experimental psychology. A, Human experimental psychology}, volume = {43}, number = {3}, pages = {603-620}, doi = {10.1080/14640749108400989}, pmid = {1775659}, issn = {0272-4987}, support = {NS25207/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; Perceptual Masking ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; *Time Perception ; }, abstract = {There is some disagreement in the literature about the relative contribution of formant transition duration and amplitude rise time in signalling the contrast between stops and glides. In this study, listeners identified sets of /ba/ and /wa/ stimuli in which transition duration and rise time varied orthogonally. Both variables affected labelling performance in the expected direction (i.e. the proportion of /b/ responses increased with shorter transition durations and shorter rise times). However, transition duration served as the primary cue to the stop/glide distinction, whereas rise time played a secondary, contrast-enhancing role. A qualitatively similar pattern of results was obtained when listeners made abrupt-onset/gradual-onset judgements of single sine-wave stimuli that modelled the rise times, frequency trajectories, and durations of the first formant in the /ba/-/wa/ stimuli. The similarities between the speech and non-speech conditions suggest that significant auditory commonalities underlie performance in the two cases.}, } @article {pmid1775658, year = {1991}, author = {Dorman, MF and Dankowski, K and McCandless, G and Parkin, JL and Smith, L}, title = {Vowel and consonant recognition with the aid of a multichannel cochlear implant.}, journal = {The Quarterly journal of experimental psychology. A, Human experimental psychology}, volume = {43}, number = {3}, pages = {585-601}, doi = {10.1080/14640749108400988}, pmid = {1775658}, issn = {0272-4987}, support = {R01-00654//PHS HHS/United States ; }, mesh = {Adult ; Auditory Threshold ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Loudness Perception ; Male ; Middle Aged ; *Phonetics ; Pitch Discrimination ; Prosthesis Design ; Psychoacoustics ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {In this report we review the vowel and consonant recognition ability of patients who use a multichannel cochlear implant and who achieve relatively good word identification scores. The results suggest that vowel recognition is accomplished by good resolution of the frequency of the first formant (F1) combined with poor resolution of the frequency of the second formant (F2). The results also suggest that consonant recognition is accomplished (1) by using information from the amplitude envelope, including periodicity/aperiodicity, as cues to manner and voicing, (2) by using F1 as an aid to the identification of manner and voicing, and (3) by using information from cochlear place of stimulation to provide a very crude indication of the shape of the frequency spectrum above 1 kHz.}, } @article {pmid1880303, year = {1991}, author = {Kluender, KR}, title = {Effects of first formant onset properties on voicing judgments result from processes not specific to humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {1}, pages = {83-96}, doi = {10.1121/1.402285}, pmid = {1880303}, issn = {0001-4966}, support = {MH-39940/MH/NIMH NIH HHS/United States ; NS-25207/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Animals ; *Attention ; Conditioning, Operant ; Coturnix ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; Reaction Time ; Sound Spectrography ; *Speech Perception ; }, abstract = {Both first formant (F1) transition duration and F1 onset frequency have been proposed to be perceptually significant in categorization of voiced and voiceless syllable-initial stops. Transition duration per se may not, however, explain the fact that, for longer transitions, longer F1 cutback is required in order to perceive a stop as voiceless. Longer transitions result in lower F1 onsets at any duration of cutback greater than zero, and it is possible that the major effect of F1 is determined by its frequency at onset. In this study, F1-transition duration, onset frequency, and slope were varied across four types of F1 transition in which one of the three variables (onset frequency, duration, slope) was held constant while the other two were allowed to vary. Each of the four F1 types was used in syllables with higher formants appropriate for labial, alveolar, and velar places of articulation. By far, the best predictor of identification of these stimuli by human listeners was F1 onset frequency. F1 duration, F1 slope, and place of articulation had little or no effect on labeling boundaries. In a second experiment using Japanese quail (Coturnix coturnix japonica), birds were trained to respond differentially to voiced versus voiceless stops. The differential effects of F1 onset frequency on the "labeling" behavior of these birds was strikingly similar to that of humans listening to the same stimuli. These results are taken to provide strong evidence that F1 onset frequency is the primary determinant of shifts in voicing boundaries across place of articulation, and that general mechanisms not unique to humans appear adequate to account for the effects of F1 onset frequency on perception of voicing for syllable-initial stops.}, } @article {pmid1880302, year = {1991}, author = {Zahorian, SA and Jagharghi, AJ}, title = {Speaker normalization of static and dynamic vowel spectral features.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {1}, pages = {67-75}, doi = {10.1121/1.402350}, pmid = {1880302}, issn = {0001-4966}, mesh = {Adult ; *Algorithms ; Child ; Child Language ; Database Management Systems/instrumentation ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; Reference Values ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Speech Production Measurement/instrumentation ; *Verbal Behavior ; }, abstract = {Two methods are described for speaker normalizing vowel spectral features: one is a multivariable linear transformation of the features and the other is a polynomial warping of the frequency scale. Both normalization algorithms minimize the mean-square error between the transformed data of each speaker and vowel target values obtained from a "typical speaker." These normalization techniques were evaluated both for formants and a form of cepstral coefficients (DCTCs) as spectral parameters, for both static and dynamic features, and with and without fundamental frequency (F0) as an additional feature. The normalizations were tested with a series of automatic classification experiments for vowels. For all conditions, automatic vowel classification rates increased for speaker-normalized data compared to rates obtained for nonnormalized parameters. Typical classification rates for vowel test data for nonnormalized and normalized features respectively are as follows: static formants--69%/79%; formant trajectories--76%/84%; static DCTCs 75%/84%; DCTC trajectories--84%/91%. The linear transformation methods increased the classification rates slightly more than the polynomial frequency warping. The addition of F0 improved the automatic recognition results for nonnormalized vowel spectral features as much as 5.8%. However, the addition of F0 to speaker-normalized spectral features resulted in much smaller increases in automatic recognition rates.}, } @article {pmid1843526, year = {1991}, author = {Eimas, PD and Miller, JL}, title = {A constraint on the discrimination of speech by young infants.}, journal = {Language and speech}, volume = {34 (Pt 3)}, number = {}, pages = {251-263}, doi = {10.1177/002383099103400303}, pmid = {1843526}, issn = {0023-8309}, support = {DC 00130/DC/NIDCD NIH HHS/United States ; HD 05331/HD/NICHD NIH HHS/United States ; RR 07143/RR/NCRR NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Auditory Perception ; Female ; Humans ; Infant ; Male ; *Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Three- and four-month-old infants were tested on their ability to discriminate the third-formant transitions sufficient to signal the syllable-medial stops [t] and [k]. The stimulus patterns consisted of an initial fricative [s], 20 or 100 msec of silence, and the vowel [a], whose initial formant transitions were appropriate for [t] or [k]. Discrimination only occurred when the duration of silence was 100 msec. This constraint on discrimination is discussed in terms of a psychoacoustic explanation based on forward masking and in terms of the hypothesis that the processing of speech signals involves a species-specific system dedicated to deriving a phonetic message.}, } @article {pmid2072692, year = {1991}, author = {Sussman, JE}, title = {Stimulus ratio effects on speech discrimination by children and adults.}, journal = {Journal of speech and hearing research}, volume = {34}, number = {3}, pages = {671-678}, doi = {10.1044/jshr.3403.671}, pmid = {2072692}, issn = {0022-4685}, mesh = {*Acoustic Stimulation ; Adult ; Age Factors ; Aging/*physiology ; Bias ; Child ; Child, Preschool ; Humans ; Learning ; *Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {This investigation examined the response strategies and discrimination accuracy of adults and children aged 5-10 as the ratio of same to different trials was varied across three conditions of a "change/no-change" discrimination task. The conditions varied as follows: (a) a ratio of one-third same to two-thirds different trials (33% same), (b) an equal ratio of same to different trials (50% same), and (c) a ratio of two-thirds same to one-third different trials (67% same). Stimuli were synthetic consonant-vowel syllables that changed along a place of articulation dimension by formant frequency transition. Results showed that all subjects changed their response strategies depending on the ratio of same-to-different trials. The most lax response pattern was observed for the 50% same condition, and the most conservative pattern was observed for the 67% same condition. Adult response patterns were most conservative across condition. Difference in discrimination accuracy as measured by P(C) were found, with the largest difference in the 5- to 6-year-old group and the smallest change in the adult group. These findings suggest that children's response strategies, like those of adults, can be manipulated by changing the ratio of same-to-different trials. Furthermore, interpretation of sensitivity measures must be referenced to task variables such as the ratio of same-to-different trials.}, } @article {pmid1918629, year = {1991}, author = {Svirsky, MA and Tobey, EA}, title = {Effect of different types of auditory stimulation on vowel formant frequencies in multichannel cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {89}, number = {6}, pages = {2895-2904}, doi = {10.1121/1.400727}, pmid = {1918629}, issn = {0001-4966}, mesh = {Acoustic Stimulation/*methods ; Adult ; *Cochlear Implants ; *Deafness/rehabilitation ; Feedback ; Female ; Humans ; Male ; Middle Aged ; *Speech ; Speech Production Measurement ; }, abstract = {Two experiments investigating the effects of auditory stimulation delivered via a Nucleus multichannel cochlear implant upon vowel production in adventitiously deafened adult speakers are reported. The first experiment contrasts vowel formant frequencies produced without auditory stimulation (implant processor OFF) to those produced with auditory stimulation (processor ON). Significant shifts in second formant frequencies were observed for intermediate vowels produced without auditory stimulation; however, no significant shifts were observed for the point vowels. Higher first formant frequencies occurred in five of eight vowels when the processor was turned ON versus OFF. A second experiment contrasted productions of the word "head" produced with a FULL map, OFF condition, and a SINGLE channel condition that restricted the amount of auditory information received by the subjects. This experiment revealed significant shifts in second formant frequencies between FULL map utterances and the other conditions. No significant differences in second formant frequencies were observed between SINGLE channel and OFF conditions. These data suggest auditory feedback information may be used to adjust the articulation of some speech sounds.}, } @article {pmid2030994, year = {1991}, author = {Fuller, BF}, title = {Acoustic discrimination of three types of infant cries.}, journal = {Nursing research}, volume = {40}, number = {3}, pages = {156-160}, pmid = {2030994}, issn = {0029-6562}, mesh = {*Acoustics ; Age Factors ; Analysis of Variance ; *Crying ; Discriminant Analysis ; Female ; Humans ; Hunger ; *Infant ; Male ; Pain ; Sex Factors ; }, abstract = {The ability of acoustic characteristics, both separately and in linear combination with others, to differentiate among procedural pain-induced, hungry, and fussy crying was explored using audiorecordings of cries from healthy 2 to 4-month-old infants. Fussy cries were less tense than hungry or pain-induced cries and pain-induced cries had significantly stronger second formant amplitudes than fussy or hungry cries. Formants and tenseness were important contributors to a linear combination of acoustic measures, derived from discriminant function analysis, which correctly classified 74% of the procedural pain-induced crying specimens. The inability of this linear combination of acoustic measures to identify correctly approximately one-third of the cry specimens suggests that the discrete acoustic differences among the three situationally defined types of crying is not large.}, } @article {pmid1861005, year = {1991}, author = {Watrous, RL}, title = {Current status of Peterson-Barney vowel formant data.}, journal = {The Journal of the Acoustical Society of America}, volume = {89}, number = {5}, pages = {2459-2460}, doi = {10.1121/1.400932}, pmid = {1861005}, issn = {0001-4966}, mesh = {Adult ; Child ; Female ; Humans ; Male ; *Online Systems ; *Phonetics ; Reference Values ; *Sound Spectrography ; }, abstract = {A question concerning the status of the Peterson-Barney vowel formant data is raised. Two machine-readable copies of the data were located, compared, and found to contain minor discrepancies. These discrepancies were resolved by comparison with a listing of the original data.}, } @article {pmid1861002, year = {1991}, author = {Sinnott, JM and Kreiter, NA}, title = {Differential sensitivity to vowel continua in Old World monkeys (Macaca) and humans.}, journal = {The Journal of the Acoustical Society of America}, volume = {89}, number = {5}, pages = {2421-2429}, doi = {10.1121/1.400974}, pmid = {1861002}, issn = {0001-4966}, support = {K04 DC00042/DC/NIDCD NIH HHS/United States ; R01 DC00541/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Animals ; *Attention ; Auditory Threshold ; *Biological Evolution ; Female ; Humans ; Macaca/*psychology ; Male ; Pitch Discrimination ; Psychoacoustics ; Species Specificity ; *Speech Perception ; }, abstract = {Previous studies indicate that monkey pure tone frequency discrimination is quantitatively and qualitatively very different from that of humans: Monkey DLs at 1.0 and 2.0 kHz are up to 20 times larger than human DLs, and monkeys DLs increase as sensation level increases, in contrast to human DLs [Sinnott et al., J. Acoust. Soc. Am. 78, 1977-1985 (1985); Sinnott et al., J. Comp. Psychol. 101, 126-131 (1987)]. These results led to an hypothesis that monkey frequency discrimination is more dependent upon "rate" coding than is that of humans. The present study compared monkey and human DLs for formant frequency changes along three synthetic vowel continua /I-i/, /ae-epsilon/, and /a-v/. Here, monkey DLs for formants near 1.0 and 2.0 kHz (32-48 Hz) were only about two to three times larger than human DLs (11-21 Hz), and both monkeys and humans exhibited relatively similar, flat sensation level functions. Taken together, these data indicate that monkey and human frequency discrimination is more similar in the case of a complex vowel stimulus than in the case of a simple pure tone stimulus. Results are discussed in relation to "rate" versus "temporal" coding of tones and vowels in the auditory system.}, } @article {pmid1938154, year = {1991}, author = {Maurer, D and Landis, T and D'Heureuse, C}, title = {Formant movement and formant number alteration with rising FO in real vocalizations of the German vowels [u:], [o:] and [a:].}, journal = {The International journal of neuroscience}, volume = {57}, number = {1-2}, pages = {25-38}, doi = {10.3109/00207459109150344}, pmid = {1938154}, issn = {0020-7454}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; Language ; Male ; Speech/*physiology ; Voice/*physiology ; }, abstract = {Production theory explains vowel sounds by formants in terms of a resonance pattern of the vocal tract. In perception theory normalization must also be undertaken because of different formant values for one vowel category, as found in comparisons of men, women and children, and in studies of sung vowels and of synthesized vowel sounds. Synthesis indicates a dependence of the formants on FO. Therefore, the question arises whether the formant pattern is directly related to FO when studied in real vocalizations. This study presents: a. a method to determine formant movement in real vocalizations for all FO, and b. the results of spectral and LPC analysis of the three German vowels [u:], [o:] and [a:] in real vocalizations. The need for formant movement with altering FO, as well as a change in the number of formants with altering FO, is demonstrated. The implications of these results for normalization in the description of the physics of the vowel sound wave are discussed.}, } @article {pmid2016437, year = {1991}, author = {Jongman, A and Miller, JD}, title = {Method for the location of burst-onset spectra in the auditory-perceptual space: a study of place of articulation in voiceless stop consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {89}, number = {2}, pages = {867-873}, doi = {10.1121/1.1894648}, pmid = {2016437}, issn = {0001-4966}, support = {NS 21994/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; *Phonetics ; *Sound Spectrography ; *Speech Articulation Tests ; *Speech Perception ; Voice Quality ; }, abstract = {A method for distinguishing burst onsets of voiceless stop consonants in terms of place of articulation is described. Four speakers produced the voiceless stops in word-initial position in six vowel contexts. A metric was devised to extract the characteristic burst-friction components at burst onset. The burst-friction components, derived from the metric as sensory formants, were then transformed into log frequency ratios and plotted as points in an auditory-perceptual space (APS). In the APS, each place of articulation was seen to be associated with a distinct region, or target zone. The metric was then applied to a test set of words with voiceless stops preceding ten different vowel contexts as produced by eight new speakers. The present method of analyzing voiceless stops in English enabled us to distinguish place of articulation in these new stimuli with 70% accuracy.}, } @article {pmid2069177, year = {1991}, author = {Tobey, EA and Angelette, S and Murchison, C and Nicosia, J and Sprague, S and Staller, SJ and Brimacombe, JA and Beiter, AL}, title = {Speech production performance in children with multichannel cochlear implants.}, journal = {The American journal of otology}, volume = {12 Suppl}, number = {}, pages = {165-173}, pmid = {2069177}, issn = {0192-9763}, mesh = {Adolescent ; Child ; Child, Preschool ; *Cochlear Implants ; Deafness/congenital/physiopathology/surgery ; Female ; Humans ; Male ; *Speech ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {The purpose of this study is to report on five speech production protocols collected as part of the Food and Drug Administration's Clinical Trials for the Nucleus multichannel cochlear implant. Sixty-one children, ranging in age from 2.4 years to 17.8 years, participated. Speech measures included nonsegmental and segmental portions of the Phonetic Level Speech Evaluation, the Phonologic Level Speech Evaluation, speech intelligibility, and an experimental condition examining speech produced with the device turned on versus off. Seventy-seven percent of the children improved on at least one-third of the speech production measures. Significant improvements in the ability to imitate prosodic characteristics were observed for 31.1 percent of the children. Improvements in the ability to imitate speech sounds were found for 66.7 percent of the children. Examination of phonologic skills acquired during spontaneous speaking improved for 55.6 percent of the children. Speech intelligibility improved in 62.9 percent of the children after implantation. More centralized second formant frequencies were observed in vowels produced with the speech processor turned off, as opposed to on, in all 13 of the children tested on this protocol. Data from this study suggest a multichannel cochlear implant may assist many children in developing better speech.}, } @article {pmid2068930, year = {1991}, author = {Pruszewicz, A and Obrebowski, A and Swidziński, P and Demeńko, G and Wika, T and Wojciechowska, A}, title = {Usefulness of acoustic studies on the differential diagnostics of organic and functional dysphonia.}, journal = {Acta oto-laryngologica}, volume = {111}, number = {2}, pages = {414-419}, doi = {10.3109/00016489109137412}, pmid = {2068930}, issn = {0001-6489}, mesh = {Diagnosis, Differential ; Glottis ; Humans ; Phonation ; *Speech Acoustics ; *Speech Production Measurement ; Voice Disorders/*diagnosis ; }, abstract = {Phoniatric and acoustic examinations were carried out in a group of 30 patients with dysphonia, including 15 with organic type and 15 with functional type. A complex phoniatric assessment offered the possibility to differentiate between these two groups of pathological voices. This was achieved also on the basis of acoustic analysis of the voice by extracting characteristics such as: formant frequency, Fo and its range, percentage of noise in the analysed verbal text, mean and maximum values of jitter. The possibility of differential diagnosis of these two different types of dysphonia in acoustic studies was confirmed by clinical examinations. The acoustic studies presented can be regarded as a new approach to a fast and sufficiently precise method in the screening diagnostics of dysphonia conditioned by growth of the vocal fold mass.}, } @article {pmid2005833, year = {1991}, author = {Stassen, HH}, title = {Affective state and voice: the specific properties of overtone distributions.}, journal = {Methods of information in medicine}, volume = {30}, number = {1}, pages = {44-52}, pmid = {2005833}, issn = {0026-1270}, mesh = {*Affect ; Calibration ; Female ; Humans ; Male ; Predictive Value of Tests ; Reference Values ; Reproducibility of Results ; Signal Processing, Computer-Assisted ; *Speech Acoustics ; Voice Quality ; }, abstract = {Motivated by psychiatric interests and as part of our investigations into the basic properties of human speech, we carried out a normative study with 192 healthy subjects--stratified according to sex, age and education--in order to derive reference values of the general population and to learn to distinguish between normal fluctuations and significant changes over time. In the present investigation, our interest focused on the individual sound characteristics of speakers ("timbre") rather than on speech behavior. Accordingly, we determined the optimum parameter setting for a problem-specific, reliable estimation of time-dependent spectra. An interval of one second length was found to be optimum for reproducibly assessing formants and corresponding bandwidths for more than 95% of the cases. Based on these findings, we adapted the concept of "spectral patterns" to speech analysis. It turned out that spectral voice patterns are stable over time and measure the fine graduations of mutual differences between human voices. A highly reliable computerized recognition of persons was possible by means of these quantities, on the basis of 16-32 s time series: 93% of persons could be uniquely recognized after a 14-day interval. Hence, we succeeded in developing specific means for modelling intra-individual changes of voice timbres over time. This is of particular interest for investigations of the speech characteristics of affectively disturbed patients, since the tonal expressiveness of human voices, or the lack thereof, essentially depends on the actual distribution of overtones and the corresponding variabilities.}, } @article {pmid2004135, year = {1991}, author = {Kurogi, S}, title = {Speech recognition by an artificial neural network using findings on the afferent auditory system.}, journal = {Biological cybernetics}, volume = {64}, number = {3}, pages = {243-249}, pmid = {2004135}, issn = {0340-1200}, mesh = {Acoustic Stimulation ; Afferent Pathways/physiology ; *Auditory Perception ; Brain/physiology ; Cybernetics ; Hearing ; Humans ; Language ; Mathematics ; *Models, Neurological ; Neurons/*physiology ; *Speech ; }, abstract = {An artificial neural network which uses anatomical and physiological findings on the afferent pathway from the ear to the cortex is presented and the roles of the constituent functions in recognition of continuous speech are examined. The network deals with successive spectra of speech sounds by a cascade of several neural layers: lateral excitation layer (LEL), lateral inhibition layer (LIL), and a pile of feature detection layers (FDL's). These layers are shown to be effective for recognizing spoken words. Namely, first, LEL reduces the distortion of sound spectrum caused by the pitch of speech sounds. Next, LIL emphasizes the major energy peaks of sound spectrum, the formants. Last, FDL's detect syllables and words in successive formants, where two functions, time-delay and strong adaptation, play important roles: time-delay makes it possible to retain the pattern of formant changes for a period to detect spoken words successively; strong adaptation contributes to removing the time-warp of formant changes. Digital computer simulations show that the network detect isolated syllables, isolated words, and connected words in continuous speech, while reproducing the fundamental responses found in the auditory system such as ON, OFF, ON-OFF, and SUSTAINED patterns.}, } @article {pmid2283430, year = {1990}, author = {Roberts, B and Moore, BC}, title = {The influence of extraneous sounds on the perceptual estimation of first-formant frequency in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {6}, pages = {2571-2583}, doi = {10.1121/1.399978}, pmid = {2283430}, issn = {0001-4966}, mesh = {Adult ; *Attention ; Humans ; *Phonetics ; *Pitch Discrimination ; Reference Values ; Speech Acoustics ; *Speech Perception ; }, abstract = {The contribution of extraneous sounds to the perceptual estimation of the first-formant (F1) frequency of voiced vowels was investigated using a continuum of vowels perceived as changing from/I/to/epsilon/as F1 was increased. Any phonetic effects of adding extraneous sounds were measured as a change in the position of the phoneme boundary on the continuum. Experiments 1-5 demonstrated that a pair of extraneous tones, mistuned from harmonic values of the fundamental frequency of the vowel, could influence perceived vowel quality when added in the F1 region. Perceived F1 frequency was lowered when the tones were added on the lower skirt of F1, and raised when they were added on the upper skirt. Experiments 6 and 7 demonstrated that adding a narrow-band noise in the F1 region could produce a similar pattern of boundary shifts, despite the differences in temporal properties and timbre between a noise band and a voiced vowel. The data are interpreted using the concept of the harmonic sieve [Duifhuis et al., J. Acoust. Soc. Am. 71, 1568-1580 (1982)]. The results imply a partial failure of the harmonic sieve to exclude extraneous sounds from the perceptual estimation of F1 frequency. Implications for the nature of the hypothetical harmonic sieve are discussed.}, } @article {pmid2283429, year = {1990}, author = {Krull, D}, title = {Relating acoustic properties to perceptual responses: a study of Swedish voiced stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {6}, pages = {2557-2570}, doi = {10.1121/1.399977}, pmid = {2283429}, issn = {0001-4966}, mesh = {Adolescent ; *Attention ; Humans ; *Language ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Perception models based on different kinds of acoustic data were compared with respect to their capacity to predict perceptual confusions between the Swedish stops [b,d,d,g] in systematically varied vowel contexts. Fragments of VC:V utterances read by a male speaker were presented to listeners. The resulting confusions were especially numerous between short stimulus segments following stop release, and formed a regular pattern depending mainly on the acute/grave dimension of the following vowel. The acoustic distances calculated were based on: (1) filter band spectra; (2) F2 and F3 at the CV boundary and in the middle of the following vowel; (3) the duration of the burst (= transient + noise section). Both the spectrum-based and the formant-based models provided measures of acoustic distance (dissimilarity) that revealed regular patterns. However, the predictive capacity of both models was improved by including the time-varying properties of the stimuli in the distance measures. The highest correlation between predicted and observed percent confusions, r = 0.85, was obtained with the formant-based model in combination with burst length data. The asymmetries in the listeners' confusions were also shown to be predictable, given acoustic data on the following vowel.}, } @article {pmid2279196, year = {1990}, author = {Faulkner, A and Rosen, S and Moore, BC}, title = {Residual frequency selectivity in the profoundly hearing-impaired listener.}, journal = {British journal of audiology}, volume = {24}, number = {6}, pages = {381-392}, doi = {10.3109/03005369009076579}, pmid = {2279196}, issn = {0300-5364}, mesh = {*Auditory Threshold ; Hearing Disorders/diagnosis/etiology/*physiopathology ; Humans ; Perceptual Masking ; *Psychoacoustics ; *Speech Perception ; }, abstract = {The extent to which auditory frequency analysis is retained in profoundly hearing-impaired listeners has major implications for hearing aid design. We have measured simplified psychoacoustic tuning curves in nine such listeners, using sinusoidal probes at 125 and 250 Hz, and 80-Hz wide narrow-band noise maskers. Two listeners showed PTCs at 125 and 250 Hz whose shapes were independent of probe frequency and parallel to their absolute thresholds, indicating the complete absence of frequency selectivity. Seven listeners showed evidence of frequency selectivity at 125 or 250 Hz or at both frequencies; at 250 Hz, frequency selectivity was evident in the six listeners whose 250-Hz hearing level was 95 dB or less, but not in the listeners with 250 Hz hearing levels above 95 dB. Where conventional 'v'-shaped PTCs were observed, estimated 3-dB auditory filter bandwidths were two to three times larger than those typically found in normal listeners. Notched-noise masking results at 250 Hz from the least hearing-impaired listener gave an estimated 3-dB bandwidth in reasonable agreement with that from the same listener's PTC data. Listeners who retain some frequency selectivity are able to make some use of first formant information in vowel identification, and preliminary results from one patient showed the ability to distinguish a variety of noise spectra. Both of these abilities could be of potential importance as a basis for the recoding of speech spectral patterning through stimulation matched to the listener's residual frequency selectivity.}, } @article {pmid2076985, year = {1990}, author = {Palmer, AR and Rees, A and Caird, D}, title = {Interaural delay sensitivity to tones and broad band signals in the guinea-pig inferior colliculus.}, journal = {Hearing research}, volume = {50}, number = {1-2}, pages = {71-86}, doi = {10.1016/0378-5955(90)90034-m}, pmid = {2076985}, issn = {0378-5955}, mesh = {Acoustic Stimulation/methods ; Action Potentials ; Animals ; Ear/*physiology ; Guinea Pigs ; Inferior Colliculi/cytology/*physiology ; Neurons/physiology ; Noise ; Phonetics ; Reaction Time ; }, abstract = {We have measured the sensitivity of 243 low-frequency cells in the central nucleus of the guinea pig to the interaural time delay of best frequency (BF) tones, wideband noise and synthetic vowels. The highest rate of firing for the majority of cells occurred when the stimulus to the contralateral ear arrived 100-400 microseconds before that to the ipsilateral ear. The best delays for tones and noise measured in the same cell were highly correlated. In contrast to the tone delay functions, the majority of the delay functions obtained in response to wideband signals did not cycle, but were characterized by a single dominant peak or trough. The response frequency calculated from the delay functions to the vowel often did not correspond to the unit's BF, suggesting that the unit was responding to a component close to the first formant frequency (730 Hz) of the vowel. Phase-locked responses, on the other hand, only occurred to the fundamental frequency of the vowel (100 Hz) and not to higher frequency components. The responses to delayed tone and noise signals in the guinea pig are very like those obtained in the cat and other mammals. The similarity of the range of best delays for the guinea-pig with those reported for the cat, despite the difference in head size in these two species, suggests that the sensitivity to interaural delays reflects the properties of the binaural pathways rather than an adaptation to the delays normally experienced by the animal.}, } @article {pmid2256887, year = {1990}, author = {Bond, ZS and Moore, TJ}, title = {Effect of whole-body vibration on acoustic measures of speech.}, journal = {Aviation, space, and environmental medicine}, volume = {61}, number = {11}, pages = {989-993}, pmid = {2256887}, issn = {0095-6562}, mesh = {*Aviation ; Humans ; Male ; Oxygen Inhalation Therapy/adverse effects ; Phonetics ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; *Vibration ; }, abstract = {If Automatic Speech Recognition technology is to be applied in the cockpit, it must deal with speech produced under environmental conditions that may alter the acoustic characteristics of the speech to be recognized. The present study examines the acoustic-phonetic detail of selected words produced under vibration, with the talker wearing an oxygen mask. The duration of words and syllables showed small effects that were inconsistent and statistically non-significant. Statistically significant increases were found in fundamental frequency and in the proportion of energy in the higher frequencies (decreased spectral tilt). The vibration conditions examined had no consistent effect on the center frequencies of the vowel formants measured.}, } @article {pmid2249858, year = {1990}, author = {White, MW and Ochs, MT and Merzenich, MM and Schubert, ED}, title = {Speech recognition in analog multichannel cochlear prostheses: initial experiments in controlling classifications.}, journal = {IEEE transactions on bio-medical engineering}, volume = {37}, number = {10}, pages = {1002-1010}, doi = {10.1109/10.102813}, pmid = {2249858}, issn = {0018-9294}, support = {N01-NS-9-2401/NS/NINDS NIH HHS/United States ; NS-11804/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; *Cochlear Implants ; Humans ; Male ; Phonetics ; *Speech Intelligibility ; }, abstract = {Computer-synthesized vowels were used to examine methods for controlling and measuring the perceptions elicited during electrical stimulation of the human cochlea. In the first experiment, we measured the importance of the second formant (F2) in the identification of vowels, matched for duration, in a single subject with a multichannel cochlear implant. The subject never confused vowels having a "low" frequency F2 with those having a "high" frequency F2. In the second experiment, identification functions were generated for a series of vowels varying only in F2. When the pattern of F2 stimulation at the basilar membrane was manipulated, vowel identification functions were altered. For the categorization of vowels, the data indicate that the relative cochlear position of F2 stimulation was more important than fine-grain temporal waveform cues. The data are supportive of cochlear implant coding strategies that make use of cochlear place information. In the later experiments, we manipulated filter passbands and channel gains to explore their effect on these classifications. These preliminary studies indicate that it is possible to "fine-tune" such classifications.}, } @article {pmid2229676, year = {1990}, author = {Palmer, AR}, title = {The representation of the spectra and fundamental frequencies of steady-state single- and double-vowel sounds in the temporal discharge patterns of guinea pig cochlear-nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1412-1426}, doi = {10.1121/1.400329}, pmid = {2229676}, issn = {0001-4966}, mesh = {Animals ; Attention/physiology ; Cochlear Microphonic Potentials/*physiology ; Cochlear Nerve/*physiology ; Guinea Pigs ; Nerve Fibers/physiology ; Perceptual Masking/physiology ; *Phonetics ; Pitch Discrimination/*physiology ; Psychoacoustics ; Sound Spectrography ; Speech Perception/*physiology ; }, abstract = {Psychophysical results using double vowels imply that subjects are able to use the temporal aspects of neural discharge patterns. To investigate the possible temporal cues available, the responses of fibers in the cochlear nerve of the anesthetized guinea pig to synthetic vowels were recorded at a range of sound levels up to 95 dB SPL. The stimuli were the single vowels /i/ [fundamental frequency (f0) 125 Hz], /a/ (f0, 100 Hz), and /c/ (f0, 100 Hz) and the double vowels were /a(100),i(125)/ and /c(100),i(125)/. Histograms synchronized to the period of the double vowels were constructed, and locking of the discharge to individual harmonics was estimated from them by Fourier transformation. One possible cue for identifying the f0's of the constituents of a double vowel is modulation of the neural discharge with a period of 1/f0. Such modulation was found at frequencies between the formant peaks of the double vowel, with modulation at the periods of 100 and 125 Hz occurring at different places in the fiber array. Generation of a population response based on synchronized responses [average localized synchronized rate (ALSR): see Young and Sachs [J. Acoust. Soc. Am. 66, 1381-1403 (1979)] allowed estimation of the f0's by a variety of methods and subsampling the population response at the harmonics of the f0 of the constituent vowel achieved a good reconstruction of its spectrum. Other analyses using interval histograms and autocorrelation, which overcome some problems associated with the ALSR approach, also allowed f0 identification and vowel segregation. The present study has demonstrated unequivocally that the timing of the impulses in auditory-nerve fibers provides copious possible cues for the identification of the fundamental frequencies and spectra associated with each of the constituents of double vowels.}, } @article {pmid2229663, year = {1990}, author = {Manuel, SY}, title = {The role of contrast in limiting vowel-to-vowel coarticulation in different languages.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1286-1298}, doi = {10.1121/1.399705}, pmid = {2229663}, issn = {0001-4966}, support = {DC-00005/DC/NIDCD NIH HHS/United States ; HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; *Language ; Male ; *Phonetics ; Sound Spectrography ; *Speech Articulation Tests ; Zimbabwe ; }, abstract = {Languages differ in their inventories of distinctive sounds and in their systems of contrast. Here, it is proposed that this observation may have predictive value with respect to how extensively various phones are coarticulated in particular languages. This hypothesis is based on three assumptions: (1) There are "output constraints" on just how a given phone can be articulated; (2) output constraints are, at least in part, affected by language-particular systems of phonetic contrast; and (3) coarticulation is limited in a way that respects those output constraints. Together, these assumptions lead to the expectation that, in general, languages will tend to tolerate less coarticulation just where extensive coarticulation would lead to confusion of contrastive phones. This prediction was tested by comparing acoustic measures of anticipatory vowel-to-vowel coarticulation in languages that differ in how they divide up the vowel space into contrastive units. The acoustic measures were the first and second formant frequencies, measured in the middle and at the end of the target vowels /a/ and /e/, followed by /pV/, where /V/ was /i,e,a,o,u/. Two languages (Ndebele and Shona) with the phonemic vowels /i,e,a,o,u/ were found to have greater anticipatory coarticulation for the target vowel /a/ than does a language (Sotho) that has a more crowded mid- and low-vowel space, with the phonemic vowels /i,e,e,a,c,o,u/. The data were based on recordings from three speakers of each of the languages.}, } @article {pmid2229662, year = {1990}, author = {Fischer, RM and Ohde, RN}, title = {Spectral and duration properties of front vowels as cues to final stop-consonant voicing.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1250-1259}, doi = {10.1121/1.399702}, pmid = {2229662}, issn = {0001-4966}, support = {DC00464-01A2 HUD-3/DC/NIDCD NIH HHS/United States ; RR05424-27/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; *Phonetics ; Pitch Discrimination ; Psychoacoustics ; *Sound Spectrography ; *Speech Perception ; Time Perception ; }, abstract = {The perception of voicing in final velar stop consonants was investigated by systematically varying vowel duration, change in offset frequency of the final first formant (F1) transition, and rate of frequency change in the final F1 transition for several vowel contexts. Consonant-vowel-consonant (CVC) continua were synthesized for each of three vowels, [i,I,ae], which represent a range of relatively low to relatively high-F1 steady-state values. Subjects responded to the stimuli under both an open- and closed-response condition. Results of the study show that both vowel duration and F1 offset properties influence perception of final consonant voicing, with the salience of the F1 offset property higher for vowels with high-F1 steady-state frequencies than low-F1 steady-state frequencies, and the opposite occurring for the vowel duration property. When F1 onset and offset frequencies were controlled, rate of the F1 transition change had inconsistent and minimal effects on perception of final consonant voicing. Thus the findings suggest that it is the termination value of the F1 offset transition rather than rate and/or duration of frequency change, which cues voicing in final velar stop consonants during the transition period preceding closure.}, } @article {pmid2224643, year = {1990}, author = {Bregman, AS and Liao, C and Levitan, R}, title = {Auditory grouping based on fundamental frequency and formant peak frequency.}, journal = {Canadian journal of psychology}, volume = {44}, number = {3}, pages = {400-413}, doi = {10.1037/h0084255}, pmid = {2224643}, issn = {0008-4255}, mesh = {Adult ; *Attention ; Female ; Humans ; Male ; *Mental Recall ; Middle Aged ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; }, abstract = {The perceptual grouping of a four-tone cycle was studied as a function of differences in fundamental frequencies and the frequencies of spectral peaks. Each tone had a single formant and at least 13 harmonics. In Experiment 1 the formant was created by filtering a flat spectrum and in Experiment 2 by adding harmonics. Fundamental frequency was found to be capable of controlling grouping even when the spectra spanned exactly the same frequency range. Formant peak separation became more effective as the sharpness (amplitude of the peak relative to a spectral pedestal) increased. The effect of each type of acoustic difference depended on the task. Listeners could group the tones by either sort of difference but were also capable of resisting the disruptive effect of the other one. This was taken as evidence for the presence of a schema-based process of perceptual grouping and the relative weakness of primitive segregation.}, } @article {pmid2172345, year = {1990}, author = {Winter, IM and Palmer, AR}, title = {Temporal responses of primarylike anteroventral cochlear nucleus units to the steady-state vowel /i/.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1437-1441}, doi = {10.1121/1.399720}, pmid = {2172345}, issn = {0001-4966}, mesh = {Animals ; Cochlear Nerve/*physiology ; Guinea Pigs ; Loudness Perception/physiology ; Nerve Fibers/physiology ; *Phonetics ; Pitch Discrimination/physiology ; Reaction Time/*physiology ; Speech Perception/*physiology ; Synaptic Transmission/*physiology ; Vestibulocochlear Nerve/physiology ; }, abstract = {It has previously been shown that a population of units classified as "primarylike" and located in the ventral cochlear nucleus of the anesthetized guinea pig was unable to signal the position of the higher formant-related peaks (greater than 1.5 kHz) of steady-state vowels in terms of a temporal-place representation [Palmer et al., J. Acoust. Soc. Am. 79, 100-113 (1986)]. In this paper, it is demonstrated that units characterized by a prepotential in their spike waveform and a primarylike post-stimulus time histogram shape can encode the relative position of the formant peaks present in the spectra of steady-state vowels in terms of a temporal-place code. The possible reasons for the differences in the present results and the previous report are discussed.}, } @article {pmid2172344, year = {1990}, author = {Secker-Walker, HE and Searle, CL}, title = {Time-domain analysis of auditory-nerve-fiber firing rates.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1427-1436}, doi = {10.1121/1.399719}, pmid = {2172344}, issn = {0001-4966}, mesh = {Animals ; Attention/physiology ; Cats ; Evoked Potentials, Auditory/physiology ; Loudness Perception/physiology ; Nerve Fibers/physiology ; *Phonetics ; Sound Spectrography ; Speech Perception/*physiology ; Synaptic Transmission/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Time-domain analysis of firing-rate data from over 200 fibers from the auditory nerve of cat has been used to estimate the formants of the synthetic-syllable stimuli. Distinct groups of fibers are identified based on intervals between peaks in the fiber firing rates. The large extent of some of these groups--over an octave in terms of characteristic frequency--and the lack of short intervals in the longer-interval groups suggest that the behavior of the nonlinear cochlear filters for these signals is effectively wideband with steep high-frequency cutoffs. The measured intervals within each group are very similar, and correspond to the period of the formant that dominates the group's response. These intervals are used to estimate the dynamic speech formants. The overall formant estimates are better than those of the previous spectral analyses of the neural data, and the details of lower-formant dynamics are tracked more precisely. The direct temporal representation of the formant in contrasted with the diffuse spectral representation, the dependence of spectral peaks on nonformant parameters, and the distortion of the spectrum by rectification. It is concluded that a time-domain analysis of the responses to complex stimuli can be an important addition to frequency-domain analysis for neural data, cochlear models, and machine processing of speech.}, } @article {pmid2146297, year = {1990}, author = {Cowan, RS and Blamey, PJ and Galvin, KL and Sarant, JZ and Alcántara, JI and Clark, GM}, title = {Perception of sentences, words, and speech features by profoundly hearing-impaired children using a multichannel electrotactile speech processor.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {3}, pages = {1374-1384}, doi = {10.1121/1.399715}, pmid = {2146297}, issn = {0001-4966}, mesh = {Adolescent ; Auditory Threshold ; Child ; *Communication Aids for Disabled ; Deafness/*rehabilitation ; Education, Special ; Female ; *Hearing Aids ; Humans ; Male ; Psychoacoustics ; Speech Discrimination Tests ; *Speech Perception ; *Touch ; Vibration ; }, abstract = {Fourteen prelinguistically profoundly hearing-impaired children were fitted with the multichannel electrotactile speech processor (Tickle Talker) developed by Cochlear Pty. Ltd. and the University of Melbourne. Each child participated in an ongoing training and evaluation program, which included measures of speech perception and production. Results of speech perception testing demonstrate clear benefits for children fitted with the device. Thresholds for detection of pure tones were lower for the Tickle Talker than for hearing aids across the frequency range 250-4000 Hz, with the greatest tactual advantage in the high-frequency consonant range (above 2000 Hz). Individual and mean speech detection thresholds for the Ling 5-sound test confirmed that speech sounds were detected by the electrotactile device at levels consistent with normal conversational speech. Results for three speech feature tests showed significant improvement when the Tickle Talker was used in combination with hearing aids (TA) as compared with hearing aids along (A). Mean scores in the TA condition increased by 11% for vowel duration, 20% for vowel formant, and 25% for consonant manner as compared with hearing aids alone. Mean TA score on a closed-set word test (WIPI) was 48%, as compared with 32% for hearing aids alone. Similarly, mean WIPI score for the combination of Tickle Talker, lipreading, and hearing aids (TLA) increased by 6% as compared with combined lipreading and hearing aid (LA) scores. Mean scores on open-set sentences (BKB) showed a significant increase of 21% for the tactually aided condition (TLA) as compared with unaided (LA). These results indicate that, given sufficient training, children can utilize speech feature information provided through the Tickle Talker to improve discrimination of words and sentences. These results indicate that, given sufficient training, children can utilize speech feature information provided through the Tickle Talker to improve discrimination of words and sentences. These results are consistent with improvement in speech discrimination previously reported for normally hearing and hearing-impaired adults using the device. Anecdotal evidence also indicates some improvements in speech production for children fitted with the Tickle Talker.}, } @article {pmid2400384, year = {1990}, author = {Gawron, VJ and Bock, DH}, title = {In search of an inherent ordering of vowel phonemes, or do pilots hear like engineers do?.}, journal = {Aviation, space, and environmental medicine}, volume = {61}, number = {8}, pages = {758-760}, pmid = {2400384}, issn = {0095-6562}, mesh = {*Aerospace Medicine ; Ergonomics ; Hearing Loss, Noise-Induced/epidemiology ; Humans ; *Music ; *Phonetics ; *Speech ; *Speech Acoustics ; Speech Intelligibility ; Speech Perception ; }, abstract = {To test the hypothesis that formant frequencies might provide an inherent ordering scheme for vowel sounds, two groups of subjects were asked to place six Italian vowel phonemes (A, O, U, AE, E, and I) into the most musically pleasing order. A subsequent chi 2 analysis of selections of the first group (21 music students and engineers) indicated a reliable consistency in vowel order supporting the hypothesis. Analysis of the second group (12 pilots) did not indicate any consistent ordering. The results are discussed in terms of hearing damage associated with exposure to high frequency aircraft noise.}, } @article {pmid2212290, year = {1990}, author = {Blamey, PJ and Clark, GM}, title = {Place coding of vowel formants for cochlear implant patients.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {2}, pages = {667-673}, doi = {10.1121/1.399770}, pmid = {2212290}, issn = {0001-4966}, mesh = {*Attention ; *Cochlear Implants ; Deafness/psychology/*rehabilitation ; Female ; Humans ; Loudness Perception ; Male ; Middle Aged ; *Phonetics ; Pitch Discrimination ; *Speech Perception ; }, abstract = {Four multiple-channel cochlear implant patients were tested with synthesized versions of the words "hid, head, had, hud, hod, hood" containing 1, 2, or 3 formants, and with a natural 2-formant version of the same words. The formant frequencies were encoded in terms of the positions of electrical stimulation in the cochlea. Loudness, duration, and fundamental frequency were kept fixed within the synthetic stimulus sets. The average recognition scores were 47%, 61%, 62%, and 79% for the synthesized 1-, 2-, and 3-format vowels and the natural vowels, respectively. These scores showed that the place coding of the first and second formant frequencies accounted for a large part of the vowel recognition of cochlear implant patients using these coding schemes. The recognition of the natural stimuli was significantly higher than recognition of the synthetic stimuli, indicating that extra cues such as loudness, duration, and fundamental frequency contributed to recognition of the spoken words.}, } @article {pmid2380450, year = {1990}, author = {Sussman, HM}, title = {Acoustic correlates of the front/back vowel distinction: a comparison of transition onset versus "steady state".}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {1}, pages = {87-96}, doi = {10.1121/1.399848}, pmid = {2380450}, issn = {0001-4966}, mesh = {*Attention ; Humans ; *Phonetics ; Psychoacoustics ; Signal Processing, Computer-Assisted ; Sound Spectrography/instrumentation ; *Speech Perception ; }, abstract = {This study investigated whether F2 and F3 transition onsets could encode the vowel place feature as well as F2 and F3 "steady-state" measures [Syrdal and Gopal, J. Acoust. Soc. Am. 79, 1086-1100 (1986)]. Multiple comparisons were made using (a) scatterplots in multidimensional space, (b) critical band differences, and (c) linear discriminant functional analyses. Four adult male speakers produced /b/(v)/t/, /d/(v)/t/, and /g/(v)/t/ tokens with medial vowel contexts /i,I, E, ey, ae, a, v, c, o, u/. Each token was repeated in a random order five times, yielding a total of 150 tokens per subject. Formant measurements were taken at four loci: F2 onset, F2 vowel, F3 onset, and F3 vowel. Onset points coincided with the first glottal pulse following the release burst and steady-state measures were taken approximately 60-70 ms post-onset. Graphic analyses revealed two distinct, minimally overlapping subsets grouped by front versus back. This dichotomous grouping was also seen in two-dimensional displays using only "onset" data as coordinates. Conversion to a critical band (bark) scale confirmed that front vowels were characterized by F3-F2 bark differences within a critical 3-bark distance, while back vowels exceeded the 3-bark critical distance. Using the critical distance metric onset values categorized front vowels as well as steady-state measures, but showed a 20% error rate for back vowels. Front vowels had less variability than back vowels. Statistical separability was quantified with linear discriminant function analysis. Percent correct classification into vowel place groups was 87.5% using F2 and F3 onsets as input variables, and 95.7% using F2 and F3 vowel. Acoustic correlates of the vowel place feature are already present at second and third formant transition onsets.}, } @article {pmid2380449, year = {1990}, author = {Nygaard, LC and Eimas, PD}, title = {A new version of duplex perception: evidence for phonetic and nonphonetic fusion.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {1}, pages = {75-86}, doi = {10.1121/1.399846}, pmid = {2380449}, issn = {0001-4966}, support = {HD05331/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; Loudness Perception ; *Phonetics ; Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; }, abstract = {In a series of experiments, a variant of duplex perception was investigated. In its original form, duplex perception is created by presenting an isolated transition to one ear and the remainder of the syllable, the standard base, to the other ear. Listeners hear a chirp at the ear receiving the isolated transition, and a full syllable at the ear receiving the base. The new version of duplex perception was created by presenting a third-formant transition in isolation to one ear and the same transition electronically mixed with the base to the other ear; the modified base now has all the information necessary for syllabic perception. With the new procedure, listeners reported hearing a chirp centered in the middle of their head and a syllable in the ear presented the modified base that was clearer than that produced by the isolated transition and standard base. They could also reliably choose the patterns that contained the additional transition in the base when attending to either the phonetic or nonphonetic sides of the duplex percept. In addition, when the fundamental frequency, onset time, and intensity of the isolated third-formant transition were varied relative to the base, the phonetic and nonphonetic (lateralization) percepts were differentially affected, although not always reliably. In general, nonphonetic fusion was more affected by large differences in these variables than was phonetic fusion. However, when two isolated third-formant transitions were presented dichotically, fusion and the resulting central location of the chirp failed markedly with relatively small differences in each variable. The results were discussed in terms of the role of fusion in the new version of duplex perception and the nature of the information that undergoes both phonetic and nonphonetic fusion.}, } @article {pmid2380448, year = {1990}, author = {Bentin, S and Mann, V}, title = {Masking and stimulus intensity effects on duplex perception: a confirmation of the dissociation between speech and nonspeech modes.}, journal = {The Journal of the Acoustical Society of America}, volume = {88}, number = {1}, pages = {64-74}, doi = {10.1121/1.399845}, pmid = {2380448}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Humans ; *Perceptual Masking ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; }, abstract = {Using the phenomenon of duplex perception, previous researchers have shown that certain manipulations affect the perception of formant transitions as speech but not their perception as nonspeech "chirps," a dissociation that is consistent with the hypothesized distinction between speech and nonspeech modes of perception [Liberman et al., Percept. Psychophys. 30, 133-143 (1981); Mann and Liberman, Cognition 14, 211-235 (1983)]. The present study supports this interpretation of duplex perception by showing the existence of a "double dissociation" between the speech and chirp percepts. Five experiments compared the effects of stimulus onset asynchrony, backward masking, and transition intensity on the two sides of duplex percepts. It was found that certain manipulations penalize the chirp side but not the speech side, whereas other manipulations had the opposite effect of penalizing the speech side but not the chirp side. In addition, although effects on the speech side of duplex percepts have appeared to be much the same as in the case of normal (electronically fused) speech stimuli, the present study discovered that manipulations that impaired the chirp side of duplex percepts had considerably less effect on the perception of isolated chirps. Thus it would seem that duplex perception makes chirp perception more vulnerable to the effects of stimulus degradation. Several explanations of the data are discussed, among them, the view that speech perception may take precedence over other forms of auditory perception [Mattingly and Liberman, in Signals and Sense: Local and Global Order in Perceptual Maps, edited by G.M. Edelman, W.E. Gall, and W.M. Cowan (Wiley, New York, in press); Whalen and Liberman, Science 237, 169-171 (1987)].}, } @article {pmid2174605, year = {1990}, author = {Telegina, TL and Pigareva, ML}, title = {[The influence of the emotional state of the child on the parameters of its verbal response].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {40}, number = {4}, pages = {643-649}, pmid = {2174605}, issn = {0044-4677}, mesh = {Child ; Child, Preschool ; Emotions/*physiology ; Games, Experimental ; Humans ; Phonetics ; Tape Recording ; Verbal Behavior/*physiology ; }, abstract = {The method of determination of human emotional state by speech was applied to the analysis of verbal expressions of 22 children during the play "make-up the square". Parameters of the verbal response--F0 (fundamental frequency) and n(0) (evaluation of the frequency of the first formant) changed to the greatest extent under the influence of emotional reactions which appeared in children during the play. Depending on the terms of the play different changes of these parameters took place in different children: in some cases significant increase of values was observed as compared with the background ones, in other cases--their significant decrease.}, } @article {pmid2397403, year = {1990}, author = {Steinschneider, M and Arezzo, JC and Vaughan, HG}, title = {Tonotopic features of speech-evoked activity in primate auditory cortex.}, journal = {Brain research}, volume = {519}, number = {1-2}, pages = {158-168}, doi = {10.1016/0006-8993(90)90074-l}, pmid = {2397403}, issn = {0006-8993}, support = {5T32GM7288/GM/NIGMS NIH HHS/United States ; HD01799/HD/NICHD NIH HHS/United States ; MH06723/MH/NIMH NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Afferent Pathways/physiology ; Animals ; Auditory Cortex/*physiology ; Brain Stem/*physiology ; *Evoked Potentials, Auditory ; Humans ; Macaca fascicularis ; Male ; Speech ; Thalamus/physiology ; }, abstract = {To further clarify the neural mechanisms underlying the cortical encoding of speech sounds, we have recorded multiple unit activity (MUA) in the primary auditory cortex (A1) and thalamocortical (TC) radiations of an awake monkey to 3 consonant-vowel syllables, /da/, /ba/ and /ta/, that vary in their consonant place of articulation and voice onset time (VOT). In addition, we have examined the responses to the syllables' isolated formants and formant pairs. Response features are related to the cortical tonotopic organization, as determined by examining the responses to selected pure tones. MUA patterns that differentially reflect the spectral characteristics of the steady-state formant frequencies and formant transition onset frequencies underlying consonant place of articulation occur at sites with similarly differentiated tone responses. Whereas the detailed spectral characteristics of the speech sounds are reflected in low frequency cortical regions, both low and high frequency areas generate responses that reflect their temporal characteristics of fundamental frequency and VOT. Formant interactions modulate the responses to the whole syllables. These interactions may sharpen response differences that reflect consonant place of articulation. Response features noted in A1 also occur in TC fibers. Thus, differences in the encoding of speech sounds between the thalamic and cortical levels may include further opportunities for formant interactions within auditory cortex. One effect could be to heighten response contrast between complex stimuli with subtle acoustical differences.}, } @article {pmid2373803, year = {1990}, author = {Beddor, PS and Hawkins, S}, title = {The influence of spectral prominence on perceived vowel quality.}, journal = {The Journal of the Acoustical Society of America}, volume = {87}, number = {6}, pages = {2684-2704}, doi = {10.1121/1.399060}, pmid = {2373803}, issn = {0001-4966}, support = {NS-07196/NS/NINDS NIH HHS/United States ; NS-07237/NS/NINDS NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Humans ; *Phonetics ; Speech/*physiology ; }, abstract = {Research indicates that, when the first and second formants of a vowel are separated by less than about 3.5 Bark, perception of its height and some other aspects of its quality is determined by some weighted average of the low-frequency spectrum, rather than by particular harmonic or hypothetical formant frequencies (as is the case with more widely spaced formants). This spectral averaging has been called the center of gravity (COG) effect. Although the existence of the effect is generally accepted, the factors that govern it are poorly understood. One possibility is that the influence of the spectral envelope on perceived vowel quality increases as low-frequency spectral prominences become less well defined. A series of three experiments examined this possibility in: (1) nasal vowels, where the lowest spectral prominence is broader and flatter than that of oral vowels; (2) one- versus two-formant vowels with bandwidths appropriate for oral vowels; and (3) two-formant vowels with very narrow or very wide bandwidths. The results of these experiments show that, when two or more spectral peaks lie within 3.5 Bark of one another, F1 and the centroid (an amplitude-weighted average frequency that estimates the COG in the low-frequency spectrum) roughly determine the boundaries within which the perceptual COG lies; the frequencies of spectral peaks dominate responses when formant bandwidths are narrow, whereas overall spectral shape exerts more influence when spectral prominences are wide. Assuming that all vowels undergo the same processing, it is suggested that vowel quality, particularly height, is determined both by the frequency of the most prominent harmonics in the low-frequency region and by the slopes of the skirts in the vicinity of these harmonics. These two effects are most clearly separable in vowels with poorly defined spectral prominences whose shape cannot be adequately described by specifying the frequencies and degree of prominence of just one or two harmonics, or hypothetical formant peaks.}, } @article {pmid2367177, year = {1990}, author = {Dooling, RJ and Brown, SD}, title = {Speech perception by budgerigars (Melopsittacus undulatus): spoken vowels.}, journal = {Perception & psychophysics}, volume = {47}, number = {6}, pages = {568-574}, pmid = {2367177}, issn = {0031-5117}, support = {HD00512/HD/NICHD NIH HHS/United States ; NS19006/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Attention ; *Birds ; Female ; Male ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {Discrimination of natural, sustained vowels was studied in 5 budgerigars. The birds were trained using operant conditioning procedures on a same-different task, which was structured so that response latencies would provide a measure of stimulus similarity. These response latencies were used to construct similarity matrices, which were then analyzed by multidimensional scaling (MDS) procedures. MDS produced spatial maps of these speech sounds where perceptual similarity was represented by spatial proximity. The results of the three experiments suggest that budgerigars perceive natural, spoken vowels according to phonetic categories, find the acoustic differences among different talkers less salient than the acoustic differences among vowel categories, and use formant frequencies in making these complex discriminations.}, } @article {pmid2358129, year = {1990}, author = {Tye-Murray, N and Lowder, M and Tyler, RS}, title = {Comparison of the F0F2 and F0F1F2 processing strategies for the Cochlear Corporation cochlear implant.}, journal = {Ear and hearing}, volume = {11}, number = {3}, pages = {195-200}, doi = {10.1097/00003446-199006000-00005}, pmid = {2358129}, issn = {0196-0202}, support = {N520466//PHS HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Equipment Design ; Hearing Loss, Sensorineural/*rehabilitation ; Humans ; *Speech Discrimination Tests ; }, abstract = {In April 1985, an updated processing strategy became available for the Cochlear Corporation Nucleus cochlear implant. Whereas the original strategy codes only fundamental frequency, amplitude, and information in the second formant region, the newer strategy also codes frequencies in the first formant region. This investigation evaluated the speech recognition skills of five subjects who were experienced with both designs. On average, the addition of first formant information improved word identification in an audition-only condition and improved spondee recognition in noise. Scores for the NU 6 Monosyllabic Word Test and the Sentence Test Without Context improved from 8% (2-12%) to 28% words correct (10-42%), and from 31% (10-45%) to 64% words correct (39-84%), respectively. Scores for the Four-Choice Spondee Test in noise improved from 37% (25-50%) to 75% (45-90%). The percent correct scores for the Iowa 14-Item Consonant Confusion Test in an audition-only and vision-plus-audition condition did not vary with the change in processing strategy. However, an information transfer analysis performed on the responses to the consonant test in a vision-only and a vision-plus-audition condition suggested that the newer strategy enhances the transmission of the voicing, duration, and envelope features.}, } @article {pmid2134755, year = {1990}, author = {Kurachi, M and Ishigami, H and Kuroki, H and Yamada, S}, title = {Acoustic analysis of experimental prosthetic apparatus on spatial distribution formants.}, journal = {Gifu Shika Gakkai zasshi = The Journal of Gifu Dental Society}, volume = {17}, number = {1}, pages = {160-169}, pmid = {2134755}, issn = {0385-0072}, mesh = {Adult ; Dentures/*adverse effects ; Humans ; Japan ; Language ; Male ; Phonetics ; Sound Spectrography ; Speech Disorders/etiology ; *Speech Intelligibility ; Speech Production Measurement/*instrumentation ; }, abstract = {This study concerns the effects of various prosthetic apparatus on speech perception. In this study we analyzed not only the first and second formant frequencies (as have often been studied previously) but also the third formant frequencies, and the three formant frequencies en masse. Our study demonstrated that of the six kinds of prosthetic apparatus we examined, the least effective apparatus was the L.S. type, while the most effective was the M.P. type.}, } @article {pmid2376796, year = {1990}, author = {Saida, H and Okamoto, M and Imaizumi, S and Hirose, H}, title = {[A study of voice mutation and physical growth--a longitudinal observation].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {93}, number = {4}, pages = {596-605}, doi = {10.3950/jibiinkoka.93.596}, pmid = {2376796}, issn = {0030-6622}, mesh = {Adolescent ; Child ; *Growth ; Humans ; Japan ; Longitudinal Studies ; Male ; Puberty ; *Voice ; }, abstract = {Past studies on the relationship between mutational voice change and body growth were generally made on grouped subjects and no exact longitudinal observation was performed. In the present report, a longitudinal study was made on 100 young male students in their puberty, in which voice recordings and measurements of physical parameters including body height and weight were performed twice for each subject with yearly interval. Information on subjective evaluation of voice abnormality was also obtained from each subject. The recorded voice samples were subjected to subsequent analysis for obtaining fundamental frequency (F0) and formant values. The following results were obtained. 1. A negative correlation in the rate of change was observed between F0 and physical parameters such as body height and weight, and sitting height. 2. It was suggested that the mutational period consisted of the rapid and slow phases. The rate of growth in body height and sitting height was more significant in the rapid phase. 3. Subjective voice abnormality and physical growths such as the development of the laryngeal prominence were often noted even before the rapid phase. After the rapid phase was over, all the cases showed secondary sexual characteristics including the laryngeal prominence. 4. Before and during the rapid phase, there was a tendency for the values of F1 and F2 to increase, while that of F3 to decrease. After the rapid phases was over, there was a trend that F1 and F2 increased, while F3 remained unchanged.}, } @article {pmid2341679, year = {1990}, author = {Hermansky, H}, title = {Perceptual linear predictive (PLP) analysis of speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {87}, number = {4}, pages = {1738-1752}, doi = {10.1121/1.399423}, pmid = {2341679}, issn = {0001-4966}, mesh = {Adult ; *Attention ; Child, Preschool ; Fourier Analysis ; Humans ; Loudness Perception ; Male ; Perceptual Distortion ; Phonetics ; Pitch Discrimination ; *Signal Processing, Computer-Assisted ; *Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {A new technique for the analysis of speech, the perceptual linear predictive (PLP) technique, is presented and examined. This technique uses three concepts from the psychophysics of hearing to derive an estimate of the auditory spectrum: (1) the critical-band spectral resolution, (2) the equal-loudness curve, and (3) the intensity-loudness power law. The auditory spectrum is then approximated by an autoregressive all-pole model. A 5th-order all-pole model is effective in suppressing speaker-dependent details of the auditory spectrum. In comparison with conventional linear predictive (LP) analysis, PLP analysis is more consistent with human hearing. The effective second formant F2' and the 3.5-Bark spectral-peak integration theories of vowel perception are well accounted for. PLP analysis is computationally efficient and yields a low-dimensional representation of speech. These properties are found to be useful in speaker-independent automatic-speech recognition.}, } @article {pmid2314083, year = {1990}, author = {Bacon, SP and Blake, PE}, title = {Temporal effects in simultaneous masking by vowel and consonant-vowel maskers.}, journal = {Journal of speech and hearing research}, volume = {33}, number = {1}, pages = {38-44}, doi = {10.1044/jshr.3301.38}, pmid = {2314083}, issn = {0022-4685}, support = {NS25062/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Auditory Perception ; Auditory Threshold ; Humans ; *Perceptual Masking ; *Phonetics ; Psychoacoustics ; *Speech ; *Speech Acoustics ; }, abstract = {Temporal effects in simultaneous masking were studied using synthetic vowel (V) and consonant-vowel (CV) maskers. For the steady-state V maskers (/i,a,u/), signals were presented at the beginning or in the temporal center of the masker. The masking patterns generally reflected the formant differences among the vowels, and the formant structure of each V masker was more clearly revealed when the signal was presented in the temporal center of the masker. For the CV maskers (/bi,gi/), signals were presented at the beginning of the (consonant portion of the) masker, at the beginning of the vowel portion of the masker, or in the temporal center of the masker. The second-formant difference between the maskers (observed acoustically at their onset) was generally revealed in the masking patterns when the signal was presented at the beginning of the consonant; this difference in the masking patterns was also present, to a lesser extent, when the signal was presented at the beginning of the vowel, where the two maskers were identical acoustically. The masking patterns for the two CV maskers were virtually identical when the signal was presented in the temporal center of the masker. These data extend previous tone-on-tone masking data and suggest that the auditory system requires a certain amount of time to represent most accurately the acoustic spectrum of both steady-state and dynamic complex maskers.}, } @article {pmid2313046, year = {1990}, author = {Liss, JM and Weismer, G and Rosenbek, JC}, title = {Selected acoustic characteristics of speech production in very old males.}, journal = {Journal of gerontology}, volume = {45}, number = {2}, pages = {P35-45}, doi = {10.1093/geronj/45.2.p35}, pmid = {2313046}, issn = {0022-1422}, mesh = {Aged ; Aged, 80 and over ; Aging/*physiology ; Humans ; Male ; Phonetics ; Sound Spectrography ; *Speech/*physiology ; *Speech Acoustics ; Speech Intelligibility ; Time Factors ; }, abstract = {An understanding of the effects of advancing age on speech characteristics is crucial for those who study and serve the older population. The purpose of this study was to obtain a normative data base for the speech production characteristics of a group of very old men. Fourteen veterans 87 to 93 years old served as subjects, producing a total of 40 sentences at a conversational rate. Wide-band (300 Hz) spectrograms were created from high-quality tape recordings. Specified acoustic measures were made (consonant, vowel, and voice-onset time durations; and vowel formant frequencies and trajectory slopes) via digitizer and microcomputer Sonogram Analyzer program. Descriptive analysis of the selected measurements was performed to obtain a profile of speech production behavior for these subjects. Data were also compared to those of the young adult, Parkinson's disease patients, and younger elderly subjects studied by Weismer (1984a) and Weismer, Kimelman, and Gorman (1985). Performance of the older subjects was similar to that of the younger elderly in many cases, but in certain cases bore notable similarities to Parkinsonian speech. Theoretical implications of aging and disease are addressed.}, } @article {pmid2137837, year = {1990}, author = {Klatt, DH and Klatt, LC}, title = {Analysis, synthesis, and perception of voice quality variations among female and male talkers.}, journal = {The Journal of the Acoustical Society of America}, volume = {87}, number = {2}, pages = {820-857}, doi = {10.1121/1.398894}, pmid = {2137837}, issn = {0001-4966}, support = {NS04332/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; *Communication Aids for Disabled ; Female ; Humans ; Male ; Middle Aged ; *Self-Help Devices ; Sex Factors ; Speech Perception/*physiology ; Voice/*physiology ; Voice Quality/*physiology ; }, abstract = {Voice quality variations include a set of voicing sound source modifications ranging from laryngealized to normal to breathy phonation. Analysis of reiterant imitations of two sentences by ten female and six male talkers has shown that the potential acoustic cues to this type of voice quality variation include: (1) increases to the relative amplitude of the fundamental frequency component as open quotient increases; (2) increases to the amount of aspiration noise that replaces higher frequency harmonics as the arytenoids become more separated; (3) increases to lower formant bandwidths; and (4) introduction of extra pole zeros in the vocal-tract transfer function associated with tracheal coupling. Perceptual validation of the relative importance of these cues for signaling a breathy voice quality has been accomplished using a new voicing source model for synthesis of more natural male and female voices. The new formant synthesizer, KLSYN88, is fully documented here. Results of the perception study indicate that, contrary to previous research which emphasizes the importance of increased amplitude of the fundamental component, aspiration noise is perceptually most important. Without its presence, increases to the fundamental component may induce the sensation of nasality in a high-pitched voice. Further results of the acoustic analysis include the observations that: (1) over the course of a sentence, the acoustic manifestations of breathiness vary considerably--tending to increase for unstressed syllables, in utterance-final syllables, and at the margins of voiceless consonants; (2) on average, females are more breathy than males, but there are very large differences between subjects within each gender; (3) many utterances appear to end in a "breathy-laryngealized" type of vibration; and (4) diplophonic irregularities in the timing of glottal periods occur frequently, especially at the end of an utterance. Diplophonia and other deviations from perfect periodicity may be important aspects of naturalness in synthesis.}, } @article {pmid2356734, year = {1990}, author = {von Wedel, H and von Wedel, UC and Streppel, M}, title = {Fine structure analysis of speech signals. Hearing aids and perceptual training.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {236-244}, pmid = {2356734}, issn = {0365-5237}, mesh = {*Hearing Aids ; Hearing Loss/physiopathology/*rehabilitation ; Humans ; Perceptual Masking/physiology ; *Speech Perception/physiology ; }, abstract = {Transition and gap detection experiments with synthetic speech stimuli in normal and pathological hearing were carried out to investigate the feasibility of some form of feature-analysing mechanism in speech perception. Identification experiments were performed before the hearing aid fitting, directly thereafter and half a year later to evaluate the important individual cues of speech perception with regard to perceptual training. Possible distortions in the temporal structure of the speech signals by the hearing aids were considered. For our experiments we generated a series of synthetic speech stimuli such as the ba-da-ga sequence, which are identical in their acoustic cues except for the transition of the second formant. The start frequency was changed in nine 200-Hz steps from 800 Hz to 2,400 Hz thus producing a ba at one end of the series and a ga at the other. The phenomenon of categorical perception could be investigated with this set of synthesized speech stimuli. In a second experiment the influence of silent intervals on the identification of plosives was analysed increasing the artificial silent interval in 10 ms steps from 0 ms to 120 ms in speech stimuli like schal thus producing stahl for the 120 ms silent interval. The results of patients with different types of hearing loss fitted with hearing aids are compared with normal hearing. Our investigations show that it will be possible to achieve an integration of fine-structure analysing mechanisms in speech perception into the management and rehabilitation of people with impaired hearing using special speech processors in modified hearing aids and developing special speech and hearing training programs.}, } @article {pmid2356720, year = {1990}, author = {von Wallenberg, EL and Hochmair, ES and Hochmair-Desoyer, IJ}, title = {Initial results with simultaneous analog and pulsatile stimulation of the cochlea.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {140-149}, pmid = {2356720}, issn = {0365-5237}, mesh = {*Cochlear Implants ; Deafness/*rehabilitation ; Electric Stimulation/*methods ; Equipment Design ; Female ; Humans ; Male ; Periodicity ; *Signal Processing, Computer-Assisted ; Speech Perception ; }, abstract = {An improved method has been developed for the coding of speech information into adequate signals for the stimulation of the auditory nerve. It combines the periodicity principle, which has been applied in single-channel analog stimulation in the Austrian cochlear prosthesis, with the place principle by simultaneous analog stimulation on one channel and pulsatile stimulation on other channels. The second formant frequency determines the place of stimulation for the pulsatile signals. Simultaneous stimulation of several channels can cause the currents emerging from different electrodes to interact because the fluid impedance in the cochlea is small. Therefore, an important aspect of the multichannel strategy is to maintain the temporal pattern transmitted via the analog channel by adequate repetition rates and phase relationships of the pulsatile signals. The signals were processed with finite impulse response digital filters. Vowel identification tests were performed with 6 patients implanted with a 4-channel intracochlear electrode. The test material was spoken by male and female speakers. With proper timing of the pulses the improvement over the single-channel stimulation was significant at the 1% level and this difference was due to a significant increase in second formant recognition.}, } @article {pmid2356719, year = {1990}, author = {Dillier, N and Senn, C and Schlatter, T and Stöckli, M and Utzinger, U}, title = {Wearable digital speech processor for cochlear implants using a TMS320C25.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {120-127}, pmid = {2356719}, issn = {0365-5237}, mesh = {*Cochlear Implants ; Equipment Design ; Programming Languages ; Research Design ; *Signal Processing, Computer-Assisted ; Software ; }, abstract = {Based on a single-chip DSP (TMS320C25, Texas Instruments) a programmable battery-operated sound processor with a digital encoder interface for the Nucleus-22 cochlear implant (CI) was built. The number of quasi-simultaneously addressed electrodes is only limited by the selected pulse width and the maximum rate of stimulation and can be as high as 10 electrodes at 300 Hz repetition rate. Implementation of various processing strategies (formant or channel vocoder, filterbank, zero crossings, etc.) is possible as well as sophisticated adaptive noise reduction. Programs and stimulation parameters are stored in electrically erasable memory and may be updated via a host computer. The built-in analog output may be used for single-channel stimulation or acoustic verifications of the sound processing algorithms. The power consumption with current 16K word data memory and at maximum stimulation rate is about 1 Watt, which necessitates recharging of batteries after 11 h.}, } @article {pmid2356718, year = {1990}, author = {Dalsgaard, P and Fink, FK and Pedersen, JE and Sørensen, H}, title = {Real-time signal processing in speech recognition and its potential use within the development of hearing aids.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {108-116}, pmid = {2356718}, issn = {0365-5237}, mesh = {Algorithms ; *Computer Systems ; *Hearing Aids ; Models, Biological ; Research Design ; *Signal Processing, Computer-Assisted ; *Speech Perception ; }, abstract = {Acoustic-phonetic features necessary for rule-based speech recognition are described. The selection of feature algorithms is based on their robustness to variability of dynamics in speech signals and to influence of environmental noise, and their suitability for real-time implementation using speech production or speech perception modelling. Features are estimated in real-time, using a 32-bit floating point signal processor DSP32. The features described are pitch, formants, segmentation and labelling. The paper indicates the potential use of these features in connection with future experiments leading to development of new hearing aids.}, } @article {pmid2356717, year = {1990}, author = {Simpson, AM and Moore, BC and Glasberg, BR}, title = {Spectral enhancement to improve the intelligibility of speech in noise for hearing-impaired listeners.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {101-107}, pmid = {2356717}, issn = {0365-5237}, mesh = {Hearing Loss, Sensorineural/*rehabilitation ; Humans ; *Signal Processing, Computer-Assisted ; Speech Intelligibility ; *Speech Perception ; }, abstract = {At speech-to-noise ratios between -3 and 6 dB, many hearing-impaired listeners have difficulty in understanding speech, but spectrograms reveal that the formant peaks of voiced speech and some of the spectral peaks associated with unvoiced speech stand out against the background noise. Our speech-enhancement process is based on the assumption that increasing spectral contrast will result in improved intelligibility. The enhancement involves calculating an auditory excitation pattern from the magnitude spectrum of overlapping short segments of the speech signal. This pattern is convolved with a difference-of-Gaussians function whose bandwidth varies with frequency in the same way as the auditory filter bandwidth. Magnitude values from this enhanced pattern are combined with the unchanged phase spectrum from the original signal to produce the enhanced speech. The processing was used to enhance Boothroyd and Bench-Kowal-Bamford Audiometric lists which had been digitally combined with speech-shaped noise at speech-to-noise ratios between -3 and 6 dB. The subjects had moderate to severe sensorineural hearing losses. The processing produced small but significant improvements in intelligibility for the hearing-impaired listeners tested. Possibilities for improving the processing are discussed.}, } @article {pmid2302543, year = {1990}, author = {McNeil, MR and Liss, JM and Tseng, CH and Kent, RD}, title = {Effects of speech rate on the absolute and relative timing of apraxic and conduction aphasic sentence production.}, journal = {Brain and language}, volume = {38}, number = {1}, pages = {135-158}, doi = {10.1016/0093-934x(90)90106-q}, pmid = {2302543}, issn = {0093-934X}, support = {5030 HD03352/HD/NICHD NIH HHS/United States ; NS18797/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Aphasia/*diagnosis ; Aphasia, Broca/*diagnosis ; Apraxias/*diagnosis ; Articulation Disorders/diagnosis ; Dysarthria/diagnosis ; Humans ; Male ; Middle Aged ; *Neuropsychological Tests ; Phonetics ; Sound Spectrography ; *Speech Production Measurement ; }, abstract = {The purpose of this investigation was to provide a constructive replication of the Kent and McNeil (1987, In Phonetic approaches to speech production in aphasia and related disorders. San Diego: College-Hill Press) study of the speech timing characteristics of apraxic and conduction aphasic speakers. Acoustic analysis was used to obtain absolute utterance durations, segment durations, and vowel formant trajectories from utterances produced under control, fast, and slow rate conditions. Segment-to-whole ratios and slope values were calculated. Results support the hypothesis presented by Kent and McNeil (1987) that there is a phonetic-motoric component contributing to the speech patterns of both the apraxic and conduction aphasic speakers sampled. Theories of rate control in normal and disordered speakers are discussed.}, } @article {pmid2151828, year = {1990}, author = {Gottfried, TL and Miller, JL and Payton, PE}, title = {Effect of speaking rate on the perception of vowels.}, journal = {Phonetica}, volume = {47}, number = {3-4}, pages = {155-172}, doi = {10.1159/000261860}, pmid = {2151828}, issn = {0031-8388}, support = {BRSG RR-07143/RR/NCRR NIH HHS/United States ; DC-00130/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Communication Aids for Disabled ; Humans ; *Phonetics ; Psychoacoustics ; *Sound Spectrography ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {Three experiments examined the conditions under which the speaking rate of a context sentence affects vowel identification. In these experiments, listeners identified the vowel in synthetic /b/-vowel-/t/ syllables that varied systematically in the duration (temporal), the formant frequencies (spectral), or both the duration and formant frequencies (temporal-spectral) of the steady-state portion of the syllable. These syllables were embedded in two synthetic sentence frames, one with the temporal characteristics of a natural fast sentence and one with those of a slow sentence. For two vowel distinctions that are specified in natural speech by both temporal and spectral characteristics, /I/-/i/ and /e/-/ae/, listeners adjusted their identification of the vowels according to the sentence rate in all three conditions. Although there was a trend for the rate effect to be reduced in the temporal-spectral condition, the influence of sentential rate was never eliminated. By contrast, for a vowel distinction that is naturally specified primarily by spectral characteristics, /e/-/I/, there was no effect of sentence rate in any of the conditions. We conclude that when vowels are differentiated in natural speech by both temporal and spectral information, listeners obligatorily use the duration of the vowel to identify it and do so in relation to the rate of the sentence in which the vowel occurs.}, } @article {pmid2141444, year = {1990}, author = {Tong, YC and Harrison, JM and Huigen, J and Clark, GM}, title = {Comparison of two speech processing schemes using normal-hearing subjects.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {469}, number = {}, pages = {135-139}, pmid = {2141444}, issn = {0365-5237}, mesh = {Cochlear Implants ; Communication Aids for Disabled ; Computer Systems ; Hearing Aids ; Humans ; Reference Values ; *Signal Processing, Computer-Assisted ; *Speech Perception ; }, abstract = {Two speech processing schemes have been implemented on a real-time laboratory speech processor for speech perception studies on normal-hearing subjects. The first scheme presents the information of two spectral components, the first and second formant of the speech signal, and will be referred to as the ZC (zero-crossing) scheme. The second scheme presents the information of four spectral components, four spectral peaks of the speech signal determined from the output of a filter bank, and will be referred to as the FB (Filter-bank) scheme. Perceptual studies were conducted on 4 normal-hearing subjects. Two of the 4 subjects were first trained and tested using the ZC scheme while the other 2 used the FB scheme. The training and testing schedules were then repeated for the unused scheme. Vowel perception performance was similar across subjects and speech processing schemes. Percentage correct scores for consonant perception in noise were higher for the FB scheme than for the ZC scheme at signal-to-noise ratios ranging from 5 to 20 dB. Percentage correct scores for open set CNC words and speech tracking scores were also better for FB than for ZC.}, } @article {pmid2137145, year = {1990}, author = {Wright, RD and Elliott, SJ}, title = {Parameter interpolation in speech synthesis.}, journal = {The Journal of the Acoustical Society of America}, volume = {87}, number = {1}, pages = {383-391}, doi = {10.1121/1.399259}, pmid = {2137145}, issn = {0001-4966}, mesh = {*Communication Aids for Disabled ; *Self-Help Devices ; }, abstract = {A comparison has been made of the transition properties of six types of speech synthesizer parameters: serial resonance, prediction coefficients, reflection coefficients, area functions, parallel resonance, and, finally, a simple set of articulatory parameters. The first four synthesizers are formally equivalent and can be made to produce identical steady-state sounds (targets). The last two involve approximations, but achieve similar targets. Formant paths between targets will differ according to the parameter type used during interpolation. Each type was tested on nonsense words spanning a wide range of parameter values. Linear interpolation of synthesizer parameters was used to determine a path between target values. The resultant data were then converted to formant values and plotted as a spectrographic (frequency versus time) representation. Small differences in formant frequency (versus linear transitions of formant frequency and bandwidth) were common, and some quite large differences in formant bandwidths were observed in certain cases.}, } @article {pmid2130383, year = {1990}, author = {Fischer-Jørgensen, E}, title = {Intrinsic F0 in tense and lax vowels with special reference to German.}, journal = {Phonetica}, volume = {47}, number = {3-4}, pages = {99-140}, doi = {10.1159/000261858}, pmid = {2130383}, issn = {0031-8388}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; Palate/physiology ; *Phonetics ; *Sound Spectrography ; Speech/*physiology ; Tongue/*physiology ; }, abstract = {The main purpose of this paper is to show that the observation which is the starting point for almost all attempts at explaining intrinsic fundamental frequency (intrinsic F0) in vowels, i.e. that it is correlated with vowel height (interpreted as tongue height), does not hold if short lax vowels are included, since they have a considerably lower tongue height but practically the same F0 as their corresponding tense counterparts. Section 1 contains a discussion of some explanations of intrinsic F0 and vowel height and a short exposition of its connection with other vowel features. Section 2 gives a survey of the properties of tense and lax vowels based on data from the phonetic literature. Section 3 reports on an investigation of German tense and lax front unrounded vowels, including duration, tongue height, jaw opening, vertical lip opening, formant frequencies, and F0. Section 4 contains a discussion of various possible explanations of the results.}, } @article {pmid2081583, year = {1990}, author = {Rastatter, MP and Jacques, RD}, title = {Formant frequency structure of the aging male and female vocal tract.}, journal = {Folia phoniatrica}, volume = {42}, number = {6}, pages = {312-319}, doi = {10.1159/000266088}, pmid = {2081583}, issn = {0015-5705}, mesh = {Adult ; Aged ; Aging/*physiology ; Female ; Humans ; Male ; Phonation/*physiology ; *Phonetics ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Vocal Cords/physiology ; }, } @article {pmid2601533, year = {1989}, author = {Murry, T and Bone, RC}, title = {Acoustic characteristics of speech following uvulopalatopharyngoplasty.}, journal = {The Laryngoscope}, volume = {99}, number = {12}, pages = {1217-1219}, doi = {10.1288/00005537-198912000-00001}, pmid = {2601533}, issn = {0023-852X}, mesh = {Aged ; Humans ; Male ; Middle Aged ; Palate/*surgery ; Pharynx/*surgery ; Reading ; Sleep Apnea Syndromes/*surgery ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; Uvula/*surgery ; }, abstract = {Selective uvulopalatopharyngoplasty has resulted in the improved management of sleep apnea; however, the effect of the surgery on speech has not been systematically evaluated. In this study, the speaking fundamental frequency, reading rate, and first and second formants of the vowels of four patients were measured acoustically presurgically and postsurgically. The results indicate that the length of time needed to read a passage was reduced in all four patients 2 weeks following surgery compared to the presurgical duration. In addition, the second formant of nine vowels studied was lower postsurgically. No changes in speaking fundamental frequency were found in three of the four patients; in the fourth, the speaking fundamental frequency was lower by approximately one semitone. Based on the reduced reading rates from the presurgical to the postsurgical conditions, changes in respiratory parameters in addition to vocal tract parameters warrant assessment when evaluating the speech of individuals prior to and following uvulopalatopharyngoplasty.}, } @article {pmid2600310, year = {1989}, author = {Geisler, CD}, title = {The responses of models of "high-spontaneous" auditory-nerve fibers in a damaged cochlea to speech syllables in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {6}, pages = {2192-2205}, doi = {10.1121/1.398480}, pmid = {2600310}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Action Potentials ; Animals ; Cats ; Cochlea/innervation/*physiopathology ; *Computer Simulation ; *Noise ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiopathology ; }, abstract = {The responses of four high-spontaneous fibers from a damaged cat cochlea responding to naturally uttered consonant-vowel (CV) syllables [m], [p], and [t], each with [a], [i], and [u] in four different levels of noise were simulated using a two-stage computer model. At the lowest noise level [+30 dB signal-to-noise (S/N) ratio], the responses of the models of the three fibers from a heavily damaged portion of the cochlea [characteristic frequencies (CFs) from 1.6 to 2.14 kHz] showed quite different response patterns from those of fibers in normal cochleas: There was little response to the noise alone, the consonant portions of the syllables evoked small-amplitude wide-bandwidth complexes, and the vowel-segment response synchrony was often masked by low-frequency components, especially the first formant. At the next level of noise (S/N = 20 dB), spectral information regarding the murmur segments of the [m] syllables was essentially lost. At the highest noise levels used (S/N = +10 and 0 dB), the noise was almost totally disruptive of coding of the spectral peaks of the consonant portions of the stop CVs. Possible implications of the results with regard to the understanding of speech by hearing-impaired listeners are discussed.}, } @article {pmid2600301, year = {1989}, author = {Tartter, VC and Chute, PM and Hellman, SA}, title = {The speech of a postlingually deafened teenager during the first year of use of a multichannel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {6}, pages = {2113-2121}, doi = {10.1121/1.398471}, pmid = {2600301}, issn = {0001-4966}, support = {1 F06 TW01223-01/TW/FIC NIH HHS/United States ; }, mesh = {Adolescent ; Auditory Threshold ; *Cochlear Implants ; Deafness/*physiopathology ; Electric Stimulation ; Humans ; Speech Disorders/*physiopathology ; }, abstract = {The speech of a profoundly postlingually deafened teenager was recorded before, immediately after, 3 months after, and 1 year after electrical stimulation with a Nucleus multichannel cochlear implant. Listener tests of target words revealed significant improvement in overall quality over the year. Spectrograms showed less aspiration and better definition of the lower formants. Acoustic measurements indicated immediate change in F0 and gradual changes in syllable duration and some aspects of voicing and manner of articulation. Vowel space shrank steadily over the year, with both first- and second-formant frequencies dropping. Prestimulation results are discussed relative to the literature on the speech of the congenitally hearing impaired. Effects of multichannel electrical stimulation on speech are compared with studies of single-electrode stimulation.}, } @article {pmid2532227, year = {1989}, author = {Tyler, RS and Tye-Murray, N and Otto, SR}, title = {The recognition of vowels differing by a single formant by cochlear-implant subjects.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {6}, pages = {2107-2112}, doi = {10.1121/1.398470}, pmid = {2532227}, issn = {0001-4966}, support = {N520466//PHS HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; *Communication Aids for Disabled ; Deafness/*physiopathology ; Humans ; *Phonetics ; *Self-Help Devices ; *Speech Perception ; }, abstract = {The ability to recognize synthetic, two-formant vowels with equal duration and similar loudness was measured in five subjects with the Cochlear and five subjects with the Symbion cochlear implants. In one set of test stimuli, vowel pairs differed only in the first-formant frequency (F1). In another set, vowel pairs differed only in the second-formant frequency (F2). When F1 differed, four of five Cochlear subjects and four of five Symbion subjects recognized the vowels significantly above chance. When F2 differed, two of five Cochlear subjects and three of five Symbion subjects scored above chance. These results suggest that implanted subjects can utilize both "place" information across different electrodes and "rate" information on a single electrode to derive information about the spectral content of the stimulus.}, } @article {pmid2489446, year = {1989}, author = {Mimura, S and Yamamura, O and Hayase, Y and Tanigawa, T and Sasaki, E and Nakabayashi, A and Matsui, H and Takeuchi, M and Ryumon, K and Maruie, Y}, title = {[Acoustic investigation of various palatal rugae].}, journal = {Gifu Shika Gakkai zasshi = The Journal of Gifu Dental Society}, volume = {16}, number = {2}, pages = {527-532}, pmid = {2489446}, issn = {0385-0072}, mesh = {*Dental Impression Technique ; Speech Acoustics ; }, abstract = {We use some palatal plates with some Kinds of palatal rugae; ready made, patterned, individually copied and omitted, and studied the influence of the pronunciation acoustically concerning the Formant Frequency.}, } @article {pmid2489445, year = {1989}, author = {Ryumon, K and Yamamura, O and Hayase, Y and Ogiso, A and Tanigawa, T and Sasaki, E and Nakabayashi, A and Matsui, H and Noda, T and Marui, Y}, title = {[Acousticodynamic investigation of palatal rugae given by the pressure impression method].}, journal = {Gifu Shika Gakkai zasshi = The Journal of Gifu Dental Society}, volume = {16}, number = {2}, pages = {520-526}, pmid = {2489445}, issn = {0385-0072}, mesh = {*Dental Impression Technique ; Denture Design ; *Denture, Complete ; Palate ; Speech Acoustics ; }, abstract = {This time, we used palatal plates with the individual palatal rugae using the plastic plate for copying the individual palatal rugae easily, and studied the influence of the pronunciation concerning the changes of the Formant frequencies with Sound Spectrograph.}, } @article {pmid2808931, year = {1989}, author = {Fitch, H}, title = {Comments on "Effects of Noise on Speech Production: Acoustic and Perceptual Analyses" [J. Acoust. Soc. Am. 84, 917-928 (1988)].}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {5}, pages = {2017-2019}, doi = {10.1121/1.398582}, pmid = {2808931}, issn = {0001-4966}, mesh = {Humans ; *Noise ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {The effect of background noise on speech production is an important issue, both from the practical standpoint of developing speech recognition algorithms and from the theoretical standpoint of understanding how speech is tuned to the environment in which it is spoken. Summers et al. [J. Acoust. Soc. Am. 84, 917-928 (1988]) address this issue by experimentally manipulating the level of noise delivered through headphones to two talkers and making several kinds of acoustic measurements on the resulting speech. They indicate that they have replicated effects on amplitude, duration, and pitch and have found effects on spectral tilt and first-formant frequency (F1). The authors regard these acoustic changes as effects in themselves rather than as consequences of a change in vocal effort, and thus treat equally the change in spectral tilt and the change in F1. In fact, the change in spectral tilt is a well-documented and understood consequence of the change in the glottal waveform, which is known to occur with increased effort. The situation with F1 is less clear and is made difficult by measurement problems. The bias in linear predictive coding (LPC) techniques related to two of the other changes-fundamental frequency and spectral tilt-is discussed.}, } @article {pmid2808917, year = {1989}, author = {Tartter, VC}, title = {What's in a whisper?.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {5}, pages = {1678-1683}, doi = {10.1121/1.398598}, pmid = {2808917}, issn = {0001-4966}, mesh = {Adult ; Humans ; Speech Perception/*physiology ; }, abstract = {Whispering is a common, natural way of reducing speech perceptibility, but whether and how whispering affects consonant identification and the acoustic features presumed important for it in normal speech perception are unknown. In this experiment, untrained listeners identified 18 different whispered initial consonants significantly better than chance in nonsense syllables. The phonetic features of place and manner of articulation and, to a lesser extent, voicing, were correctly identified. Confusion matrix and acoustic analyses indicated preservation of resonance characteristics for place and manner of articulation and suggested the use of burst, aspiration, or frication duration and intensity, and/or first-formant cutback for voicing decisions.}, } @article {pmid2605380, year = {1989}, author = {Bochenek, W and Chorzempa, A and Hazell, JW and Kiciak, J and Kukwa, A}, title = {Non-invasive electrical stimulation of the ear canal as a communication aid in acquired total deafness.}, journal = {British journal of audiology}, volume = {23}, number = {4}, pages = {285-291}, doi = {10.3109/03005368909076516}, pmid = {2605380}, issn = {0300-5364}, mesh = {Adult ; Deafness/*rehabilitation ; Ear Canal ; Electric Stimulation ; Evaluation Studies as Topic ; Female ; *Hearing Aids ; Humans ; Male ; Prosthesis Design ; }, abstract = {Some profoundly deafened patients, who cannot be helped by sound amplification, claim to perceive auditory sensations when alternating currents up to 100 Hz are passed through electrodes applied to the skin of the external ear canal. A portable speech processor has been developed which supplies this current. The signal is a balanced square wave, the frequency of which is proportional to the first formant frequency. The amplitude is proportional to the intensity of voiced sounds. In order to fit the narrow frequency and dynamic range of the electrical stimulus, the speech processor produces a downward frequency transposition and strong limitation of the dynamic range. The device has been tested for (1) discrimination of environmental sounds; (2) question/statement discrimination; (3) identification of vowels and consonants in vowel/consonant/vowel context; (4) lip-reading with and without the prosthesis.}, } @article {pmid2808902, year = {1989}, author = {Nittrouer, S and Whalen, DH}, title = {The perceptual effects of child-adult differences in fricative-vowel coarticulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {4}, pages = {1266-1276}, doi = {10.1121/1.398741}, pmid = {2808902}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; NS-07237/NS/NINDS NIH HHS/United States ; NS27433/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {Earlier work [Nittrouer et al., J. Speech Hear. Res. 32, 120-132 (1989)] demonstrated greater evidence of coarticulation in the fricative-vowel syllables of children than in those of adults when measured by anticipatory vowel effects on the resonant frequency of the fricative back cavity. In the present study, three experiments showed that this increased coarticulation led to improved vowel recognition from the fricative noise alone: Vowel identification by adult listeners was better overall for children's productions and was successful earlier in the fricative noise. This enhanced vowel recognition for children's samples was obtained in spite of the fact that children's and adults' samples were randomized together, therefore indicating that listeners were able to normalize the vowel information within a fricative noise where there often was acoustic evidence of only one formant associated primarily with the vowel. Correct vowel judgments were found to be largely independent of fricative identification. However, when another coarticulatory effect, the lowering of the main spectral prominence of the fricative noise for /u/ versus /i/, was taken into account, vowel judgments were found to interact with fricative identification. The results show that listeners are sensitive to the greater coarticulation in children's fricative-vowel syllables, and that, in some circumstances, they do not need to make a correct identification of the most prominently specified phone in order to make a correct identification of a coarticulated one.}, } @article {pmid2798031, year = {1989}, author = {Murphy, WD and Shea, SL and Aslin, RN}, title = {Identification of vowels in "vowel-less" syllables by 3-year-olds.}, journal = {Perception & psychophysics}, volume = {46}, number = {4}, pages = {375-383}, pmid = {2798031}, issn = {0031-5117}, support = {HD-20286/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Child, Preschool ; Humans ; *Language Development ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {The ability of 3-year-old children to perceive the identity of vowels in full-vowel and silent-center, consonant-vowel-consonant (CVC) syllables was investigated using a two-alternative pointing procedure. Silence replaced the middle 10%, 35%, 65%, or 90% of the steady-state formants of synthetic "bad" and "bud" syllables. Identification of the two full-vowel syllables was 87% correct, whereas performance for the silent-center syllables was somewhat lower (72%, 70%, 67%, and 66% correct for the 10%, 35%, 65%, and 90% deletion stimuli, respectively). The performance of individual children fell into two subgroups: (1) those who performed like adults by maintaining correct vowel identification for all of the silent-center syllables, and (2) those who identified the full-vowel syllables correctly but performed at chance for all of the silent-center syllables. Three additional experiments showed that none of the children performed poorly when noise replaced the gap in the silent-center syllables. These results demonstrate that many 3-year-olds can identify vowels correctly in CVC syllables in the absence of the full spectral properties of steady-state formants.}, } @article {pmid2685247, year = {1989}, author = {Tobey, EA and Lincks, J}, title = {Acoustic analyses of speech changes after maxillectomy and prosthodontic management.}, journal = {The Journal of prosthetic dentistry}, volume = {62}, number = {4}, pages = {449-455}, doi = {10.1016/0022-3913(89)90180-7}, pmid = {2685247}, issn = {0022-3913}, mesh = {*Dentures ; Humans ; Maxilla/*surgery ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Disorders/prevention & control ; }, abstract = {Acoustic speech patterns of five maxillofacial surgery patients were examined before and after prosthodontic reconstruction to determine the effectiveness of a maxillary prosthesis for eliminating (or reducing) nasal resonances. Vowel formant frequencies were measured for /i/ (as in seep) and /u/ (as in in soup). Within-subject, across test-session comparisons were made by using analysis of variance techniques. Prosthodontic restoration significantly reduced the nasal resonances in all patients by either (1) completely eliminating the resonances, (2) reducing the amplitude of the resonances, or (3) changing the frequency of the resonances to more nearby regions of the vowels.}, } @article {pmid2600196, year = {1989}, author = {Shukla, RS}, title = {Phonological space in the speech of the hearing impaired.}, journal = {Journal of communication disorders}, volume = {22}, number = {5}, pages = {317-325}, doi = {10.1016/0021-9924(89)90008-7}, pmid = {2600196}, issn = {0021-9924}, mesh = {Adolescent ; Adult ; Articulation Disorders/*diagnosis ; Deafness/*complications ; Female ; Humans ; India ; *Language ; Language Development Disorders/*diagnosis ; Language Tests ; Male ; *Phonetics ; Sound Spectrography ; Speech Production Measurement ; }, abstract = {The vowels /a/, /i/, and /u/ form the extreme points of vowel triangle in Kannada language. First and second formant frequencies of these vowels were measured to determine the phonological space in the speech of the 30 deaf subjects and 30 normally hearing subjects. The phonological space was found to be reduced in the speech of the hearing-impaired subjects, and the reduction was primarily due to the lowering of the second formant of the vowel /i/. Therapeutic implications have been drawn.}, } @article {pmid2477216, year = {1989}, author = {Novak, GP and Kurtzberg, D and Kreuzer, JA and Vaughan, HG}, title = {Cortical responses to speech sounds and their formants in normal infants: maturational sequence and spatiotemporal analysis.}, journal = {Electroencephalography and clinical neurophysiology}, volume = {73}, number = {4}, pages = {295-305}, doi = {10.1016/0013-4694(89)90108-9}, pmid = {2477216}, issn = {0013-4694}, support = {HD01799/HD/NICHD NIH HHS/United States ; HD52910/HD/NICHD NIH HHS/United States ; NS19755/NS/NINDS NIH HHS/United States ; }, mesh = {Auditory Cortex/growth & development/*physiology ; Brain Mapping ; Child Development/*physiology ; Electroencephalography ; *Evoked Potentials, Auditory ; Female ; Functional Laterality ; Humans ; Infant ; Infant, Newborn ; Male ; *Phonetics ; Reaction Time/physiology ; }, abstract = {Cortical auditory evoked potentials (AEPs) to the synthesized syllables [da[ and [ta[ and to the isolated 3 formants of [da[ were obtained from 32 normal infants studied at monthly intervals from birth through 3 months and at 6 months of age. A bilateral array of 16 electrodes referenced to the mid-occiput permitted a topographic analysis of the cortical AEPs at selected latencies. A differential maturational sequence was seen: a predominantly negative cortical AEP wave form became positive, first over the frontocentral region (around term), and then over the temporal region (at 1-2 months). The timing of these electrophysiological changes coincides with a differential anatomical maturational sequence in the auditory cortex, as myelination and synaptogenesis are more advanced in primary than secondary auditory areas at term. All infants in this study followed this developmental sequence. However, there was no systematic effect of the center frequency of the formant stimuli on the maturational level of their respective cortical AEPs, suggesting a relative maturational equivalence in those regions of auditory cortex responding to stimuli across the frequency range present in human speech. In term infants, an initial midline positivity and bitemporal negativities were asynchronous in their peak and offset latencies. suggesting independent generators for each of these components. In infants from 3 to 6 months of age, cortical AEP wave forms consisted of 2 initial positive peaks followed by a negative peak. While the wave forms were similar over midline and lateral scalp, spatiotemporal analysis revealed differences in the latency of onset, duration and in the spatial extent of these components, again suggesting that 2 bilateral, temporally overlapping generators contribute to the cortical AEP. No systematic topographic difference was observed in the cortical AEPs elicited by each of the 3 formants, which differed in center frequencies.}, } @article {pmid2794247, year = {1989}, author = {Elliott, LL and Hammer, MA and Scholl, ME and Carrell, TD and Wasowicz, JM}, title = {Discrimination of rising and falling simulated single-formant frequency transitions: practice and transition duration effects.}, journal = {The Journal of the Acoustical Society of America}, volume = {86}, number = {3}, pages = {945-953}, doi = {10.1121/1.398729}, pmid = {2794247}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Pitch Discrimination/*physiology ; }, abstract = {Two experiments evaluated discrimination of simulated single-format frequency transitions. In the first experiment, listeners received practice with trial-by-trial feedback in discriminating either rising or falling frequency transitions of three different durations (30, 60, and 120 ms). Transitions either occurred in isolation or were followed by a steady-state sound matched in frequency to the transition end point. Some improvement in discrimination over practice runs occurred for the shortest transitions. Whether performance was evaluated at the beginning or end of practice, there were no differences attributable to transition direction or to whether transitions were followed by steady-state sound. Discrimination, however, was significantly better for the longest transitions. Just noticeable differences (jnd's) for the longest transitions, measured in Hz at transition onsets, were of approximately the same magnitude as jnd's for steady-state sounds that were equal in frequency to the midpoints of the transitions. Subjects of the second experiment discriminated the longer rising and falling transitions, but did not receive extensive practice. Results of experiment 2 replicated results of experiment 1 in showing similar jnd's. Experiment 2 also showed no differences attributable to transition direction or to the presence of the steady-state sound following transitions.}, } @article {pmid2771622, year = {1989}, author = {Whalen, DH}, title = {Vowel and consonant judgments are not independent when cued by the same information.}, journal = {Perception & psychophysics}, volume = {46}, number = {3}, pages = {284-292}, pmid = {2771622}, issn = {0031-5117}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Cues ; Female ; Humans ; *Judgment ; Male ; *Phonetics ; Pitch Perception ; Psychoacoustics ; *Speech Perception ; }, abstract = {Despite many attempts to define the major unit of speech perception, none has been generally accepted. In a unique study, Mermelstein (1978) claimed that consonants and vowels are the appropriate units because a single piece of information (duration, in this case) can be used for one distinction without affecting the other. In a replication, this apparent independence was found, instead, to reflect a lack of statistical power: The vowel and consonant judgments did interact. In another experiment, interdependence of two phonetic judgments was found in responses based on the fricative noise and the vocalic formants of a fricative-vowel syllable. These results show that each judgment made on speech signals must take into account other judgments that compete for information in the same signal. An account is proposed that takes segments as the primary units, with syllables imposing constraints on the shape they may take.}, } @article {pmid2771615, year = {1989}, author = {Schouten, ME and Pols, LC}, title = {Identification and discrimination of sweep formants.}, journal = {Perception & psychophysics}, volume = {46}, number = {3}, pages = {235-244}, pmid = {2771615}, issn = {0031-5117}, mesh = {*Attention ; Humans ; *Phonetics ; Pitch Discrimination ; Psychoacoustics ; *Sound Spectrography ; *Speech Perception ; }, abstract = {Earlier identification experiments with sweep tones are repeated with rising and falling single formant (band) sweeps, with durations ranging from 15 to 40 msec and sweep rates from 0 to 40 oct/sec. Steady-state portions of 100-msec duration are then added to the sweeps. The general conclusions are that the tendency to perceive level and slightly rising tones as falling, which was such a prominent feature of the earlier results, disappears as the stimuli become more complex, and that sweep discrimination seems to be a function of the difference between the initial and the final frequency of a sweep.}, } @article {pmid2762106, year = {1989}, author = {Elliott, LL and Hammer, MA and Scholl, ME and Wasowicz, JM}, title = {Age differences in discrimination of simulated single-formant frequency transitions.}, journal = {Perception & psychophysics}, volume = {46}, number = {2}, pages = {181-186}, pmid = {2762106}, issn = {0031-5117}, mesh = {Adolescent ; Adult ; Aged ; Aged, 80 and over ; Aging/*psychology ; Attention ; Auditory Threshold ; Child ; Humans ; Middle Aged ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; }, abstract = {We studied auditory discrimination of simulated single-formant frequency transitions that resembled portions of certain speech consonants. Significant age differences in transition discrimination occurred; both children and older adults required larger acoustic differences between transitions for discrimination than did teenagers/young adults. Longer transitions were more easily discriminated than shorter transitions by all listeners, and there were no differences between discriminations of rising and falling transitions. Teens/young adults and older adults, but not children, required larger frequency differences to discriminate frequency transitions followed by a steady-state sound than for transitions alone. There were also age differences in discrimination of steady-state sounds. These developmental-perceptual differences may help explain why children and older adults who have good pure-tone sensitivity may experience difficulty in understanding speech.}, } @article {pmid2762105, year = {1989}, author = {Johnson, K}, title = {Higher formant normalization results from auditory integration of F2 and F3.}, journal = {Perception & psychophysics}, volume = {46}, number = {2}, pages = {174-180}, pmid = {2762105}, issn = {0031-5117}, mesh = {Adult ; *Attention ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {Two experiments were conducted to test the hypothesis that higher formant normalization results from the auditory integration of F2 and F3 when they are within 3 Bark of each other. In the first experiment, Formants 3-5 were manipulated in both a "hid"-"head" continuum (in which F2 and F3 are within 3 Bark of each other) and a "hood"-"HUD" continuum (in which F2 and F3 are not within 3 Bark of each other). It was found that there was a shift in identification consistent with the higher formant normalization effect only in the "hid"-"head" continuum. In the second experiment, F3 alone was manipulated in a "hood"-"HUD" continuum. The amplitude of F3 in this continuum was increased (as compared with the F3 in the "hood"-"HUD" continuum used in Experiment 1) and a pretest indicated that the shift in F3 could be detected. As in the first experiment, there was no shift in identification associated with shifting F3 frequency in a back-vowel continuum. The results of these experiments are not consistent with an explanation of higher formant normalization in which hearers adjust an internal vowel space in response to higher formant information; rather, the present findings indicate that higher formant normalization results from auditory integration of F2 and F3.}, } @article {pmid2758874, year = {1989}, author = {Lester, BM and Dreher, M}, title = {Effects of marijuana use during pregnancy on newborn cry.}, journal = {Child development}, volume = {60}, number = {4}, pages = {765-771}, pmid = {2758874}, issn = {0009-3920}, mesh = {Acoustics ; Adolescent ; Adult ; Crying/*drug effects ; Female ; Humans ; *Infant, Newborn ; Marijuana Smoking/*adverse effects ; Phonation/drug effects ; Pregnancy ; *Prenatal Exposure Delayed Effects ; }, abstract = {The effects of maternal marijuana use on the newborn cry were studied in Jamaica, where it was possible to rule out confounding factors such as the use of other substances and demographic variables that have clouded previous studies and where higher dosages may make the effects more visible. The acoustic characteristics of the cries of 20 infants of marijuana users and 20 controls were analyzed. The cries of the infants of marijuana users were shorter, had a higher percentage of dysphonation, a higher and more variable fundamental frequency, and a lower first formant than controls. There was also a dose response relation between the first formant and marijuana use. We suggest that heavy marijuana use affects the neurophysiological integrity of the infant.}, } @article {pmid2754560, year = {1989}, author = {Vohr, BR and Lester, B and Rapisardi, G and O'Dea, C and Brown, L and Peucker, M and Cashore, W and Oh, W}, title = {Abnormal brain-stem function (brain-stem auditory evoked response) correlates with acoustic cry features in term infants with hyperbilirubinemia.}, journal = {The Journal of pediatrics}, volume = {115}, number = {2}, pages = {303-308}, doi = {10.1016/s0022-3476(89)80090-3}, pmid = {2754560}, issn = {0022-3476}, mesh = {Brain Stem/*physiopathology ; Crying/*physiology ; Evoked Potentials, Auditory ; Female ; Humans ; Infant, Newborn ; Jaundice, Neonatal/*physiopathology ; Male ; Phonation ; }, abstract = {We hypothesized that changes in brain-stem auditory evoked responses related to bilirubin would be associated with changes in cry because of the anatomic proximity in the brain stem of cranial nerves 8 (auditory) and 9 to 12 (vagal complex, which controls cry). Brain-stem auditory evoked responses and computerized cry analysis were used to study the concurrent effects of moderate hyperbilirubinemia on auditory function and cry. Fifty term infants were divided equally into two groups on the basis of serum bilirubin concentrations: low (less than 8 mg/dl; 136) mumol/L and moderate (10 to 20 mg/dl, 170 to 342 mumol/L). Forty-three infants had successful tracings of brain-stem auditory evoked responses recorded with a Cadwell model 5200A evoked response unit during two successive trials, and a cry recording of each infant was analyzed by computer. The moderate serum bilirubin group had an increase in percent cry phonation (p less than 0.02) and an increase in the variability of the first formant (p less than 0.04) in comparison with the low serum bilirubin group. Serum bilirubin values correlated positively with brain-stem conduction time (r = 0.36, p less than 0.01), percent phonation (r = 0.42, p less than 0.004), and variability of the first formant (r = 0.39, p less than 0.02). Percent phonation, the voiced component produced by increased neural control, correlated with the interpeak of waves latencies I to III (r = 0.32, p less than 0.03) and brain-stem conduction time (wave I to V) (r = 0.35, p less than 0.01). We conclude that hyperbilirubinemia affects adjoining areas of the brain stem that control hearing and cry production.}, } @article {pmid2585199, year = {1989}, author = {Gyo, K and Takeda, K and Sato, H and Yanagihara, N}, title = {[Modification of speech encoding process of single channel cochlear implant utilizing digital delay circuit].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {92}, number = {8}, pages = {1253-1258}, doi = {10.3950/jibiinkoka.92.1253}, pmid = {2585199}, issn = {0030-6622}, mesh = {Adult ; *Cochlear Implants ; Humans ; Male ; Middle Aged ; Prosthesis Design ; *Speech Perception ; Time Factors ; }, abstract = {Speech perception by single channel cochlear implant can be improved to some extent by modifying the encoding process of the speech signals. The principle of our encoding strategy is to present speech signals with appropriate delay time for lower frequency components of the speech sound, while without delay for higher frequency components. This is based on a cochlear physiology that higher frequency sounds are perceived earlier than lower frequency sounds, since the former is perceived at the basal turn and the latter at the apical turn. The following tests were performed without the aid of lip reading in the two patients implanted with a single channel cochlear implant of 3M-House design. Lower frequency components of the speech signals were digitally delayed with a Sony SDP-777 ES surround processor. The results indicated that the optimal delay time for identification of /i/, /e/ and /u/ needed longer delay time, while that of /a/ and /o/ need shorter delay time, corresponding to the difference between the first and second formant of each vowel. Comparison of vowel perception with and without delay coding showed that vowel identification with delay coding was superior to that without it; 30% vs. 16% in case 1, and 48% vs. 30% in case 2.}, } @article {pmid2489741, year = {1989}, author = {Horiuchi, M}, title = {[Studies on influences of artificial palatal plates on pronunciation].}, journal = {Nihon Hotetsu Shika Gakkai zasshi}, volume = {33}, number = {4}, pages = {863-877}, doi = {10.2186/jjps.33.863}, pmid = {2489741}, issn = {0389-5386}, mesh = {*Denture Bases ; Humans ; Palate/physiology ; *Phonetics ; Speech Acoustics ; Speech Articulation Tests ; }, abstract = {UNLABELLED: The purpose of this study was to investigate the influence of the alteration of palatal form on pronunciation. Three types of artificial palatal plates simulating the upper complete denture, which artificial posterior teeth position was more lingually than natural dentition, were made for each dentulous subject. Difficulty of articulatory movements and sound qualities were assessed when the subjects pronounced /sa/, /si/, /ka/, /ki/, /ci/, /hi/, and /ri/ with each plate. The distortional sounds judged by auditory psychological experiment were analysed by acoustic methods, and mandibular movements during pronunciation of them were measured.

RESULTS: 1. /ki/ and /hi/ were affected remarkably when the posterior position was more lingually, and /ri/, /ci/ and /sa/ were also affected. 2. The acoustic characteristics of distortional sounds were estimated by peak trajectories, supectrum envelopes, peak frequencies and sound waveforms of consonant parts, formant frequencies and formant trajectories of following vowels, and segemental durations. 3. Mandibular movements did not influence directory on the acoustic characteristics of distortional sounds. It was thought that tongue and lips mainly influenced on those.}, } @article {pmid2816052, year = {1989}, author = {Sokolova, NN and Liakso, EE}, title = {[The reflection of the motivational status in the spectral characteristics of the species-specific acoustic signals of the domestic cat].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {39}, number = {4}, pages = {667-674}, pmid = {2816052}, issn = {0044-4677}, mesh = {Animals ; Animals, Newborn ; Behavior, Animal/physiology ; Cats ; Emotions/physiology ; Female ; Male ; *Motivation ; Sound Spectrography ; Species Specificity ; Time Factors ; Vocalization, Animal/*physiology ; }, abstract = {Spectral characteristics of species-specific acoustic signals were analyzed in cats under various unfavourable conditions: hunger, isolation, pain stimulation, agony. The increase in the need to get rid of the discomfort accompanied by the development of emotional excitation was reflected in spectral characteristics of produced signals. The frequency and duration of signals increased, their spectrum widened accompanied by spectral maxima shifted towards the high-frequency area similar to the range of formant frequencies in the signals of newborn kittens. The similarity between spectral characteristics of the above signals in adult and newborn cats might indicate the appearance of infantile features in adult cats under conditions of a marked desire to change the existing situation. The fact that motivational state was reflected in spectral characteristics of acoustic signals along with stable responses to the signals, spoke in favour of a considerable contribution made by communication to the organization of intraspecific relations.}, } @article {pmid2755760, year = {1989}, author = {Ciocca, V and Bregman, AS}, title = {The effects of auditory streaming on duplex perception.}, journal = {Perception & psychophysics}, volume = {46}, number = {1}, pages = {39-48}, pmid = {2755760}, issn = {0031-5117}, mesh = {*Attention ; *Dominance, Cerebral ; Humans ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {When a formant transition and the remainder of a syllable are presented to subjects' opposite ears, most subjects perceive two simultaneous sounds: a syllable and a nonspeech chirp. It has been demonstrated that, when the remainder of the syllable (base) is kept unchanged, the identity of the perceived syllable will depend on the kind of transition presented at the opposite ear. This phenomenon, called duplex perception, has been interpreted as the result of the independent operation of two perceptual systems or modes, the phonetic and the auditory mode. In the present experiments, listeners were required to identify and discriminate such duplex syllables. In some conditions, the isolated transition was embedded in a temporal sequence of capturing transitions sent to the same ear. This streaming procedure significantly weakened the contribution of the transition to the perceived phonetic identity of the syllable. It is likely that the sequential integration of the isolated transition into a sequence of capturing transitions affected its fusion with the contralateral base. This finding contrasts with the idea that the auditory and phonetic processes are operating independently of each other. The capturing effect seems to be more consistent with the hypothesis that duplex perception occurs in the presence of conflicting cues for the segregation and the integration of the isolated transition with the base.}, } @article {pmid2745883, year = {1989}, author = {Forrest, K and Weismer, G and Turner, GS}, title = {Kinematic, acoustic, and perceptual analyses of connected speech produced by parkinsonian and normal geriatric adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {6}, pages = {2608-2622}, doi = {10.1121/1.397755}, pmid = {2745883}, issn = {0001-4966}, support = {NS-13274/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Aged ; Aging/*physiology ; Auditory Pathways/*physiology/physiopathology ; Humans ; Middle Aged ; Parkinson Disease/*complications/physiopathology ; Speech/*physiology ; Speech Disorders/etiology/*physiopathology ; }, abstract = {Acoustic and kinematic analyses, as well as perceptual evaluation, were conducted on the speech of Parkinsonian and normal geriatric adults. As a group, the Parkinsonian speakers had very limited jaw movement compared to the normal geriatrics. For opening gestures, jaw displacements and velocities produced by the Parkinsonian subjects were about half those produced by the normal geriatrics. Lower lip movement amplitude and velocity also were reduced for the Parkinsonian speakers relative to the normal geriatrics, but the magnitude of the reduction was not as great as that seen in the jaw. Lower lip closing velocities expressed as a function of movement amplitude were greater for the Parkinsonian speakers than for the normal geriatrics. This increased velocity of lower lip movement may reflect a difference in the control of lip elevation for the Parkinsonian speakers, an effect that increased with the severity of dysarthria. Acoustically, the Parkinsonian subjects had reduced durations of vocalic segments, reduced formant transitions, and increased voice onset time compared to the normal geriatrics. These effects were greater for the more severe, compared to the milder, dysarthrics and were most apparent in the more complex, vocalic gestures.}, } @article {pmid2740189, year = {1989}, author = {Samuel, AG}, title = {Insights from a failure of selective adaptation: syllable-initial and syllable-final consonants are different.}, journal = {Perception & psychophysics}, volume = {45}, number = {6}, pages = {485-493}, pmid = {2740189}, issn = {0031-5117}, mesh = {*Attention ; Humans ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {Selective adaptation with a syllable-initial consonant fails to affect perception of the same consonant in syllable-final position, and vice versa. One account of this well-replicated result invokes a cancellation explanation: with the place-of-articulation stimuli used, the pattern of formant transitions switches according to syllabic position, allowing putative phonetic-level effects to be opposed by putative acoustic-level effects. Three experiments tested the cancellation hypothesis by preempting the possibility of acoustic countereffects. In Experiment 1, the test syllables and adaptors were /r/-/l/CVs and VCs, which do not produce cancelling formant patterns across syllabic position. In Experiment 2, /b/-/d/ continua were used in a paired-contrast procedure, believed to be sensitive to phonetic, but not acoustic, identity. In Experiment 3, cross-ear adaptation, also believed to tap phonetic rather than acoustic processes, was used. All three experiments refuted the cancellation hypothesis. Instead, it appears that the perceptual process treats syllable-initial consonants and syllable-final ones as inherently different. These results provide support for the use of demisyllabic representations in speech perception.}, } @article {pmid2731944, year = {1989}, author = {Morris, LR and Barszczewski, P}, title = {Algorithms, hardware, and software for a digital signal processor microcomputer-based speech processor in a multielectrode cochlear implant system.}, journal = {IEEE transactions on bio-medical engineering}, volume = {36}, number = {6}, pages = {573-584}, doi = {10.1109/10.29451}, pmid = {2731944}, issn = {0018-9294}, mesh = {*Algorithms ; *Cochlear Implants ; Electrodes ; Humans ; *Microcomputers ; *Signal Processing, Computer-Assisted ; *Software ; }, abstract = {Software and hardware have been developed to create a powerful, inexpensive, compact digital signal processing system which in real-time extracts a low-bit rate linear predictive coding (LPC) speech system model. The model parameters derived include accurate spectral envelope, formant, pitch, and amplitude information. The system is based on the Texas Instruments TMS320 family, and the most compact realization requires only three chips (TMS320E17, A/D-D/A, op-amp), consuming a total of less than 0.5 W. The processor is part of programmable cochlear implant system under development by a multiuniversity Canadian team, but also has other applications in aids to the hearing handicapped.}, } @article {pmid2641180, year = {1989}, author = {Kitamura, T}, title = {[Acoustic characteristics of Japanese/p, b,t, d, k, g, r/in duration and transition of the second-formants].}, journal = {Showa Shigakkai zasshi = The Journal of Showa University Dental Society}, volume = {9}, number = {2}, pages = {211-230}, pmid = {2641180}, issn = {0285-922X}, mesh = {Humans ; Japan ; *Phonation ; *Speech ; *Speech Acoustics ; *Voice ; }, } @article {pmid2489725, year = {1989}, author = {Horiuchi, M}, title = {[Prosthetic application of digital processing of speech signals].}, journal = {Nihon Hotetsu Shika Gakkai zasshi}, volume = {33}, number = {3}, pages = {654-666}, doi = {10.2186/jjps.33.654}, pmid = {2489725}, issn = {0389-5386}, mesh = {Analog-Digital Conversion ; *Denture, Complete ; Humans ; Spectrum Analysis ; *Speech Acoustics ; Speech Articulation Tests ; }, abstract = {The purpose of this study was to investigate the acoustic characteristics of the nine sounds, /sa/, /si/, /ka/, /ki/, /ci/, /hi/ and /ri/, which were reported affected by wearing complete denture, through the acoustic analysis of speech samples uttered by nine male subjects. Spectrum analysis was done within the frequency range of 0 to 10 KHz according to the auto-correlation method of liner predictive coding (LPC) analysis. Results 1. The consonant parts of /sa/, /si/, /ci/ and /hi/ wee characterized by the spectrum envelopes and their peak frequencies derived from the LPC analysis of order 5. 2. The consonant parts of /ka/ and /ki/ were characterized by their first spectrum peak frequencies derived from the LPC analysis of order 10. 3. The consonant part of /ri/ was characterized by the spectrum envelope derived from the LPC analysis of order 24, and the sound waveform. 4. The frequencies of the formants (F 1, F 2, F 3) of the following vowels /i/ were derived from the LPC analysis of order 24. They were influenced by individual variations and preceding consonants. 5. Each sound had its own characteristics in the duration of the consonant and of the closure interval and the time from the end of preceding vowel to the beginning of the following vowel.}, } @article {pmid2732389, year = {1989}, author = {Diehl, RL and Walsh, MA}, title = {An auditory basis for the stimulus-length effect in the perception of stops and glides.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {2154-2164}, doi = {10.1121/1.397864}, pmid = {2732389}, issn = {0001-4966}, support = {HD18060/HD/NICHD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Pitch Discrimination/physiology ; Speech Perception/*physiology ; }, abstract = {To investigate possible auditory factors in the perception of stops and glides (e.g., /b/ vs /w/), a two-category labeling performance was compared on several series of /ba/-/wa/ stimuli and on corresponding nonspeech stimulus series that modeled the first-formant trajectories and amplitude rise times of the speech items. In most respects, performance on the speech and nonspeech stimuli was closely parallel. Transition duration proved to be an effective cue for both the stop/glide distinction and the nonspeech distinction between abrupt and gradual onsets, and the category boundaries along the transition-duration dimension did not differ significantly in the two cases. When the stop/glide distinction was signaled by variation in transition duration, there was a reliable stimulus-length effect: A longer vowel shifted the category boundary toward greater transition durations. A similar effect was observed for the corresponding nonspeech stimuli. Variation in rise time had only a small effect in signaling both the stop/glide distinction and the nonspeech distinction between abrupt and gradual onsets. There was, however, one discrepancy between the speech and nonspeech performance. When the stop/glide distinction was cued by rise-time variation, there was a stimulus-length effect, but no such effect occurred for the corresponding nonspeech stimuli. On balance, the results suggest that there are significant auditory commonalities between the perception of stops and glides and the perception of acoustically analogous nonspeech stimuli.}, } @article {pmid2732380, year = {1989}, author = {Sinex, DG and McDonald, LP}, title = {Synchronized discharge rate representation of voice-onset time in the chinchilla auditory nerve.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {1995-2004}, doi = {10.1121/1.397852}, pmid = {2732380}, issn = {0001-4966}, support = {NS23242/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Action Potentials ; Animals ; Chinchilla ; Neural Conduction ; *Phonetics ; Reaction Time ; Vestibulocochlear Nerve/*physiology ; *Voice ; }, abstract = {Responses of chinchilla auditory nerve fibers to synthesized stop consonant syllables differing in voice-onset time (VOT) were obtained. The syllables, heard as /ga/-/ka/ or /da/-/ta/, were similar to those previously used by others in psychophysical experiments with human and chinchilla subjects. Synchronized discharge rates of neurons tuned to frequencies near the first formant increased at the onset of voicing for VOTs longer than 20 ms. Stimulus components near the formant or the neuron's characteristic frequency accounted for the increase. In these neurons, synchronized response changes were closely related to the same neuron's average discharge rates [D. G. Sinex and L. P. McDonald, J. Acoust. Soc. Am. 83, 1817-1827 (1988)]. Neurons tuned to frequency regions near the second and third formants usually responded to components near the second formant prior to the onset of voicing. These neurons' synchronized discharges could be captured by the first formant at the onset of voicing or with a latency of 50-60 ms, whichever was later. Since these neurons' average rate responses were unaffected by the onset of voicing, the latency of the synchronized response did provide as much additional neural cue to VOT. Overall, however, discharge synchrony did not provide as much information about VOT as was provided by the best average rate responses. The results are compared to other measurements of the peripheral encoding of speech sounds and to aspects of VOT perception.}, } @article {pmid2659639, year = {1989}, author = {Miller, JD}, title = {Auditory-perceptual interpretation of the vowel.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {2114-2134}, doi = {10.1121/1.397862}, pmid = {2659639}, issn = {0001-4966}, support = {NS 03856/NS/NINDS NIH HHS/United States ; NS 21994/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Pitch Discrimination/physiology ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; Speech Production Measurement ; }, abstract = {The major issues in relating acoustic waveforms of spoken vowels to perceived vowel categories are presented and discussed in terms of the author's auditory-perceptual theory of phonetic recognition. A brief historical review of formant-ratio theory is presented, as well as an analysis of frequency scales that have been proposed for description of the vowel. It is illustrated that the monophthongal vowel sounds of American English can be represented as clustered in perceptual target zones within a three-dimensional auditory-perceptual space (APS), and it is shown that preliminary versions of these target zones segregate a corpus of vowels of American English with 93% accuracy. Furthermore, it is shown that the nonretroflex vowels of American English fall within a narrow slab within the APS, with spread vowels near the front of this slab and rounded vowels near the back. Retroflex vowels fall in a distinct region behind the vowel slab. Descriptions of the vowels within the APS are shown to be correlated with their descriptions in terms of dimensions of articulation and timbre. Additionally, issues related to talker normalization, coarticulation effects, segmentation, pitch, transposition, and diphthongization are discussed.}, } @article {pmid2659638, year = {1989}, author = {Nearey, TM}, title = {Static, dynamic, and relational properties in vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {2088-2113}, doi = {10.1121/1.397861}, pmid = {2659638}, issn = {0001-4966}, mesh = {Humans ; Pattern Recognition, Automated ; *Phonetics ; Pitch Discrimination/physiology ; Sound Spectrography ; Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Speech Production Measurement ; Voice Quality ; }, abstract = {The present work reviews theories and empirical findings, including results from two new experiments, that bear on the perception of English vowels, with an emphasis on the comparison of data analytic "machine recognition" approaches with results from speech perception experiments. Two major sources of variability (viz., speaker differences and consonantal context effects) are addressed from the classical perspective of overlap between vowel categories in F1 x F2 space. Various approaches to the reduction of this overlap are evaluated. Two types of speaker normalization are considered. "Intrinsic" methods based on relationships among the steady-state properties (F0, F1, F2, and F3) within individual vowel tokens are contrasted with "extrinsic" methods, involving the relationships among the formant frequencies of the entire vowel system of a single speaker. Evidence from a new experiment supports Ainsworth's (1975) conclusion [W. Ainsworth, Auditory Analysis and Perception of Speech (Academic, London, 1975)] that both types of information have a role to play in perception. The effects of consonantal context on formant overlap are also considered. A new experiment is presented that extends Lindblom and Studdert-Kennedy's finding [B. Lindblom and M. Studdert-Kennedy, J. Acoust. Soc. Am. 43, 840-843 (1967)] of perceptual effects of consonantal context on vowel perception to /dVd/ and /bVb/ contexts. Finally, the role of vowel-inherent dynamic properties, including duration and diphthongization, is briefly reviewed. All of the above factors are shown to have reliable influences on vowel perception, although the relative weight of such effects and the circumstances that alter these weights remain far from clear. It is suggested that the design of more complex perceptual experiments, together with the development of quantitative pattern recognition models of human vowel perception, will be necessary to resolve these issues.}, } @article {pmid2659637, year = {1989}, author = {Strange, W}, title = {Evolving theories of vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {2081-2087}, doi = {10.1121/1.397860}, pmid = {2659637}, issn = {0001-4966}, support = {NH-37924/NH/NIH HHS/United States ; NS-22568/NS/NINDS NIH HHS/United States ; }, mesh = {Auditory Threshold/physiology ; Differential Threshold ; Humans ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Research on the perception of vowels in the last several years has given rise to new conceptions of vowels as articulatory, acoustic, and perceptual events. Starting from a "simple" target model in which vowels were characterized articulatorily as static vocal tract shapes and acoustically as points in a first and second formant (F1/F2) vowel space, this paper briefly traces the evolution of vowel theory in the 1970s and 1980s in two directions. (1) Elaborated target models represent vowels as target zones in perceptual spaces whose dimensions are specified as formant ratios. These models have been developed primarily to account for perceivers' solution of the "speaker normalization" problem. (2) Dynamic specification models emphasize the importance of formant trajectory patterns in specifying vowel identity. These models deal primarily with the problem of "target undershoot" associated with the coarticulation of vowels with consonants in natural speech and with the issue of "vowel-inherent spectral change" or diphthongization of English vowels. Perceptual studies are summarized that motivate these theoretical developments.}, } @article {pmid2525139, year = {1989}, author = {Ladefoged, P}, title = {A note on "Information conveyed by vowels".}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {5}, pages = {2223-2224}, doi = {10.1121/1.397821}, pmid = {2525139}, issn = {0001-4966}, mesh = {Communication Aids for Disabled ; Humans ; *Phonetics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {In an early experiment using synthetic speech, it was shown that raising or lowering the formants in an introductory sentence affected the identification of the vowel in a following test word [P. Ladefoged and D. Broadbent, J. Acoust. Soc. Am. 29, 98-104 (1957)]. This experiment has now been replicated using natural speech produced by a phonetician using two different overall settings of the vocal tract.}, } @article {pmid2708690, year = {1989}, author = {Jusczyk, PW and Rosner, BS and Reed, MA and Kennedy, LJ}, title = {Could temporal order differences underlie 2-month-olds' discrimination of English voicing contrasts?.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {4}, pages = {1741-1749}, doi = {10.1121/1.397963}, pmid = {2708690}, issn = {0001-4966}, support = {HD 15795/HD/NICHD NIH HHS/United States ; }, mesh = {Arousal ; *Attention ; Humans ; Infant ; *Language Development ; *Phonetics ; Psychoacoustics ; *Speech Perception ; *Time Perception ; }, abstract = {Previous studies have shown that infants discriminate voice onset time (VOT) differences for certain speech contrasts categorically. In addition, investigations of nonspeech processing by infants also yield evidence of categorical discrimination of temporal-order differences. These findings have led some researchers to argue that common auditory mechanisms underlie the infant's discrimination of timing differences in speech and nonspeech contrasts [e.g., Jusczyk et al., J. Acoust. Soc. Am. 67, 262-270 (1980)]. Nevertheless, some discrepancies in the location of the infant's category boundaries for different kinds of contrasts have been noted [e.g., Eilers et al. (1980)]. Because different procedures were used to study the different kinds of contrasts, interpretation of the discrepancies between the studies has been difficult. In the present study, three different continua were examined: [ba]-[pa] stimuli, which differed in VOT; [du]-[tu] stimuli, which differed in VOT but which lacked format transitions; nonspeech formant onset time (FOT) stimuli that varied in the time that lower harmonics increased in amplitude. An experiment with adults indicated a close match between the perceptual boundaries for the three series. Similarly, tests with 2-month-old infants using high amplitude sucking procedure yielded estimates of perceptual category boundaries at between 20 and 40 ms for all three stimulus series.}, } @article {pmid2708689, year = {1989}, author = {Kewley-Port, D and Atal, BS}, title = {Perceptual differences between vowels located in a limited phonetic space.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {4}, pages = {1726-1740}, doi = {10.1121/1.397962}, pmid = {2708689}, issn = {0001-4966}, support = {NS-20606/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Attention ; Female ; Humans ; *Phonetics ; Pitch Perception ; *Sound Spectrography ; *Speech Perception ; }, abstract = {The perception of subphonemic differences between vowels was investigated using multidimensional scaling techniques. Three experiments were conducted with natural-sounding synthetic stimuli generated by linear predictive coding (LPC) formant synthesizers. In the first experiment, vowel sets near the pairs (i-I), (epsilon-ae), or (u-U) were synthesized containing 11 vowels each. Listeners judged the dissimilarities between all pairs of vowels within a set several times. These perceptual differences were mapped into distances between the vowels in an n-dimensional space using two-way multidimensional scaling. Results for each vowel set showed that the physical stimulus space, which was specified by the two parameters F1 and F2, was always mapped into a two-dimensional perceptual space. The best metric for modeling the perceptual distances was the Euclidean distance between F1 and F2 in barks. The second experiment investigated the perception of the same vowels from the first experiment, but embedded in a consonantal context. Following the same procedures as experiment 1, listeners' perception of the (bv) dissimilarities was not different from their perception of the isolated vowel dissimilarities. The third experiment investigated dissimilarity judgments for the three vowels (ae-alpha-lambda) located symmetrically in the F1 X F2 vowel space. While the perceptual space was again two dimensional, the influence of phonetic identity on vowel difference judgments was observed. Implications for determining metrics for subphonemic vowel differences using multidimensional scaling are discussed.}, } @article {pmid2708680, year = {1989}, author = {Geisler, CD and Gamble, T}, title = {Responses of "high-spontaneous" auditory-nerve fibers to consonant-vowel syllables in noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {4}, pages = {1639-1652}, doi = {10.1121/1.397952}, pmid = {2708680}, issn = {0001-4966}, support = {NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Attention/*physiology ; Cats ; Nerve Fibers/physiology ; Noise/*adverse effects ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of "high-spontaneous" single auditory-nerve fibers in anesthetized cat to nine different spoken stop and nasal consonant-vowel syllables presented in four different levels of speech-shaped noise are reported. The temporal information contained in the responses was analyzed using "composite" spectrograms and pseudo-3D spatial-frequency plots. Spectral characteristics of both consonant and vowel segments of the CV syllables were strongly encoded at S/N ratios of 30 and 20 dB. At S/N = 10 dB, formant information during the vowel segments was all that was reliably detectable in most cases. Even at S/N = 0 dB, most vowel formants were detectable, but only with relatively long analysis windows (40 ms). The increases (and decreases) in discharge rate during various phases of the responses were also determined. The rate responses to the "release" and to the voicing of the stop-consonant syllables were quite robust, being detectable at least half of the time, even at the highest noise level. Comparisons with psychoacoustic studies using similar stimuli are made.}, } @article {pmid2710622, year = {1989}, author = {Best, CT and Studdert-Kennedy, M and Manuel, S and Rubin-Spitz, J}, title = {Discovering phonetic coherence in acoustic patterns.}, journal = {Perception & psychophysics}, volume = {45}, number = {3}, pages = {237-250}, pmid = {2710622}, issn = {0031-5117}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Attention ; Cues ; Female ; Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Despite spectral and temporal discontinuities in the speech signal, listeners normally report coherent phonetic patterns corresponding to the phonemes of a language that they know. What is the basis for the internal coherence of phonetic segments? According to one account, listeners achieve coherence by extracting and integrating discrete cues; according to another, coherence arises automatically from general principles of auditory form perception; according to a third, listeners perceive speech patterns as coherent because they are the acoustic consequences of coordinated articulatory gestures in a familiar language. We tested these accounts in three experiments by training listeners to hear a continuum of three-tone, modulated sine wave patterns, modeled after a minimal pair contrast between three-formant synthetic speech syllables, either as distorted speech signals carrying a phonetic contrast (speech listeners) or as distorted musical chords carrying a nonspeech auditory contrast (music listeners). The music listeners could neither integrate the sine wave patterns nor perceive their auditory coherence to arrive at consistent, categorical percepts, whereas the speech listeners judged the patterns as speech almost as reliably as the synthetic syllables on which they were modeled. The outcome is consistent with the hypothesis that listeners perceive the phonetic coherence of a speech signal by recognizing acoustic patterns that reflect the coordinated articulatory gestures from which they arose.}, } @article {pmid2708674, year = {1989}, author = {Jafari, M and Wong, KH and Behbehani, K and Kondraske, GV}, title = {Performance characterization of human pitch control system: an acoustic approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {3}, pages = {1322-1328}, doi = {10.1121/1.397463}, pmid = {2708674}, issn = {0001-4966}, mesh = {Adult ; Humans ; Larynx/*physiology ; Male ; Phonation ; Pulmonary Ventilation ; *Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; Vocal Cords/physiology ; *Voice ; *Voice Quality ; }, abstract = {The performance of the human pitch control system was characterized by measurement of the speed of pitch shift and pitch shift response speed (inverse of reaction time) at various initial pitch and loudness levels. Data from three nonsinger adult male subjects and one professional singer suggest a strong inverse correlation (r greater than 0.78) between initial pitch and rate of pitch rise. This study showed no significant relation between initial loudness and rate of pitch rise. Also, vocal response speed showed no significant relation with either initial pitch or loudness. However, it is suggested that pitch shift response speed might be related to the second formant frequency of the target vowel. A composite index of pitch control performance capacity was defined as the product of response speed and vocal fold contractile velocity. From experimental data, the composite index was able to reflect a distinct 74% superior performance by the professional singer (relative to the average maximum performance capacity of nonsingers). It is suggested that the product-based composite index of performance capacity can serve as a sensitive means for vocal proficiency determination.}, } @article {pmid2704190, year = {1989}, author = {Sussman, JE and Carney, AE}, title = {Effects of transition length on the perception of stop consonants by children and adults.}, journal = {Journal of speech and hearing research}, volume = {32}, number = {1}, pages = {151-160}, doi = {10.1044/jshr.3201.151}, pmid = {2704190}, issn = {0022-4685}, mesh = {Adaptation, Physiological ; Adult ; Child ; Child, Preschool ; Feedback ; Humans ; *Language Development ; *Phonetics ; *Speech Perception ; Time Factors ; }, abstract = {The purpose of this investigation was to determine whether children with normal linguistic skills demonstrate increasing developmental changes in their perception of place of articulation for stop consonants with short- and long-duration formant transitions. Three experimental paradigms were used with children and adults: discrimination, labeling, and selective adaptation. Two sets of synthetic CV syllables, varying along a seven-step, bilabial-to-alveolar dimension, were used as stimuli. These two synthetic continua differed in the length of the second and third formant transitions. Results showed that children's discrimination abilities gradually approximated those of adults, but did not reach adult levels even at 10 years of age. Differences were not observed in the labeling task. Further, results of the selective adaptation task indicated that only the adult subjects showed a significant boundary shift for any adapting stimuli. The absence of selective adaptation in children was interpreted as a possible reflection of their poorer auditory abilities. Thus, the pattern of speech perception development for children for place of articulation is a complex one with a strong auditory developmental component.}, } @article {pmid2704188, year = {1989}, author = {Ochs, MT and Humes, LE and Ohde, RN and Grantham, DW}, title = {Frequency discrimination ability and stop-consonant identification in normally hearing and hearing-impaired subjects.}, journal = {Journal of speech and hearing research}, volume = {32}, number = {1}, pages = {133-142}, doi = {10.1044/jshr.3201.133}, pmid = {2704188}, issn = {0022-4685}, support = {28//PHS HHS/United States ; }, mesh = {Adult ; Hearing Loss/*psychology ; Hearing Loss, High-Frequency/*psychology ; Hearing Loss, Sensorineural/*psychology ; Humans ; Middle Aged ; Noise ; Phonetics ; *Pitch Discrimination ; Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {Identification of place of articulation in the synthesized syllables /bi/, /di/, and /gi/ was examined in three groups of listeners: (a) normal hearers, (b) subjects with high-frequency sensorineural hearing loss, and (c) normally hearing subjects listening in noise. Stimuli with an appropriate second formant (F2) transition (moving-F2 stimuli) were compared with stimuli in which F2 was constant (straight-F2 stimuli) to examine the importance of the F2 transition in stop-consonant perception. For straight-F2 stimuli, burst spectrum and F2 frequency were appropriate for the syllable involved. Syllable duration also was a variable, with formant durations of 10, 19, 28, and 44 ms employed. All subjects' identification performance improved as stimulus duration increased. The groups were equivalent in terms of their identification of /di/ and /gi/ syllables, whereas the hearing-impaired and noise-masked normal listeners showed impaired performance for /bi/, particularly for the straight-F2 version. No difference in performance among groups was seen for /di/ and /gi/ stimuli for moving-F2 and straight-F2 versions. Second-formant frequency discrimination measures suggested that subjects' discrimination abilities were not acute enough to take advantage of the formant transition in the /di/ and /gi/ stimuli.}, } @article {pmid2926006, year = {1989}, author = {Bond, ZS and Moore, TJ and Gable, B}, title = {Acoustic-phonetic characteristics of speech produced in noise and while wearing an oxygen mask.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {2}, pages = {907-912}, doi = {10.1121/1.397563}, pmid = {2926006}, issn = {0001-4966}, mesh = {Humans ; Jaw/physiology ; Male ; *Masks ; *Noise ; *Phonetics ; *Speech/physiology ; *Speech Acoustics ; }, abstract = {The present study investigated changes in the prosodic and acoustic-phonetic features of isolated words by four male talkers speaking in quite and in pink noise at a level of 95 dB SPL. Speech samples were collected both with and without an oxygen mask. Changes in duration, fundamental frequency, total energy, and formant center frequency were analyzed. In addition to the expected changes of increased pitch and amplitude associated with speaking in noise without an oxygen mask, significant effects were found (particularly in the formant center frequencies) as a result of using the oxygen mask. When the oxygen mask was employed, no further significant changes were caused by adding noise to the speaking situation.}, } @article {pmid2925806, year = {1989}, author = {De Boysson-Bardies, B and Halle, P and Sagart, L and Durand, C}, title = {A crosslinguistic investigation of vowel formants in babbling.}, journal = {Journal of child language}, volume = {16}, number = {1}, pages = {1-17}, doi = {10.1017/s0305000900013404}, pmid = {2925806}, issn = {0305-0009}, mesh = {Algeria ; *Child Language ; *Cross-Cultural Comparison ; Female ; Hong Kong ; Humans ; Infant ; *Language ; *Language Development ; London ; Male ; Paris ; Phonetics ; Speech Acoustics ; }, abstract = {A cross-cultural investigation of the influence of target-language in babbling was carried out. 1047 vowels produced by twenty 10-month-old infants from Parisian French, London English, Hong Kong Cantonese and Algiers Arabic language backgrounds were recorded in the cities of origin and spectrally analysed. F1-F2 plots of these vowels were obtained for each subject and each language group. Statistical analyses provide evidence of differences between infants across language backgrounds. These differences parallel those found in adult speech in the corresponding languages. Implications of an early build-up of target-language-oriented production skills are discussed.}, } @article {pmid2707975, year = {1989}, author = {Rapisardi, G and Vohr, B and Cashore, W and Peucker, M and Lester, B}, title = {Assessment of infant cry variability in high-risk infants.}, journal = {International journal of pediatric otorhinolaryngology}, volume = {17}, number = {1}, pages = {19-29}, doi = {10.1016/0165-5876(89)90290-5}, pmid = {2707975}, issn = {0165-5876}, support = {R01 HD21013/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; Central Nervous System/physiology ; Crying/*physiology ; Electronic Data Processing ; Humans ; Hyperbilirubinemia/*physiopathology ; Infant, Newborn ; Infant, Premature/*physiology ; Longitudinal Studies ; }, abstract = {Two studies were conducted to determine the relationship between variability in acoustic features of the infant cry and medical risk factors. In study 1, 3 groups of preterm infants (healthy, sick and CNS pathology) were compared with term infants at 40 weeks gestational age. The cry was analyzed by computer. The coefficient of variability of cry amplitude and the formant features of the cry differed among the groups of preterm infants. In study 2, 3 groups of term infants at low, moderate and high levels of hyperbilirubinemia were compared on the cry measures. More variability in the formant features of the cry was found in infants with higher levels of bilirubin. The correlation between the coefficient of variation in the cry formants and level of bilirubin was statistically significant. These two studies suggest that variability in the acoustic features of the cry relate to the medical status of the infant and may provide a measure of neurophysiological integrity.}, } @article {pmid2654966, year = {1989}, author = {Wallbott, HG}, title = {Vocal behavior and psychopathology.}, journal = {Pharmacopsychiatry}, volume = {22 Suppl 1}, number = {}, pages = {13-16}, doi = {10.1055/s-2007-1014618}, pmid = {2654966}, issn = {0176-3679}, mesh = {Humans ; Mental Disorders/*psychology ; Speech/*physiology ; }, abstract = {An overview of voice parameters such as fundamental frequency, intensity, spectral characteristics, and formants is given, and ways to measure such parameters are demonstrated. Furthermore, an attempt is made to link voice parameters theoretically to emotions and psychopathology. The relevance of voice parameters is demonstrated by reporting results of studies on vocal indicators of different emotions, on differences between depressives and schizophrenics, as well as on voice changes during the course of psychotherapy. The importance of voice characteristics as a diagnostic tool is discussed within the framework of a general model of person perception, the Brunswikean lens model.}, } @article {pmid2921420, year = {1989}, author = {Repp, BH and Lin, HB}, title = {Acoustic properties and perception of stop consonant release transients.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {1}, pages = {379-396}, doi = {10.1121/1.397689}, pmid = {2921420}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Spectrum Analysis ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Voice ; }, abstract = {This study focuses on the initial component of the stop consonant release burst, the release transient. In theory, the transient, because of its impulselike source, should contain much information about the vocal tract configuration at release, but it is usually weak in intensity and difficult to isolate from the accompanying frication in natural speech. For this investigation, a human talker produced isolated release transients of /b,d,g/ in nine vocalic contexts by whispering these syllables very quietly. He also produced the corresponding CV syllables with regular phonation for comparison. Spectral analyses showed the isolated transients to have a clearly defined formant structure, which was not seen in natural release bursts, whose spectra were dominated by the frication noise. The formant frequencies varied systematically with both consonant place of articulation and vocalic context. Perceptual experiments showed that listeners can identify both consonants and vowels from isolated transients, though not very accurately. Knowing one of the two segments in advance did not help, but when the transients were followed by a compatible synthetic, steady-state vowel, consonant identification improved somewhat. On the whole, isolated transients, despite their clear formant structure, provided only partial information for consonant identification, but no less so, it seems, than excerpted natural release bursts. The information conveyed by artificially isolated transients and by natural (frication-dominated) release bursts appears to be perceptually equivalent.}, } @article {pmid2921415, year = {1989}, author = {Assmann, PF and Summerfield, Q}, title = {Modeling the perception of concurrent vowels: vowels with the same fundamental frequency.}, journal = {The Journal of the Acoustical Society of America}, volume = {85}, number = {1}, pages = {327-338}, doi = {10.1121/1.397684}, pmid = {2921415}, issn = {0001-4966}, mesh = {Humans ; Models, Theoretical ; *Phonetics ; *Speech Perception ; }, abstract = {The ability of listeners to identify pairs of simultaneous synthetic vowels has been investigated in the first of a series of studies on the extraction of phonetic information from multiple-talker waveforms. Both members of the vowel pair had the same onset and offset times and a constant fundamental frequency of 100 Hz. Listeners identified both vowels with an accuracy significantly greater than chance. The pattern of correct responses and confusions was similar for vowels generated by (a) cascade formant synthesis and (b) additive harmonic synthesis that replaced each of the lowest three formants with a single pair of harmonics of equal amplitude. In order to choose an appropriate model for describing listeners' performance, four pattern-matching procedures were evaluated. Each predicted the probability that (i) any individual vowel would be selected as one of the two responses, and (ii) any pair of vowels would be selected. These probabilities were estimated from measures of the similarities of the auditory excitation patterns of the double vowels to those of single-vowel reference patterns. Up to 88% of the variance in individual responses and up to 67% of the variance in pairwise responses could be accounted for by procedures that highlighted spectral peaks and shoulders in the excitation pattern. Procedures that assigned uniform weight to all regions of the excitation pattern gave poorer predictions. These findings support the hypothesis that the auditory system pays particular attention to the frequencies of spectral peaks, and possibly also of shoulders, when identifying vowels. One virtue of this strategy is that the spectral peaks and shoulders can indicate the frequencies of formants when other aspects of spectral shape are obscured by competing sounds.}, } @article {pmid2912474, year = {1989}, author = {Cranney, J and Fowler, CA and Musiek, F}, title = {Duplex perception: some initial findings concerning its neural basis.}, journal = {Brain and cognition}, volume = {9}, number = {1}, pages = {48-56}, doi = {10.1016/0278-2626(89)90043-2}, pmid = {2912474}, issn = {0278-2626}, support = {HD 01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Attention/*physiology ; Auditory Perception/*physiology ; Brain Damage, Chronic/*physiopathology ; Corpus Callosum/*physiopathology ; Dichotic Listening Tests ; Dominance, Cerebral/physiology ; Epilepsy/surgery ; Female ; Humans ; Male ; Phonetics ; Pons/*physiopathology ; Postoperative Complications/physiopathology ; Speech Perception/*physiology ; }, abstract = {Duplex perception is the simultaneous perception of a speech syllable and of a nonspeech "chirp," and occurs when a single formant transition and the remainder (the "base") of a synthetic syllable are presented to different ears. The current study found a slight but nonsignificant advantage for correct labeling of the fused syllable when the chirp was presented to the left ear. This advantage was amplified in the performance of a "split-brain" subject. A subject with a left pontine lesion performed at chance level when the chirp was presented to her left ear. These findings suggest that some, if not complete, ipsilateral suppression does occur in the dichotic fusion procedure, and that identification of the fused syllable is maximal when the left hemisphere fully processes the linguistic characteristics of the base (through contralateral presentation), and at least minimally processes the frequency transition information of the chirp (through ipsilateral presentation).}, } @article {pmid2814327, year = {1989}, author = {Kuk, FK}, title = {Single-channel versus multichannel electrical stimulation. Voicing-frequency and formant-transition difference limens.}, journal = {Scandinavian audiology}, volume = {18}, number = {3}, pages = {149-153}, doi = {10.3109/01050398909070738}, pmid = {2814327}, issn = {0105-0397}, support = {CDRI-POINS20466-1A1/NS/NINDS NIH HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {*Cochlear Implants ; Electric Stimulation/*methods ; Humans ; Phonation ; Prosthesis Design ; Voice ; }, abstract = {The 'frequency' processing ability of 11 cochlear-implant patients wearing the Symbion device was evaluated with the stimuli presented in a multichannel mode and in a single-channel mode. A fixed-formant synthetic vowel /a/ was used to measure voicing-frequency difference limen. A voiced, synthetic single formant was used to measure the formant-transition difference limen. On the voicing-frequency test, patients obtained a median difference limen of 8 Hz when stimuli were presented in the multichannel mode and 14.7 Hz in the single-channel mode. On the formant-transition test, patients obtained a median difference limen of 589 Hz in the multichannel mode and 611 Hz in the single-channel mode. Three patients were unable to obtain a difference limen within the range of available stimuli. The performance with multichannel stimulation was significantly better (lower difference limen) than performance in the single channel stimulation. Temporal information in more channels and place information may contribute to the superior performance with multichannel stimulation.}, } @article {pmid2806441, year = {1989}, author = {Creutzfeldt, O and Ojemann, G and Lettich, E}, title = {Neuronal activity in the human lateral temporal lobe. I. Responses to speech.}, journal = {Experimental brain research}, volume = {77}, number = {3}, pages = {451-475}, pmid = {2806441}, issn = {0014-4819}, support = {NS 17111/NS/NINDS NIH HHS/United States ; NS 20482/NS/NINDS NIH HHS/United States ; NS 21724/NS/NINDS NIH HHS/United States ; }, mesh = {Epilepsy/surgery ; Evoked Potentials, Auditory ; Humans ; Speech Perception/*physiology ; Temporal Lobe/*physiology ; }, abstract = {Single and multiple unit neuronal activity was recorded from the cortex of the lateral temporal lobe in conscious humans during open brain surgery for the treatment of epilepsy. Recordings were obtained from the right and left superior, middle and inferior temporal gyrus of 34 patients (41 recording sites). Recordings were restricted to regions to be resected during subsequent surgery. This excluded recordings from language areas proper. Neuronal responses to words and sentences presented over a loudspeaker and during free conversation were recorded. No significant differences between the right and left hemisphere were obvious. All neurons in the superior temporal gyrus responded to various aspects of spoken language with temporally well defined activation/inhibition patterns, but not or only little to non-linguistic noises or tones. Excitatory responses were typically short or prolonged (up to several hundred ms) bursts of discharges at rates above 20/sec, reaching peak rates of 50-100/s. Such responses could be specifically related to certain combinations of consonants suggesting a function in categorization, they could depend on word length, could differentiate between polysyllabic and compound words of the same length or could be unspecifically related to language as such. No formant specific responses were found, but the prolonged excitations across syllables suggest that consonant/vowel combinations may play a role for some activation patterns. Responses of some neurons (or neuronal populations) depended on the attention paid to the words and sentences, or the task connected with them (repeat words, speech addressed to the patient demanding something). Neurons in the middle and inferior temporal gyrus were only little affected by listening to single words or sentences, but some were unspecifically activated by words or while listening to sentences. Excitatory responses varied within a limited range of discharge rates usually below 5-10/s. Phonetic distortion of spoken language could reduce responses in superior temporal gyrus neurons, but also the slight changes in discharge rate of middle temporal neurons could be absent during distorted and uncomprehensible speech sounds. We conclude that superior temporal gyrus neuron responses reflect some general phonetic but not semantic aspects of spoken language. Middle and inferior temporal gyrus neurons do not signal phonetic aspects of language, but may be involved in understanding language under certain conditions.}, } @article {pmid2693565, year = {1989}, author = {Pauka, CK}, title = {Place-pitch and vowel-pitch comparisons in cochlear implant patients using the Melbourne-Nucleus cochlear implant.}, journal = {The Journal of laryngology and otology. Supplement}, volume = {19}, number = {}, pages = {1-31}, pmid = {2693565}, issn = {0144-2945}, mesh = {Adult ; Aged ; Cochlear Implants/*psychology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Pitch Perception/*physiology ; Speech Perception/*physiology ; }, abstract = {Results of place-pitch and vowel-pitch comparisons are presented in 21 cochlear implant patients using the Melbourne-Nucleus cochlear implant. Vowel-pitch comparisons were also carried out in 10 normal hearing subjects. A technique for the place-pitch ranking test has been developed. A graphic representation of the results shows the well-ranked electrodes in sequential pitch-order, and reveals any indication of abnormal place-pitch perception. It aids the selection of correctly place-pitch ranked electrodes. The vowel-pitch comparisons showed that both normal hearing subjects and cochlear implant patients are able to rank vowels according to 'vowel-pitch'. In normal hearing subjects, three main types of vowel-pitch processing have been found. Results indicate that an information selection and reduction process occurs at higher levels along the auditory pathway. Cochlear implant patients test results showed the limited contribution of the first and the virtual lack of the second formant's contribution to pitch-ranking the voiced vowels. These results indicate that fundamental frequency converted to pulse rate may not be adequate at certain segments along the cochlear partition. Vowels are not perceived by cochlear implant patients according to their first or second formant frequency converted to place-pitch. There would seem to be a need for alternative speech processing strategies in the Melbourne-Nucleus implant.}, } @article {pmid2608728, year = {1989}, author = {Fox, RA}, title = {Dynamic information in the identification and discrimination of vowels.}, journal = {Phonetica}, volume = {46}, number = {1-3}, pages = {97-116}, doi = {10.1159/000261831}, pmid = {2608728}, issn = {0031-8388}, mesh = {Cues ; Glottis/physiology ; Humans ; Memory ; *Phonetics ; Psychological Theory ; Sound Spectrography ; *Speech/physiology ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The dynamic theory of vowel perception emphasizes the importance of the consonant transitions in CVC syllables to the identification of the vowel itself. However, the dynamic theory is relatively vague in terms of what perceptual processes may be involved and it says nothing about the role of auditory and/or phonetic memory for such 'dynamic events'. The present study created three synthetic [iota]-[epsilon] continua consisting of a [bVb] full-syllable continuum, a silent-center continuum (which had over 72% of the medial vowel replaced by silence) and an isolated vowel continuum to specifically examine (1) the use of auditory and phonetic memory for dynamic acoustic cues and (2) the ability of listeners to track the trajectories of very brief formant transitions. Experiment 1 demonstrated that there were no significant differences among the three continua in terms of listener identifications but that the silent-center tokens demonstrated significantly lower within-category discriminations, perhaps because of degraded auditory representation. Experiment 2 required listeners to make cross-continuum vowel discriminations and showed that some degree of acoustic similarity was important--listeners were particularly poor at discriminating between the silent-center and vowel-only tokens. This suggests that listeners were not able to make discriminations on the basis of abstract vowel labels alone. Experiment 3 demonstrated that listeners could make quite accurate vowel identifications even when as little as 1 pitch period of acoustic energy signaled the consonant transitions in the silent-center tokens. Experiment 4 showed that listeners' identifications of the silent-center stimuli were based on the formant changes in the transitions rather than their endpoint frequencies.}, } @article {pmid2602702, year = {1989}, author = {Takagi, M and Takahashi, M and Narita, E and Shimooka, S}, title = {[Comparison between children with missing anterior deciduous teeth and posterior deciduous teeth by analysis of speech sounds].}, journal = {Shoni shikagaku zasshi. The Japanese journal of pedodontics}, volume = {27}, number = {1}, pages = {144-152}, pmid = {2602702}, issn = {0583-1199}, mesh = {Child ; Child, Preschool ; Humans ; *Jaw, Edentulous, Partially ; *Speech Intelligibility ; *Tooth, Deciduous ; }, abstract = {Growing human offsprings are creatures that communicate by language. If rapidly growing children lose their deciduous teeth very early in life, their language and pronunciation functions may be seriously affected. The authors conducted a series of tests to find how the voice changes when deciduous teeth are extracted. The results may be summarized as follows. 1. There was no significant difference in the formants of vowels on "a-gyo", the first line of the Japanese syllabary, but an appreciable difference was recognized between children (four to six years old) with missing anterior teeth and posterior teeth when there were consonants before and after these vowels. 2. In the formants of the vowels, there was a marked difference in the vowel "i" when children were fitted with no appliance. 3. The strength of voice components in each frequency range was compared between children with missing anterior teeth and posterior teeth. It differed widely in "o ka a sa n" (which means "Mother") and "a-gyo", when children were fitted with no appliance. These findings indicate that the pronunciation of "o ka a sa n" and "a-gyo" can be recovered to some extent if the children are provided with an appliance. However, a sound analysis indicates that their pronunciation ability of sounds on "ka-gyo", "sa-gyo" and "ta-gyo" (the second, third and fourth lines of the Japanese syllabary) can hardly be fully recovered.}, } @article {pmid2532003, year = {1989}, author = {Tyler, RS and Tye-Murray, N and Moore, BC and McCabe, BF}, title = {Synthetic two-formant vowel perception by some of the better cochlear-implant patients.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {28}, number = {6}, pages = {301-315}, doi = {10.3109/00206098909081637}, pmid = {2532003}, issn = {0020-6091}, support = {CDR1P01NS20466-01A1/NS/NINDS NIH HHS/United States ; }, mesh = {*Cochlear Implants ; Communication Aids for Disabled ; Humans ; Language ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Synthetic two-formant vowel recognition was tested in some of the better cochlear-implant users to determine their ability to resolve sounds that differ essentially in their frequency content. The vowels /i, epsilon, alpha, u/ were synthesized with equal duration and similar sound pressure level. The formant values were chosen to approximate the values of these vowels in French, German and American English. Performance ranged from 29 to 71% for the 6 patients with the Chorimac implant, from 25 to 67% for the 9 patients with the 3M/Vienna implant, from 63 to 92% for the 10 patients with the Nucleus implant from Hannover, from 17 to 79% for the 10 patients with the Duren/Cologne implant, from 54 to 100% for the 9 patients with the Symbion implant, and from 79 to 100% for the 10 Nucleus patients from the USA. Patients with each of these devices can utilize some spectral information when recognizing steady-state vowels.}, } @article {pmid2489380, year = {1989}, author = {Tsunokawa, S and Takagi, M and Shimooka, S}, title = {[Changes on pronunciation using the removal appliance for children with early loss of deciduous teeth. Comparison between good occlusion children with missing anterior-posterior deciduous teeth].}, journal = {Shoni shikagaku zasshi. The Japanese journal of pedodontics}, volume = {27}, number = {2}, pages = {436-456}, pmid = {2489380}, issn = {0583-1199}, mesh = {Articulation Disorders/*etiology ; Child ; Dental Occlusion ; Denture Design ; *Denture, Partial, Removable ; Humans ; Tooth Loss/*complications ; Tooth, Deciduous ; }, abstract = {The purpose of this study was to examine missing anterior teeth and posterior teeth for possible differences according to the where the teeth were missing site, a comparison was made between children with such defects and those with relatively good occlusion. The following results were obtained: 1. Compared with children with missing anterior teeth, children with missing posterior teeth showed more differences in the frequency range for vowel formants from those with relatively good occlusion. 2. As for vowel formant, missing anterior teeth and missing posterior teeth presented more such differences for formant i and formants i and e, respectively. 3. For each formant, more such differences were noted for the first formant and the third formant compared with comparatively fewer such differences noted for the second formant. 4. In terms of the strength of voice components in each frequency range, consonant pronunciation rather than vowel pronunciation proved to be influenced by missing either anterior or posterior teeth. 5. For vowel pronunciation, more such differences were noted at low to intermediate frequency ranges. 6. Pronunciation of words containing the letter t and k tended to be improved for missing anterior and posterior teeth, respectively, but pronunciation proved to be improved in a difficult manner. The above results revealed that compared with missing anterior teeth, missing posterior teeth presented more differences from children with relatively good occlusion, suggesting that not only space maintenance and masticatory function but also speech function must be considered in preparing the removal appliance for missing posterior teeth of children.}, } @article {pmid3230892, year = {1988}, author = {Blamey, PJ and Cowan, RS and Alcantara, JI and Clark, GM}, title = {Phonemic information transmitted by a multichannel electrotactile speech processor.}, journal = {Journal of speech and hearing research}, volume = {31}, number = {4}, pages = {620-629}, doi = {10.1044/jshr.3104.620}, pmid = {3230892}, issn = {0022-4685}, mesh = {Adult ; Cochlear Implants ; Communication Methods, Total ; Deafness/*rehabilitation ; Electric Stimulation ; *Hearing Aids ; Humans ; Lipreading ; Phonetics ; *Touch ; }, abstract = {A wearable electrotactile speech processor was evaluated in a study with seven normally hearing and four hearing-impaired subjects. The processor estimated the fundamental frequency, the second-formant frequency, and amplitude of the acoustic speech signal. These parameters were presented as a pattern of electrical pulses applied to eight electrodes positioned over the digital nerve bundles on one hand. The device was shown to provide useful information for the recognition of phonemes in closed sets of words using tactile information alone. The device also supplemented lipreading to improve the recognition of open-set words. The recognition of duration and first- and second-formant frequencies of vowels and the recognition of voicing and manner of consonants were improved over recognition with lipreading alone. Recognition of final consonants was improved more than recognition of initial consonants. These results indicate that the device may be useful to both severely and profoundly hearing-impaired people.}, } @article {pmid3226856, year = {1988}, author = {DeJarnette, G}, title = {Formant frequencies (F1, F2) of jaw-free versus jaw-fixed vowels in normal and articulatory disordered children.}, journal = {Perceptual and motor skills}, volume = {67}, number = {3}, pages = {963-971}, doi = {10.2466/pms.1988.67.3.963}, pmid = {3226856}, issn = {0031-5125}, mesh = {Adult ; Articulation Disorders/*physiopathology ; Child ; Child, Preschool ; Humans ; Jaw/*physiopathology ; Male ; *Phonetics ; Speech Acoustics ; }, abstract = {Formant frequency data were gathered for four English vowels (i, ae, a, u) as produced by 15 male Caucasian subjects. There were 5 moderately articulatory disordered children aged 5.9 to 8.1 yr., 5 normal articulatory children aged 6.8 to 7.0 yr., and 5 normal adults aged 19.3 to 23.1 yr. All groups produced vowels under two conditions including with the mandible unconstrained in a normal or free condition and with the mandible fixed by a bite block. All subjects produced formant patterns within the ranges accepted for normal vowels in both jaw unconstrained and jaw fixed conditions. This finding suggests that articulation is a creative and context-sensitive behavior for normally matured, maturing, and developmentally immature systems. Also, it suggests that, if there are neurophysiological differences between groups, the vowel task is too simple to tease them out. The implication of these findings for consideration of tasks of motor equivalence is discussed.}, } @article {pmid3209771, year = {1988}, author = {Baum, SR and Katz, WF}, title = {Acoustic analysis of compensatory articulation in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {5}, pages = {1662-1668}, doi = {10.1121/1.397181}, pmid = {3209771}, issn = {0001-4966}, support = {NS 15123/NS/NINDS NIH HHS/United States ; NS 22343-01/NS/NINDS NIH HHS/United States ; }, mesh = {Child ; Child, Preschool ; Humans ; *Language Development ; Mandible/physiology ; Phonetics ; *Speech ; *Speech Acoustics ; }, abstract = {A study was undertaken to explore the effects of fixing the mandible with a bite block on the formant frequencies of the vowels [i a u] produced by two groups of children aged 4 and 5 and 7 and 8 years. Vowels produced in both normal and bite-block conditions were submitted to LPC analysis with windows placed over the first glottal pulse and at the vowel midpoint. For both groups of children, no differences were found in the frequencies of either the first or second formant between the normal and bite-block conditions. Results are discussed in relation to theories of the acquisition of speech motor control.}, } @article {pmid3198876, year = {1988}, author = {Ohde, RN}, title = {Revisiting stop-consonant perception for two-formant stimuli.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {4}, pages = {1551-1555}, doi = {10.1121/1.396603}, pmid = {3198876}, issn = {0001-4966}, support = {NS 18284/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Attention ; Humans ; *Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {The purpose of this study was to reexamine the factors leading to stop-consonant perception for consonant-vowel (CV) stimuli with just two formants over a range of vowels, under both an open- and closed-response condition. Five two-formant CV stimulus continua were synthesized, each covering a range of second-formant (F2) starting frequencies, for vowels corresponding roughly to [i,I,ae,u,a]. In addition, for the [I] and [a] continua, the duration of the first-formant (F1) transition was systematically varied. Three main findings emerged. First, criterion-level labial and alveolar responses were obtained for those stimuli with substantial F2 transitions. Second, for some stimuli, increases in the duration of the F1 transition increased velar responses to criterion level. Third, the response paradigm had a substantial influence on stop-consonant perception across all vowel continua. The results support a model of stop-consonant perception that includes spectral and time-varying spectral properties as integral components of analysis.}, } @article {pmid3198863, year = {1988}, author = {Weismer, G and Kent, RD and Hodge, M and Martin, R}, title = {The acoustic signature for intelligibility test words.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {4}, pages = {1281-1291}, doi = {10.1121/1.396627}, pmid = {3198863}, issn = {0001-4966}, support = {NS 22458/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Amyotrophic Lateral Sclerosis/*physiopathology ; Brain Stem/physiopathology ; Dysarthria/*physiopathology ; Female ; Humans ; Male ; Pyramidal Tracts/physiopathology ; *Sound Spectrography ; Speech Disorders/*physiopathology ; *Speech Intelligibility ; }, abstract = {As part of a research program that aims to develop an explicit acoustic basis for a single-word intelligibility test, an initial attempt to characterize the formant trajectories and segment durations of seven test words produced by 30 normal speakers is described. These characterizations are referred to as "acoustic signatures." The data indicate that: (1) formant trajectories show two sex effects, namely, that females are more variable as a group than males and tend to have greater slopes for the transitional segment of the second-formant trajectories and that these effects are consistent across words; (2) Bark transformations of the frequency data do not seem to eliminate the interspeaker differences in formant trajectories, nor do they eliminate either of the sex effects described above; and (3) segment durations have different variabilities depending on the syllabic structure of the word; no sex effect was noted here. The discussion focuses on the appropriate form for the acoustic signatures, as well as factors that should be considered in selecting words for signature development. To demonstrate the potential application of these data, formant trajectory and segment duration data from 18 speakers with amyotrophic lateral sclerosis and varying degrees of dysarthria are compared to the acoustic signature for the word wax.}, } @article {pmid3183210, year = {1988}, author = {Klingholz, F and Penning, R and Liebhardt, E}, title = {Recognition of low-level alcohol intoxication from speech signal.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {3}, pages = {929-935}, doi = {10.1121/1.396661}, pmid = {3183210}, issn = {0001-4966}, mesh = {Alcoholic Intoxication/*physiopathology ; Humans ; Male ; Speech/*drug effects/physiology ; }, abstract = {Eleven male subjects were required to read a text in both sober and alcohol intoxicated conditions. By means of statistical signal analysis, frequency distributions of fundamental frequency (F0), signal-to-noise ratio (SNR), ratio of first- to second-formant frequencies (F1/F2), variation speed of the frequencies F0, F1, F2, and the long-term average spectrum (LTAS) were determined. The distributions were examined for their suitability in discriminating between sober and intoxicated conditions. The SNR and F0 distributions as well as the LTAS discriminated with an error rate less than 5%. Combination of SNR and F0 profiles enabled correct discrimination in all cases. The parameter F1/F2 describing the articulation varied strongly among individuals. It was modified only with high levels of blood alcohol. Frequency variation speeds were not altered by intoxication. Speaker recognition by means of LTAS was interpreted as a perturbation of laryngeal movement control, where long-term voice effort was found to produce similar effects to alcohol intoxication. On the basis of the present results and various other factors (ambiguity of the sources of the acoustic effects, expense of the procedure), application of acoustic analysis in forensic medicine for recognition of low-level alcohol intoxication is considered inexpedient.}, } @article {pmid3183209, year = {1988}, author = {Summers, WV and Pisoni, DB and Bernacki, RH and Pedlow, RI and Stokes, MA}, title = {Effects of noise on speech production: acoustic and perceptual analyses.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {3}, pages = {917-928}, pmid = {3183209}, issn = {0001-4966}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; NS-12179/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Auditory Pathways/*physiology ; Humans ; Male ; *Noise ; Speech/*physiology ; Speech Perception/*physiology ; }, abstract = {Acoustical analyses were carried out on a set of utterances produced by two male speakers talking in quiet and in 80, 90, and 100 dB SPL of masking noise. In addition to replicating previous studies demonstrating increases in amplitude, duration, and vocal pitch while talking in noise, these analyses also found reliable differences in the formant frequencies and short-term spectra of vowels. Perceptual experiments were also conducted to assess the intelligibility of utterances produced in quiet and in noise when they were presented at equal S/N ratios for identification. In each experiment, utterances originally produced in noise were found to be more intelligible than utterances produced in the quiet. The results of the acoustic analyses showed clear and consistent differences in the acoustic-phonetic characteristics of speech produced in quiet versus noisy environments. Moreover, these accounts differences produced reliable effects on intelligibility. The findings are discussed in terms of: (1) the nature of the acoustic changes that taken place when speakers produce speech under adverse conditions such as noise, psychological stress, or high cognitive load: (2) the role of training and feedback in controlling and modifying a talker's speech to improve performance of current speech recognizers; and (3) the development of robust algorithms for recognition of speech in noise.}, } @article {pmid3183206, year = {1988}, author = {Tong, YC and Lim, HH and Clark, GM}, title = {Synthetic vowel studies on cochlear implant patients.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {3}, pages = {876-887}, doi = {10.1121/1.396657}, pmid = {3183206}, issn = {0001-4966}, support = {N01-NS-5-2388/NS/NINDS NIH HHS/United States ; NS-21027/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Auditory Pathways/physiopathology ; *Cochlear Implants ; Deafness/*physiopathology/therapy ; Electric Stimulation ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Speech perception studies were conducted on three cochlear implant patients to investigate the relative merits of six speech processing schemes for presenting speech information to these patients. Electrical stimuli, described in this article as synthetic vowels, were constructed using tabulated data of formant frequencies of natural vowels. The six schemes differed in the number of formant frequencies encoded on the electrical signal dimension of electrode position, and/or in the range of electrode position used for encoding each formant frequency. Eleven synthetic vowels (i, I, E, ae, a, c, U, u, v, E, D) were used and were presented in a single-interval procedure for absolute identification. Single-formant vowels were used in two of the six schemes, two-formant vowels in three schemes, and three-formant vowels in the remaining scheme. The confusion matrices were subjected to conditional information transmission analysis on the basis of previous psychophysiological findings. Comparisons among the schemes in terms of the analyzed results showed that training, experience, and adaptability to new speech processing schemes were major factors influencing the identification of synthetic vowels. For vowels containing more than one formant, the information about each formant affected the perception of the other formants. In addition, there appeared to be differences between the perceptual processes for vowels containing more than one formant and the processes for single-formant vowels. Taking into consideration the effects of training, experience, and adaptability, the three-formant speech processing scheme appeared, on the basis of perceptual performance comparisons among the six schemes, to be the logical choice for implementation in speech processors for cochlear implant patients.}, } @article {pmid3183204, year = {1988}, author = {Behrens, S and Blumstein, SE}, title = {On the role of the amplitude of the fricative noise in the perception of place of articulation in voiceless fricative consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {3}, pages = {861-867}, doi = {10.1121/1.396655}, pmid = {3183204}, issn = {0001-4966}, support = {NS15123/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Humans ; Male ; *Noise ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {The current study explores the role of the amplitude of the fricative noise in the perception of place of articulation in voiceless fricative consonants. The amplitude of the fricative noise in naturally produced fricative-vowel utterances was varied relative to the vowel and potential changes in perceptual responses were investigated. The amplitude of the fricative noise for [s] and [s] was reduced such that the amplitude of the noise relative to the vowel was similar to [f] and [O], and, conversely, the amplitude of the fricative noise of [f] and [O] was increased such that the amplitude of the noise relative to the vowel was similar to [s] and [s]. The fricative noise was presented to listeners in both its vowel context and in isolation. Results indicated that, when the spectral properties of the fricative noise and formant transitions are compatible, the perceptual effects of the amplitude manipulation of the amplitude of the noise had a small effect on the overall identification of place of articulation, and when effects emerged, they varied across the different fricative stimuli. Moreover, although decreasing the amplitude of [s] and [s] resulted in an increase in [f] and [O] responses, increasing the amplitude of [f] and [O] did not result in an increase in [s] and [s] responses. Implications of these findings for phonetic feature theories are considered.}, } @article {pmid3398479, year = {1988}, author = {Chaney, C}, title = {Identification of correct and misarticulated semivowels.}, journal = {The Journal of speech and hearing disorders}, volume = {53}, number = {3}, pages = {252-261}, doi = {10.1044/jshd.5303.252}, pmid = {3398479}, issn = {0022-4677}, mesh = {Adult ; Articulation Disorders/*diagnosis/psychology ; Child, Preschool ; Female ; Humans ; Male ; Parents ; *Phonetics ; Self Concept ; Speech Articulation Tests/*methods ; Speech Perception ; Speech Production Measurement/*methods ; }, abstract = {This study examined the identification of correctly produced and misarticulated /w, r, l, j/ in several subject groups: 4 normal children who produced correct /w, r, l, j/, 4 normal children with developmental w/r and w/l substitutions, 4 articulation-impaired children who misarticulated /r/ and /l/, a parent of each child, and two raters who were trained in phonetic transcription. Immediately following a production experiment in which the child subjects had produced minimally contrastive /w, r, l, j/ words in sentences, a selection of each child's utterances was randomly mixed with productions by an adult female speaker and were presented in minimally paired sets to each subject for identification of /w, r, l, j/. The children, parents, and raters were much more successful in identifying correctly produced semivowels than misarticulated ones. Misarticulating children and their parents differed from raters in approach to the task and in success. Analysis of individual subject data found that children who identified self-produced semivowels most successfully were the same children whose semivowels exhibited the most second formant frequency and transition rate differences in the previous production experiment. These results have both theoretical and therapeutic implications.}, } @article {pmid3170943, year = {1988}, author = {Dorman, MF and Hannley, MT and McCandless, GA and Smith, LM}, title = {Auditory/phonetic categorization with the Symbion multichannel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {2}, pages = {501-510}, doi = {10.1121/1.396828}, pmid = {3170943}, issn = {0001-4966}, support = {R01 NS 15457-6/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Cues ; Humans ; Male ; *Phonetics ; Prosthesis Design ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The phonetic identification ability of an individual (SS) who exhibits the best, or equal to the best, speech understanding of patients using the Symbion four-channel cochlear implant is described. It has been found that SS: (1) can use aspects of signal duration to form categories that are isomorphic with the phonetic categories established by listeners with normal auditory function; (2) can combine temporal and spectral cues in a normal fashion to form categories; (3) can use aspects of fricative noises to form categories that correspond to normal phonetic categories; (4) uses information from both F1 and higher formants in vowel identification; and (5) appears to identify stop consonant place of articulation on the basis of information provided by the center frequency of the burst and by the abruptness of frequency change following signal onset. SS has difficulty identifying stop consonants from the information provided by formant transitions and cannot differentially identify signals that have identical F1's and relatively low-frequency F2's. SS's performance suggests that simple speech processing strategies (filtering of the signal into four bands) and monopolar electrode design are viable options in the design of cochlear prostheses.}, } @article {pmid3170279, year = {1988}, author = {Dillier, N and Spillmann, T}, title = {[Perception of acoustic speech features with single channel cochlear implants and hearing aids. A comparative analysis].}, journal = {HNO}, volume = {36}, number = {8}, pages = {335-341}, pmid = {3170279}, issn = {0017-6192}, mesh = {Auditory Threshold ; *Cochlear Implants ; Deafness/*rehabilitation ; *Hearing Aids ; Humans ; Phonetics ; Speech Discrimination Tests ; }, abstract = {A newly developed battery of 12 multiple-choice tests for the German language (graded according to difficulty) comprising identification of sentences, words and syllables was applied to 8 patients with single-channel cochlear implants and to 35 deaf hearing-aid users. Phoneme confusion matrices were analysed using sequential information transmission analysis (SINFA) and multi-dimensional scaling (SINDSCAL). Acoustic parameters of the digitally stored test items were compared with the outcome of SINFA and SINDSCAL. It was found that cochlear implant and hearing-aid users relied on the same perceptual dimensions which were correlated to acoustic signal properties of the speech sounds. Best perceived acoustic features were time and amplitude structure of the speech sound (envelope). Fundamental and first formant frequency are evaluated both by implant and hearing-aid users. The use of higher frequency spectral information depends on the amount of residual hearing.}, } @article {pmid2971766, year = {1988}, author = {Levitt, A and Jusczyk, PW and Murray, J and Carden, G}, title = {Context effects in two-month-old infants' perception of labiodental/interdental fricative contrasts.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {14}, number = {3}, pages = {361-368}, doi = {10.1037//0096-1523.14.3.361}, pmid = {2971766}, issn = {0096-1523}, support = {05596//PHS HHS/United States ; HD 01994/HD/NICHD NIH HHS/United States ; NS 24655/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Infant ; *Language Development ; *Phonetics ; *Speech Perception ; }, abstract = {We investigated 2-month-old infants' perception of a subset of highly confusable English fricatives. In Experiment 1, infants discriminated modified natural tokens of the voiceless fricative pair [fa]/[oa] but only when the syllables included their frication noises. They also discriminated the voiced pair [va]/[oa] both with and without fricative noises. These results parallel those found with adults by Carden, Levitt, Jusczyk, and Walley (1981). In Experiment 2 [f] and [o] noises were appended to [a], and the same [f] noise was appended to the previously indiscriminable fricationless versions of [fa] and [oa]. Infants discriminated both pairs of stimuli, indicating that (a) the frication is a sufficient cue for [fa]/[oa] discrimination and that (b) it provides a context for discriminating the [f] and [o] formant transitions. We conclude that infants' perception of labiodental/interdental fricative contrasts show evidence of context effects similar to those observed with adults.}, } @article {pmid3411047, year = {1988}, author = {Hienz, RD and Brady, JV}, title = {The acquisition of vowel discriminations by nonhuman primates.}, journal = {The Journal of the Acoustical Society of America}, volume = {84}, number = {1}, pages = {186-194}, doi = {10.1121/1.396963}, pmid = {3411047}, issn = {0001-4966}, support = {DA-00018/DA/NIDA NIH HHS/United States ; DA-02490/DA/NIDA NIH HHS/United States ; }, mesh = {Animals ; Diazepam/*pharmacology ; Dose-Response Relationship, Drug ; Male ; Papio/*physiology ; *Phonetics ; Speech Perception/drug effects/*physiology ; }, abstract = {Three adult male baboons were trained on a psychophysical procedure to discriminate five synthetic, steady-state vowel sounds [a), (ae), (c), (U), and (epsilon] from one another. A pulsed train of one vowel comprised the reference stimulus during a session. Animals were trained to press a lever and release the lever only when this reference vowel sound changed to one of the comparison vowels. All animals learned the vowel discriminations rapidly and, once learned, performed the discriminations at the 95%-100% correct level. During the initial acquisition of the discriminations, however, percent correct detections were higher for those vowels with greater spectral differences from the reference vowel. For some cases, the detection scores correlated closely with differences between first formant peaks, while for others the detection scores correlated more closely with differences between second formant peaks. Once the discriminations were acquired, no discriminability differences were apparent among the different vowels. Underlying discriminability differences were still present, however, and could be revealed by giving a minor tranquilizer (diazepam) that lowered discrimination performances. These drug-induced decrements in vowel discriminability were also correlated with spectral differences, with lower vowel discriminability scores found for those vowels with smaller spectral differences from the reference vowel.}, } @article {pmid3417881, year = {1988}, author = {Mount, KH and Salmon, SJ}, title = {Changing the vocal characteristics of a postoperative transsexual patient: a longitudinal study.}, journal = {Journal of communication disorders}, volume = {21}, number = {3}, pages = {229-238}, doi = {10.1016/0021-9924(88)90031-7}, pmid = {3417881}, issn = {0021-9924}, mesh = {Humans ; Longitudinal Studies ; Male ; Middle Aged ; *Speech ; *Speech Acoustics ; Transsexualism/*surgery ; *Voice ; *Voice Quality ; }, abstract = {The vocal characteristics of a 63-year-old individual who underwent male-to-female sex reassignment surgery were evaluated. Treatment was designed to alter inappropriate male voice characteristics. Speech goals were to (1) encourage use of successively higher pitch levels, and (2) modify tongue carriage to change resonance. After 11 months of therapy, average fundamental frequency for /i, a, u/ vowels changed from 110 to 205 Hz. Also, second formant frequency values changed remarkably for each of these vowels, with the greatest frequency change being 291 Hz for /i/. These acoustic differences could account for the perception of femininity in her posttreatment voice. Maintenance of these acoustic features was found five years posttreatment.}, } @article {pmid3411026, year = {1988}, author = {Gramming, P and Sundberg, J}, title = {Spectrum factors relevant to phonetogram measurement.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {6}, pages = {2352-2360}, doi = {10.1121/1.396366}, pmid = {3411026}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Perception/*physiology ; }, abstract = {Phonetograms showing the sound-pressure level (SPL) in loudest and softest possible phonation are frequently used in some voice clinics as an aid for describing the status of voice function. Spectrum analysis of the vowel /a/ produced by ten females and ten males with healthy, untrained voices revealed that the fundamental was mostly the strongest spectrum partial in soft phonation while the loudest partial in loud phonation was generally an overtone. Also, the first-formant frequency was generally lower in soft than in loud phonation. Measuring SPL in dB(A) rather than in dB lowered the phonetogram contour for soft phonation, an effect increasing with decreasing fundamental frequency. SPL measurements on a group of 22 females with healthy voices showed that the vowel /a/ gave higher SPL values than other vowels in loud phonation. The effect of using dB rather the dB(A) was great but similar for all vowels in soft phonation while, in loud phonation, the effect was small, particularly for /a/. In dB, the effect of using different vowels amounts to about +/- 5 dB, approximately. Interpretation of a phonetogram in terms of voice physiology is facilitated if SPL is given in dB and if a vowel with a high first-formant frequency is used.}, } @article {pmid3398500, year = {1988}, author = {Chaney, C}, title = {Acoustic analysis of correct and misarticulated semivowels.}, journal = {Journal of speech and hearing research}, volume = {31}, number = {2}, pages = {275-287}, doi = {10.1044/jshr.3102.275}, pmid = {3398500}, issn = {0022-4685}, mesh = {*Child Language ; Child, Preschool ; Humans ; *Language Development ; *Phonetics ; Reference Values ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; Speech Disorders/*physiopathology ; }, abstract = {Four children who produced correct (w,r,l,j), four children with developmental w/r and w/l substitutions, and four articulation impaired children with w/r and w/l substitutions were subjects. They produced sets of minimally contrasted words with (w,r,l,j) in word-initial position with four vowels and with (w,r,l) in two types of consonant clusters. Children's utterances were spectrographically analyzed for three formant frequencies and transition rate of the second formant. Children with correct semivowels produced distinctive formant frequency patterns for semivowels that were similar to those previously reported in the literature for adults and children. Developmental and articulation impaired children produced acoustic features for (j) that were similar to the (j) produced by the control group; but neither group differentiated among (w,r,l) by either formant frequencies or transition rate. Some individuals in both groups produced formant frequency and/or transition rate differences among semivowels in some phonetic contexts. The (w) produced for target (w) and in substitution for (r) and (l) by three developmental children and two articulation-impaired children did not match the acoustic pattern of control (w). These productions had higher second formants, occurring between control (w) and (r,l) or in the range of correct (r,l).}, } @article {pmid3398493, year = {1988}, author = {Sharf, DJ and Ohde, RN and Lehman, ME}, title = {Relationship between the discrimination of (w-r) and (t-d) continua and the identification of distorted (r).}, journal = {Journal of speech and hearing research}, volume = {31}, number = {2}, pages = {193-206}, doi = {10.1044/jshr.3102.193}, pmid = {3398493}, issn = {0022-4685}, mesh = {Adult ; Child Language ; Feedback ; Humans ; Models, Theoretical ; *Phonetics ; *Speech Perception ; }, abstract = {The purpose of this study was to assess the extent to which listeners can perceive intraphonemic differences. In Experiment 1, subjects identified synthesized acoustic tokens of child-like speech that varied in second and third formant (F2 and F3) onset frequencies as (w), (r), or distorted (r) in two conditions: (a) with and without feedback of the group response choices, and (b) before and after training to identify the best examples of (w), (r), and distorted (r) based on their identification in the first condition. The results were: (a) some subjects consistently identified distorted (r) above criterion, and (b) feedback was more effective in increasing distorted (r) identification than was training. In Experiment 2, the same subjects participated in discrimination tasks using stimuli from a synthesized child (w-r) continuum that varied in F2 and F3 onsets and from a synthesized adult (t-d) continuum that varied in preconsonantal vowel duration. The results were: (a) perception was not categorical for both continua, (b) little relation was found between distorted-(r) identification and measures of (w-r) discrimination, and (c) a high and significant correlation was found between identification of distorted (r) and within-(d) discrimination. In Experiment 3, different subjects identified the child manifold stimuli and discriminated stimuli in a synthesized child (w-r) continuum and in a synthesized adult (t-d) continuum. The results were: (a) neither (w-r) or (t-d) perception was categorical although the former came closer than the latter in terms of individual subject performance, (b) there was a high and significant correlation between distorted-(r) identification and within-(r) discrimination of (w-r) stimuli, and (c) there were high and significant correlations between distorted-(r) identification and mean, cross-category boundary, and within-(t) discrimination of (t-d) stimuli.}, } @article {pmid3403805, year = {1988}, author = {Nábĕlek, AK and Letowski, TR}, title = {Similarities of vowels in nonreverberant and reverberant fields.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {5}, pages = {1891-1899}, doi = {10.1121/1.396473}, pmid = {3403805}, issn = {0001-4966}, support = {R01 NS 12305/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Humans ; *Phonetics ; Pitch Perception/physiology ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Perceptual distances among single tokens of American English vowels were established for nonreverberant and reverberant conditions. Fifteen vowels in the phonetic context (b-t), embedded in the sentence "Mark the (b-t) again" were recorded by a male talker. For the reverberant condition, the sentences were played through a room with a reverberation time of 1.2 s. The CVC syllables were removed from the sentences and presented in pairs to ten subjects with audiometrically normal hearing, who judged the similarity of the syllable pairs separately for the nonreverberant and reverberant conditions. The results were analyzed by multidimensional scaling procedures, which showed that the perceptual data were accounted for by a three-dimensional vowel space. Correlations were obtained between the coordinates of the vowels along each dimension and selected acoustic parameters. For both conditions, dimensions 1 and 2 were highly correlated with formant frequencies F2 and F1, respectively, and dimension 3 was correlated with the product of the duration of the vowels and the difference between F3 and F1 expressed on the Bark scale. These observations are discussed in terms of the influence of reverberation on speech perception.}, } @article {pmid3403796, year = {1988}, author = {Sinex, DG and McDonald, LP}, title = {Average discharge rate representation of voice onset time in the chinchilla auditory nerve.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {5}, pages = {1817-1827}, doi = {10.1121/1.396516}, pmid = {3403796}, issn = {0001-4966}, support = {NS23242/NS/NINDS NIH HHS/United States ; }, mesh = {Action Potentials ; Animals ; Chinchilla ; *Phonation ; Phonetics ; Pitch Discrimination/physiology ; Time Factors ; Vestibulocochlear Nerve/*physiology ; *Voice ; }, abstract = {Responses of chinchilla auditory-nerve fibers to synthesized stop consonants differing in voice onset time (VOT) were obtained. The syllables, heard as /ga/-/ka/ or /da/-/ta/, were similar to those previously used by others in psychophysical experiments with human and with chinchilla subjects. Average discharge rates of neurons tuned to the frequency region near the first formant generally increased at the onset of voicing, for VOTs longer than 20 ms. These rate increases were closely related to spectral amplitude changes associated with the onset of voicing and with the activation of the first formant; as a result, they provided accurate information about VOT. Neurons tuned to frequency regions near the second and third formants did not encode VOT in their average discharge rates. Modulations in the average rates of these neurons reflected spectral variations that were independent of VOT. The results are compared to other measurements of the peripheral encoding of speech sounds and to psychophysical observations suggesting that syllables with large variations in VOT are heard as belonging to one of only two phonemic categories.}, } @article {pmid3372871, year = {1988}, author = {Lindholm, JM and Dorman, M and Taylor, BE and Hannley, MT}, title = {Stimulus factors influencing the identification of voiced stop consonants by normal-hearing and hearing-impaired adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {4}, pages = {1608-1614}, doi = {10.1121/1.395915}, pmid = {3372871}, issn = {0001-4966}, support = {R01 NS 15457-05/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Aged ; Cues ; Hearing Disorders/*physiopathology ; Humans ; Middle Aged ; *Phonetics ; *Speech Perception ; }, abstract = {The effects of mild-to-moderate hearing impairment on the perceptual importance of three acoustic correlates of stop consonant place of articulation were examined. Normal-hearing and hearing-impaired adults identified a stimulus set comprising all possible combinations of the levels of three factors: formant transition type (three levels), spectral tilt type (three levels), and abruptness of frequency change (two levels). The levels of these factors correspond to those appropriate for /b/, /d/, and /g/ in the /ae/ environment. Normal-hearing subjects responded primarily in accord with the place of articulation specified by the formant transitions. Hearing-impaired subjects showed less-than-normal reliance on formant transitions and greater-than-normal reliance on spectral tilt and abruptness of frequency change. These results suggest that hearing impairment affects the perceptual importance of cues to stop consonant identity, increasing the importance of information provided by both temporal characteristics and gross spectral shape and decreasing the importance of information provided by the formant transitions.}, } @article {pmid3372870, year = {1988}, author = {Parker, EM}, title = {Auditory constraints on the perception of voice-onset time: the influence of lower tone frequency on judgments of tone-onset simultaneity.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {4}, pages = {1597-1607}, doi = {10.1121/1.395914}, pmid = {3372870}, issn = {0001-4966}, support = {HD 18060/HD/NICHD NIH HHS/United States ; }, mesh = {Auditory Pathways/*physiology ; Humans ; *Judgment ; *Sound ; Speech Perception/*physiology ; Time Factors ; }, abstract = {The experiments reported employed nonspeech analogs of speech stimuli to examine the perceptual interaction between first-formant onset frequency and voice-onset time, acoustic cues to the voicing distinction in English initial stop consonants. The nonspeech stimuli comprised two pure tones varying in relative onset time, and listeners were asked to judge the simultaneity of tone onsets. These judgments were affected by the frequency of the lower tone in a manner that parallels the influence of first-formant onset frequency on voicing judgments. This effect was shown to occur regardless of prior learning and to be systematic over a wide range of lower tone frequencies including frequencies beyond the range of possible first-formant frequencies of speech, suggesting that the effect in speech is not attributable to (tacit) knowledge of production constraints, as some current theories suggest.}, } @article {pmid3359170, year = {1988}, author = {Molfese, DL and Molfese, VJ}, title = {Right-hemisphere responses from preschool children to temporal cues to speech and nonspeech materials: electrophysiological correlates.}, journal = {Brain and language}, volume = {33}, number = {2}, pages = {245-259}, doi = {10.1016/0093-934x(88)90067-3}, pmid = {3359170}, issn = {0093-934X}, mesh = {Auditory Perception/*physiology ; Child, Preschool ; Cues ; *Evoked Potentials, Auditory ; Female ; Functional Laterality/*physiology ; Humans ; Male ; Phonetics ; Sex Factors ; Speech Perception/*physiology ; Temporal Lobe/physiology ; Time Perception/physiology ; }, abstract = {Auditory-evoked responses (AERs) were recorded from scalp electrodes placed over the left and right temporal hemisphere regions of 12 preschool children while they listened to a series of velar stop consonants which varied in voice onset time (VOT) and to two-formant tone stimuli with temporal lags comparable to the speech materials. A late occurring negative peak (N400) in the right hemisphere AERs discriminated between both the speech and nonspeech materials in a categorical-like manner. Sex-related hemisphere differences were also noted in response to the two different stimulus types. These results replicate earlier work with speech materials and suggest that temporal delays for both speech and nonspeech auditory materials are processed in the right hemisphere.}, } @article {pmid3406659, year = {1988}, author = {Dreschler, WA}, title = {The effect of specific compression settings on phoneme identification in hearing-impaired subjects.}, journal = {Scandinavian audiology}, volume = {17}, number = {1}, pages = {35-43}, doi = {10.3109/01050398809042178}, pmid = {3406659}, issn = {0105-0397}, mesh = {Adolescent ; *Hearing Aids ; Hearing Loss, Conductive/physiopathology/rehabilitation ; Hearing Loss, Sensorineural/*physiopathology/rehabilitation ; Humans ; *Speech Perception ; }, abstract = {In this study, some aspects of single-channel compression have been investigated in 15 hearing-impaired subjects and 4 normal-hearing listeners. Phoneme perception as a function of the compression-threshold level was measured for two hearing aids with input-dependent compression, differing in compression ratio and compression recovery time. Phoneme-identification scores, obtained by different hearing aids under similar conditions, did not differ significantly. However, there was a small but significant increase in identification scores for the lowest setting of the compression threshold. Some qualitative differences in the patterns of confusion could also be established. There are indications that compression reduces the relative contribution of frication and enhances the contribution of the second formant of vowels in phoneme perception.}, } @article {pmid3343443, year = {1988}, author = {Repp, BH and Svastikula, K}, title = {Perception of the [m]-[n] distinction in VC syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {1}, pages = {237-247}, doi = {10.1121/1.396529}, pmid = {3343443}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This study complements earlier experiments on the perception of the [m]-[n] distinction in CV syllables [B. H. Repp, J. Acoust. Soc. Am. 79, 1987-1999 (1986); B. H. Repp, J. Acoust. Soc. Am. 82, 1525-1538 (1987)]. Six talkers produced VC syllables consisting of [m] or [n] preceded by [i, a, u]. In listening experiments, these syllables were truncated from the beginning and/or from the end, or waveform portions surrounding the point of closure were replaced with noise, so as to map out the distribution of the place of articulation information for consonant perception. These manipulations revealed that the vocalic formant transitions alone conveyed about as much place of articulation information as did the nasal murmur alone, and both signal portions were about as informative in VC as in CV syllables. Nevertheless, full VC syllables were less accurately identified than full CV syllables, especially in female speech. The reason for this was hypothesized to be the relative absence of a salient spectral change between the vowel and the murmur in VC syllables. This hypothesis was supported by the relative ineffectiveness of two additional manipulations meant to disrupt the perception of relational spectral information (channel separation or temporal separation of vowel and murmur) and by subjects' poor identification scores for brief excerpts including the point of maximal spectral change. While, in CV syllables, the abrupt spectral change from the murmur to the vowel provides important additional place of articulation information, for VC syllables it seems as if the format transitions in the vowel and the murmur spectrum functioned as independent cues.}, } @article {pmid3343436, year = {1988}, author = {Payton, KL}, title = {Vowel processing by a model of the auditory periphery: a comparison to eighth-nerve responses.}, journal = {The Journal of the Acoustical Society of America}, volume = {83}, number = {1}, pages = {145-162}, doi = {10.1121/1.396441}, pmid = {3343436}, issn = {0001-4966}, support = {T32 NSO7099/NS/NINDS NIH HHS/United States ; }, mesh = {Basilar Membrane/physiology ; Ear, Middle/physiology ; Hair Cells, Auditory/physiology ; Humans ; Mathematics ; *Models, Biological ; Speech Perception/*physiology ; Synapses/physiology ; Tympanic Membrane/physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {A model of peripheral auditory processing that incorporates processing steps describing the conversion from the acoustic pressure-wave signal at the eardrum to the time course activity in auditory neurons has been developed. It can process arbitrary time domain waveforms and yield the probability of neural firing. The model consists of a concatenation of modules, one for each anatomical section of the periphery. All modules are based on published algorithms and current experimental data, except that the basilar membrane is assumed to be linear. The responses of this model to vowels alone and vowels in noise are compared to neural population responses, as determined by the temporal and average rate response measures of Sachs and Young [J. Acoust. Soc. Am. 66, 470-479, (1979)] and Young and Sachs [J. Acoust. Soc. Am. 66, 1381-1403, (1979)]. Despite the exclusion of nonlinear membrane mechanics, the model accurately predicts the vowel formant representations in the average localized synchronized rate (ALSR) responses and the saturating characteristics of the normalized average rate responses in quiet. When vowels are presented in background noise, the modeled ALSR responses are less robust than the neural data.}, } @article {pmid3252832, year = {1988}, author = {Smith, RA and Mehra, S and Devereaux, MO and Rich, J}, title = {Teaching primary health care: a comprehensive approach.}, journal = {World health forum}, volume = {9}, number = {3}, pages = {389-392}, pmid = {3252832}, issn = {0251-2432}, mesh = {*Education, Medical ; *Primary Health Care ; Teaching/methods ; }, } @article {pmid3237776, year = {1988}, author = {Traunmüller, H}, title = {Paralinguistic variation and invariance in the characteristic frequencies of vowels.}, journal = {Phonetica}, volume = {45}, number = {1}, pages = {1-29}, doi = {10.1159/000261809}, pmid = {3237776}, issn = {0031-8388}, mesh = {Age Factors ; Child ; Child, Preschool ; Cues ; Female ; Humans ; Jaw/physiology ; Larynx/physiology ; Male ; *Phonetics ; Physical Exertion ; Sex Factors ; Sound Spectrography ; *Speech/*physiology ; *Speech Acoustics ; }, abstract = {It is shown that within-speaker variations in vocal effort and phonation affect fundamental frequency (F0) and the formant frequencies of vowels in the sense of a linear compression/expansion of the spectral separations between them, given an adequate scaling of pitch. Between-speaker variations in size correspond to a translation of the spectral peaks shaped by F0 and the formants if pitch is scaled tonotopically (in Bark). On the basis of these observations, invariant cues to vowel quality are suggested. It is further shown that vowels produced by adult women tend to be phonetically more explicit and, hence, more peripheral in 'vowel space' than those of men and children. It is also shown that the formant frequencies of vowels subjected to paralinguistic variation are related by power functions of frequency.}, } @article {pmid3197467, year = {1988}, author = {Nini, W and Nini, N}, title = {[Intrascrotal teratoma forming a free foreign body in the tunica vaginalis. Apropos of a case].}, journal = {Chirurgie; memoires de l'Academie de chirurgie}, volume = {114}, number = {3}, pages = {268-269}, pmid = {3197467}, issn = {0001-4001}, mesh = {Adult ; *Calcinosis/pathology ; *Foreign Bodies ; Humans ; Male ; *Scrotum ; *Teratoma/pathology ; }, } @article {pmid3429735, year = {1987}, author = {Deng, L and Geisler, CD}, title = {A composite auditory model for processing speech sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {6}, pages = {2001-2012}, doi = {10.1121/1.395644}, pmid = {3429735}, issn = {0001-4966}, support = {NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Basilar Membrane/physiology ; *Computer Simulation ; Ear, Inner/*physiology ; Hair Cells, Auditory/physiology ; Humans ; Models, Neurological ; Nerve Fibers/physiology ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {A composite inner-ear model, containing the middle ear, basilar membrane (BM), hair cells, and hair-cell/nerve-fiber synapses, is presented. The model incorporates either a linear-BM stage or a nonlinear one. The model with the nonlinear BM generally shows a high degree of success in reproducing the qualitative aspects of experimentally recorded cat auditory-nerve-fiber responses to speech. In modeling fiber population responses to speech and speech in noise, it was found that the BM nonlinearity allows bands of fibers in the model to synchronize strongly to a common spectral peak in the stimulus. A cross-channel correlation algorithm has been devised to further process the model's population outputs. With output from the nonlinear-BM model, the cross-channel correlation values are appreciably reduced only at those channels whose CFs coincide with the formant frequencies. This observation also holds, to a large extent, for noisy speech.}, } @article {pmid2961932, year = {1987}, author = {Hillenbrand, J}, title = {A methodological study of perturbation and additive noise in synthetically generated voice signals.}, journal = {Journal of speech and hearing research}, volume = {30}, number = {4}, pages = {448-461}, doi = {10.1044/jshr.3004.448}, pmid = {2961932}, issn = {0022-4685}, support = {1-R01-NS-22234-01/NS/NINDS NIH HHS/United States ; 7-R01-NS-23703-01/NS/NINDS NIH HHS/United States ; }, mesh = {*Communication Aids for Disabled ; *Computers ; Humans ; *Microcomputers ; Phonetics ; *Self-Help Devices ; Software ; Sound Spectrography/*instrumentation ; *Speech ; *Speech Acoustics ; }, abstract = {There is a relatively large body of research that is aimed at finding a set of acoustic measures of voice signals that can be used to: (a) aid in the detection, diagnosis, and evaluation of voice-quality disorders; (b) identify individual speakers by their voice characteristics; or (c) improve methods of voice synthesis. Three acoustic parameters that have received a relatively large share of attention, especially in the voice-disorders literature, are pitch perturbation, amplitude perturbation, and additive noise. The present study consisted of a series of simulations using a general-purpose formant synthesizer that were designed primarily to determine whether these three parameters could be measured independent of one another. Results suggested that changes in any single dimension can affect measured values of all three parameters. For example, adding noise to a voice signal resulted not only in a change in measured signal-to-noise ratio, but also in measured values of pitch and amplitude perturbation. These interactions were quite large in some cases, especially in view of the fact that the perturbation phenomena that are being measured are generally quite small. For the most part, the interactions appear to be readily explainable when the measurement techniques are viewed in relation to what is known about the acoustics of voice production.}, } @article {pmid2828446, year = {1987}, author = {Deng, L and Geisler, CD}, title = {Responses of auditory-nerve fibers to nasal consonant-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {6}, pages = {1977-1988}, doi = {10.1121/1.395642}, pmid = {2828446}, issn = {0001-4966}, support = {NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Cats ; Evoked Potentials, Auditory ; Nerve Fibers/physiology ; *Phonetics ; Psychoacoustics ; Speech Perception/*physiology ; Synaptic Transmission ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of single auditory-nerve fibers in anesthetized cat to spoken nasal consonant-vowel syllables were recorded. Analyses in the form of spectrograms and of three-dimensional spatial-time and spatial-frequency plots were made. Among other features, formant transitions are clearly represented in the fibers' response synchronization properties. During vocalic segments, especially those in /mu/and/ma/, at a stimulus level near 75 dB SPL, a strong dominance in the responses by frequencies near the second formant (F2) is found for most fibers whose characteristic frequencies (CFs) are at or above F2. In contrast, at more moderate levels, the same fibers may show response synchrony to frequencies closer to their own CFs. There are significant differences in the response properties of high and low/medium-spontaneous-rate fibers.}, } @article {pmid3693692, year = {1987}, author = {Dubno, JR and Dorman, MF}, title = {Effects of spectral flattening on vowel identification.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {5}, pages = {1503-1511}, doi = {10.1121/1.395194}, pmid = {3693692}, issn = {0001-4966}, support = {KO7 NS 00693/NS/NINDS NIH HHS/United States ; R01 NS 15457-6/NS/NINDS NIH HHS/United States ; }, mesh = {Auditory Perception ; Humans ; *Phonetics ; *Speech Perception ; }, abstract = {The identification of front vowels was studied in normal-hearing listeners using stimuli whose spectra had been altered to approximate the spectrum of vowels processed by auditory filters similar to those that might accompany sensorineural hearing loss. In the first experiment, front vowels were identified with greater than 95% accuracy when the first formant was specified in a normal manner and the higher frequency formants were represented by a broad, flat spectral plateau ranging from approximately 1600 to 3500 Hz. In the second experiment, the bandwidth of the first formant was systematically widened for stimuli with already flattened higher frequency formants. Normal vowel identification was preserved until the first formant was widened to six times its normal bandwidth. These results may account for the coexistence of abnormal vowel masking patterns (indicating flattened auditory spectra) and normal vowel recognition.}, } @article {pmid3693691, year = {1987}, author = {Spiegel, MF}, title = {Speech masking. I. Simultaneous and nonsimultaneous masking within stop /d/ and flap closures.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {5}, pages = {1492-1502}, doi = {10.1121/1.395193}, pmid = {3693691}, issn = {0001-4966}, mesh = {Adult ; Auditory Threshold ; Humans ; Male ; *Perceptual Masking ; *Phonetics ; Speech ; *Speech Perception ; }, abstract = {This article reports on investigations of the relative roles of simultaneous and nonsimultaneous masking on detection thresholds using natural speech utterances. Thresholds were obtained for 15-ms probe tones placed in the stop or flap closures of /ada/ and /idi/. Threshold elevations due to simultaneous and nonsimultaneous masking could be explained by the dynamics of neighboring speech spectra. Nonsimultaneous effects were related to spectra at least 30 ms around the probe tone. Although simultaneous masking is usually stronger than nonsimultaneous masking, the relative amplitude of adjacent speech segments in natural speech is sufficiently high near formant regions to cause noticeable effects of nonsimultaneous masking.}, } @article {pmid3434615, year = {1987}, author = {Franz, BK and Dowell, RC and Clark, GM and Seligman, PM and Patrick, JF}, title = {Recent developments with the nucleus 22-electrode cochlear implant: a new two formant speech coding strategy and its performance in background noise.}, journal = {The American journal of otology}, volume = {8}, number = {6}, pages = {516-518}, pmid = {3434615}, issn = {0192-9763}, mesh = {*Cochlear Implants ; Humans ; Lipreading ; Noise ; *Speech Perception ; }, abstract = {A clinical evaluation of speech processing strategies for the Nucleus 22-electrode cochlear implant showed improvements in understanding speech using the new FOF1F2 speech coding strategy instead of the F0F2 strategy. Significant improvement in closed-set speech recognition in the presence of background noise was an additional advantage of the new speech processing strategy.}, } @article {pmid3434265, year = {1987}, author = {Dowell, RC and Seligman, PM and Blamey, PJ and Clark, GM}, title = {Speech perception using a two-formant 22-electrode cochlear prosthesis in quiet and in noise.}, journal = {Acta oto-laryngologica}, volume = {104}, number = {5-6}, pages = {439-446}, doi = {10.3109/00016488709128272}, pmid = {3434265}, issn = {0001-6489}, mesh = {Adult ; *Cochlear Implants ; Deafness/rehabilitation/surgery ; Humans ; Lipreading ; Middle Aged ; Noise ; Postoperative Period ; *Speech Perception ; }, abstract = {A new speech-processing strategy has been developed for the Cochlear Pty. Ltd. 22-electrode cochlear prosthesis which codes an estimate of the first formant frequency in addition to the amplitude, voice pitch and second formant frequencies. Two groups of cochlear implant patients were tested 3 months after implant surgery, one group (n = 13) having used the old (F0F2) processing strategy and the other (n = 9) having used the new (F0F1F2) strategy. All patients underwent similar postoperative training programs. Results indicated significantly improved speech recognition for the F0F1F2 group, particularly on open set tests with audition alone. Additional testing with a smaller group of patients was carried out with competing noise (speech babble). Results for a closed set spondee test showed that patient performance was significantly degraded at a signal-to-noise ratio of 10 dB when using the F0F2 strategy, but was not significantly affected with the F0F1F2 strategy.}, } @article {pmid3658572, year = {1987}, author = {Lester, BM}, title = {Developmental outcome prediction from acoustic cry analysis in term and preterm infants.}, journal = {Pediatrics}, volume = {80}, number = {4}, pages = {529-534}, pmid = {3658572}, issn = {0031-4005}, support = {R01-HD10889/HD/NICHD NIH HHS/United States ; R01-HD21013/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; Child Development/*physiology ; Crying/*physiology ; Fourier Analysis ; Humans ; Infant, Newborn/*physiology ; Infant, Premature/*physiology ; Prospective Studies ; Signal Processing, Computer-Assisted ; Sound Spectrography/methods ; *Voice ; *Voice Quality ; }, abstract = {It has been suggested that the cry may reflect the neurophysiologic integrity of the infant and relate to later developmental outcome. In this study, the cry was recorded at term conceptional age in 18 preterm and 13 term infants using a standardized procedure and analyzed by high-speed computer. At 18 months of age, a significant number of infants were correctly classified as scoring high or low on the Bayley Scales of Infant Development based on the mean and variability in the fundamental frequency, variability in the first formant, and the amplitude of the cry. At 5 years of age, a significant number of infants were correctly classified on the McCarthy General Cognitive Index and on the verbal, perceptual-performance, and quantitative subscales based on the variability of the fundamental frequency, variability of the first formant, and amplitude and duration of the cry. Although preliminary, this study supports the potential use of the cry as a noninvasive measure to detect developmental outcome in the infant at risk.}, } @article {pmid2960853, year = {1987}, author = {Damper, RI and Burnett, JW and Gray, PW and Straus, LP and Symes, RA}, title = {Hand-held text-to-speech device for the non-vocal disabled.}, journal = {Journal of biomedical engineering}, volume = {9}, number = {4}, pages = {332-340}, doi = {10.1016/0141-5425(87)90082-3}, pmid = {2960853}, issn = {0141-5425}, mesh = {*Communication Aids for Disabled ; Dysarthria/rehabilitation ; Humans ; *Self-Help Devices ; Speech Disorders/*rehabilitation ; Voice Disorders/rehabilitation ; *Writing ; }, abstract = {A hand-held, battery-powered synthetic speech aid for the non-vocally disabled has been constructed. The device accepts as its input, largely unrestricted text keyed by the user. This is converted by text-to-speech software, based on 349 letter-to-sound rules and some simple rules of continuity, intonation and stress, to appropriate control signals which drive a single-chip (series formant) speech synthesizer. A number of implementation constraints are imposed by portability; the system has, as far as possible, been designed using CMOS components. To extend the time for which the system will operate between battery charges, power saving facilities are incorporated. Hand-held use implies the need for a one-handed keyboard: a unique integral keyboard is used, designed to minimize the visual search time to locate a letter key. Considerable attention has been paid to rule-search strategies, the handling of 'exceptions' which violate the letter-to-sound principle and the resolution of conflicts when more than one rule might apply. The quality and intelligibility of speech from a rule-based system is typically poor, and every effort has been made to improve it. Limits on possible improvement are, however, set by the use of a proprietary single chip synthesizer and by the minimal nature of a portable system. To facilitate the task of composing messages, a two-line liquid crystal display is provided together with a range of editing functions. The display can also be shown to the message receiver should he/she be deaf, or used for silent communication as an analogue to 'whispering'.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid3669636, year = {1987}, author = {Prosek, RA and Montgomery, AA and Walden, BE and Hawkins, DB}, title = {Formant frequencies of stuttered and fluent vowels.}, journal = {Journal of speech and hearing research}, volume = {30}, number = {3}, pages = {301-305}, doi = {10.1044/jshr.3003.301}, pmid = {3669636}, issn = {0022-4685}, mesh = {Adult ; Female ; Humans ; Male ; Speech/*physiology ; Speech Articulation Tests ; Stuttering/*physiopathology ; }, abstract = {The formant frequencies of 15 adult stutterers' fluent and disfluent vowels and the formant frequencies of stutterers' and nonstutterers' fluent vowels were compared in an F1-F2 vowel space and in a normalized F1-F2 vowel space. The results indicated that differences in formant frequencies observed between the stutterers' and nonstutterers' vowels can be accounted for by differences among the vocal tract dimensions of the talkers. In addition, no differences were found between the formant frequencies of the fluent and disfluent vowels produced by the stutterers. The overall pattern of these results indicates that, contrary to recent reports (Klich & May, 1982), stutterers do not exhibit significantly greater vowel centralization than nonstutterers.}, } @article {pmid3655120, year = {1987}, author = {Summers, WV}, title = {Effects of stress and final-consonant voicing on vowel production: articulatory and acoustic analyses.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {3}, pages = {847-863}, doi = {10.1121/1.395284}, pmid = {3655120}, issn = {0001-4966}, support = {T32 NS07134/NS/NINDS NIH HHS/United States ; }, mesh = {Biomechanical Phenomena ; Humans ; Jaw/physiology ; Linguistics ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; *Voice ; *Voice Quality ; }, abstract = {Durations of the vocalic portions of speech are influenced by a large number of linguistic and nonlinguistic factors (e.g., stress and speaking rate). However, each factor affecting vowel duration may influence articulation in a unique manner. The present study examined the effects of stress and final-consonant voicing on the detailed structure of articulatory and acoustic patterns in consonant-vowel-consonant (CVC) utterances. Jaw movement trajectories and F 1 trajectories were examined for a corpus of utterances differing in stress and final-consonant voicing. Jaw lowering and raising gestures were more rapid, longer in duration, and spatially more extensive for stressed versus unstressed utterances. At the acoustic level, stressed utterances showed more rapid initial F 1 transitions and more extreme F 1 steady-state frequencies than unstressed utterances. In contrast to the results obtained in the analysis of stress, decreases in vowel duration due to devoicing did not result in a reduction in the velocity or spatial extent of the articulatory gestures. Similarly, at the acoustic level, the reductions in formant transition slopes and steady-state frequencies demonstrated by the shorter, unstressed utterances did not occur for the shorter, voiceless utterances. The results demonstrate that stress-related and voicing-related changes in vowel duration are accomplished by separate and distinct changes in speech production with observable consequences at both the articulatory and acoustic levels.}, } @article {pmid2963542, year = {1987}, author = {Pentz, AL}, title = {Formant amplitude of children with Down syndrome.}, journal = {American journal of mental deficiency}, volume = {92}, number = {2}, pages = {230-233}, pmid = {2963542}, issn = {0002-9351}, mesh = {Child ; Down Syndrome/*physiopathology ; Female ; Humans ; Male ; *Phonetics ; Signal Processing, Computer-Assisted ; *Sound Spectrography ; *Voice ; *Voice Quality ; }, abstract = {The sustained vowel sounds of 14 noninstitutionalized 7- to 10-year-old children with Down syndrome and 14 nonretarded 7- to 10-year-old children were analyzed acoustically in terms of vowel formant amplitude levels. Taped speech samples were subjected to narrowband realtime analysis, and the levels of both groups were compared statistically. The speakers with Down syndrome had formant amplitude intensity levels that were significantly lower than those of a similar group of nonretarded speakers.}, } @article {pmid3624640, year = {1987}, author = {Blamey, PJ and Dowell, RC and Brown, AM and Clark, GM and Seligman, PM}, title = {Vowel and consonant recognition of cochlear implant patients using formant-estimating speech processors.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {1}, pages = {48-57}, doi = {10.1121/1.395436}, pmid = {3624640}, issn = {0001-4966}, mesh = {*Cochlear Implants ; *Computers/standards ; Humans ; *Microcomputers/standards ; *Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Vowel and consonant confusion matrices were collected in the hearing alone (H), lipreading alone (L), and hearing plus lipreading (HL) conditions for 28 patients participating in the clinical trial of the multiple-channel cochlear implant. All patients were profound-to-totally deaf and "hearing" refers to the presentation of auditory information via the implant. The average scores were 49% for vowels and 37% for consonants in the H condition and the HL scores were significantly higher than the L scores. Information transmission and multidimensional scaling analyses showed that different speech features were conveyed at different levels in the H and L conditions. In the HL condition, the visual and auditory signals provided independent information sources for each feature. For vowels, the auditory signal was the major source of duration information, while the visual signal was the major source of first and second formant frequency information. The implant provided information about the amplitude envelope of the speech and the estimated frequency of the main spectral peak between 800 and 4000 Hz, which was useful for consonant recognition. A speech processor that coded the estimated frequency and amplitude of an additional peak between 300 and 1000 Hz was shown to increase the vowel and consonant recognition in the H condition by improving the transmission of first formant and voicing information.}, } @article {pmid3624639, year = {1987}, author = {Blamey, PJ and Dowell, RC and Clark, GM and Seligman, PM}, title = {Acoustic parameters measured by a formant-estimating speech processor for a multiple-channel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {1}, pages = {38-47}, doi = {10.1121/1.395542}, pmid = {3624639}, issn = {0001-4966}, mesh = {*Cochlear Implants ; *Computers ; Humans ; *Microcomputers ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Perception ; }, abstract = {In order to assess the limitations imposed on a cochlear implant system by a wearable speech processor, the parameters extracted from a set of 11 vowels and 24 consonants were examined. An estimate of the fundamental frequency EF 0 was derived from the zero crossings of the low-pass filtered envelope of the waveform. Estimates of the first and second formant frequencies EF 1 and EF 2 were derived from the zero crossings of the waveform, which was filtered in the ranges 300-1000 and 800-4000 Hz. Estimates of the formant amplitudes EA 1 and EA 2 were derived by peak detectors operating on the outputs of the same filters. For vowels, these parameters corresponded well to the first and second formants and gave sufficient information to identify each vowel. For consonants, the relative levels and onset times of EA 1 and EA 2 and the EF 0 values gave cues to voicing. The variation in time of EA 1, EA 2, EF 1, and EF 2 gave cues to the manner of articulation. Cues to the place of articulation were given by EF 1 and EF 2. When pink noise was added, the parameters were gradually degraded as the signal-to-noise ratio decreased. Consonants were affected more than vowels, and EF 2 was affected more than EF 1. Results for three good patients using a speech processor that coded EF 0 as an electric pulse rate, EF 1 and EF 2 as electrode positions, and EA 1 and EA 2 as electric current levels confirmed that the parameters were useful for recognition of vowels and consonants. Average scores were 76% for recognition of 11 vowels and 71% for 12 consonants in the hearing-alone condition. The error rates were 4% for voicing, 12% for manner, and 25% for place.}, } @article {pmid2957405, year = {1987}, author = {Blamey, PJ and Clark, GM}, title = {Psychophysical studies relevant to the design of a digital electrotactile speech processor.}, journal = {The Journal of the Acoustical Society of America}, volume = {82}, number = {1}, pages = {116-125}, doi = {10.1121/1.395555}, pmid = {2957405}, issn = {0001-4966}, mesh = {*Communication Aids for Disabled ; *Computers ; Differential Threshold ; Electric Stimulation ; Electrodes ; Humans ; *Microcomputers ; Psychophysics/methods ; Pulse ; *Self-Help Devices ; *Touch ; }, abstract = {Psychophysical tests were carried out to investigate the perception of electrocutaneous stimuli delivered to the digital nerve bundles. The tests provided data for defining the operating range of a tactile aid for patients with profound-to-total hearing loss, as well as the individual differences between subjects and the information that could be transmitted. Monopolar biphasic constant current pulses with variable pulse widths were used. Threshold pulse widths varied widely between subjects and between fingers for the same subject. Thresholds were reasonably stable, but maximum comfortable levels increased with time. Perceived intensity was weakly dependent on pulse rate. Absolute identification of stimuli differing in pulse width gave information transmissions from 1.3-2.1 bits, limited by the dynamic ranges of the stimuli (3-17 dB). Stimuli from electrodes placed on either side of each finger were identified easily by all subjects. Absolute identification of stimuli differing in pulse rate gave information transmissions from 0.5-2.0 bits. Difference limens for pulse rate varied between subjects and were generally poor above 100 pps. On the basis of the results, an electrotactile speech processor is proposed, which codes the speech amplitude as pulse width, the fundamental frequency as pulse rate, and the second formant frequency as electrode position. Variable performances on tasks relying on amplitude and fundamental frequency cues are expected to arise from the intersubject differences in dynamic range and pulse rate discrimination. The psychophysical results for electrotactile stimulation are compared with previously published results for electroauditory stimulation with a multiple-channel cochlear implant.}, } @article {pmid3611510, year = {1987}, author = {Yost, WA and Moore, MJ}, title = {Temporal changes in a complex spectral profile.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {6}, pages = {1896-1905}, doi = {10.1121/1.394754}, pmid = {3611510}, issn = {0001-4966}, mesh = {Auditory Pathways/*physiology ; Filtration ; Humans ; Pitch Discrimination/*physiology ; Psychoacoustics ; Time Factors ; }, abstract = {The spectral properties of a complex stimulus (rippled noise) were varied over time, and listeners were asked to discriminate between this stimulus and a flat-spectrum, stationary noise. The spacing between the spectral peaks of rippled noise was changed sinusoidally as a function of time, or the location of the spectral peaks of rippled noise was moved up and down the spectrum as a sinusoidal function of time. In most conditions, listeners were able to make the discriminations up to rates of temporal modulation of 5-10 cycles per second. Beyond 5-10 cps the rippled noise with the temporally varying peaks was indiscriminable from a flat (nonrippled) noise. The results suggest that for temporal changes in the spectral peaks of rippled noise, listeners cannot monitor the output of a single (or small number of) auditory channel(s) (critical bands), or that the mechanism used to extract the perceptual information from these stimuli is slow. Temporal variations in the spectral properties of rippled noise may relate to temporal changes in the repetition pitch of complex sounds, the temporal properties of the coloration added to sound in a reverberant environment, and the nature of spectral peak changes such as those that occur in speech-formant transitions. The results are relevant to the general issue of the auditory system's ability to extract information from a complex spectral profile.}, } @article {pmid3599953, year = {1987}, author = {Ohde, RN and Sharf, DJ}, title = {Effect of formant transition rate on the differentiation of synthesized child and adult (w) and (r) sounds.}, journal = {Journal of speech and hearing research}, volume = {30}, number = {2}, pages = {215-222}, doi = {10.1044/jshr.3002.215}, pmid = {3599953}, issn = {0022-4685}, mesh = {Child ; Child Language ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The purpose of this research was to assess the perceptual effects of a range of second (F2) and third (F3) formant transition rates that occur naturally in the production of /w/ and /r/ by children and adults. Synthesized CV continua that varied in the second (F2) and third (F3) formant onset frequencies between values appropriate for /w/ and /r/ were used as stimuli. Subjects participated in four experimental conditions that involved changing the rate of transition of either F2 or F3 by varying the duration of the transition between the glide onset and an /eI/ vowel nucleus for child and adult stimuli. In each condition, the transition rates of the /w/-endpoint stimulus, the /r/-endpoint stimulus, and at least one of the midpoint stimuli were varied across values appropriate for /w/ and /r/. Mean ratings of the stimuli were compared to test the predictions that /w/ perception increases with increased F2 transition rate and increases with decreased F3 transition rate. The results were as follows: (a) one of the 10 comparisons for the child F2 stimuli was significant, but it involved a change opposite to the predicted direction; (b) four of the six comparisons for the child F3 stimuli were significant, but they involved changes opposite to the predicted direction; (c) 14 comparisons of adult F2 stimuli were significant, but 12 of these 14 comparisons involved changes opposite to the predicted direction; and (d) four of the six comparisons for the adult F3 stimuli were significant, but none of them involved changes in the predicted direction. Only one of all the comparisons involved a significant change in rating between the /w/ and /r/ categories.}, } @article {pmid3631227, year = {1987}, author = {Holmes, AE and Kemker, FJ and Merwin, GE}, title = {The effects of varying the number of cochlear implant electrodes on speech perception.}, journal = {The American journal of otology}, volume = {8}, number = {3}, pages = {240-246}, pmid = {3631227}, issn = {0192-9763}, mesh = {Adult ; *Cochlear Implants ; Electrodes, Implanted ; Evaluation Studies as Topic ; Female ; Hearing Loss, Sensorineural/rehabilitation ; Hearing Tests/methods ; Humans ; Speech Perception/*physiology ; }, abstract = {The purpose of this investigation was to evaluate the speech perception of a patient fitted with a multichannel processor under different electrode conditions. The subject was evaluated with the speech processor programmed with electrode arrays varying in number from one to nineteen. The results suggested that with an increased number of programmed electrodes, the subject's speech perception improved. Comparisons were also made among three speech processing strategies. The processor was programmed with: (1) one electrode pair that provided fundamental frequency information; (2) nineteen electrode paris that provided fundamental frequency plus second formant information (F0/F2); and (3) nineteen electrode paris that provided fundamental frequency plus first and second formant information (F0/F1F2). The subject did better with the F0/F2 strategy compared to the F0 strategy and best with the F0/F1F2 strategy.}, } @article {pmid3631226, year = {1987}, author = {Clark, GM and Busby, PA and Roberts, SA and Dowell, RC and Tong, YC and Blamey, PJ and Nienhuys, TG and Mecklenburg, DJ and Webb, RL and Pyman, BC}, title = {Preliminary results for the cochlear corporation multielectrode intracochlear implant in six prelingually deaf patients.}, journal = {The American journal of otology}, volume = {8}, number = {3}, pages = {234-239}, pmid = {3631226}, issn = {0192-9763}, mesh = {Adolescent ; Adult ; Age Factors ; Child, Preschool ; *Cochlear Implants ; Deafness/etiology/*rehabilitation ; Evaluation Studies as Topic ; Female ; Humans ; Language Development ; Male ; Meningitis/complications ; Speech Intelligibility/physiology ; Speech Perception/*physiology ; Time Factors ; }, abstract = {The preliminary results from this study indicate that some prelingually deaf patients may get worthwhile help from a multiple-electrode cochlear implant that uses a formant-based speech processing strategy. It is encouraging that these improvements can occur in young adults and teenagers. The results for two children are also encouraging. A 10-year-old child obtained significant improvement on some speech perception tests. It was easy to set thresholds and comfortable listening levels on a 5-year-old child, and he is now a regular user of the device. There are, however, considerable variations in performance among the prelingual patients, which may be related to the following factors: whether they have had some hearing after birth, the method of education used, the motivation of the patient, and age at implantation.}, } @article {pmid3584696, year = {1987}, author = {Van Tasell, DJ and Fabry, DA and Thibodeau, LM}, title = {Vowel identification and vowel masking patterns of hearing-impaired subjects.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {5}, pages = {1586-1597}, doi = {10.1121/1.394511}, pmid = {3584696}, issn = {0001-4966}, support = {NS12125/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; *Auditory Threshold ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Noise ; Reference Values ; *Speech Intelligibility ; }, abstract = {Confusion matrices for seven synthetic steady-state vowels were obtained from ten normal and three hearing-impaired subjects. The vowels were identified at greater than 96% accuracy by the normals, and less accurately by the impaired subjects. Shortened versions of selected vowels then were used as maskers, and vowel masking patterns (VMPs) consisting of forward-masked threshold for sinusoidal probes at all vowel masker harmonics were obtained from the impaired subjects and from one normal subject. Vowel-masked probe thresholds were transformed using growth-of-masking functions obtained with flat-spectrum noise. VMPs of the impaired subjects, relative to those of the normal, were characterized by smaller dynamic range, poorer peak resolution, and poorer preservation of the vowel formant structure. These VMP characteristics, however, did not necessarily coincide with inaccurate vowel recognition. Vowel identification appeared to be related primarily to VMP peak frequencies rather than to the levels at the peaks or to between-peak characteristics of the patterns.}, } @article {pmid2974912, year = {1987}, author = {Repp, BH and Lin, HB}, title = {Difference in second-formant transitions between aspirated and unaspirated stop consonants preceding [a].}, journal = {Language and speech}, volume = {30 (Pt 2)}, number = {}, pages = {115-129}, doi = {10.1177/002383098703000202}, pmid = {2974912}, issn = {0023-8309}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Communication Aids for Disabled ; Humans ; *Inhalation ; Male ; *Phonetics ; *Respiration ; *Speech ; *Speech Acoustics ; }, } @article {pmid3584682, year = {1987}, author = {Cranen, B and Boves, L}, title = {On subglottal formant analysis.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {3}, pages = {734-746}, doi = {10.1121/1.394842}, pmid = {3584682}, issn = {0001-4966}, mesh = {Air Pressure ; Glottis/*physiology/physiopathology ; Humans ; Male ; *Models, Biological ; Sound Spectrography ; Speech/*physiology ; Speech Acoustics ; }, abstract = {When subglottal pressure signals which are recorded during normal speech production are spectrally analyzed, the frequency of the first spectral maximum appears to deviate appreciably from the first resonance frequency which has been reported in the literature and which stems from measurements of the acoustic impedance of the subglottal system. It is postulated that this is caused by the spectrum of the excitation function. This hypothesis is corroborated by a modeling study. Using an extended version of the well-known two-mass model of the vocal folds that can account for a glottal leak, it is shown that under realistic physiological assumptions glottal flow waveforms are generated whose spectral properties cause a downward shift of the location of the first spectral maximum in the subglottal pressure signals. The order of magnitude of this effect is investigated for different glottal settings and with a subglottal system that is modeled according to the impedance measurements reported in the literature. The outcomes of this modeling study show that the location of the first spectral maximum of the subglottal pressure may deviate appreciably from the natural frequency of the subglottal system. As a consequence, however, the comfortable assumption that in normal speech the glottal excitation function is constant and zero during the "closed glottis interval" has to be called into question.}, } @article {pmid3584677, year = {1987}, author = {Summerfield, Q and Sidwell, A and Nelson, T}, title = {Auditory enhancement of changes in spectral amplitude.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {3}, pages = {700-708}, doi = {10.1121/1.394838}, pmid = {3584677}, issn = {0001-4966}, mesh = {Adult ; Analysis of Variance ; Audiometry, Speech ; Humans ; Male ; Noise ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {An auditory enhancement effect occurs when one component of a harmonic series is omitted for a few hundred milliseconds and then reintroduced: The reintroduced harmonic stands out perceptually. Three experiments are reported that studied a version of this effect in which several components of a harmonic series are enhanced to define the formants of a vowel. Using the accuracy of vowel identification to measure the prominence of the formant peaks in the effective auditory representation, forms of the effect were identified that are qualitatively similar to the incremental and decremental responses seen in primary auditory-nerve fibers. These results are compatible with an origin for the enhancement effect in peripheral auditory adaptation. However, an additional mechanism is required to account for the demonstration [Viemeister and Bacon, J. Acoust. Soc. Am. 71, 1502-1507 (1982)] that enhancement can involve a true gain in the frequency region of the reintroduced component. These effects demonstrate one way in which the auditory system may attenuate the prominence of background noises while preserving the ability to represent changes in spectral amplitude produced by newly arriving signals.}, } @article {pmid3560896, year = {1987}, author = {Revoile, S and Pickett, JM and Holden-Pitt, LD and Talkin, D and Brandt, FD}, title = {Burst and transition cues to voicing perception for spoken initial stops by impaired- and normal-hearing listeners.}, journal = {Journal of speech and hearing research}, volume = {30}, number = {1}, pages = {3-12}, doi = {10.1044/jshr.3001.03}, pmid = {3560896}, issn = {0022-4685}, support = {NS-05464/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Hearing Loss/*physiopathology ; Humans ; Inhalation ; *Phonation ; Pitch Perception/physiology ; Speech Acoustics ; Speech Perception/*physiology ; *Voice ; }, abstract = {The use of cues to voicing perception of initial stop consonants in multiple spoken syllables was studied for moderately/severely hearing-impaired (n = 43) and normal-hearing listeners (n = 12). The test stimuli were ten utterances each of the syllables/baed, gaed, daed, paed, kaed, taed/. The utterances were analyzed acoustically to confirm the presence of certain cues to initial-stop voicing, namely, differences in voice onset time (VOT), aspiration, and vowel-onset values of the first formant and of fundamental frequency (fo). Test conditions were prepared in which different portions of the syllable onsets were either deleted or interchanged for voicing-cognate syllables. Also the fo contour was flattened for syllable pairs via analysis/synthesis using linear predictor code (LPC) processing. The results confirmed that VOT was a strong voicing cue for both the hearing-impaired and normal-hearing listeners. When the aspirations of the voiceless stops were inserted between the release and the vowel of the voiced-stop syllables, the normal-hearing listeners perceived voiceless stops predominantly. The transition portions of the vowel onsets in burstless /baed, gaed, daed/ contained strong cues for voicing perception of /b,g,d/. The hearing-impaired listeners seemed less sensitive than the normal-hearing listeners to the aspiration-presence and the vowel-onset cues. The fo difference at vowel onset appeared to have no cue value for either group of listeners.}, } @article {pmid3807347, year = {1987}, author = {Kent, RD and Osberger, MJ and Netsell, R and Hustedde, CG}, title = {Phonetic development in identical twins differing in auditory function.}, journal = {The Journal of speech and hearing disorders}, volume = {52}, number = {1}, pages = {64-75}, doi = {10.1044/jshd.5201.64}, pmid = {3807347}, issn = {0022-4677}, support = {NS16763/NS/NINDS NIH HHS/United States ; S07RR05834/RR/NCRR NIH HHS/United States ; }, mesh = {Child Language ; *Diseases in Twins ; Hearing Loss/*diagnosis ; Hearing Loss, Bilateral/*diagnosis/rehabilitation ; Humans ; Infant ; *Language Development ; Male ; *Phonetics ; Speech Perception ; Twins, Monozygotic ; }, abstract = {The subjects of this report are identical (monozygotic) twin boys who differ in auditory function. One has normal hearing; the other has a profound hearing loss bilaterally. These boys offered a rare opportunity to study the effects of hearing loss on vocal development with reasonable control over environmental and genetic factors. This initial report focuses on their vocal development over the sampled ages of 8, 12, and 15 months. Acoustic-phonetic differences in the babbling of the two boys were evident in the 8-month sample (the first recording opportunity), and some differences between them became greater over the succeeding samples at 12 and 15 months. The major differences were in the formant patterns of vocalic elements; the frequency of occurrence of fricatives, affricates, and trills; histograms of syllable type; and variation in vowel usage. The data hold implications for the early identification of infants at risk for communication disorder and for the understanding of auditory-motor processes in phonetic development during infancy.}, } @article {pmid3558970, year = {1987}, author = {Assmann, PF and Nearey, TM}, title = {Perception of front vowels: the role of harmonics in the first formant region.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {2}, pages = {520-534}, doi = {10.1121/1.394918}, pmid = {3558970}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Mathematics ; Phonetics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Vowel matching and identification experiments were carried out to investigate the perceptual contribution of harmonics in the first formant region of synthetic front vowels. In the first experiment, listeners selected the best phonetic match from an F1 continuum, for reference stimuli in which a band of two to five adjacent harmonics of equal intensity replaced the F1 peak; F1 values of best matches were near the frequency of the highest frequency harmonic in the band. Attenuation of the highest harmonic in the band resulted in lower F1 matches. Attenuation of the lowest harmonic had no significant effects, except in the case of a 2-harmonic band, where higher F1 matches were selected. A second experiment investigated the shifts in matched F1 resulting from an intensity increment to either one of a pair of harmonics in the F1 region. These shifts were relatively invariant over different harmonic frequencies and proportional to the fundamental frequency. A third experiment used a vowel identification task to determine phoneme boundaries on an F1 continuum. These boundaries were not substantially altered when the stimuli comprised only the two most prominent harmonics in the F1 region, or these plus either the higher or lower frequency subset of the remaining F1 harmonics. The results are consistent with an estimation procedure for the F1 peak which assigns greatest weight to the two most prominent harmonics in the first formant region.}, } @article {pmid3558969, year = {1987}, author = {Sereno, JA and Baum, SR and Marean, GC and Lieberman, P}, title = {Acoustic analyses and perceptual data on anticipatory labial coarticulation in adults and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {2}, pages = {512-519}, doi = {10.1121/1.394917}, pmid = {3558969}, issn = {0001-4966}, mesh = {Adult ; Child ; Child, Preschool ; Humans ; Lip ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; Speech Perception/*physiology ; }, abstract = {The present study investigated anticipatory labial coarticulation in the speech of adults and children. CV syllables, composed of [s], [t], and [d] before [i] and [u], were produced by four adult speakers and eight child speakers aged 3-7 years. Each stimulus was computer edited to include only the aperiodic portion of fricative-vowel and stop-vowel syllables. LPC spectra were then computed for each excised segment. Analyses of the effect of the following vowel on the spectral peak associated with the second formant frequency and on the characteristic spectral prominence for each consonant were performed. Perceptual data were obtained by presenting the aperiodic consonantal segments to subjects who were instructed to identify the following vowel as [i] or [u]. Both the acoustic and the perceptual data show strong coarticulatory effects for the adults and comparable, although less consistent, coarticulation in the speech stimuli of the children. The results are discussed in terms of the articulatory and perceptual aspects of coarticulation in language learning.}, } @article {pmid3558961, year = {1987}, author = {Turner, CW and Holte, LA}, title = {Discrimination of spectral-peak amplitude by normal and hearing-impaired subjects.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {2}, pages = {445-451}, doi = {10.1121/1.394909}, pmid = {3558961}, issn = {0001-4966}, mesh = {Adult ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The present study compared the abilities of normal and hearing-impaired subjects to discriminate differences in the spectral shapes of speechlike sounds. The minimum detectable change in amplitude of a second-formant spectral peak was determined for steady-state stimuli across a range of presentation levels. In many cases, the hearing-impaired subjects required larger spectral peaks than did the normal-hearing subjects. The performance of all subjects showed a dependence upon presentation level. For some hearing-impaired subjects, high presentation levels resulted in discrimination values similar to that of normal-hearing subjects, while for other hearing-loss subjects, increases in presentation level did not yield normal values, even when the second-formant spectral region was presented at levels above the subject's sensitivity thresholds. These results demonstrate that under certain conditions, some sensorineural hearing-impaired subjects require more prominent spectral peaks in certain speech sounds than normal subjects for equivalent performance. For the group of subjects who did not achieve normal discrimination results at any presentation level, application of high-frequency amplification to the stimuli was successful in returning those subjects' performance to within normal values.}, } @article {pmid3819173, year = {1987}, author = {Leek, MR and Dorman, MF and Summerfield, Q}, title = {Minimum spectral contrast for vowel identification by normal-hearing and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {81}, number = {1}, pages = {148-154}, doi = {10.1121/1.395024}, pmid = {3819173}, issn = {0001-4966}, support = {NS 15457/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Aged ; Hearing Loss, Sensorineural/*diagnosis ; Humans ; Middle Aged ; *Phonetics ; Pitch Discrimination ; Psychoacoustics ; *Sound Spectrography ; *Speech Perception ; }, abstract = {To determine the minimum difference in amplitude between spectral peaks and troughs sufficient for vowel identification by normal-hearing and hearing-impaired listeners, four vowel-like complex sounds were created by summing the first 30 harmonics of a 100-Hz tone. The amplitudes of all harmonics were equal, except for two consecutive harmonics located at each of three "formant" locations. The amplitudes of these harmonics were equal and ranged from 1-8 dB more than the remaining components. Normal-hearing listeners achieved greater than 75% accuracy when peak-to-trough differences were 1-2 dB. Normal-hearing listeners who were tested in a noise background sufficient to raise their thresholds to the level of a flat, moderate hearing loss needed a 4-dB difference for identification. Listeners with a moderate, flat hearing loss required a 6- to 7-dB difference for identification. The results suggest, for normal-hearing listeners, that the peak-to-trough amplitude difference required for identification of this set of vowels is very near the threshold for detection of a change in the amplitude spectrum of a complex signal. Hearing-impaired listeners may have difficulty using closely spaced formants for vowel identification due to abnormal smoothing of the internal representation of the spectrum by broadened auditory filters.}, } @article {pmid3624084, year = {1987}, author = {van den Honert, C and Stypulkowski, PH}, title = {Temporal response patterns of single auditory nerve fibers elicited by periodic electrical stimuli.}, journal = {Hearing research}, volume = {29}, number = {2-3}, pages = {207-222}, doi = {10.1016/0378-5955(87)90168-7}, pmid = {3624084}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Animals ; Cats ; Electric Stimulation ; *Evoked Potentials, Auditory ; Nerve Fibers/*physiology ; Pitch Perception ; Speech Acoustics ; Time Factors ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Single auditory nerve fibers exhibit firing synchronized to one or both phases of periodic AC stimulus currents. Responses to biphasic pulses depend on order and excitation sites of the two phases. Sine and triangle stimuli between 100 Hz and 500 Hz elicit similar response patterns. Responses to square waves are sometimes more synchronized and generally shifted in phase with respect to sine wave responses. Preferred firing phase(s): (1) are largely independent of stimulus intensity; (2) vary among fibers; (3) may shift continuously or discontinuously over several seconds before steady state is achieved. Responses to an unprocessed synthetic vowel stimulus were dominated by pitch period, first formant, and 'spurious' components.}, } @article {pmid3558185, year = {1987}, author = {Leanderson, R and Sundberg, J and von Euler, C}, title = {Role of diaphragmatic activity during singing: a study of transdiaphragmatic pressures.}, journal = {Journal of applied physiology (Bethesda, Md. : 1985)}, volume = {62}, number = {1}, pages = {259-270}, doi = {10.1152/jappl.1987.62.1.259}, pmid = {3558185}, issn = {8750-7587}, mesh = {Adult ; Diaphragm/*physiology ; Esophagus/physiology ; Humans ; Male ; Middle Aged ; Muscle Contraction ; Phonation ; Pressure ; Stomach/physiology ; *Voice ; }, abstract = {Esophageal and gastric pressures during singing are measured in four male professional singers performing singing tasks requiring rapid changes of subglottal pressure. Evidence for a consistent use of the diaphragm is found in all subjects. Some subjects punctually activate the diaphragm when there is a need for a rapid decrease of subglottal pressure, such as when singing a falling octave interval, when shifting from a loud to a soft note, to save air during a /p/ explosion, and in performing a trillo involving a repeated switching between glottal adduction and abduction. The first three cases were observed in the beginning of the phrase, presumably over the period that the pressure generated by the passive expiratory recoil forces of the breathing system was higher than the intended subglottal pressure. In addition to this, one subject exhibited a diaphragmatic tonus throughout the entire phrase. The phonatory relevance of a diaphragmatic activity was evaluated in a subsequent experiment. The transdiaphragmatic pressure was displayed on an oscilloscope screen as a visual feedback signal for singers and nonsingers, who performed various phonatory tasks with and without voluntary coactivation of the diaphragm. In most subjects this activity tended to increase the glottal closed/open ratio as well as the amplitude of the glottogram (i.e., the transglottal volume velocity wave-form as determined by inverse filtering). These changes suggest that diaphragmatic coactivation tends to affect phonation. Also, it tended to reduce the formant frequency variability under conditions of changing fundamental frequency suggesting a better stabilization of the vocal tract.}, } @article {pmid3452956, year = {1987}, author = {Dejonckere, PH and Lebacq, J}, title = {Harmonic emergence in formant zone of a sustained [a] as a parameter for evaluating hoarseness.}, journal = {Acta oto-rhino-laryngologica Belgica}, volume = {41}, number = {6}, pages = {988-996}, pmid = {3452956}, issn = {0001-6497}, mesh = {Adolescent ; Adult ; Aged ; Computers ; Female ; Humans ; Male ; Middle Aged ; Phonation ; *Sound ; *Speech ; *Speech Acoustics ; Voice Disorders/*physiopathology ; }, abstract = {In order to test the value of the harmonic emergence in formant zone of a sustained (a) as a parameter for evaluating hoarseness, a multifactorial acoustical investigation was performed in 10 normal and 22 dysphonic subjects. The parameters were: a subjective evaluation (7-degree-scale) of the extent of the hoarseness by 3 trained specialists the phonation quotient the 0-6 kHz/6-10 kHz energy ratio, computed on a long--time--average--spectrum from a reading of a phonetically selected text the harmonic emergence in the bandwidth of the two first formants of a sustained (a). The harmonic emergence in the formant zone of a sustained (a) seems to be an interesting and complementary parameter, with respect to the previously investigated 0-6 kHz/6-10 kHz energy ratio, for evaluating hoarseness. Both of these parameters demonstrate highly significant differences between normal and pathological voices. Moreover, they show a high degree of correlation both with the subjective evaluation of the severeness of the hoarseness and with the mean transglottic air flow during phonation. The 0-6 kHz/6-10 kHz energy ratio seems to discriminate better in cases of severe dysphonia, while harmonic emergence seems more suitable in milder hoarseness. Both spectral parameters are suitable for automatic analysis.}, } @article {pmid3452836, year = {1987}, author = {Bauer, HR}, title = {Frequency code: orofacial correlates of fundamental frequency.}, journal = {Phonetica}, volume = {44}, number = {3}, pages = {173-191}, doi = {10.1159/000261793}, pmid = {3452836}, issn = {0031-8388}, mesh = {*Acoustics ; Aggression/physiology ; Animals ; Dominance-Subordination ; *Facial Expression ; Lip/physiology ; Male ; Pan troglodytes ; Smiling/physiology ; Sound Spectrography ; Tooth/physiology ; Vocalization, Animal/*physiology ; Voice ; }, abstract = {In the human voice, lip retraction as in smiling can be associated with high fundamental (Fo) and formant frequencies. The aim of this study was to investigate under naturalistic conditions the cross-species generality of the frequency code hypothesis and related orofacial correlates in Fo. Digital spectral, spectrographic, and cinegraphic measurements were made of spontaneous chimpanzee vocalizations and orofacial movements produced and recorded in the field. A significant decline in Fo was found in submissive-scream-to-aggressive-waahbark transitions as predicted by the frequency code. Teeth and lip opening distances during sounds were positively correlated with their duration, Fo, frequency rise and tempo. These positive correlates and a trivariate regression between orofacial opening and Fo give support to the coordination of these appeasing facial and vocal features proposed in the frequency code hypothesis.}, } @article {pmid3451734, year = {1987}, author = {Funasaka, S and Takahashi, O and Yukawa, K and Hatsushika, S and Hayashibara, S}, title = {Speech perception with multi-channel cochlear implant of short duration pulse strategy.}, journal = {Auris, nasus, larynx}, volume = {14}, number = {3}, pages = {153-163}, doi = {10.1016/s0385-8146(87)80016-0}, pmid = {3451734}, issn = {0385-8146}, mesh = {Acoustic Stimulation/methods ; Adult ; *Cochlear Implants ; Deafness/etiology/physiopathology/*rehabilitation ; Female ; Humans ; Japan ; Lipreading ; Meningitis/complications ; Speech ; *Speech Perception ; }, abstract = {A multi-channel cochlear implant (Nucleus type) was implanted in a 40-year-old female Japanese patient who became totally deaf after meningitis. The formant-based speech processing strategy was used, but a narrow pulse width of 22-42 microseconds was required because of intermittent difficulty in controlling the pulse amplitude. The patient was tested with a speech tracking test and could recognize 24 bunsetsues (the minimum meaningful unit of the Japanese sentence) per minute using the cochlear implant plus lipreading and 14.3 bunsetsues for the lipreading alone after 3.5 months' training. The patient was also able to understand usual conversational sentences spoken a little slowly. Scores of vowel and consonant tests reached 70% and 54% respectively for the cochlear implant alone, and 100% and 73% for the cochlear implant plus lipreading. This study has also shown that cochlear stimulation with very narrow pulse widths can be used, and restore speech comprehension ability for the Japanese.}, } @article {pmid3430382, year = {1987}, author = {Turner, CW and Holte, LA and Relkin, E}, title = {Auditory filtering and the discrimination of spectral shapes by normal and hearing-impaired subjects.}, journal = {Journal of rehabilitation research and development}, volume = {24}, number = {4}, pages = {229-238}, pmid = {3430382}, issn = {0748-7711}, mesh = {Auditory Threshold ; Deafness/*rehabilitation ; *Hearing Aids ; Humans ; Microcomputers ; *Pitch Discrimination ; Psychoacoustics ; Software ; *Speech Perception ; }, abstract = {A review of the literature suggests that many hearing-impaired patients suffer from sensory deficits in addition to the reduced audibility of speech signals. Poor frequency resolution, or abnormal spread of masking, is a consistently identified deficit in sensorineural hearing loss. Frequency resolution was measured in individual subjects using the input filter pattern paradigm, and the minimum detectable amplitude of a second-formant spectral peak in a spectral-shape discrimination task was also determined for each subject. The two tasks were designed to test the identical frequency regions in each subject. A nearly perfect correlation was found between the degree of frequency resolution as measured by the input filter pattern and performance on the spectral-shape discrimination task. These results suggest that measures of frequency selectivity may offer predictive value as to the degree of impairment that individual hearing-impaired patients may have in perceiving the spectral characteristics of speech, and also lead to suggestions for signal processing strategies to aid these patients.}, } @article {pmid3318305, year = {1987}, author = {Clark, GM and Blamey, PJ and Brown, AM and Gusby, PA and Dowell, RC and Franz, BK and Pyman, BC and Shepherd, RK and Tong, YC and Webb, RL}, title = {The University of Melbourne--nucleus multi-electrode cochlear implant.}, journal = {Advances in oto-rhino-laryngology}, volume = {38}, number = {}, pages = {V-IX, 1-181}, pmid = {3318305}, issn = {0065-3071}, mesh = {Animals ; Biocompatible Materials/adverse effects ; *Cochlear Implants ; Deafness/physiopathology/surgery ; Electric Stimulation ; Electrodes, Implanted ; Humans ; Pitch Discrimination/physiology ; Prosthesis Design ; Psychoacoustics ; Speech Perception/physiology ; Vestibulocochlear Nerve/physiology ; }, abstract = {To summarize, our preliminary results indicate that some prelingually deaf patients may get worthwhile help from a multiple-electrode cochlear implant which extracts formants. They can understand words and running speech better when using the cochlear implant with lip-reading compared to lip-reading alone. It has been encouraging that these improvements can occur in young adults and teenagers. It has also been encouraging that some can recognize place pitch as well as rate pitch. There are, however, considerable variations in performance and this may be due to the following factors: whether they have had some hearing after birth, the method of education used, the motivation of the patient and age at implantation. In conclusion it is important to emphasize that deaf children are severely disadvantaged however good their teacher of the deaf. Research on cochlear implants offers hope for profoundly-totally deaf children. These developments will not replace the caring, competent educators but complement their skills. There is also a greater need than ever for an interdisciplinary approach to these children.}, } @article {pmid3795897, year = {1986}, author = {Forrest, K and Abbas, PJ and Zimmermann, GN}, title = {Effects of white noise masking and low pass filtering on speech kinematics.}, journal = {Journal of speech and hearing research}, volume = {29}, number = {4}, pages = {549-562}, doi = {10.1044/jshr.2904.549}, pmid = {3795897}, issn = {0022-4685}, mesh = {Adult ; *Feedback ; Female ; Humans ; Jaw/physiology ; Lip/physiology ; Movement ; Noise ; *Perceptual Masking ; Speech/*physiology ; Time Factors ; Tongue/physiology ; }, abstract = {The effects of reduced auditory information on spatial and temporal parameters of speech production were investigated using cinefluorographic techniques. While subjects read a series of test words embedded in carrier sentences, they received normal auditory information, auditory information regarding only the first formant of their production, or high level noise to mask all formant information. Kinematic analyses indicated that while there were some changes across conditions, these changes were not consistent either within or across the subjects. Parameters that were affected included mean displacement, vocal tract shape, interarticulator timing, and steady state duration. The results suggest that auditory information plays a role in maintaining dynamic aspects of speech kinematics. That is, while speech can be produced without auditory information, the precise action and coordination that characteristic normal production may be altered.}, } @article {pmid3796164, year = {1986}, author = {Klingholz, F and Martin, F and Jolk, A}, title = {[The 3-dimensional voice field].}, journal = {Laryngologie, Rhinologie, Otologie}, volume = {65}, number = {10}, pages = {588-591}, pmid = {3796164}, issn = {0340-1588}, mesh = {Electromyography ; Hoarseness/diagnosis ; Humans ; Laryngeal Diseases/*diagnosis ; Phonation ; *Voice ; Voice Disorders/*diagnosis ; *Voice Quality ; }, abstract = {Extension of the voice field (voice profile) by a third variable which characterizes the phonation has various advantages as well as drawbacks. Since the voice field evaluates the laryngeal function, the use of aerodynamic variables improve the representation of the laryngeal efficiency in the voice field. On the other hand, the measurement of these variables complicates the method. When voice quality measures (presence of singing formant, sharpness) are used as the third dimension, the limits of the voice field are unique references for the measures. However, the voice quality at the physiologic limits is not representative of the voice as produced in singing or speaking. Measures which characterize the "hoarseness" of the voice yield important results with respect to the perturbation of the voice production (jitter, shimmer, additive noise). The perturbation measures evaluate the laryngeal function at defined points in the frequency--intensity area. However, in forte phonation, the natural perturbation is reduced, in piano phonation, the perturbation is increased, and the results cannot be interpreted in relation to the perturbation magnitude within the voice field. Besides, the measurement of perturbation requires digital signal processing. If EMG data are used for the voice field, as it is shown for the M. vocalis, the voice field reveals interesting relationships between acoustic and physiologic features. For instance, in the piano phonation, the EMG signal power is constant on a low level. In the forte phonation, in the chest register, the EMG signal power shows a medium level and a high level in the head register. In the falsetto, however, the EMG signal power is low.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid3771932, year = {1986}, author = {Elliott, LL}, title = {Discrimination and response bias for CV syllables differing in voice onset time among children and adults.}, journal = {The Journal of the Acoustical Society of America}, volume = {80}, number = {4}, pages = {1250-1255}, doi = {10.1121/1.393819}, pmid = {3771932}, issn = {0001-4966}, support = {NS 18556/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Age Factors ; Child ; Humans ; Phonetics ; *Speech Perception ; }, abstract = {A previous experiment demonstrated age-related differences in voice-onset-time (VOT) discrimination when an adaptive procedure was used and trials were concentrated among pairs of stimuli that were discriminated 50% of the time. The major purpose of this experiment was to determine whether the same types of age effects would be replicated for new groups of subjects and a different task in which all stimuli were presented equal numbers of times. An eight-item, five-formant consonant-vowel (CV) continuum in which VOT ranged from 0-35 ms was used. The same-different task presented all possible pairs of CV syllables in which VOT differed by 10 and 20 ms and an equal number of catch trials that contained identical CVs. Results showed that children displayed poorer discrimination than adults for CV pairs differing by both time intervals. Adults displayed a somewhat greater tendency to respond "same" than children. The outcomes supported results of the previous study and were interpreted as representing true age-related differences in VOT discrimination.}, } @article {pmid2945839, year = {1986}, author = {Moran, MJ}, title = {Identification of Down's syndrome adults from prolonged vowel samples.}, journal = {Journal of communication disorders}, volume = {19}, number = {5}, pages = {387-394}, doi = {10.1016/0021-9924(86)90028-6}, pmid = {2945839}, issn = {0021-9924}, mesh = {Adult ; Diagnosis, Differential ; Down Syndrome/diagnosis/*physiopathology ; Education, Special ; Female ; Hoarseness/diagnosis/physiopathology ; Humans ; Male ; Middle Aged ; Phonation ; Speech-Language Pathology ; *Voice ; *Voice Quality ; }, abstract = {Special educators and speech pathologists were both able to distinguish adults with Down's syndrome from nonretarded adults with voice disorders based only on prolonged vowel samples. Listeners indicated that resonance differences may have helped them to identify Down's syndrome speakers. Differences in formant frequencies and perceived hypernasality between Down's syndrome and nonretarded speakers, however, were nonsignificant. Formant frequency measures and perceived nasality ratings obtained from Down's syndrome adults in the present study are in contrast to such measures obtained from Down's syndrome children in previously reported studies.}, } @article {pmid3760327, year = {1986}, author = {Nábĕlek, AK and Dagenais, PA}, title = {Vowel errors in noise and in reverberation by hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {80}, number = {3}, pages = {741-748}, doi = {10.1121/1.393948}, pmid = {3760327}, issn = {0001-4966}, support = {R01 NS 12035/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Aged ; Auditory Threshold ; Female ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; Noise ; *Phonetics ; Speech Disorders/*etiology ; }, abstract = {The effects of noise and reverberation on the identification of monophthongs and diphthongs were evaluated for ten subjects with moderate sensorineural hearing losses. Stimuli were 15 English vowels spoken in a /b-t/ context, in a carrier sentence. The original tape was recorded without reverberation, in a quiet condition. This test tape was degraded either by recording in a room with reverberation time of 1.2 s, or by adding a babble of 12 voices at a speech-to-noise ratio of 0 dB. Both types of degradation caused statistically significant reductions of mean identification scores as compared to the quiet condition. Although the mean identification scores for the noise and reverberant conditions were not significantly different, the patterns of errors for these two conditions were different. Errors for monophthongs in reverberation but not in noise seemed to be related to an overestimation of vowel duration, and there was a tendency to weight the formant frequencies differently in the reverberation and quiet conditions. Errors for monophthongs in noise seemed to be related to spectral proximity of formant frequencies for confused pairs. For the diphthongs in both noise and reverberation, there was a tendency to judge a diphthong as the beginning monophthong. This may have been due to temporal smearing in the reverberation condition, and to a higher masked threshold for changing compared to stationary formant frequencies in the noise condition.}, } @article {pmid3756461, year = {1986}, author = {Ryalls, JH}, title = {An acoustic study of vowel production in aphasia.}, journal = {Brain and language}, volume = {29}, number = {1}, pages = {48-67}, doi = {10.1016/0093-934x(86)90033-7}, pmid = {3756461}, issn = {0093-934X}, support = {1 F31 MH08285-01A1/MH/NIMH NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aphasia/*psychology ; Aphasia, Broca/psychology ; Aphasia, Wernicke/psychology ; Humans ; Male ; Middle Aged ; Phonation ; Phonetics ; *Speech ; *Speech Acoustics ; Time Factors ; }, abstract = {A group of five anterior and seven posterior aphasic patients were recorded for their vowel productions of the nine nondipthong vowels of American English and compared to a group of seven normal speakers. All phonemic substitutions were eliminated from the data base. A Linear Predictive Coding (LPC) computer program was used to extract the first and the second formant frequencies at the midpoint of the vowel for each of the remaining repetitions of the nine vowels. The vowel duration and the fundamental frequency of phonation were also measured. Although there were no significant differences in the formant frequency means across groups, there were significantly larger standard deviations for the aphasic groups compared to normals. Anterior aphasics were not significantly different from posterior aphasics with respect to this greater formant variability. There was a main effect for vowel duration means, but no individual group was significantly different from the other. Standard deviations of duration were significantly greater for the anterior aphasics compared to normal speakers, but not significantly different from posterior aphasics. Posterior aphasics did not have significantly greater standard deviations of duration than did normal subjects. Greater acoustic variability was considered to evidence a phonetic production deficit on the part of both groups of aphasic speakers, in the context of fairly well-preserved phonemic organization for vowels.}, } @article {pmid3756460, year = {1986}, author = {Ziegler, W and von Cramon, D}, title = {Disturbed coarticulation in apraxia of speech: acoustic evidence.}, journal = {Brain and language}, volume = {29}, number = {1}, pages = {34-47}, doi = {10.1016/0093-934x(86)90032-5}, pmid = {3756460}, issn = {0093-934X}, mesh = {Adult ; Apraxias/*psychology ; Humans ; Male ; Middle Aged ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Time Factors ; }, abstract = {The results of a recent perceptual study (W. Ziegler & D. von Cramon, 1985, Anticipatory coarticulation in a patient with apraxia of speech. Brain and Language 26, 117-130) provided evidence for disturbed coarticulation in verbal apraxia. Further support for this finding is now provided by acoustic analyses. Formant frequencies and LP reflection coefficients were chosen to assess anticipatory vowel-to-vowel coarticulation and vowel anticipation in stop consonants, respectively. These parameters revealed a lack of coarticulatory cohesion in the speech of a patient suffering from verbal apraxia, explainable by a consistent delay in the initiation of anticipatory vowel gestures. The findings are discussed with respect to prosodic features and to theoretical and clinical concepts of verbal apraxia.}, } @article {pmid3745489, year = {1986}, author = {Nichols, AC}, title = {The vocal tract model of placement: II. Acoustic aspects.}, journal = {Journal of communication disorders}, volume = {19}, number = {4}, pages = {281-288}, doi = {10.1016/0021-9924(86)90034-1}, pmid = {3745489}, issn = {0021-9924}, mesh = {Humans ; *Phonetics ; Sound Spectrography ; Vocal Cords/*physiopathology ; *Voice ; Voice Disorders/*physiopathology/therapy ; *Voice Quality ; }, abstract = {The technique of "placing" the voice in the throat, head, or elsewhere is used in training singers of operatic quality and in vocal rehabilitation. A vocal tract model of placement has been shown to have perceptual validity (Nichols and Shellenberger, 1985). This study tested its acoustic validity. Line spectra of vowels at different vocal placements were analyzed. We found that Formant 2 reflected changes in vocal placement for both male and female speakers.}, } @article {pmid2943858, year = {1986}, author = {Tolkmitt, FJ and Scherer, KR}, title = {Effect of experimentally induced stress on vocal parameters.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {12}, number = {3}, pages = {302-313}, doi = {10.1037//0096-1523.12.3.302}, pmid = {2943858}, issn = {0096-1523}, mesh = {Anxiety/physiopathology ; Cognition ; Denial, Psychological ; Female ; Humans ; Male ; Phonation ; Phonetics ; Sex Factors ; Sound Spectrography ; *Speech/physiology ; Stress, Psychological/*physiopathology ; }, abstract = {In a factorially designed experiment the factors Mode of Stress (cognitive vs. emotional) and Degree of Stress (low vs. high) were studied in their effect on phonatory and articulatory processes during speech production in groups of male and female students (factor Sex) selected to represent three types of personality (factor Coping Style: low anxiety, high anxiety, anxiety denying). Using digital speech analysis procedures, mean F0, F0 floor, formant location, and spectral energy distribution were extracted from the experimental speech samples. Although mean F0 did not covary with stress manipulation, F0 floor of high-anxious and anxiety-denying subjects increased with stress, probably due to physiologically based changes in muscle tension. For articulatory processes, as measured through formant location and spectral composition, significant changes were found for females. For anxiety-denying female subjects, precision of articulation increased under cognitive stress and decreased under emotional stress. The results are interpreted in terms of differential susceptibility to stress.}, } @article {pmid3755194, year = {1986}, author = {Tyler, RS and Preece, JP and Lansing, CR and Otto, SR and Gantz, BJ}, title = {Previous experience as a confounding factor in comparing cochlear-implant processing schemes.}, journal = {Journal of speech and hearing research}, volume = {29}, number = {2}, pages = {282-287}, doi = {10.1044/jshr.2902.282}, pmid = {3755194}, issn = {0022-4685}, support = {P50 DC000242/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Threshold ; Cochlear Implants/*standards ; Deafness/*therapy ; Female ; Humans ; Meniere Disease/complications ; Middle Aged ; Phonetics ; Prosthesis Design ; Set, Psychology ; Speech Reception Threshold Test ; }, abstract = {It is of great importance to compare the relative merits of different cochlear-implant speech-processing strategies. Some groups have compared different strategies within single subjects, but usually the subject has prior experience with one strategy, and no allowance is made for this prior experience. We show in the present study that this is inappropriate. We tested one subject using the Melbourne (Cochlear Corp.) multichannel implant with the device set to process sounds in two different ways. In the first processing scheme, the device functioned normally, extracting information about voicing frequency, amplitude and second-formant frequency. This information activated the 21-channel device, determining pulse rate, pulse amplitude and electrode position (respectively). In the second processing scheme, a single electrode (with the largest dynamic range) was activated. This electrode coded overall amplitude and voicing frequency. The subject was tested on an audiovisual test of a 14-choice consonant recognition in the form /iCi/ over a period of over 4 months. During this time the subject used the 21-channel processor outside of the laboratory. Upon initial connection, there was little difference between the results obtained with the two schemes when tested in sound alone or in sound plus vision. However, after about 4 months, scores obtained with the 21-channel processor in sound plus vision were superior to the scores obtained with the one channel. This advantage came from a superiority in the features of voicing and nasality, but not place. Scores for sound-alone conditions between the two processing schemes remained similar for the 4-month period.(ABSTRACT TRUNCATED AT 250 WORDS)}, } @article {pmid3755193, year = {1986}, author = {Doyle, KJ and Danhauer, JL and Edgerton, BJ}, title = {Vowel perception: experiments with a single-electrode cochlear implant.}, journal = {Journal of speech and hearing research}, volume = {29}, number = {2}, pages = {179-192}, pmid = {3755193}, issn = {0022-4685}, mesh = {Adult ; Aged ; Auditory Threshold ; *Cochlear Implants ; Deafness/*therapy ; Humans ; Loudness Perception ; Middle Aged ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; Speech Discrimination Tests/*methods ; }, abstract = {We investigated vowel perception by 15 subjects using the single-electrode cochlear implant used at the House Ear Institute in Los Angeles. Subjects were postlingually deaf adults having histories of unsuccessful hearing aid use and a minimum of 6 to 12 months experience with the implant. Eleven American English vowels spoken by a male talker were tape recorded, digitized, analyzed, and controlled for the experiments. The stimuli were audio-recordings of both natural and loudness-matched vowels. Subjects rated the dissimilarity of both the naturally spoken and the loudness-matched vowels, and performed identification of the latter. Two normal-hearing subjects served as controls for the dissimilarity tasks. Multidimensional scaling, hierarchical clustering, and percent correct identification analyses were used to help determine the perceptual features used by the subjects in their judgments. Generally, the normal-hearing subjects took advantage of second formant (F2) frequency information. The cochlear-implant users relied primarily upon fundamental (F0) and first formant (F1) frequency information and demonstrated difficulty in vowel identification. No major differences were noted for the natural versus loudness-matched vowels. F2 information, requisite for accurate vowel recognition, did not correspond to any of the perceptual dimensions discerned in the results obtained from implant subjects.}, } @article {pmid3722610, year = {1986}, author = {Bloothooft, G and Plomp, R}, title = {The sound level of the singer's formant in professional singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {6}, pages = {2028-2033}, doi = {10.1121/1.393211}, pmid = {3722610}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; *Music ; *Phonation ; Pressure ; Sound ; *Voice ; *Voice Quality ; }, abstract = {The relative sound level of the "singer's formant," measured in a 1/3-oct band with a center frequency of 2.5 kHz for males and of 3.16 kHz for females, has been investigated for 14 professional singers, nine different modes of singing, nine different vowels, variations in overall sound-pressure level, and fundamental frequencies ranging from 98 up to 880 Hz. Variation in the sound level of the singer's formant due to differences among male singers was small (4 dB), the factors vowels (16 dB) and fundamental frequency (9-14 dB) had an intermediate effect, while the largest variation was found for differences among female singers (24 dB), between modes of singing (vocal effort) (23 dB), and in overall sound-pressure level (more than 30 dB). In spite of this great potential variability, for each mode of singing the sound level of the singer's formant was remarkably constant up to F0 = 392 Hz, due to adaptation of vocal effort. This may be explained as the result of the perceptual demand of a constant voice quality. The definition of the singer's formant is discussed.}, } @article {pmid3722609, year = {1986}, author = {Repp, BH}, title = {Perception of the [m]-[n] distinction in CV syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {6}, pages = {1987-1999}, doi = {10.1121/1.393207}, pmid = {3722609}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Female ; Humans ; Language ; Male ; *Phonation ; *Speech ; *Speech Perception ; United States ; *Voice ; }, abstract = {The contribution of the nasal murmur and the vocalic formant transitions to perception of the [m]-[n] distinction in utterance-initial position preceding [i,a,u] was investigated, extending the recent work of Kurowski and Blumstein [J. Acoust. Soc. Am. 76, 383-390 (1984)]. A variety of waveform-editing procedures were applied to syllables produced by six different talkers. Listeners' judgments of the edited stimuli confirmed that the nasal murmur makes a significant contribution to place of articulation perception. Murmur and transition information appeared to be integrated at a genuinely perceptual, not an abstract cognitive, level. This was particularly evident in [-i] context, where only the simultaneous presence of murmur and transition components permitted accurate place of articulation identification. The perceptual information seemed to be purely relational in this case. It also seemed to be context specific, since the spectral change from the murmur to the vowel onset did not follow an invariant pattern across front and back vowels.}, } @article {pmid3722607, year = {1986}, author = {Rossing, TD and Sundberg, J and Ternström, S}, title = {Acoustic comparison of voice use in solo and choir singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {6}, pages = {1975-1981}, doi = {10.1121/1.393205}, pmid = {3722607}, issn = {0001-4966}, mesh = {Acoustics ; Humans ; Male ; *Music ; *Phonation ; *Voice ; *Voice Quality ; }, abstract = {An experiment was carried out in which eight bass/baritone singers were recorded while singing in both choral and solo modes. Together with their own voice, they heard the sound of the rest of the choir and a piano accompaniment, respectively. The recordings were analyzed in several ways, including computation of long-time-average spectra for each passage, analysis of the sound levels in the frequency ranges corresponding to the fundamental and the "singer's formant," and a comparison of the sung levels with the levels heard by the singers. Matching pairs of vowels in the two modes were inverse filtered to determine the voice source spectra and formant frequencies for comparison. Differences in both phonation and articulation between the two modes were observed. Subjects generally sang with more power in the singer's formant region in the solo mode and with more power in the fundamental region in the choral mode. Most singers used a reduced frequency distance between the third and fifth formants for increasing the power in the singer's formant range, while the difference in the fundamental was mostly a voice source effect. In a choral singing mode, subjects usually adjusted their voice levels to the levels they heard from the other singers, whereas in a solo singing mode the level sung depended much less on the level of an accompaniment.}, } @article {pmid3722600, year = {1986}, author = {Carney, LH and Geisler, CD}, title = {A temporal analysis of auditory-nerve fiber responses to spoken stop consonant-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {6}, pages = {1896-1914}, doi = {10.1121/1.393197}, pmid = {3722600}, issn = {0001-4966}, support = {NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Animals ; *Auditory Perception ; Cats ; Hearing ; Humans ; Neurons/*physiology ; *Speech ; Time Factors ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Auditory-nerve fiber spike trains were recorded in response to spoken English stop consonant-vowel syllables, both voiced (/b,d,g/) and unvoiced (/p,t,k/), in the initial position of syllables with the vowels /i,a,u/. Temporal properties of the neural responses and stimulus spectra are displayed in a spectrographic format. The responses were categorized in terms of the fibers' characteristic frequencies (CF) and spontaneous rates (SR). High-CF, high-SR fibers generally synchronize to formants throughout the syllables. High-CF, low/medium-SR fibers may also synchronize to formants; however, during the voicing, there may be sufficient low-frequency energy present to suppress a fiber's synchronized response to a formant near its CF. Low-CF fibers, from both SR groups, synchronize to energy associated with voicing. Several proposed acoustic correlates to perceptual features of stop consonant-vowel syllables, including the initial spectrum, formant transitions, and voice-onset time, are represented in the temporal properties of auditory-nerve fiber responses. Nonlinear suppression affects the temporal features of the responses, particularly those of low/medium-spontaneous-rate fibers.}, } @article {pmid3720395, year = {1986}, author = {Elliott, LL and Busse, LA and Partridge, R and Rupert, J and DeGraaff, R}, title = {Adult and child discrimination of CV syllables differing in voicing onset time.}, journal = {Child development}, volume = {57}, number = {3}, pages = {628-635}, pmid = {3720395}, issn = {0009-3920}, support = {NS 18556/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Humans ; *Language Development ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {The purpose of this experiment was to determine whether age-related differences would be observed for discrimination of synthesized, 5-formant consonant-vowel syllables that differed in voicing onset time (VOT) of the initial consonant. Just noticeable differences (JNDs) were measured relative to the end points of the stimulus continuum, using a "same"-"different," adaptive procedure with trial-by-trial feedback/reinforcement and "catch" trials, in which members of the stimulus pair were identical syllables. 6-8-year-old children required significantly longer VOTs for syllabic discrimination than did adults. Adults' JNDs relative to [pa] at the end of the stimulus continuum with longer VOTs were significantly larger than their JNDs relative to [ba], as would be expected on the basis of psychoacoustic considerations, but neither 6-8-nor 8-11-year-old children showed this trend. Performance on "catch" trials for both groups of children was significantly poorer than for adults. Outcomes corresponded to results of other investigations that have reported that children require more acoustic information than adults to achieve the same performance criterion.}, } @article {pmid3711460, year = {1986}, author = {Repp, BH}, title = {Some observations on the development of anticipatory coarticulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {5}, pages = {1616-1619}, doi = {10.1121/1.393298}, pmid = {3711460}, issn = {0001-4966}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; *Auditory Perception ; Child ; Child, Preschool ; Female ; Humans ; Male ; Mouth/physiology ; *Speech ; *Speech Acoustics ; Speech Articulation Tests ; }, abstract = {The influence of vocalic context on various temporal and spectral properties of preceding acoustic segments was investigated in utterances containing [schwa No. CV] sequences produced by two girls aged 4;8 and 9;5 years and by their father. The younger (but not the older) child's speech showed a systematic lowering of [s] noise and [th] release burst spectra before [u] as compared to [i] and [ae]. The older child's speech, on the other hand, showed an orderly relationship of the second-formant frequency in [] to the transconsonantal vowel. Both children tended to produce longer [s] noises and voice onset times as well as higher second-formant peaks at constriction noise offset before [i] than before [u] and [ae]. All effects except the first were shown by the adult who, in addition, produced first-formant frequencies in [] that anticipated the transconsonantal vowel. These observations suggest that different forms of anticipatory coarticulation may have different causes and may follow different developmental patterns. A strategy for future research is suggested.}, } @article {pmid3702569, year = {1986}, author = {Burns, P}, title = {Acoustical analysis of the underlying voice differences between two groups of professional singers: opera and country and western.}, journal = {The Laryngoscope}, volume = {96}, number = {5}, pages = {549-554}, doi = {10.1288/00005537-198605000-00015}, pmid = {3702569}, issn = {0023-852X}, mesh = {Computers ; Humans ; Male ; *Music ; *Phonetics ; Speech Acoustics ; *Voice ; Voice Disorders/etiology ; *Voice Quality ; Voice Training ; }, abstract = {An acoustical analysis of the speaking and singing voices of two types of professional singers was conducted. The vowels /i/, /a/, and /o/ were spoken and sung ten times each by seven opera and seven country and western singers. Vowel spectra were derived by computer software techniques allowing quantitative assessment of formant structure (F1-F4), relative amplitude of resonance peaks (F1-F4), fundamental frequency, and harmonic high frequency energy. Formant analysis was the most effective parameter differentiating the two groups. Only opera singers lowered their fourth formant creating a wide-band resonance area (approximately 2,800 Hz) corresponding to the well-known "singing formant." Country and western singers revealed similar resonatory voice characteristics for both spoken and sung output. These results implicate faulty vocal technique in country and western singers as a contributory reason for vocal abuse/fatigue.}, } @article {pmid2420574, year = {1986}, author = {Norcia, AM and Sato, T and Shinn, P and Mertus, J}, title = {Methods for the identification of evoked response components in the frequency and combined time/frequency domains.}, journal = {Electroencephalography and clinical neurophysiology}, volume = {65}, number = {3}, pages = {212-226}, doi = {10.1016/0168-5597(86)90056-0}, pmid = {2420574}, issn = {0013-4694}, support = {EY07029-05/EY/NEI NIH HHS/United States ; }, mesh = {Electroencephalography ; *Evoked Potentials, Visual ; Humans ; Pattern Recognition, Visual/physiology ; Reaction Time/physiology ; }, abstract = {Two prominent frequency components designated f1 and f2 have been identified in the visual evoked response to the transient presentation of sinusoidal luminance gratings in the range of 0.5-8 c/deg. The components occur at temporal frequencies below the alpha band, with the f1 frequency being roughly half that of the f2 frequency. The f1 component is largest at low spatial frequencies with f2 becoming progressively dominant as spatial frequency is increased. The frequency and amplitude of f1 and f2 change substantially over the time course of the response. This has been studied by calculating the temporal frequency spectrum of the transient evoked potential over successive short-time epochs running through the response. Using this technique, the response is shown to consist of narrow-band frequency peaks or 'formants' emerging at different times after stimulus onset. These formants occur at frequencies other than those of the spontaneous EEG and undergo changes in frequency and amplitude over the time course of the response. Two spectrum analysis techniques were employed: the Discrete Fourier Transform and Linear Predictive Coding. Frequency components were successfully identified in single-trial responses using the LPC technique.}, } @article {pmid3755172, year = {1986}, author = {Hirshorn, MS and Mecklenburg, DJ and Brimacombe, JA}, title = {Nucleus 22-channel cochlear implant: preliminary observations.}, journal = {Journal of rehabilitation research and development}, volume = {23}, number = {2}, pages = {27-33}, pmid = {3755172}, issn = {0748-7711}, mesh = {Adult ; Aged ; Cochlear Implants/*instrumentation ; Deafness/*rehabilitation ; Electrodes ; Female ; Humans ; Lipreading ; Male ; Microcomputers ; Middle Aged ; Software ; Speech Discrimination Tests ; }, abstract = {A carefully designed study was undertaken in 1982 to evaluate the performance of individuals who received the Nucleus 22-channel cochlear implant. All patients were profound-totally deaf, adults with a postlingual onset of impairment. The preoperative evaluation, prosthesis fitting, training, and postoperative testing were consistent across clinics. Single-subject studies, where each patient acted as his/her own control, revealed that of the 37 subjects, 16-24 obtained significant improvement (P less than or equal to 0.001) on unpracticed, unfamiliar recorded speech tests from the Minimal Auditory Capabilities (MAC) Battery, when using hearing alone (no lipreading). In addition, virtually all patients showed improvement in recognition of speech material with lipreading. The data support the efficacy of a feature extraction coding system where specific formant and amplitude information are transmitted via direct electrical stimulation to the cochlea.}, } @article {pmid3700864, year = {1986}, author = {Syrdal, AK and Gopal, HS}, title = {A perceptual model of vowel recognition based on the auditory representation of American English vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {4}, pages = {1086-1100}, doi = {10.1121/1.393381}, pmid = {3700864}, issn = {0001-4966}, support = {CMS 5 KO5 NS00548/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Child ; Female ; Humans ; Male ; *Phonetics ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; }, abstract = {A quantitative perceptual model of human vowel recognition based upon psychoacoustic and speech perception data is described. At an intermediate auditory stage of processing, the specific bark difference level of the model represents the pattern of peripheral auditory excitation as the distance in critical bands (barks) between neighboring formants and between the fundamental frequency (F0) and first formant (F1). At a higher, phonetic stage of processing, represented by the critical bark difference level of the model, the transformed vowels may be dichotomously classified based on whether the difference between formants in each dimension falls within or exceeds the critical distance of 3 bark for the spectral center of gravity effect [Chistovich et al., Hear. Res. 1, 185-195 (1979)]. Vowel transformations and classifications correspond well to several major phonetic dimensions and features by which vowels are perceived and traditionally classified. The F1-F0 dimension represents vowel height, and high vowels have F1-F0 differences within 3 bark. The F3-F2 dimension corresponds to vowel place of articulation, and front vowels have F3-F2 differences of less than 3 bark. As an inherent, speaker-independent normalization procedure, the model provides excellent vowel clustering while it greatly reduces between-speaker variability. It offers robust normalization through feature classification because gross binary categorization allows for considerable acoustic variability. There was generally less formant and bark difference variability for closely spaced formants than for widely spaced formants. These findings agree with independently observed perceptual results and support Stevens' quantal theory of vowel production and perceptual constraints on production predicted from the critical bark difference level of the model.}, } @article {pmid3503243, year = {1986}, author = {Pindzola, RH}, title = {Acoustic evidence of aberrant velocities in stutterers' fluent speech.}, journal = {Perceptual and motor skills}, volume = {62}, number = {2}, pages = {399-405}, doi = {10.2466/pms.1986.62.2.399}, pmid = {3503243}, issn = {0031-5125}, mesh = {Adolescent ; Adult ; Articulation Disorders/physiopathology ; Humans ; Male ; *Phonetics ; *Sound Spectrography ; *Speech Production Measurement ; Stuttering/*physiopathology ; Vocal Cords/physiopathology ; }, abstract = {Movement rates of formant frequencies and the extents of articulatory change were spectrographically analyzed in the fLuent (VCV) utterances of 20 stutterers and nonstutterers. The velocities of articulator movement throughout the first vowel and velocities into the second vowel were not significantly different for the two groups. These mean rates of movement, although nonsignificant, were slower in stutterers and slightly more variable, and the extent of articulator movement was comparable. These results do not support the contentions that stutterers use coarticulatory movements that are too rapid or that stutterers have a poorer competence for rapid coordination of speech movements. The rationales of rate-control treatment methods to slow coarticulatory movements in stutterers need to be reexamined.}, } @article {pmid3958326, year = {1986}, author = {Darwin, CJ and Gardner, RB}, title = {Mistuning a harmonic of a vowel: grouping and phase effects on vowel quality.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {3}, pages = {838-845}, doi = {10.1121/1.393474}, pmid = {3958326}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; *Pitch Perception ; Psychoacoustics ; *Speech Perception ; }, abstract = {The harmonic sieve has been proposed as a mechanism for excluding extraneous frequency components from the estimate of the pitch of a complex sound. The experiments reported here examine whether a harmonic sieve could also determine whether a particular harmonic contributes to the phonetic quality of a vowel. Mistuning a harmonic in the first formant region of vowels from an /I/-/e/ continuum gave shifts in the phoneme boundary that could be explained by (i) phase effects for small amounts of mistuning and (ii) a harmonic sievelike grouping mechanism for larger amounts of mistuning. Similar grouping criteria to those suggested for pitch may operate for the determination of first formant frequency in voiced speech.}, } @article {pmid3950202, year = {1986}, author = {Breeuwer, M and Plomp, R}, title = {Speechreading supplemented with auditorily presented speech parameters.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {2}, pages = {481-499}, doi = {10.1121/1.393536}, pmid = {3950202}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Lipreading ; Male ; Phonetics ; Pitch Perception ; *Speech Perception ; }, abstract = {Results are reported from two experiments in which the benefit of supplementing speechreading with auditorily presented information about the speech signal was investigated. In experiment I, speechreading was supplemented with information about the prosody of the speech signal. For ten normal-hearing subjects with no experience in speechreading, the intelligibility score for sentences increased significantly when speechreading was supplemented with information about the overall amplitude of the speech signal, information about the fundamental frequency, or both. Binary information about voicing appeared not to be a significant supplement. In experiment II, the best-scoring supplements of experiment I were compared with two supplementary signals from our previous studies, i.e., information about the sound-pressure levels in two 1-oct filter bands centered at 500 and 3160 Hz, or information about the frequencies of the first and second formants from voiced speech segments. Sentence-intelligibility scores were measured for 24 normal-hearing subjects with no experience in speechreading, and for 12 normal-hearing experienced speechreaders. For the inexperienced speechreaders, the sound-pressure levels appeared to be the best supplement (87.1% correct syllables). For the experienced speechreaders, the formant-frequency information (88.6% correct), and the fundamental-frequency plus amplitude information (86.0% correct), were equally efficient supplements as the sound-pressure information (86.1% correct). Discrimination of phonemes (both consonants and vowels) was measured for the group of 24 inexperienced speechreaders. Percentage correct responses, confusion among phonemes, and the percentage of transmitted information about different types of manner and place of articulation and about the feature voicing are presented.}, } @article {pmid3754170, year = {1986}, author = {Rosen, S and Ball, V}, title = {Speech perception with the Vienna extra-cochlear single-channel implant: a comparison of two approaches to speech coding.}, journal = {British journal of audiology}, volume = {20}, number = {1}, pages = {61-83}, doi = {10.3109/03005368609078999}, pmid = {3754170}, issn = {0300-5364}, mesh = {Adult ; *Cochlear Implants ; Deafness/surgery ; Evaluation Studies as Topic ; Female ; Follow-Up Studies ; *Hearing ; Humans ; Middle Aged ; Sound ; *Speech ; }, abstract = {Although it is generally accepted that single-channel electrical stimulation can significantly improve a deafened patient's speech perceptual ability, there is still much controversy surrounding the choice of speech processing schemes. We have compared, in the same patients, two different approaches: (1) The speech pattern extraction technique of the EPI group, London (Fourcin et al., British Journal of Audiology, 1979,13,85-107) in which voice fundamental frequency is extracted and presented in an appropriate way, and (2) The analogue 'whole speech' approach of Hochmair and Hochmair-Desoyer (Annals of the New York Academy of Sciences, 1983, 405, 268-279) of Vienna, in which the microphone-sensed acoustic signal is frequency-equalized and amplitude-compressed before being presented to the electrode. With the 'whole-speech' coding scheme (which they used daily), all three patients showed an improvement in lipreading when they used the device. No patient was able to understand speech without lipreading. Reasonable ability to distinguish voicing contrasts and voice pitch contours was displayed. One patient was able to detect and make appropriate use of the presence of voiceless frication in certain situations. Little sensitivity to spectral features in natural speech was noted, although two patients could detect changes in the frequency of the first formant of synthesised vowels. Presentation of the fundamental frequency only generally led to improved perception of features associated with it (voicing and intonation). Only one patient consistently showed any advantage (and that not in all tests) of coding more than the fundamental alone.}, } @article {pmid3947944, year = {1986}, author = {Sussman, H and Marquardt, T and Hutchinson, J and MacNeilage, P}, title = {Compensatory articulation in Broca's aphasia.}, journal = {Brain and language}, volume = {27}, number = {1}, pages = {56-74}, doi = {10.1016/0093-934x(86)90005-2}, pmid = {3947944}, issn = {0093-934X}, support = {NS 15336/NS/NINDS NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Adult ; Aged ; Aphasia/*physiopathology ; Aphasia, Broca/pathology/*physiopathology ; Brain/pathology ; Female ; Humans ; Male ; Middle Aged ; Speech/*physiology ; Tomography, X-Ray Computed ; }, abstract = {Thirteen patients clinically diagnosed as Broca's aphasics, were administered a vowel bite block task to ascertain their on-line compensatory articulation ability. Formant frequencies for F1 and F2 were obtained from a pitch-synchronous spectral analysis of digitized vowel waveforms. Bite block/i/productions were consistently characterized by higher F1 and lower F2 frequencies compared to normal vowel productions. Undershooting of tongue elevation and fronting explains this result. No consistent pattern of "off-target" productions were found for/a/tokens. A multiple correlation analysis among clinically derived quantitative measures (BDAE score, oral and verbal apraxia), experimentally derived measures (bite block performance, articulatory reaction time), and neurological measures (brain lesion volume) was highly successful despite a relatively small sample size. Compensatory articulation was significantly related to both oral apraxia scores (r = -.70) and the BDAE total score (r = -.51). A multiple regression analysis nearly reached statistical significance (p = .11) in predicting bite block performance, with oral apraxia the most powerful predictor variable. Descriptive accounts of loci of brain lesion vis-à-vis compensatory articulation ability are given to delimit brain areas thought to possibly mediate this articulatory adaptive skill. Preliminary speculation supports the notion that an intact Area 44 is a prerequisite for successful compensatory ability.}, } @article {pmid3944336, year = {1986}, author = {Palmer, AR and Winter, IM and Darwin, CJ}, title = {The representation of steady-state vowel sounds in the temporal discharge patterns of the guinea pig cochlear nerve and primarylike cochlear nucleus neurons.}, journal = {The Journal of the Acoustical Society of America}, volume = {79}, number = {1}, pages = {100-113}, doi = {10.1121/1.393633}, pmid = {3944336}, issn = {0001-4966}, mesh = {Animals ; Cochlear Nerve/*physiology ; Evoked Potentials, Auditory ; Guinea Pigs ; Nerve Fibers/physiology ; Neurons/physiology ; *Phonetics ; Pitch Perception/physiology ; Speech Perception/*physiology ; }, abstract = {We have recorded the responses of fibers in the cochlear nerve and cells in the cochlear nucleus of the anesthetized guinea pig to synthetic vowels [i], [a], and [u] at 60 and 80 dB SPL. Histograms synchronized to the pitch period of the vowel were constructed, and locking of the discharge to individual harmonics was estimated from these by Fourier transformation. In cochlear nerve fibers from the guinea pig, the responses were similar in all respects to those previously described for the cat. In particular, the average-localized-synchronized-rate functions (ALSR), computed from pooled data, had well-defined peaks corresponding to the formant frequencies of the three vowels at both sound levels. Analysis of the components dominating the discharge could also be used to determine the voice pitch and the frequency of the first formants. We have computed similar population measures over a sample of primarylike cochlear nucleus neurons. In these primarylike cochlear nucleus cell responses, the locking to the higher-frequency formants of the vowels is weaker than in the nerve. This results in a severe degradation of the peaks in the ALSR function at the second and third formant frequencies at least for [i] and [u]. This result is somewhat surprising in light of the reports that primarylike cochlear nucleus cells phaselock, as well as do cochlear nerve fibers.}, } @article {pmid4087882, year = {1985}, author = {Lieberman, P and Meskill, RH and Chatillon, M and Schupack, H}, title = {Phonetic speech perception deficits in dyslexia.}, journal = {Journal of speech and hearing research}, volume = {28}, number = {4}, pages = {480-486}, doi = {10.1044/jshr.2804.480}, pmid = {4087882}, issn = {0022-4685}, mesh = {Adolescent ; Adult ; Cues ; Dyslexia/genetics/*psychology ; Female ; Humans ; Intelligence ; Male ; Middle Aged ; *Phonetics ; Psychoacoustics ; Speech Acoustics ; *Speech Perception/physiology ; }, abstract = {Adult developmental dyslexics showed deficits in the identification of the vowels of English when the sole acoustic cues were steady-state formant frequency patterns. Deficits in the identification of place of articulation of the English stop-consonants [b], [d] and [g] in syllable-initial position were also observed. The average consonantal error rate was 22%. These error rates are significantly different from those of nondyslexic control groups (p less than .01). No single deficit characterized the entire group of dyslexic subjects. The pattern of errors with respect to place of articulation also varied for different groups of subjects. Three dyslexics have high vowel error rates and low consonantal error rates. The data are consistent with the premise that dyslexic subjects may have different perceptual deficits rather than a general auditory deficit involving the rate at which they can process perceptual information. The clinical histories of the present subjects suggest genetic transmission of these speech perception deficits. The presence of genetic variation in the biological substrate relevant to the perception of human speech should be further explored.}, } @article {pmid4067070, year = {1985}, author = {Shadle, CH}, title = {Intrinsic fundamental frequency of vowels in sentence context.}, journal = {The Journal of the Acoustical Society of America}, volume = {78}, number = {5}, pages = {1562-1567}, doi = {10.1121/1.392792}, pmid = {4067070}, issn = {0001-4966}, mesh = {Female ; Humans ; Language ; Male ; *Speech ; *Speech Acoustics ; }, abstract = {High vowels have a higher intrinsic fundamental frequency (F0) than low vowels. This phenomenon has been verified in several languages. However, most studies of intrinsic F0 of vowels have used words either in isolation or bearing the main phrasal stress in a carrier sentence. As a first step towards an understanding of how the intrinsic F0 of vowels interacts with intonation in running speech, this study examined F0 of the vowels [i,a,u] in four sentence positions. The four speakers used for this study showed a statistically significant main effect of intrinsic F0 (high vowels had higher F0). Three of the four speakers also showed an interaction between intrinsic F0 and sentence position such that no significant F0 difference was observed in the unaccented, sentence-final position. The interaction was shown not to be due to vowel neutralization or correlated with changes in the glottal waveform shape, as evidenced by measures of the first formant frequency and spectral slope. Comparison with studies of tone languages and speech of the deaf suggests that both the lack of accent and the lower F0 caused the reduction in the intrinsic F0 difference.}, } @article {pmid4067067, year = {1985}, author = {Koizumi, T and Taniguchi, S and Hiromitsu, S}, title = {Glottal source-vocal tract interaction.}, journal = {The Journal of the Acoustical Society of America}, volume = {78}, number = {5}, pages = {1541-1547}, doi = {10.1121/1.392789}, pmid = {4067067}, issn = {0001-4966}, mesh = {Glottis/*physiology ; Humans ; *Models, Biological ; *Speech ; *Speech Acoustics ; Vocal Cords/*physiology ; }, abstract = {The problem of glottal source-vocal tract interaction is treated. The effect of the first formant load on the waveform of glottal volume flow is investigated by solving a nonlinear differential equation which describes the variation of pressure drop across the first formant load. The glottal volume flow is calculated under the influence of supraglottal and subglottal formant loads. The estimation of the glottal area function is discussed as an application of the theory developed.}, } @article {pmid3864371, year = {1985}, author = {Bowers, J and Tobey, EA and Shaye, R}, title = {An acoustic-speech study of patients who received orthognathic surgery.}, journal = {American journal of orthodontics}, volume = {88}, number = {5}, pages = {373-379}, doi = {10.1016/0002-9416(85)90064-8}, pmid = {3864371}, issn = {0002-9416}, support = {NS 11647/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Cephalometry ; Humans ; Malocclusion/pathology/physiopathology/*surgery ; *Orthognathic Surgical Procedures ; Osteotomy/methods ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Intelligibility ; }, abstract = {Several reports have suggested that orthognathic surgery may influence speech patterns. The purpose of this study was to examine the formant frequency changes of speech following orthognathic surgery in patients whose speech was considered perceptually normal preoperatively and postoperatively. Speech samples were obtained from five patients (three patients with Class II, Division 1 malocclusions and two patients with Class III malocclusions). Significant second-formant frequency shifts were found for the vowel 'e' (as in 'seat'); however, only minor second-formant frequency variations were found for the vowels 'a' (as in 'sat') and 'u' (as in 'suit'). The pattern of formant frequencies before and after surgical treatment suggested that the speakers adjusted their articulation to accommodate the orthognathic surgery. Overall, the data from this study indicate that speech patterns may be reorganized after orthognathic surgery even though speech remains perceptually "normal."}, } @article {pmid3840813, year = {1985}, author = {Shamma, SA}, title = {Speech processing in the auditory system. II: Lateral inhibition and the central processing of speech evoked activity in the auditory nerve.}, journal = {The Journal of the Acoustical Society of America}, volume = {78}, number = {5}, pages = {1622-1632}, doi = {10.1121/1.392800}, pmid = {3840813}, issn = {0001-4966}, mesh = {Animals ; Cats ; Cochlear Implants ; Evoked Potentials, Auditory ; Models, Biological ; Speech Acoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {A biologically realistic model of a uniform lateral inhibitory network (LIN) is shown capable of extracting from the complex spatio-temporal firing patterns of the cat's auditory nerve the formants and low-order harmonics of synthetic voiced speech stimuli. The model provides a realistic mechanism to utilize the temporal aspects of the firing and thus supports the hypothesis that the neural coding of complex sounds in terms of average rates can be supplemented by the information coded in the synchronous firing. At low levels of intensity the LIN can sharpen the average rate profiles. At moderate and high levels the LIN uses the cues available in the distribution of phases of the synchronous activity which exhibit rapid relative phase shifts at specific characteristic frequency (CF) locations (corresponding to the frequencies of the low-order harmonics in the stimulus). These temporal phase shifts manifest themselves at the input of the LIN as steep and localized spatial discontinuities in the instantaneous pattern of activity across the fiber array. The LIN enhances its output from these spatially steep input regions while suppressing its output from spatially smooth input regions (where little phase shifts occur). In this manner the LIN recreates from the response patterns a representation of the stimulus spectrum using the temporal cues as spatial markers of the stimulus components rather than as absolute measures of their frequencies. Similar results are obtained with various lateral inhibitory topologies, e.g., recurrent versus nonrecurrent, single versus double layer, and linear versus nonlinear.}, } @article {pmid4088814, year = {1985}, author = {Fox, RA}, title = {Within- and between-series contrast in vowel identification: full-vowel versus single-formant anchors.}, journal = {Perception & psychophysics}, volume = {38}, number = {3}, pages = {223-226}, pmid = {4088814}, issn = {0031-5117}, mesh = {Adult ; *Attention ; Humans ; Mental Recall ; *Phonetics ; *Speech Perception ; }, } @article {pmid2931555, year = {1985}, author = {Dorman, MF and Lindholm, JM and Hannley, MT}, title = {Influence of the first formant on the recognition of voiced stop consonants by hearing-impaired listeners.}, journal = {Journal of speech and hearing research}, volume = {28}, number = {3}, pages = {377-380}, doi = {10.1044/jshr.2803.377}, pmid = {2931555}, issn = {0022-4685}, support = {NS 15457/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Communication Aids for Disabled ; Hearing Loss, Sensorineural/*psychology ; Humans ; Middle Aged ; Perceptual Masking ; *Phonetics ; *Speech Perception ; }, abstract = {In order to examine first formant masking effects in hearing-impaired listeners, bay, day, and gay were synthesized with and without a first formant. The signals were presented at several levels. Removal of the first formant did not improve intelligibility at levels above, at, or below the point of maximum intelligibility. These results and others converge to suggest that masking spread from the first formant is not a significant factor in the identification of voiced stop consonants by listeners with sloping, mild-to-moderate hearing loss.}, } @article {pmid4010255, year = {1985}, author = {Bernstein-Ratner, N}, title = {Dissociations between vowel durations and formant frequency characteristics.}, journal = {Journal of speech and hearing research}, volume = {28}, number = {2}, pages = {255-264}, doi = {10.1044/jshr.2802.255}, pmid = {4010255}, issn = {0022-4685}, mesh = {Adult ; Humans ; Infant ; Mother-Child Relations ; Phonation ; *Phonetics ; *Speech ; Verbal Behavior ; }, abstract = {Disagreement exists on the degree to which rate of speech and segmental duration affect the formant frequency characteristics of vowels. Post hoc analysis of the vowel characteristics of words uttered by women in conversational speech with both adult and child addressees indicates that there is no simple relationship between the length of vowels and the degree to which their formant frequency characteristics resemble those seen in citation forms of speech. In the case of women addressing children, it was possible for content and function words to share formant frequency characteristics that maximally differentiated their embedded vowels, despite the relatively shorter duration of function word vowels. Implications for the elicitation of "clear speech" are discussed.}, } @article {pmid4010246, year = {1985}, author = {Klingholz, F and Martin, F}, title = {Quantitative spectral evaluation of shimmer and jitter.}, journal = {Journal of speech and hearing research}, volume = {28}, number = {2}, pages = {169-174}, doi = {10.1044/jshr.2802.169}, pmid = {4010246}, issn = {0022-4685}, mesh = {Adult ; Female ; Humans ; Middle Aged ; Phonation ; *Sound Spectrography ; Voice Disorders/*diagnosis ; Voice Quality ; }, abstract = {A vowel [a]-like, synthesized speech wave was perturbated by defined and comparable jitter and shimmer levels. The signal-to-noise ratio was calculated from the speech wave spectra. Noise emerges in those spectral regions in which the harmonics have high amplitudes, that is, at low frequencies and in the formant regions. Jitter created noise levels significantly higher than shimmer. To verify the theoretical findings, the voices of 32 women with functional voice disorders were analyzed for shimmer and jitter. It was found that only jitter is relevant for differentiating between hypo- and hyperfunctional voice disorders. Jitter was reduced in hyperfunctional voice disorder. This is presumed to be an effect of the high vocal fold tension found in the disorder.}, } @article {pmid3998298, year = {1985}, author = {Perkell, JS and Nelson, WL}, title = {Variability in production of the vowels /i/ and /a/.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {5}, pages = {1889-1895}, doi = {10.1121/1.391940}, pmid = {3998298}, issn = {0001-4966}, support = {NS04332/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Larynx/physiology ; Muscle Contraction ; Pharynx/physiology ; *Phonetics ; Speech/*physiology ; Speech Acoustics ; Tongue/physiology ; }, abstract = {A hypothesis on the nature of articulatory targets for the vowels /i/ and /a/ is proposed, based on acoustic considerations and vowel articulations. The conjecture is that positioning of points on the tongue surface in a repetition experiment should be most accurate in the direction perpendicular to the vocal-tract midline, at the acoustically critical point of maximal constriction for each vowel. The hypothesis was tested by: examining x-ray microbeam data for three speakers, conducting a partial acoustical analysis, and performing a modeling study. Distributions were plotted of the midsagittal locations of three tongue points at the time of maximal excursion toward the vowel target for numbers of examples of the vowels, embedded in a variety of phonetic contexts. More variation was found along a direction parallel to the vocal tract midline than perpendicular to the midline, supporting the hypothesis. Statistics on formant values for one subject have been calculated, and pairwise regressions of displacement and formant data have been run. An articulatory synthesizer [Rubin et al., J. Acoust. Soc. Am. 70, 321-328 (1981)] has been manipulated through displacements similar to the subject's articulatory variation. Although articulatory synthesis showed systematic relationships between articulatory relationships and formant frequencies, there were no significant correlations between the subject's measured articulatory displacements and his formant data. These additional results raise questions about the methodology and point to the need for additional work for an adequate test of the hypothesis.}, } @article {pmid3990258, year = {1985}, author = {Nabelek, AK and Letowski, TR}, title = {Vowel confusions of hearing-impaired listeners under reverberant and nonreverberant conditions.}, journal = {The Journal of speech and hearing disorders}, volume = {50}, number = {2}, pages = {126-131}, doi = {10.1044/jshd.5002.126}, pmid = {3990258}, issn = {0022-4677}, support = {R01 NS 12035/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Hearing Disorders/*diagnosis/psychology ; Humans ; Middle Aged ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Discrimination Tests/*methods ; Speech Perception ; }, abstract = {The effects of reverberation on the perception of vowels and diphthongs were evaluated using 10 subjects with moderate sensorineural hearing losses. Stimuli were 15 English vowels and diphthongs, spoken between/b/and/t/and recorded in a carrier sentence. The test was recorded without and with reverberation (T = 1.2 s). Although vowel confusions occurred in both test conditions, the number of vowels and diphthongs affected and the total number of errors made were significantly greater under the reverberant condition. The results indicated that the perception of vowels by hearing-impaired listeners can be influenced substantially by reverberation. Errors for vowels in reverberation seemed to be related to the overestimation of vowel duration and to a tendency to perceive the pitch of the formant frequencies as being higher than in vowels without reverberation. Error patterns were somewhat individualized among subjects.}, } @article {pmid3989111, year = {1985}, author = {Hawkins, S and Stevens, KN}, title = {Acoustic and perceptual correlates of the non-nasal--nasal distinction for vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {4}, pages = {1560-1575}, doi = {10.1121/1.391999}, pmid = {3989111}, issn = {0001-4966}, support = {NS-04332/NS/NINDS NIH HHS/United States ; NS-07237/NS/NINDS NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; Language ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Voice Quality ; }, abstract = {For each of five vowels [i e a o u] following [t], a continuum from non-nasal to nasal was synthesized. Nasalization was introduced by inserting a pole-zero pair in the vicinity of the first formant in an all-pole transfer function. The frequencies and spacing of the pole and zero were systematically varied to change the degree of nasalization. The selection of stimulus parameters was determined from acoustic theory and the results of pilot experiments. The stimuli were presented for identification and discrimination to listeners whose language included a non-nasal--nasal vowel opposition (Gujarati, Hindi, and Bengali) and to American listeners. There were no significant differences between language groups in the 50% crossover points of the identification functions. Some vowels were more influenced by range and context effects than were others. The language groups showed some differences in the shape of the discrimination functions for some vowels. On the basis of the results, it is postulated that (1) there is a basic acoustic property of nasality, independent of the vowel, to which the auditory system responds in a distinctive way regardless of language background; and (2) there are one or more additional acoustic properties that may be used to various degrees in different languages to enhance the contrast between a nasal vowel and its non-nasal congener. A proposed candidate for the basic acoustic property is a measure of the degree of prominence of the spectral peak in the vicinity of the first formant. Additional secondary properties include shifts in the center of gravity of the low-frequency spectral prominence, leading to a change in perceived vowel height, and changes in overall spectral balance.}, } @article {pmid3989110, year = {1985}, author = {Fox, RA}, title = {Auditory contrast and speaker quality variation in vowel perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {4}, pages = {1552-1559}, doi = {10.1121/1.391998}, pmid = {3989110}, issn = {0001-4966}, mesh = {Auditory Fatigue/*physiology ; Female ; Humans ; Male ; Phonetics ; Speech Perception/*physiology ; }, abstract = {Selective adaption and anchoring effects in speech perception have generated several different hypotheses regarding the nature of contextual contrast, including auditory/phonetic feature detector fatigue, response bias, and auditory contrast. In the present study three different seven-step [hId]-[h epsilon d] continua were constructed to represent a low F0 (long vocal tract source), a high F0 (long vocal tract source), and a high F0 (short vocal tract source), respectively. Subjects identified the tokens from each of the stimulus continua under two conditions: an equiprobable control and an anchoring condition which included an endpoint stimulus from one of the three continua occurring at least three times more often than any other single stimulus. Differential contrast effects were found depending on whether the anchor differed from the test stimuli in terms of F0, absolute formant frequencies, or both. Results were inconsistent with both the feature detector fatigue and response bias hypothesis. Rather, the obtained data suggest that vowel contrast occurs on the basis of normalized formant values, thus supporting a version of the auditory-contrast theory.}, } @article {pmid3842139, year = {1985}, author = {McGuire, RA and Rastatter, MP}, title = {Masking effects on formant frequency structure variability under selected speaking conditions.}, journal = {The Journal of auditory research}, volume = {25}, number = {2}, pages = {73-80}, pmid = {3842139}, issn = {0021-9177}, mesh = {Adult ; Female ; Humans ; Larynx/physiology ; Male ; *Perceptual Masking ; Phonetics ; Speech/*physiology ; }, abstract = {Normal-speaking adults (3 M, 3F) produced 4 vowels (/a/, /i/, /u/ and /o/) in a carrier phrase under free and disruptive speaking conditions (masking and/or bite block). Formant frequency structure variability was not affected by speaking conditions; however, a difference among vowels was obtained. F1 variability for the vowel /a/ was significantly different from the other 3 vowels as was the vowel /i/ for F2. The results were presented as the volumetric relationships which exist between the anterior and posterior F1 and F2 vocal tract cavities. The analysis of F1 and F2 variability suggests that once the place of vowel articulation exceeds a certain physiological boundary in reference to vocal tract cavity size, performance variability accelerates significantly.}, } @article {pmid3978404, year = {1985}, author = {Riedel, K and Studdert-Kennedy, M}, title = {Extending formant transitions may not improve aphasics' perception of stop consonant place of articulation.}, journal = {Brain and language}, volume = {24}, number = {2}, pages = {223-232}, doi = {10.1016/0093-934x(85)90132-4}, pmid = {3978404}, issn = {0093-934X}, support = {NICHD 01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Aphasia/*physiopathology ; Aphasia, Broca/*physiopathology ; Aphasia, Wernicke/*physiopathology ; Humans ; Language Tests ; Middle Aged ; Psychoacoustics ; Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {Synthetic speech stimuli were used to investigate whether aphasics' ability to perceive stop consonant place of articulation was enhanced by the extension of initial formant transitions in CV syllables. Phoneme identification and discrimination tests were administered to 12 aphasic patients, 5 fluent and 7 nonfluent. There were no significant differences in performance due to the extended transitions, and no systematic pattern of performance due to aphasia type. In both groups, discrimination was generally high and significantly better than identification, demonstrating that auditory capacity was retained, while phonetic perception was impaired; this result is consistent with repeated demonstrations that auditory and phonetic processes may be dissociated in normal listeners. Moreover, significant rank order correlations between performances on the Token Test and on both perceptual tasks suggest that impairment on these tests may reflect a general cognitive rather than a language-specific deficit.}, } @article {pmid3973240, year = {1985}, author = {Kuwabara, H}, title = {An approach to normalization of coarticulation effects for vowels in connected speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {2}, pages = {686-694}, doi = {10.1121/1.392337}, pmid = {3973240}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; *Phonetics ; *Speech Perception ; }, abstract = {A method is proposed to reduce the ambiguity of vowels in connected speech by normalizing the coarticulation effects. The method is applied to vowels in phonetic environments where great ambiguity would be likely to occur, taking as their features the first and second formant trajectories. The separability between vowel clusters is found to be greatly improved for the vowel samples. In addition, distribution of the vowels on a feature plane characterized by this method seems to reflect their perceptual nature when presented to listeners without isolation from their phonetic environments. The results suggest that the method proposed here is useful for automatic speech recognition and help infer some possible mechanisms underlying dynamic aspects of human speech recognition.}, } @article {pmid3973237, year = {1985}, author = {Dorman, MF and Marton, K and Hannley, MT and Lindholm, JM}, title = {Phonetic identification by elderly normal and hearing-impaired listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {2}, pages = {664-670}, doi = {10.1121/1.391885}, pmid = {3973237}, issn = {0001-4966}, support = {NS 15457/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Hearing Loss, Sensorineural/*psychology ; Humans ; Middle Aged ; *Phonetics ; *Psychoacoustics ; }, abstract = {Young normal-hearing listeners, elderly normal-hearing listeners, and elderly hearing-impaired listeners were tested on a variety of phonetic identification tasks. Where identity was cued by stimulus duration, the elderly hearing-impaired listeners evidenced normal identification functions. On a task in which there were multiple cues to vowel identity, performance was also normal. On a/b d g/identification task in which the starting frequency of the second formant was varied, performance was abnormal for both the elderly hearing-impaired listeners and the elderly normal-hearing listeners. We conclude that errors in phonetic identification among elderly hearing-impaired listeners with mild to moderate, sloping hearing impairment do not stem from abnormalities in processing stimulus duration. The results with the /b d g/continuum suggest that one factor underlying errors may be an inability to base identification on dynamic spectral information when relatively static information, which is normally characteristic of a phonetic segment, is unavailable.}, } @article {pmid4084116, year = {1985}, author = {Raz, I and Noffsinger, D}, title = {Identification of synthetic, voiced stop-consonants by hearing-impaired listeners.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {24}, number = {6}, pages = {437-448}, doi = {10.3109/00206098509078363}, pmid = {4084116}, issn = {0020-6091}, support = {NS07108/NS/NINDS NIH HHS/United States ; NS17836/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Analysis of Variance ; Audiometry, Pure-Tone ; Female ; Hearing Loss/physiopathology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Middle Aged ; *Speech Perception ; }, abstract = {The identification of synthetic, voiced stop consonants changing in place of articulation was investigated in 26 normal-hearing listeners and 16 hearing-impaired listeners who reported difficulties in speech understanding, but attained high speech discrimination scores on standard test materials. Two place continua (each containing 13 stimuli changing from /ba/ to /da/ and /ga/) were employed: a continuum in which place was cued by an initial noise burst and formant transitions, and a continuum in which place was cued by formant transitions alone. All normal listeners exhibited categorical and highly consistent identification for both stimulus continua. For the hearing-impaired listeners, identification was somewhat less consistent for the burst-and-transition stimuli, and much poorer for the transition-only stimuli. Similar differences were observed when comparing the responses of 4 subjects with a unilateral hearing loss when the stimuli were presented to their normal and impaired ears, and 2 subjects with a unilateral, sudden-onset hearing loss with recovered pure-tone sensitivity.}, } @article {pmid3979927, year = {1985}, author = {Schutte, HK and Miller, R}, title = {Intraindividual parameters of the singer's formant.}, journal = {Folia phoniatrica}, volume = {37}, number = {1}, pages = {31-35}, doi = {10.1159/000265776}, pmid = {3979927}, issn = {0015-5705}, mesh = {Humans ; Male ; *Music ; *Sound Spectrography ; *Voice ; *Voice Quality ; }, } @article {pmid3973225, year = {1985}, author = {Breeuwer, M and Plomp, R}, title = {Speechreading supplemented with formant-frequency information from voiced speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {1}, pages = {314-317}, doi = {10.1121/1.392230}, pmid = {3973225}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Auditory Perception ; Female ; Humans ; *Lipreading ; Male ; Pressure ; *Speech Perception ; }, abstract = {The benefit of supplementing speechreading with information about the frequencies of the first and second formants from the voiced sections of the speech signal was studied by presenting short sentences to 18 normal-hearing listeners under the following three conditions: (a) speechreading combined with listening to the formant-frequency information, (b) speechreading only, and (c) formant-frequency information only. The formant frequencies were presented either as pure tones or as a complex speechlike signal, obtained by filtering a periodic pulse sequence of 250 Hz by a cascade of four second-order bandpass filters (with constant bandwidth); the center frequencies of two of these filters followed the frequencies of the first and second formants, whereas the frequencies of the others remained constant. The percentage of correctly identified syllables increased from 22.8 in the case of speechreading only to 82.0 in the case of speechreading while listening to the complex speechlike signal. Listening to the formant information only scored 33.2% correct. However, comparison with the best-scoring condition of our previous study [Breeuwer and Plomp, J. Acoust. Soc. Am. 76, 686-691 (1984)] indicates that information about the sound-pressure levels in two one-octave filter bands with center frequencies of 500 and 3160 Hz is a more effective supplement to speechreading than the formant-frequency information.}, } @article {pmid3842771, year = {1985}, author = {Maddieson, I and Emmorey, K}, title = {Relationship between semivowels and vowels: cross-linguistic investigations of acoustic difference and coarticulation.}, journal = {Phonetica}, volume = {42}, number = {4}, pages = {163-174}, doi = {10.1159/000261748}, pmid = {3842771}, issn = {0031-8388}, mesh = {*Cross-Cultural Comparison ; Female ; Humans ; Indians, North American ; *Language ; Male ; Mouth/physiology ; *Phonetics ; *Speech/physiology ; *Speech Acoustics ; United States ; }, abstract = {Formant frequencies of the semivowels /j/ and /w/ in Amharic, Yoruba and Zuni were measured in three vowel environments. Cross-language differences were found between what are described as the same semivowels, i.e. different languages have different acoustic targets for /j/ and /w/. These cross-language differences in semivowels correlate with cross-language differences in the respective cognate vowels /i/ and /u/. Nonetheless, the semivowels differ in systematic ways from the vowels in directions that make them more 'consonantal'. These languages also differ in their patterns of coarticulation between semivowels and adjacent vowels. This shows, inter alia, that palatal segments differ from language to language in their degree of resistance to coarticulation. Because of these language-specific coarticulatory patterns, cross-language differences in acoustic targets can only be established after careful consideration of the effect of context.}, } @article {pmid3838322, year = {1985}, author = {Blamey, PJ and Martin, LF and Clark, GM}, title = {A comparison of three speech coding strategies using an acoustic model of a cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {77}, number = {1}, pages = {209-217}, doi = {10.1121/1.392260}, pmid = {3838322}, issn = {0001-4966}, mesh = {Adult ; *Cochlear Implants ; Humans ; Learning ; Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Three alternative speech coding strategies suitable for use with cochlear implants were compared in a study of three normally hearing subjects using an acoustic model of a multiple-channel cochlear implant. The first strategy (F2) presented the amplitude envelope of the speech and the second formant frequency. The second strategy (F0 F2) included the voice fundamental frequency, and the third strategy (F0 F1 F2) presented the first formant frequency as well. Discourse level testing with the speech tracking method showed a clear superiority of the F0 F1 F2 strategy when the auditory information was used to supplement lipreading. Tracking rates averaged over three subjects for nine 10-min sessions were 40 wpm for F2, 52 wpm for F0 F2, and 66 wpm for F0 F1 F2. Vowel and consonant confusion studies and a test of prosodic information were carried out with auditory information only. The vowel test showed a significant difference between the strategies, but no differences were found for the other tests. It was concluded that the amplitude and duration cues common to all three strategies accounted for the levels of consonant and prosodic information received by the subjects, while the different tracking rates were a consequence of the better vowel recognition and the more natural quality of the F0 F1 F2 strategy.}, } @article {pmid6520301, year = {1984}, author = {Darwin, CJ}, title = {Perceiving vowels in the presence of another sound: constraints on formant perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {6}, pages = {1636-1647}, doi = {10.1121/1.391610}, pmid = {6520301}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; Sound ; Speech Acoustics ; *Speech Perception ; Time Factors ; }, abstract = {Speech is normally heard against a background of other sounds, yet our ability to isolate perceptually the speech of a particular talker is poorly understood. The experiments reported here illustrate two different ways in which a listener may decide whether a tone at a harmonic of a vowel's fundamental forms part of the vowel. First, a tone that starts or stops at a different time from a vowel is less likely to be heard as part of that vowel than if it is simultaneous with it; moreover, this effect occurs regardless of whether the tone has been added to a normal vowel, or to a vowel that has already been reduced in energy at the tone's frequency. Second, energy added simultaneously with a vowel, at a harmonic frequency near to the vowel's first formant, may or may not be fully incorporated into the vowel percept, depending on its relation to the first formant: When the additional tone is just below the vowel's first formant frequency, it is less likely to be incorporated than energy that is added at a frequency just above the first formant. Both experiments show that formants may only be estimated after properties of the sound wave have been grouped into different apparent sound sources. The first result illustrates a general auditory mechanism for performing perceptual grouping, while the second result illustrates a mechanism that may use a more specific constraint on vocal-tract transfer functions.}, } @article {pmid6509311, year = {1984}, author = {Quine, DB and Regan, D and Murray, TJ}, title = {Degraded discrimination between speech-like sounds by patients with multiple sclerosis and Friedreich's ataxia.}, journal = {Brain : a journal of neurology}, volume = {107 (Pt 4)}, number = {}, pages = {1113-1122}, doi = {10.1093/brain/107.4.1113}, pmid = {6509311}, issn = {0006-8950}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; *Discrimination, Psychological ; Female ; Friedreich Ataxia/*psychology ; Hearing Disorders/*psychology ; Humans ; Male ; Middle Aged ; Multiple Sclerosis/*psychology ; *Speech Perception ; }, abstract = {We previously found that some patients with multiple sclerosis are selectively 'deaf' to changes in the pitch of a tone, even when audiometric sensitivity to pure tones is unimpaired. This subtle form of deafness is not experienced by patients with noise-induced hearing loss of exclusively peripheral origin. It was suggested that this auditory defect may be one possible cause for difficulties in discriminating speech, on the grounds that frequency changes in the speech waveform are known to be important for intelligibility. This implication is not self-evident; our earlier studies tested hearing with a single pure tone that was either frequency-modulated or amplitude-modulated, while even a simple approximation to speech sounds involves not one, but three narrow bands of noise (formants) whose frequencies and intensities change from instant to instant. The present study has investigated the ability of subjects to discriminate between speech-like sounds. These consisted of three formant frequencies generated by computer. The only difference between the sounds was that the lowest-frequency formant rose or fell in pitch by different amounts. In order to ensure that subjects used frequency (pitch) cues rather than any associated loudness cues were mixed different loudness shifts with the frequency shifts. Nineteen control subjects, 25 patients with multiple sclerosis (MS) and 4 patients with Friedreich's ataxia (FA) were tested. Nine of the patients with MS and all 4 patients with FA gave results that fell outside the range of the control subjects. A possible pathophysiological basis for this observation is the finding that some neurons in the auditory pathway of animals respond preferentially to changes in tone frequency: homologues of these neurons might be functionally impaired in some patients with MS and FA.}, } @article {pmid6239905, year = {1984}, author = {Woods, DL and Hillyard, SA and Hansen, JC}, title = {Event-related brain potentials reveal similar attentional mechanisms during selective listening and shadowing.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {10}, number = {6}, pages = {761-777}, doi = {10.1037//0096-1523.10.6.761}, pmid = {6239905}, issn = {0096-1523}, mesh = {Adolescent ; Adult ; *Attention ; *Dominance, Cerebral ; *Electroencephalography ; Evoked Potentials, Auditory ; Female ; Humans ; Male ; Reading ; *Speech Perception ; }, abstract = {The properties of linguistic attention were examined by recording event-related brain potentials (ERPs) to probe stimuli mixed with dichotically presented prose passages. Subjects either shadowed (repeated phrase by phrase) or selectively listened to one passage while ERPs were recorded from electrodes overlying midline sites, left-hemisphere speech areas, and corresponding areas of the right hemisphere. Mixed with each voice (a male voice in one ear, a female voice in the other) were four probe stimuli: digitized speech sounds (but or /a/ as in father) produced by the same speaker and tone bursts at the mean fundamental and second formant frequencies of that voice. The ERPs elicited by the speech probes in the attended ear showed an enhanced negativity, with an onset at 50 ms-100 ms and lasting up to 800 ms-1,000 ms, whereas the ERPs to the second formant probes showed an enhanced positivity in the 200 ms-300 ms latency range. These effects were comparable for shadowing and selective listening conditions and remained stable over the course of the experiment. The attention-related negativity to the consonant-vowel-consonant probe (but) was most prominent over the left hemisphere; other probes produced no significant asymmetries. The results indicate that stimulus selection during linguistic attention is specifically tuned to speech sounds rather than simply to constituent pure-tone frequencies or ear of entry. Furthermore, it appears that both attentional set and stimulus characteristics can influence the hemispheric utilization of stimuli.}, } @article {pmid6239611, year = {1984}, author = {Eilers, RE and Bull, DH and Oller, DK}, title = {Tactual perception of speech-like stimuli with an electrocutaneous vocoder.}, journal = {Artificial organs}, volume = {8}, number = {4}, pages = {494-497}, doi = {10.1111/j.1525-1594.1984.tb04328.x}, pmid = {6239611}, issn = {0160-564X}, mesh = {Adult ; *Communication Aids for Disabled ; Electric Stimulation ; Humans ; Male ; *Self-Help Devices ; *Speech Perception ; *Touch ; }, abstract = {A deficiency in current tactual vocoder designs is the relatively poor transmission of rapid spectral changes (formant transitions). To understand better the application of electrocutaneous tactual stimulation as a basis for an artificial hearing system, the tactual discrimination of rapidly changing patterns was studied. Stimuli were presented on a belt containing 32 electrodes that was worn in a linear array 5 cm above the navel. Bipolar pulse trains (height 10 mA; width 13 microseconds) of specified frequencies (200, 400, or 1,000 Hz) were presented sequentially to adjacent electrodes to simulate a tactual vocoder equivalent of simple frequency transitions. Discrimination of the direction of stimulation movement (i.e., stimulation on the belt to either a subject's left or right) was assessed with one experienced subject as a function of both electrode number (four or eight electrodes) and stimulation frequency. Identification of the direction of stimulation movement was influenced both by the number of stimulus channels and by the frequency of electrocutaneous stimulation. These preliminary results indicate that the discrimination of formant trajectory transmitted in an electrocutaneous vocoder scheme is within the temporal limits typically imposed by natural speech.}, } @article {pmid6482417, year = {1984}, author = {Sharf, DJ and Ohde, RN}, title = {Effect of formant frequency onset variation on the differentiation of synthesized /w/ and /r/ sounds.}, journal = {Journal of speech and hearing research}, volume = {27}, number = {3}, pages = {475-479}, doi = {10.1044/jshr.2703.475}, pmid = {6482417}, issn = {0022-4685}, mesh = {Adult ; Child ; Hearing/physiology ; Humans ; *Phonetics ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The purpose of this study was to assess the use of psychophysical transformations for analyzing the differentiation of /w/ and /r/ sounds of children and adults. Stimuli from Adult and Child manifolds, consisting of 25 synthesized /Cej/-type utterances with different F2 and F3 onset frequencies, were presented in random order to eight naive subjects. Subjects rated the stimuli on a four-point scale between good /r/ and good /w/. Correlations between mel transformations and Bark transformations of the F3-F2 differences among the stimuli and their percent /r/ responses were close to or greater than .90. Predictions of percent /r/ responses derived from regression analyses based on mel transformations and Bark transformations of F3-F2 differences among stimuli indicated that some sounds identified as /w/ for /r/ substitutions could be differentiated from /w/ sounds. The category boundaries between /r/ and /w/ were estimated to be 5.0 Bark for adult stimuli and 5.7 Bark for child stimuli.}, } @article {pmid6482410, year = {1984}, author = {Collins, MJ}, title = {Tone-glide discrimination: normal and hearing-impaired listeners.}, journal = {Journal of speech and hearing research}, volume = {27}, number = {3}, pages = {403-412}, pmid = {6482410}, issn = {0022-4685}, support = {1F32 NS 06053/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Audiometry/methods ; Female ; Hearing/*physiology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; Perceptual Masking ; Phonetics ; Speech Discrimination Tests/*methods ; }, abstract = {Speech sounds vary rapidly in frequency, and the status of the auditory system might be expected to influence the processing of time-varying signals. Two experiments were undertaken to determine the effect of sensorineural hearing loss on the discrimination of frequency-varying sinusoids which are analogous to second formant transitions and contiguous steady-state portions of consonant-vowel and vowel-consonant syllables. A transformed up-down procedure was used to estimate the difference limen of the glide (transition) portions of the stimuli. In Experiment 1, it was found that three of the four normal hearers exhibited larger difference limens for stimuli with glides preceding fixed-frequency (steady state) portions than for comparable glide-following conditions. Subjects with sensorineural hearing loss did not show this effect consistently. In Experiment 2, the possibility of temporal masking of the glide segment by the fixed-frequency segment was explored. Large individual differences in temporal masking effects were observed for both normal-hearing and hearing-impaired subjects. Temporal masking could account for some, but not all of the differences in discriminability of glide-preceding and glide-following conditions. It appears that audiometrically similar sensorineural hearing losses can differentially affect temporal masking and discrimination of the tone glides.}, } @article {pmid6480990, year = {1984}, author = {Lahiri, A and Gewirth, L and Blumstein, SE}, title = {A reconsideration of acoustic invariance for place of articulation in diffuse stop consonants: evidence from a cross-language study.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {2}, pages = {391-404}, doi = {10.1121/1.391580}, pmid = {6480990}, issn = {0001-4966}, support = {NS15123/NS/NINDS NIH HHS/United States ; }, mesh = {Female ; Humans ; *Language ; Male ; *Phonation ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Articulation Tests ; *Speech Perception ; *Voice ; }, abstract = {This study explored the claim that invariant acoustic properties corresponding to phonetic features generalize across languages. Experiment I examined whether the same invariant properties can characterize diffuse stop consonants in Malayalam, French, and English. Results showed that, contrary to theoretical predictions, we could not distinguish labials from dentals, nor could we classify dentals and alveolars together in terms of the same invariant properties. We developed an alternative metric based on the change in the distribution of spectral energy from the burst onset to the onset of voicing. This metric classified over 91% of the stops in Malayalam, French, and English. In experiment II, we investigated whether the invariant properties defined by the metric are used by English-speaking listeners in making phonetic decisions for place of articulation. Prototype CV syllables--[b d] in the context of [i e a o u]--were synthesized. The gross shape of the spectrum was manipulated first at the burst onset, then at the onset of voicing, such that the stimulus configuration had the spectral properties prescribed by our metric for labial and dental consonants, while the formant frequencies and transitions were appropriate to the contrasting place of articulation. Results of identification tests showed that listeners were able to perceive place of articulation as a function of the relative distribution of spectral energy specified by the metric.}, } @article {pmid6480989, year = {1984}, author = {Kurowski, K and Blumstein, SE}, title = {Perceptual integration of the murmur and formant transitions for place of articulation in nasal consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {2}, pages = {383-390}, doi = {10.1121/1.391139}, pmid = {6480989}, issn = {0001-4966}, support = {NS 15123/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Cues ; Humans ; *Phonation ; *Phonetics ; *Speech Perception ; *Voice ; }, abstract = {This study reassessed the role of the nasal murmur and formant transitions as perceptual cues for place of articulation in nasal consonants across a number of vowel environments. Five types of computer-edited stimuli were generated from natural utterances consisting of [m n] followed by [i e a o u]: (1) full murmurs; (2) transitions plus vowel segments; (3) the last six pulses of the murmur; (4) the six pulses starting from the beginning of the formant transitions; and (5) the six pulses surrounding the nasal release (three pulses before and three pulses after). Results showed that the murmur provided as much information for the perception of place of articulation as did the transitions. Moreover, the highest performance scores for place of articulation were obtained in the six-pulse condition containing both murmur and transition information. The data support the view that it is the combination of nasal murmur plus formant transitions which forms an integrated property for the perception of place of articulation.}, } @article {pmid6747105, year = {1984}, author = {Hillenbrand, J and Ingrisano, DR and Smith, BL and Flege, JE}, title = {Perception of the voiced-voiceless contrast in syllable-final stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {1}, pages = {18-26}, doi = {10.1121/1.391094}, pmid = {6747105}, issn = {0001-4966}, support = {5S05RR07028/RR/NCRR NIH HHS/United States ; T32NS07100-04/NS/NINDS NIH HHS/United States ; }, mesh = {Cues ; Humans ; *Phonetics ; Psychoacoustics ; *Semantics ; *Speech Perception ; }, abstract = {A computer editing technique was used to remove varying amounts of voicing from the syllable-final closure intervals of naturally produced tokens of /p epsilon b, p epsilon d, p epsilon g, pag, pig, pug/. Vowels for all six syllables were approximately the same duration, and the final release bursts were retained. Identification results showed that voiceless responses tended to occur in relatively large numbers when all of the closure voicing and, in most cases, a portion of the preceding vowel-to-consonant (VC) transition had been removed. A second experiment demonstrated that removal of final release bursts had very little effect on the identification functions. Acoustic measurements were made in an attempt to gain information about the acoustic bases of the listeners' voiced-voiceless judgments. In general, stimuli that subjects tended to identify as voiceless showed higher first-formant offset frequencies and shorter intensity decay times than stimuli that subjects tended to identify as voiced. However, for stops following /i/ and /u/ these acoustic differences were relatively small. We were unable to find a single acoustic measure, or any combination of measures, that clearly explained the listeners' voiced-voiceless decisions.}, } @article {pmid6747103, year = {1984}, author = {Sinex, DG and Geisler, CD}, title = {Comparison of the responses of auditory nerve fibers to consonant-vowel syllables with predictions from linear models.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {1}, pages = {116-121}, doi = {10.1121/1.391106}, pmid = {6747103}, issn = {0001-4966}, support = {NS-03856/NS/NINDS NIH HHS/United States ; NS-06195/NS/NINDS NIH HHS/United States ; NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Auditory Threshold/physiology ; Cats ; Evoked Potentials, Auditory ; Nerve Fibers/*physiology ; *Phonetics ; Pitch Perception/physiology ; Psychoacoustics ; *Semantics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {The responses of cat auditory-nerve fibers to synthesized consonant-vowel syllables were compared with predictions from linear models based on individual fibers' threshold tuning curves. Comparisons with the linear predictions provided information about the specific effects of peripheral nonlinearities on the representation of speech sounds. Spectral peaks, such as the formants of vowels, were more prominently represented in synchronized discharge patterns than in the linear predictions. Suppression of responses to other spectral peaks and to stimulus components between spectral peaks accounted for the differences. While profiles of fibers' synchronized responses were usually dominated by a single formant, predicted linear responses often included broad responses having two or more formants as well as components near the fibers' characteristic frequencies. In contrast, when no stimulus peak fell within a fiber's response area, the agreement between the neural response and the linear prediction was quite good. The results suggest that one role for peripheral nonlinearities in the auditory system may be to enhance the neural representation of spectral features such as formants.}, } @article {pmid6547734, year = {1984}, author = {Blamey, PJ and Dowell, RC and Tong, YC and Brown, AM and Luscombe, SM and Clark, GM}, title = {Speech processing studies using an acoustic model of a multiple-channel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {76}, number = {1}, pages = {104-110}, doi = {10.1121/1.391104}, pmid = {6547734}, issn = {0001-4966}, mesh = {Aged ; Auditory Threshold ; *Cochlear Implants ; Deafness/psychology/*therapy ; Dominance, Cerebral ; Humans ; Lipreading ; Male ; Middle Aged ; Pitch Perception ; Psychoacoustics ; *Speech Perception ; }, abstract = {The speech perception of two multiple-channel cochlear implant patients was compared with that of three normally hearing listeners using an acoustic model of the implant for 22 different speech tests. The tests used included a minimal auditory capabilities battery, both closed-set and open-set word and sentence tests, speech tracking and a 12-consonant confusion study using nonsense syllables. The acoustic model represented electrical current pulses by bursts of noise and the effects of different electrodes were represented by using bandpass filters with different center frequencies. All subjects used a speech processor that coded the fundamental voicing frequency of speech as a pulse rate and the second formant frequency of speech as the electrode position in the cochlea, or the center frequency of the bandpass filter. Very good agreement was found for the two groups of subjects, indicating that the acoustic model is a useful tool for the development and evaluation of alternative cochlear implant speech processing strategies.}, } @article {pmid6747096, year = {1984}, author = {Busby, PA and Tong, YC and Clark, GM}, title = {Underlying dimensions and individual differences in auditory, visual, and auditory-visual vowel perception by hearing-impaired children.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {6}, pages = {1858-1865}, doi = {10.1121/1.390987}, pmid = {6747096}, issn = {0001-4966}, mesh = {Adolescent ; Deafness/*psychology ; Female ; Hearing Aids ; Humans ; Individuality ; *Lipreading ; Male ; *Phonetics ; *Speech Perception ; }, abstract = {Vowel perception studies were conducted on a group of four adolescent children with congenital profound sensorineural hearing impairments in the three conditions of audition alone, vision alone, and audition plus vision. Data were analyzed using the ALSCAL multidimensional scaling procedure to identify the underlying dimensions and individual differences in dimension emphasis. The three dimensions obtained from the analysis of data for the audition alone condition were interpreted as the parameters of first and second formant frequencies, and vowel length. The one dimension for the vision alone condition was interpreted as the parameter of the width of the internal lip opening. The three dimensions for the audition plus vision condition were interpreted as the parameters of first formant frequency, vowel length, and the width of the internal lip opening. Subject variations in parameter preferences were observed for the audition alone and audition plus vision conditions but not for the vision alone condition.}, } @article {pmid6738036, year = {1984}, author = {Kallail, KJ and Emanuel, FW}, title = {Formant-frequency differences between isolated whispered and phonated vowel samples produced by adult female subjects.}, journal = {Journal of speech and hearing research}, volume = {27}, number = {2}, pages = {245-251}, doi = {10.1044/jshr.2702.251}, pmid = {6738036}, issn = {0022-4685}, mesh = {Adult ; Female ; Humans ; *Phonation ; *Phonetics ; *Sound Spectrography ; Speech Perception ; *Voice ; }, abstract = {This study was designed to investigate the formant frequencies of phonated and whispered productions of five test vowels (/i/, /u/, /ae/, /a/). Each test vowel was sustained twice in isolation--once phonated, one whispered--by 20 adult female subjects. The phoneme represented by each recorded production was identified independently by 11 listeners. Only those samples identified by 6 or more of the listeners as the vowel intended were retained for a further (acoustic) analysis. An acoustic spectrum of each retained sample was obtained to permit formant measurements. To provide the clearest formant delineation possible in our lab, the phonated samples were analyzed by broadband spectrography and the whispered samples by very narrow-band spectrography. This report presents the formant frequencies (F1-F3) of the test productions as measured from the acoustic vowel spectra and the formant-frequency differences between the phonated and whispered productions. In general, the results showed a trend for whispered vowel formants to be higher in frequency than phonated vowel formants, but that trend was only strongly evident for F1.}, } @article {pmid6480513, year = {1984}, author = {Miller, MI and Sachs, MB}, title = {Representation of voice pitch in discharge patterns of auditory-nerve fibers.}, journal = {Hearing research}, volume = {14}, number = {3}, pages = {257-279}, doi = {10.1016/0378-5955(84)90054-6}, pmid = {6480513}, issn = {0378-5955}, mesh = {Acoustic Stimulation ; Action Potentials ; Animals ; Cats ; Electric Stimulation ; Fourier Analysis ; Nerve Fibers/*physiology ; Phonetics ; Pitch Perception/*physiology ; Sound Spectrography ; Speech Acoustics ; Speech Perception/physiology ; Vestibulocochlear Nerve/*physiology ; Voice ; }, abstract = {Responses of populations of auditory-nerve fibers were measured for synthesized consonant-vowel stimuli. This paper explores the encoding of fundamental frequency (pitch) in these responses. Post-stimulus time (PST) histograms were computed from 25 ms segments of the spike trains. Discrete Fourier transforms with a 40 Hz resolution were computed from the histograms. Two representations of pitch are considered. The first representation is based on the pitch-related temporal properties of the speech signal. Histograms for individual units can show envelope modulations directly related to the pitch period. These modulations reflect the responses of these fibers to a number of stimulus harmonics near fiber CF. Responses of fibers near formant frequencies are dominated by a single large harmonic component, and thus show small or no pitch-related enveloped modulations. Envelope modulations are reduced in the presence of background noise. The second representation uses both temporal properties of auditory-nerve responses and cochlear place to encode the pitch-related harmonic structure of speech. As a measure of the response of the population of fibers to each harmonic of 40 Hz the magnitude of the component of the Fourier transform at that frequency was averaged across all fibers whose characteristic frequencies were within one-fourth octave of that harmonic. We call this measure the average localized synchronized rate (ALSR). The ALSR provides a good representation of stimulus spectrum, even in the presence of background noise. From the harmonic structure of the ALSR, we are able to extract the stimulus pitch frequency. The relationship of these two representations to pitch perception in both acoustic and electrical stimulation (via cochlear implants) is discussed.}, } @article {pmid6202359, year = {1984}, author = {Blumstein, SE and Tartter, VC and Nigro, G and Statlender, S}, title = {Acoustic cues for the perception of place of articulation in aphasia.}, journal = {Brain and language}, volume = {22}, number = {1}, pages = {128-149}, doi = {10.1016/0093-934x(84)90083-x}, pmid = {6202359}, issn = {0093-934X}, support = {NS06209/NS/NINDS NIH HHS/United States ; NS07615/NS/NINDS NIH HHS/United States ; }, mesh = {Aphasia/*psychology ; Cues ; Discrimination Learning ; Humans ; *Phonation ; *Phonetics ; *Speech Perception ; *Voice ; }, abstract = {Two experiments assessed the abilities of aphasic patients and nonaphasic controls to perceive place of articulation in stop consonants. Experiment I explored labeling and discrimination of [ba, da, ga] continua varying in formant transitions with or without an appropriate burst onset appended to the transitions. Results showed general difficulty in perceiving place of articulation for the aphasic patients. Regardless of diagnostic category or auditory language comprehension score, discrimination ability was independent of labeling ability, and discrimination functions were similar to normals even in the context of failure to reliably label the stimuli. Further there was less variability in performance for stimuli with bursts than without bursts. Experiment II measured the effects of lengthening the formant transitions on perception of place of articulation in stop consonants and on the perception of auditory analogs to the speech stimuli. Lengthening the transitions failed to improve performance for either the speech or nonspeech stimuli, and in some cases, reduced performance level. No correlation was observed between the patient's ability to perceive the speech and nonspeech stimuli.}, } @article {pmid6725775, year = {1984}, author = {Shinn, P and Blumstein, SE}, title = {On the role of the amplitude envelope for the perception of [b] and [w].}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {4}, pages = {1243-1252}, doi = {10.1121/1.390677}, pmid = {6725775}, issn = {0001-4966}, support = {NS15123/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {This study investigated the role of the amplitude envelope in the vicinity of consonantal release in the perception of the stop-glide contrast. Three sets of acoustic [b-w] continua, each in the vowel environments [a] and [i], were synthesized using parameters derived from natural speech. In the first set, amplitude, formant frequency, and duration characteristics were interpolated between exemplar stop and glide endpoints. In the second set, formant frequency and duration characteristics were interpolated, but all stimuli were given a stop amplitude envelope. The third set was like the second, except that all stimuli were given a glide amplitude envelope. Subjects were given both forced-choice and free-identification tasks. The results of the forced-choice task indicated that amplitude cues were able to override transition slope, duration, and formant frequency cues in the perception of the stop-glide contrast. However, results from the free-identification task showed that, although presence of a stop amplitude envelope turned all stimuli otherwise labeled as glides to stops, the presence of a glide amplitude envelope changed stimuli labeled otherwise as stops to fricatives rather than to glides. These results support the view that the amplitude envelope in the vicinity of the consonantal release is a critical acoustic property for the continuant / noncontinuant contrast. The results are discussed in relation to a theory of acoustic invariance.}, } @article {pmid6707320, year = {1984}, author = {Delgutte, B and Kiang, NY}, title = {Speech coding in the auditory nerve: V. Vowels in background noise.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {3}, pages = {908-918}, doi = {10.1121/1.390537}, pmid = {6707320}, issn = {0001-4966}, support = {NS 04332/NS/NINDS NIH HHS/United States ; NS 13126/NS/NINDS NIH HHS/United States ; }, mesh = {Action Potentials ; Animals ; Cats ; Cues ; Nerve Fibers/physiology ; *Noise ; *Phonetics ; Pitch Perception/physiology ; Speech Acoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of auditory-nerve fibers to steady-state, two-formant vowels in low-pass background noise (S/N = 10 dB) were obtained in anesthetized cats. For fibers over a wide range of characteristic frequencies (CFs), the peaks in discharge rate at the onset of the vowel stimuli were nearly eliminated in the presence of noise. In contrast, strong effects of noise on fine time patterns of discharge were limited to CF regions that are far from the formant frequencies. One effect is a reduction in the amplitude of the response component at the fundamental frequency in the high-CF regions and for CFs between F1 and F2 when the formants are widely separated. A reduction in the amplitude of the response components at the formant frequencies, with concomitant increase in components near CF or low-frequency components occurs in CF regions where the signal-to-noise ratio is particularly low. The processing schemes that were effective for estimating the formant frequencies and fundamental frequency of vowels in quiet generally remain adequate in moderate-level background noise. Overall, the discharge patterns contain many cues for distinctions among the vowel stimuli, so that the central processor should be able to identify the different vowels, consistent with psychophysical performance at moderate signal-to-noise ratios.}, } @article {pmid6707319, year = {1984}, author = {Delgutte, B and Kiang, NY}, title = {Speech coding in the auditory nerve: IV. Sounds with consonant-like dynamic characteristics.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {3}, pages = {897-907}, doi = {10.1121/1.390599}, pmid = {6707319}, issn = {0001-4966}, support = {NS 13126/NS/NINDS NIH HHS/United States ; NS04332/NS/NINDS NIH HHS/United States ; }, mesh = {Action Potentials ; Adaptation, Physiological ; Animals ; Cats ; Nerve Fibers/physiology ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Discharge patterns of auditory-nerve fibers in anesthetized cats were obtained for two stimulus levels in response to synthetic stimuli with dynamic characteristics appropriate for selected consonants. A set of stimuli was constructed by preceding a signal that was identified as /da/by another sound that was systematically manipulated so that the entire complex would sound like either /da/, /ada/, /na/, /sa/, /sa/, or others. Discharge rates of auditory-nerve fibers in response to the common /da/-like formant transitions depended on the preceding context. Average discharge rates during these transitions decreased most for fibers whose CFs were in frequency regions where the context had considerable energy. Some effect of the preceding context on fine time patterns of response to the transitions was also found, but the identity of the largest response components (which often corresponded to the formant frequencies) was in general unaffected. Thus the response patterns during the formant transitions contain cues about both the nature of the transitions and the preceding context. A second set of stimuli sounding like /s/ and /c/ was obtained by varying the duration of the rise in amplitude at the onset of a filtered noise burst. At both 45 and 60 dB SPL, there were fibers which showed a more prominent peak in discharge rate at stimulus onset for /c/ than for /s/, but the CF regions that reflected the clearest distinctions depended on stimulus level. The peaks in discharge rate that occur in response to rapid changes in amplitude or spectrum might be used by the central processor as pointers to portions of speech signals that are rich in phonetic information.}, } @article {pmid6707318, year = {1984}, author = {Delgutte, B and Kiang, NY}, title = {Speech coding in the auditory nerve: III. Voiceless fricative consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {3}, pages = {887-896}, doi = {10.1121/1.390598}, pmid = {6707318}, issn = {0001-4966}, support = {NS 04332/NS/NINDS NIH HHS/United States ; NS 13126/NS/NINDS NIH HHS/United States ; }, mesh = {Action Potentials ; Animals ; Cats ; Nerve Fibers/physiology ; *Phonetics ; Pitch Perception/physiology ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of auditory-nerve fibers in anesthetized cats were recorded for synthetic voiceless fricative consonants. The four stimuli (/x/, /s/, /s/, and /f/) were presented at two levels corresponding to speech in which the levels of the vowels would be approximately 60 and 75 dB SPL, respectively. Discharge patterns were characterized in terms of PST histograms and their power spectra. For both stimulus levels, frequency regions in which the stimuli had considerable energy corresponded well with characteristic-frequency (CF) regions in which average discharge rates were the highest. At the higher level, the profiles of discharge rate against CF were more distinctive for the stimulus onset than for the central portion. Power spectra of PST histograms had large response components near fiber characteristic frequencies for CFs up to 3-4 kHz, as well as low-frequency components for all fibers. The relative amplitudes of these components varied for the different stimuli. In general, the formant frequencies of the fricatives did not correspond with the largest response components, except for formants below about 3 kHz. Processing schemes based on fine time patterns of discharge that were effective for vowel stimuli generally failed to extract the formant frequencies of fricatives.}, } @article {pmid6707317, year = {1984}, author = {Delgutte, B}, title = {Speech coding in the auditory nerve: II. Processing schemes for vowel-like sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {3}, pages = {879-886}, doi = {10.1121/1.390597}, pmid = {6707317}, issn = {0001-4966}, support = {NS 04332/NS/NINDS NIH HHS/United States ; NS 13126/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Cats ; Nerve Fibers/physiology ; *Phonetics ; Pitch Perception/physiology ; Psychophysics ; Sound Spectrography ; Speech Perception/*physiology ; Statistics as Topic ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Several processing schemes by which phonetically important information for vowels can be extracted from responses of auditory-nerve fibers are analyzed. The schemes are based on power spectra of period histograms obtained in response to a set of nine two-formant, steady-state, vowel-like stimuli presented at 60 and 75 dB SPL. One class of "local filtering" schemes, which was originally proposed by Young and Sachs [J. Acoust. Soc. Am. 66, 1381-1403 (1979)], consists of analyzing response patterns by filters centered at the characteristic frequencies (CF) of the fibers, so that a tonotopically arranged measure of synchronized response can be obtained. Various schemes in this class differ in the characteristics of the filter. For a wide range of filter bandwidths, formant frequencies correspond approximately to the CFs for which the response measure is maximal. If in addition, the bandwidths of the analyzing filters are made compatible with psychophysical measures of frequency selectivity, low-frequency harmonics of the stimulus fundamental are resolved in the output profile, so that fundamental frequency can also be estimated. In a second class of processing schemes, a dominant response component is defined for each fiber from a 1/6 octave spectral representation of the response pattern, and the formant frequencies are estimated from the most frequent values of the dominant component in the ensemble of auditory-nerve fibers. The local filtering schemes and the dominant component schemes can be related to "place" and "periodicity" models of auditory processing, respectively.}, } @article {pmid6707316, year = {1984}, author = {Delgutte, B and Kiang, NY}, title = {Speech coding in the auditory nerve: I. Vowel-like sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {3}, pages = {866-878}, doi = {10.1121/1.390596}, pmid = {6707316}, issn = {0001-4966}, support = {NS 04332/NS/NINDS NIH HHS/United States ; NS 13126/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Cats ; Cues ; Nerve Fibers/physiology ; *Phonetics ; Pitch Perception/physiology ; Sound Spectrography ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Discharge patterns of auditory-nerve fibers in anesthetized cats were recorded in response to a set of nine steady-state, two-formant vowels presented at 60 and 75 dB SPL. The largest components in the discrete Fourier transforms of period histograms were almost always harmonics of the vowel fundamental frequency that were close to one of the formant frequencies, the fundamental frequency or the fiber characteristic frequency (CF). For any fiber, the position of its CF relative to the formant frequencies (F1 and F2) appears to determine which of these components dominates the response. Specifically, the response characteristics of the tonotopically arranged array of fibers can be described in terms of five CF regions: (1) a low-CF region below F1 in which the largest response components are the harmonics of the fundamental frequency closest to CF; (2) a region centered around CF = F1 in which the first formant and its harmonics are the largest components; (3) an intermediate region between F1 and F2 with prominent components at both the fiber CF and the fundamental frequency; (4) a region centered around CF = F2 in which harmonics close to the second formant are the largest for frequencies above the fundamental; and (5) a high-CF region in which response spectra tend to show broad, multiple peaks at the formant and fundamental frequencies. These CF regions are related to the phonetic descriptions of vowels. For example, the extent of the low-CF region is largest for "open" vowels (which have a high F1), and the intermediate region is distinct only for "spread" vowels for which F1 and F2 are more than 1.5-2 octaves apart. For all vowels, response activity for the majority of fibers is concentrated near the formant frequencies, in contrast to responses to broadband noise for which components near CF are dominant.}, } @article {pmid6546846, year = {1984}, author = {Clark, GM and Tong, YC and Dowell, RC}, title = {Comparison of two cochlear implant speech-processing strategies.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {93}, number = {2 Pt 1}, pages = {127-131}, doi = {10.1177/000348948409300205}, pmid = {6546846}, issn = {0003-4894}, mesh = {*Cochlear Implants ; Deafness/*rehabilitation ; Electric Stimulation ; Evaluation Studies as Topic ; Humans ; Lipreading ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; Voice Quality ; }, abstract = {Speech processors extracting either the fundamental frequency (F0) alone, or the fundamental frequency combined with second formant information (F0-F2), have been evaluated on a totally deaf patient using a multiple-channel cochlear implant. A closed set test using 16 spondees and a modified rhyme test showed that for electrical stimulation alone the F0-F2 speech processor was significantly better than the F0 processor. The open set tests using phonetically balanced words and Central Institute for the Deaf everyday sentences showed that for electrical stimulation alone and electrical stimulation combined with lipreading, the results with the F0-F2 speech processor were all significantly better than with the F0 processor. Information transmission for consonant speech features was also better when using the F0-F2 processor.}, } @article {pmid6722370, year = {1984}, author = {Plant, G}, title = {The effects of an acquired profound hearing loss on speech production. A case study.}, journal = {British journal of audiology}, volume = {18}, number = {1}, pages = {39-48}, doi = {10.3109/03005368409078927}, pmid = {6722370}, issn = {0300-5364}, mesh = {Adolescent ; Hearing Loss, Sudden/*psychology ; Humans ; Male ; Phonetics ; Sound Spectrography ; *Speech ; Speech Acoustics ; Speech Intelligibility ; Voice Quality ; }, abstract = {Three parallel studies conducted with an early adolescent male subject deafened by meningitis at age 11 are reported. Study 1 plotted phonetic errors in spontaneous and read speech, long-term average spectra and vowel formant frequencies. Study 2 measured the subject's speech intelligibility using monosyllabic word lists. Study 3 involved normal hearer 's ratings of the subject's speech 2 months and 30 months after the onset of deafness. The results obtained revealed a deterioration in both speech intelligibility and quality.}, } @article {pmid6699297, year = {1984}, author = {Walley, AC and Pisoni, DB and Aslin, RN}, title = {Infant discrimination of two- and five-formant voiced stop consonants differing in place of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {2}, pages = {581-589}, pmid = {6699297}, issn = {0001-4966}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; HD-11915/HD/NICHD NIH HHS/United States ; MH-24027/MH/NIMH NIH HHS/United States ; NS-12179/NS/NINDS NIH HHS/United States ; }, mesh = {Child Language ; Cues ; Humans ; Infant ; Phonetics ; Psychology, Child ; Sound Spectrography ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {According to recent theoretical accounts of place of articulation perception, global, invariant properties of the stop CV syllable onset spectrum serve as primary, innate cues to place of articulation, whereas contextually variable formant transitions constitute secondary, learned cues. By this view, one might expect that young infants would find the discrimination of place of articulation contrasts signaled by formant transition differences more difficult than those cued by gross spectral differences. Using an operant head-turning paradigm, we found that 6-month-old infants were able to discriminate two-formant stimuli contrasting in place of articulation as well as they did five-formant + burst stimuli. Apparently, neither the global properties of the onset spectrum nor simply the additional acoustic information contained in the five-formant + burst stimuli afford the infant any advantage in the discrimination task. Rather, formant transition information provides a sufficient basis for discriminating place of articulation differences.}, } @article {pmid6699294, year = {1984}, author = {Turner, CW and Van Tasell, DJ}, title = {Sensorineural hearing loss and the discrimination of vowel-like stimuli.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {2}, pages = {562-565}, doi = {10.1121/1.390528}, pmid = {6699294}, issn = {0001-4966}, support = {NS12125/NS/NINDS NIH HHS/United States ; }, mesh = {Hearing Loss, Sensorineural/*physiopathology ; Humans ; Perceptual Masking ; Phonetics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {It has been hypothesized that the wider-than-normal auditory bandwidths attributed to sensorineural hearing loss lead to a reduced ability to discriminate spectral characteristics in speech signals. In order to investigate this possibility, the minimum detectable depth of a spectral "notch" between the second (F2) and third (F3) formants of a synthetic vowel-like stimulus was determined for normal and hearing-impaired subjects. The minimum detectable notch for all subjects was surprisingly small; values obtained were much smaller than those found in actual vowels. An analysis of the stimuli based upon intensity discrimination within a single critical band predicted only small differences in performance on this task for rather large differences in the size of the auditory bandwidth. These results suggest that impairments of auditory frequency resolution in sensorineural hearing loss may not be critical in the perception of steady-state vowels.}, } @article {pmid6721788, year = {1984}, author = {Kaplan, H and Pickett, JM}, title = {Release from first-formant masking in presbyacusis.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {23}, number = {2}, pages = {165-180}, doi = {10.3109/00206098409072832}, pmid = {6721788}, issn = {0020-6091}, support = {IF 32-NS-AG6025/NS/NINDS NIH HHS/United States ; }, mesh = {Aged ; Dichotic Listening Tests ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Loudness Perception ; Middle Aged ; Noise ; Perceptual Masking/*physiology ; Presbycusis/*physiopathology ; Speech Discrimination Tests ; }, abstract = {Using synthetic vowels, the effects of first-formant masking on discrimination of the presence of a transition in the second formant were measured under conditions of first-formant attenuation and dichotic presentation. The subjects were 26 listeners between the ages of 61 and 91 with borderline to moderate sensorineural hearing losses. Results indicated: (1) dichotic presentation was significantly superior to monotic presentation; (2) attenuation of the first formant did not result in significant improvement of second-formant transition discrimination. Improved performance in the dichotic conditions may be related to the fact that subjects selected lower most comfortable listening levels than for monotic conditions. Results are compared to those of another study using natural speech stimuli.}, } @article {pmid6706694, year = {1984}, author = {Martin, F and Klingholz, F and Eicher, W}, title = {[Evaluation of the voice as a secondary sex characteristic in transsexuals].}, journal = {HNO}, volume = {32}, number = {1}, pages = {24-27}, pmid = {6706694}, issn = {0017-6192}, mesh = {Female ; Humans ; Male ; Phenotype ; *Sex Characteristics ; *Transsexualism ; *Voice ; }, abstract = {The secondary sex character of the voice was assessed in 15 transsexuals. They were compared to 40 people with typical normal vocal sex characteristics. The mean pitch level and the distributions of formant frequencies of the transsexuals were compared to those of the control group. The results obtained show no significant relation to the phenotype.}, } @article {pmid6699285, year = {1984}, author = {Hillenbrand, J}, title = {Perception of sine-wave analogs of voice onset time stimuli.}, journal = {The Journal of the Acoustical Society of America}, volume = {75}, number = {1}, pages = {231-240}, doi = {10.1121/1.390400}, pmid = {6699285}, issn = {0001-4966}, support = {5 S05 RR07028/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Voice ; }, abstract = {It has been argued that perception of stop consonant voicing contrasts is based on auditory mechanisms responsible for the resolution of temporal order. As one source of evidence, category boundaries for nonspeech stimuli whose components vary in relative onset time are reasonably close to the labeling boundary for a labial stop voiced-voiceless continuum. However, voicing boundaries change considerably when the onset frequency of the first formant (F1) is varied--either directly or as a side effect of a change in F1 transition duration. Stimuli consisted of a midfrequency sinusoid that was initiated 0-50 ms prior to the onset of a low-frequency sinusoid. Results showed that the labeling boundary for relative onset time increased for longer durations of a low-frequency tone sweep. This effect is analogous to the F1 transition duration effect with synthetic speech. Further, the discrimination of differences in relative onset time was poorer for stimuli with longer frequency sweeps. However, unlike synthetic speech, there were no systematic effects when the frequency of a transitionless lower sinusoid was varied. These findings are discussed in relation to the potential contributions of auditory mechanisms and speech-specific processes in the perception of the voicing contrast.}, } @article {pmid6597698, year = {1984}, author = {Anderson, SW and Podwall, FN and Jaffe, J}, title = {Timing analysis of coding and articulation processes in dyslexia.}, journal = {Annals of the New York Academy of Sciences}, volume = {433}, number = {}, pages = {71-86}, doi = {10.1111/j.1749-6632.1984.tb14760.x}, pmid = {6597698}, issn = {0077-8923}, mesh = {Child ; Cognition ; Computers ; Dyslexia/*psychology ; Humans ; Linguistics ; Male ; Phonetics ; Semantics ; *Speech ; Speech Acoustics ; Time Factors ; }, abstract = {Several investigators have found that the time required to perform serial naming tasks is a good predictor of dyslexia in children. Protracted overall time scores, such as are reported for the Rapid Automatized Naming (RAN) test of Denckla and Rudel, are, by themselves, insufficient to determine the extent to which the performance deficit is cognitive in nature, or is the articulatory consequence of an inability master the quickly changing acoustic formant patterns associated with consonants, as proposed by Tallal. We administered the RAN test to matched groups of dyslexic and normal control children, aged 8 to 11 years. Measurement of speech signal durations was performed by computer. We applied the Gould and Boies algorithm for extracting cognitive preparation time to these data, resulting in a partition of overall RAN test score into coding time and articulation time components. Differential performances between the groups on RAN subtests were examined for effects of postvocalic consonants and semantic load. It was found that both vocalization time and pause time means were significantly longer for the dyslexics on each of the four RAN subtests: objects, colors, numbers, letters. The Gould and Boies analysis showed very little preparation time during speech in both groups, attributing virtually all coding time to pauses, although reanalysis suggested somewhat less. Increased vowel length among the dyslexics occurred on all subtests, but was not maximized on subtests with more numerous postvocalic consonants. Vowel time differences between groups accounted for nearly all of the differences in vocalization time. Profiles were constructed on speech measures which, for each subtest, correctly identified each subject with his group. On the letters subtest it was found that vowel duration alone achieved a perfect discrimination between the dyslexic and control subjects.}, } @article {pmid6596837, year = {1984}, author = {Seligman, PM and Patrick, JF and Tong, YC and Clark, GM and Dowell, RC and Crosby, PA}, title = {A signal processor for a multiple-electrode hearing prosthesis.}, journal = {Acta oto-laryngologica. Supplementum}, volume = {411}, number = {}, pages = {135-139}, pmid = {6596837}, issn = {0365-5237}, mesh = {*Cochlear Implants ; Deafness/rehabilitation ; Humans ; Microcomputers ; Psychoacoustics ; Speech Perception ; }, abstract = {A 22-electrode implantable hearing prosthesis uses a wearable speech processor which estimates three speech signal parameters. These are voice pitch, second formant frequency and flattened spectrum amplitude. The signal is monitored continuously for periodicity in the range 80-400 Hz and, if this is present, stimulation occurs at the same rate. Otherwise, as in the case of unvoiced sounds, it occurs at the random rate of fluctuation of the signal envelope. The second formant is obtained by filtering to extract the dominant peak in the midband region and by continuous measurement of the zero crossing rate. The amplitude measured is that of the whole speech spectrum, pre-emphasized by differentiation. The values that are presented to the patient are the parameter estimates immediately prior to the stimulation pulse. Second formant frequency is coded by selection of an appropriate electrode in the cochlea and amplitude by a suitably controlled current. Automatic gain control is used to keep the dynamic range of the amplitude estimate within the 30 dB range of the circuitry.}, } @article {pmid6546950, year = {1984}, author = {Clark, GM and Tong, YC and Patrick, JF and Seligman, PM and Crosby, PA and Kuzma, JA and Money, DK}, title = {A multi-channel hearing prosthesis for profound-to-total hearing loss.}, journal = {Journal of medical engineering & technology}, volume = {8}, number = {1}, pages = {3-8}, doi = {10.3109/03091908409032065}, pmid = {6546950}, issn = {0309-1902}, mesh = {Adult ; *Cochlear Implants ; Deafness/physiopathology/*therapy ; Electrodes, Implanted ; Female ; Hearing Tests ; Humans ; Male ; Middle Aged ; Prosthesis Design ; }, abstract = {A multi-channel cochlear implant hearing prosthesis providing 22 separate channels of stimulation has been developed. The electronics for the implantable receiver-stimulator have been incorporated on a single chip, using digital circuits and employing CMOS technology. The chip is enclosed in a titanium capsule with platinum/ceramic electrode feed-throughs. A pocket-sized speech processor and directional microphone extract the following speech parameters: signal amplitude, fundamental frequency and formant frequency. The fundamental frequency is coded as electric pulse rate, and formant frequency by electrode position. The speech processor has been realized using hybrid circuits and CMOS gate arrays. The multi-channel prosthesis has undergone a clinical trial on four postlingually deaf patients with profound-total hearing losses. The speech perception results indicate that they were able to obtain open-set speech recognition scores for phonetically balanced words, CID sentences and spondees. In all cases the tests showed significant improvements when using the cochlear prosthesis combined with lipreading compared to lipreading alone.}, } @article {pmid6370109, year = {1984}, author = {Sachs, MB}, title = {Neural coding of complex sounds: speech.}, journal = {Annual review of physiology}, volume = {46}, number = {}, pages = {261-273}, doi = {10.1146/annurev.ph.46.030184.001401}, pmid = {6370109}, issn = {0066-4278}, mesh = {Electrophysiology ; Humans ; *Speech ; Time Factors ; Vestibulocochlear Nerve/*physiology ; }, abstract = {The studies that we have reviewed here demonstrate that a temporal-place code can represent fine details in the spectra of vowels and stop-consonants. These features include formant frequencies, formant-frequency transitions, and pitch. On the other hand, such a phase locking-based representation may have difficulty with certain fricative consonants. Detailed format structure of vowels is present in a rate-place code at moderate stimulus levels, but is maintained at high levels only in the small population of low-spontaneous-rate fibers. A rate-place code preserves the formant structure at high stimulus levels better for stop-consonants than for vowels. Formant structure of some fricatives may be represented in a place-rate code in cases where a temporal-place code fails. Voice pitch is well preserved in a temporal code but not in a rate-place code. From this summary we must consider the possibility that the central nervous system utilizes both rate-place and temporal-place information in processing all of the relevant features of speech. This review points out a number of issues that currently confronts us in the coding of complex stimuli. We conclude by briefly summarizing these issues. First, we must attempt to clarify the role of the low-spontaneous-rate, high-threshold fibers in the representation of speech at high-stimulus levels. A second major question that must be addressed is the role of the cochlear efferents in the peripheral encoding of speech, especially in the presence of background noise. Closely related to the role of the efferents may be the role of the middle ear muscles. Finally, if we are to seriously consider a temporal-based code for speech, we must develop quantitative hypotheses about how spectral information might be extracted from temporal patterns in the central nervous system with real neural "hardware."}, } @article {pmid6668369, year = {1983}, author = {Gulian, E and Hinds, P and Fallside, F and Brooks, S}, title = {Vowel-learning and the vowel system of deaf children: age and feedback-related differences.}, journal = {Journal of communication disorders}, volume = {16}, number = {6}, pages = {449-469}, doi = {10.1016/0021-9924(83)90005-9}, pmid = {6668369}, issn = {0021-9924}, mesh = {Age Factors ; Child ; Child, Preschool ; Computers ; Deafness/psychology/*rehabilitation ; *Feedback ; Female ; Humans ; Male ; *Phonetics ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {A study comparing perceptual judgments of vowels produced by deaf children taught to speak by using a computer-based aid (experimental group, E) and by conventional techniques (control group, C) revealed feedback-related differences. These concerned the ability to reorganise articulatory performance, intelligibility of vowels after training, and the strength of the memory trace, which all showed an advantage for the E group. A phoneme-by-phoneme analysis indicated that the E group improved consistently on back and central vowels, whereas no consistent trend was found in the C group. The modifications observed were also age-dependent, more progress was found in the younger than in the older children's performance. An error analysis showed that asymmetric vowel confusions were predominant and that the major factor accounting for all errors was the small difference between second formant values. The findings are discussed in terms of the articulatory and perceptual variables intervening in the vowel intelligibility of deaf children.}, } @article {pmid6645438, year = {1983}, author = {Leonard, RJ and Gillis, R}, title = {Effects of a prosthetic tongue on vowel formants and isovowel lines in a patient with total glossectomy (an addendum to Leonard and Gillis, 1982).}, journal = {The Journal of speech and hearing disorders}, volume = {48}, number = {4}, pages = {423-426}, doi = {10.1044/jshd.4804.423}, pmid = {6645438}, issn = {0022-4677}, support = {1 K07 NS00496/NS/NINDS NIH HHS/United States ; }, mesh = {Female ; Glossectomy/*rehabilitation ; Humans ; Middle Aged ; Phonetics ; *Prostheses and Implants ; Prosthesis Design ; *Speech ; *Speech Acoustics ; *Speech Intelligibility ; *Tongue ; }, } @article {pmid6643834, year = {1983}, author = {Darwin, CJ and Seton, J}, title = {Perceptual cues to the onset of voiced excitation in aspirated initial stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {4}, pages = {1126-1135}, doi = {10.1121/1.390036}, pmid = {6643834}, issn = {0001-4966}, mesh = {*Cues ; Female ; Humans ; *Phonetics ; Speech Acoustics ; *Speech Perception ; Voice ; }, abstract = {Previous experiments on the perception of initial stops differing in voice-onset time have used sounds where the boundary between aspiration and voicing is clearly marked by a variety of acoustic events. In natural speech these events do not always occur at the same moment and there is disagreement among phoneticians as to which mark the onset of voicing. The three experiments described here examine how the phoneme boundary between syllable-initial, prestress /b/ and /p/ is influenced by the way in which voicing starts. In the first experiment the first 30 ms of buzz excitation is played at four different levels relative to the steady state of the vowel and with two different frequency distributions: In the F1-only conditions buzz is confined to the first formant, whereas in the F123 conditions all three formants are excited by buzz. The results reject the hypothesis that voicing is perceived to start when periodic excitation is present in the first formant. The results of the third experiment show also that buzz excitation confined to the fundamental frequency for 30 ms before the onset of full voicing (formant excitation) has little effect on the voicing boundary. The second experiment varies aspiration noise intensity and buzz onset intensity independently. Together with the first experiment it shows that: (1) at all buzz onset levels a change in aspiration intensity moves the boundary by about the 0.43 ms/dB found by Repp [Lang. Speech 27, 173-189 (1979)]; (2) when buzz onsets at levels greater than - 15 dB relative to the final vowel level, changes in buzz onset level again move the /b/-/p/ boundary by the same amount; (3) when buzz onsets at levels less than - 15 dB relative to the vowel, decreasing the buzz onset level gives more /p/- percepts than Repp's ratio predicts. This last result, taken with the results of the first experiment, may reflect a decision based on overall intensity about when voicing has started.}, } @article {pmid6630727, year = {1983}, author = {Tartter, VC and Kat, D and Samuel, AG and Repp, BH}, title = {Perception of intervocalic stop consonants: the contributions of closure duration and formant transitions.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {3}, pages = {715-725}, doi = {10.1121/1.389857}, pmid = {6630727}, issn = {0001-4966}, mesh = {Humans ; Male ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Acoustic analyses of vowel-consonant-vowel (VCV) utterances indicate that they generally include formant transitions from the first vowel into a period of closure (VC transitions), and transitions out of the closure into the second vowel (CV transitions). Three experiments investigated the perceptual importance of the VC transitions, the CV transitions, and the closure period in identification of medial stop consonants varying in place of articulation. Experiment 1 compared identification of members of synthetic VC and CV continua with those from VCV series made by concatenating corresponding VC and CV stimuli using various closure durations. Experiment 2 examined identification of VCV stimuli constructed with only VC, only CV, or both VC and CV transitions; again closure duration was systematically varied. Experiment 3 correlated CV and VC identification with identification of VCV stimuli. Neither closure duration nor formant transition structure (i.e., only VC, only CV, or both) had an independent effect on identification. Instead, the formant structure and closure duration together strongly affected stop identification. When both VC and CV transitions were present, the CV transitions contributed somewhat more to identification of medial stops with short closures, than the VC transitions did. With longer closure durations, neither set of transitions appeared to determine perceived place of articulation in any simple way. Overall, the data indicate that the perception of a medial consonant is more than simply a (weighted) sum of its parts.}, } @article {pmid6630726, year = {1983}, author = {Ohde, RN and Stevens, KN}, title = {Effect of burst amplitude on the perception of stop consonant place of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {3}, pages = {706-714}, doi = {10.1121/1.389856}, pmid = {6630726}, issn = {0001-4966}, support = {NS04322/NS/NINDS NIH HHS/United States ; NS07040/NS/NINDS NIH HHS/United States ; NS18284/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {We have examined the effects of the relative amplitude of the release burst on perception of the place of articulation of utterance-initial voiceless and voiced stop consonants. The amplitude of the burst, which occurs within the first 10-15 ms following consonant release, was systematically varied in 5-dB steps from -10 to +10 dB relative to a "normal" burst amplitude for two labial-to-alveolar synthetic speech continua--one comprising voiceless stops and the other, voiced stops. The distribution of spectral energy in the bursts for the labial and alveolar stops at the ends of the continuum was consistent with the spectrum shapes observed in natural utterances, and intermediate shapes were used for intermediate stimuli on the continuum. The results of identification tests with these stimuli showed that the relative amplitude of the burst significantly affected the perception of the place of articulation of both voiceless and voiced stops, but the effect was greater for the former than the latter. The results are consistent with a view that two basic properties contribute to the labial-alveolar distinction in English. One of these is determined by the time course of the change in amplitude in the high-frequency range (above 2500 Hz) in the few tens of ms following consonantal release, and the other is determined by the frequencies of spectral peaks associated with the second and third formants in relation to the first formant.}, } @article {pmid6630725, year = {1983}, author = {Strange, W and Jenkins, JJ and Johnson, TL}, title = {Dynamic specification of coarticulated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {3}, pages = {695-705}, doi = {10.1121/1.389855}, pmid = {6630725}, issn = {0001-4966}, support = {71-2420//PHS HHS/United States ; HD-0098/HD/NICHD NIH HHS/United States ; MH-21153/MH/NIMH NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {An adequate theory of vowel perception must account for perceptual constancy over variations in the acoustic structure of coarticulated vowels contributed by speakers, speaking rate, and consonantal context. We modified recorded consonant-vowel-consonant syllables electronically to investigate the perceptual efficacy of three types of acoustic information for vowel identification: (1) static spectral "targets," (2) duration of syllabic nuclei, and (3) formant transitions into and out of the vowel nucleus. Vowels in /b/-vowel-/b/ syllables spoken by one adult male (experiment 1) and by two females and two males (experiment 2) served as the corpus, and seven modified syllable conditions were generated in which different parts of the digitized waveforms of the syllables were deleted and the temporal relationships of the remaining parts were manipulated. Results of identification tests by untrained listeners indicated that dynamic spectral information, contained in initial and final transitions taken together, was sufficient for accurate identification of vowels even when vowel nuclei were attenuated to silence. Furthermore, the dynamic spectral information appeared to be efficacious even when durational parameters specifying intrinsic vowel length were eliminated.}, } @article {pmid6619427, year = {1983}, author = {Miller, MI and Sachs, MB}, title = {Representation of stop consonants in the discharge patterns of auditory-nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {2}, pages = {502-517}, doi = {10.1121/1.389816}, pmid = {6619427}, issn = {0001-4966}, mesh = {Animals ; Cats ; Evoked Potentials, Auditory ; Nerve Fibers/*physiology ; *Phonetics ; Pitch Perception/physiology ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {The representation of the speech syllables /da/ and /ba/ in populations of auditory-nerve fibers was studied. Post-stimulus-time histograms were computed from 20-ms segments of fiber spike trains occurring in response to the stimulus. Discrete Fourier transforms with a resolution of 50 Hz were computed from each histogram. As a measure of the response of the population of fibers to each harmonic of the 50-Hz resolution frequency of the transform, the magnitude of the response to that frequency was averaged across all fibers whose characteristic frequencies were within one-fourth octave of that harmonic. We have previously called this measure the average localized synchronized rate (ALSR). Response profiles for the 20-ms segments of the stimulus were generated by plotting the ALSR versus frequency. Time-varying spectral features of the /da/ and /ba/ stimuli are well preserved by such profiles. For example, the onset spectrum and formant transitions of the consonant-vowel syllable are well represented. Furthermore, the fine structure in the speech spectrum related to the pitch of the excitation source is maintained in these ALSR plots. Average discharge rate profiles were generated in a manner similar to that for the ALSR; in this case average rate replaces Fourier transform components as response measure. Such average rate profiles can represent the transitions of at least formants two and three. However, such average rate profiles do not represent the steady-state formants or the voice pitch.}, } @article {pmid6619420, year = {1983}, author = {Belcher, EO and Hatlestad, S}, title = {Formant frequencies, bandwidths, and Qs in helium speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {2}, pages = {428-432}, doi = {10.1121/1.389758}, pmid = {6619420}, issn = {0001-4966}, mesh = {Diving ; Helium/*pharmacology ; Humans ; Phonetics ; Sound Spectrography ; Speech/*drug effects ; }, abstract = {We used linear prediction analysis to estimate a diver's vocal tract response for isolated vowels spoken in air at 1 atm and in heliox at simulated depths of 54-, 120-, 300-, and 500-m seawater. We specifically measured formant frequency, bandwidth, and Q changes as a function of environment. The formant frequencies shifted upward nonlinearly in helium speech. The formant bandwidths in helium speech increased as much as 14 times their corresponding bandwidths in normal speech. The ratios of formant Qs (helium speech to normal speech) ranges from 0.3 (low formants) to 2 (high formants).}, } @article {pmid6886197, year = {1983}, author = {Hannley, M and Dorman, MF}, title = {Susceptibility to intraspeech spread of masking in listeners with sensorineural hearing loss.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {1}, pages = {40-51}, doi = {10.1121/1.389616}, pmid = {6886197}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Aged ; Auditory Perception ; Female ; Hearing Loss, Noise-Induced/physiopathology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Meniere Disease/physiopathology ; Middle Aged ; *Perceptual Masking ; *Phonetics ; Presbycusis/physiopathology ; Speech Acoustics ; *Speech Perception ; }, abstract = {Previous research with speechlike signals has suggested that upward spread of masking from the first formant (F 1) may interfere with the identification of place of articulation information signaled by changes in the upper formants. This suggestion was tested by presenting two-formant stop consonant--vowel syllables varying along a/ba--/da/--/ga/ continuum to hearing-impaired listeners grouped according to etiological basis of the disorder. The syllables were presented monaurally at 80 dB and 100 dB SPL when formant amplitudes were equal and when F 1 amplitude was reduced by 6, 12, and 18 dB. Noise-on-tone masking patterns were also generated using narrow bands of noise at 80 and 100 dB SPL to assess the extent of upward spread of masking. Upward spread of masking could be demonstrated in both speech and nonspeech tasks, irrespective of the subject's age, audiometric configuration, or etiology of hearing impairment. Attenuation of F 1 had different effects on phonetic identification in different subject groups: While listeners with noise-induced hearing loss showed substantial improvement in identifying place of articulation, upward spread of masking did not consistently account for poor place identification in other types of sensorineural hearing impairment.}, } @article {pmid6875649, year = {1983}, author = {Sachs, MB and Voigt, HF and Young, ED}, title = {Auditory nerve representation of vowels in background noise.}, journal = {Journal of neurophysiology}, volume = {50}, number = {1}, pages = {27-45}, doi = {10.1152/jn.1983.50.1.27}, pmid = {6875649}, issn = {0022-3077}, support = {GM07057/GM/NIGMS NIH HHS/United States ; }, mesh = {Animals ; Cats ; Evoked Potentials, Auditory ; Loudness Perception/physiology ; Nerve Fibers/physiology ; *Noise ; *Phonetics ; Pitch Perception/physiology ; Psychoacoustics ; Speech Perception/*physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of auditory nerve fibers to steady-state vowels presented alone and in the presence of background noise were obtained from anesthetized cats. Representation of vowels based on average discharge rate and representation based primarily on phase-locked properties of responses are considered. Profiles of average discharge rate versus characteristic frequency (CF) ("rate-place" representation) can show peaks of discharge rate in the vicinity of formant frequencies when vowels are presented alone. These profiles change drastically in the presence of background noise, however. At moderate vowel and noise levels and signal/noise ratios of +9 dB, there are not peaks of rate near the second and third formant frequencies. In fact, because of two-tone suppression, rate to vowels plus noise is less than rate to noise alone for fibers with CFs above the first formant. Rate profiles measured over 5-ms intervals near stimulus onset show clear formant-related peaks at higher sound levels than do profiles measured over intervals later in the stimulus (i.e., in the steady state). However, in background noise, rate profiles at onset are similar to those in the steady state. Specifically, for fibers with CFs above the first formant, response rates to the noise are suppressed by the addition of the vowel at both vowel onset and steady state. When rate profiles are plotted for low spontaneous rate fibers, formant-related peaks appear at stimulus levels higher than those at which peaks disappear for high spontaneous fibers. In the presence of background noise, however, the low spontaneous fibers do not preserve formant peaks better than do the high spontaneous fibers. In fact, the suppression of noise-evoked rate mentioned above is greater for the low spontaneous fibers than for high. Representations that reflect phase-locked properties as well as discharge rate ("temporal-place" representations) are much less affected by background noise. We have used synchronized discharge rate averaged over fibers with CFs near (+/- 0.25 octave) a stimulus component as a measure of the population temporal response to that component. Plots of this average localized synchronized rate (ALSR) versus frequency show clear first and second formant peaks at all vowel and noise levels used. Except at the highest level (vowel at 85 dB sound pressure level (SPL), signal/noise = +9 dB), there is also a clear third formant peak. At signal-to-noise ratios where there are no second formant peaks in rate profiles, human observers are able to discriminate second formant shifts of less than 112 Hz. ALSR plots show clear second formant peaks at these signal/noise ratios.}, } @article {pmid6688434, year = {1983}, author = {Tong, YC and Blamey, PJ and Dowell, RC and Clark, GM}, title = {Psychophysical studies evaluating the feasibility of a speech processing strategy for a multiple-channel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {74}, number = {1}, pages = {73-80}, doi = {10.1121/1.389620}, pmid = {6688434}, issn = {0001-4966}, mesh = {*Cochlear Implants ; Humans ; Loudness Perception ; Male ; Pitch Perception ; *Psychoacoustics ; Speech Acoustics ; Speech Perception ; }, abstract = {This paper reports further psychophysical studies on a multiple-channel cochlear implant patient evaluating the feasibility of a speech processing strategy which converts the acoustic fundamental frequency to electric repetition rate, the second-formant frequency to electrode position, and the acoustic amplitude to current level. The first four studies evaluated the use of a special pulse pattern to minimize the loudness variation with electric repetition rate. The chosen pulse pattern consisted of multiple pulses occurring in the first half of each repetitive period (MPP) in contrast to the more conventional pattern with a single pulse per period (SPP). The results showed that MPP approximately equalized the loudness variation with repetition rate. The dynamic range of current, the pitch variation with repetition rate, and the difference limens for repetition rate were found to be similar to MPP and SPP. Two other studies investigated interaction between electrode position and repetition rate (RR). The first of these showed that the patient could make use of information provided by rising or falling RR trajectories superimposed on individual electrodes or electrode trajectories as an indicator of the direction of intonation variation. The second of these studies showed that the dissimilarities amongst the hearing sensations produced by steady-state stimuli differing in electrode position and repetition rate were characterized by two perceptual components, relating to the two electric parameters, respectively.}, } @article {pmid6887818, year = {1983}, author = {Gerratt, BR}, title = {Formant frequency fluctuation as an index of motor steadiness in the vocal tract.}, journal = {Journal of speech and hearing research}, volume = {26}, number = {2}, pages = {297-304}, doi = {10.1044/jshr.2602.297}, pmid = {6887818}, issn = {0022-4685}, support = {5 T32 NS07100/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Aged ; Dysarthria/complications/*physiopathology ; Dyskinesia, Drug-Induced/complications/*physiopathology ; Facial Muscles/physiopathology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Disorders/*physiopathology ; }, abstract = {Involuntary movement of the articulatory structures can interfere with the accurate placement of the articulators during consonant production and may also result in distortion of vowel quality. An acoustic method was used to assess motor steadiness in the vocal tract musculature superior to the glottis during vowel production by five subjects with abnormal involuntary orofacial movements associated with tardive dyskinesia and 10 normal subjects. A linear predictive coding technique of spectral analysis yielded formant frequencies from the sustained productions of /a/. Based on the premise that changes in vocal tract configuration can be measured as changes in formant frequency, the sequential segment-to-segment fluctuations of the second formant frequency of these vowel samples were computed and used as an index of motor steadiness. Results showed that formant frequency fluctuation measures for four of the five tardive dyskinetic patients were substantially larger than those of the normal subjects, indicating a reduction of motor steadiness in these four subjects. Factors influencing the validity of this procedure and implications for its use are discussed.}, } @article {pmid6887808, year = {1983}, author = {Kelso, JA and Tuller, B}, title = {"Compensatory articulation" under conditions of reduced afferent information: a dynamic formulation.}, journal = {Journal of speech and hearing research}, volume = {26}, number = {2}, pages = {217-224}, doi = {10.1044/jshr.2602.217}, pmid = {6887808}, issn = {0022-4685}, support = {NS 13617/NS/NINDS NIH HHS/United States ; NS17778/NS/NINDS NIH HHS/United States ; RR05596/RR/NCRR NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Adult ; Facial Muscles/*physiology ; Female ; Humans ; Male ; Mandible/physiology ; Mechanoreceptors/physiology ; Models, Biological ; Mouth/innervation ; *Movement ; Mucous Membrane/physiology ; Perceptual Masking ; Proprioception ; Speech/*physiology ; Temporomandibular Joint/innervation ; }, abstract = {A well-established feature of speech production is that talkers, faced with either anticipated or unanticipated perturbations, can spontaneously adjust the movement patterns of articulators such that the acoustic output remains relatively undistorted. Less clear is the nature of the underlying processes involved. In this study we examined five subjects' productions of the point vowels /i, a, u/ in isolation and of the same vowels embedded in a dynamic speech context under normal conditions and under a combined condition, in which (a) the mandible was fixed by means of a bite block; (b) proprioceptive information was reduced through bilateral anesthetization of the temporomandibular joint; (c) tactile information from the oral mucosa was reduced by application of a topical anesthetic; and (d) auditory information was masked by white noise. Minimal distortion of the formant patterns was found in the combined condition. These findings are unfavorable for central (e.g., predictive simulation) or peripheral closed-loop models, both of which require reliable peripheral information; they are more in line with recent work suggesting that movement goals may be achieved by muscle collectives that behave in a way that is qualitatively similar to a nonlinear vibratory system.}, } @article {pmid6875101, year = {1983}, author = {Soli, SD}, title = {The role of spectral cues in discrimination of voice onset time differences.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {6}, pages = {2150-2165}, doi = {10.1121/1.389539}, pmid = {6875101}, issn = {0001-4966}, support = {HD-00098/HD/NICHD NIH HHS/United States ; HD-01136/HD/NICHD NIH HHS/United States ; MN-21153/MN/OMHHE CDC HHS/United States ; }, mesh = {Adult ; Cross-Cultural Comparison ; Cues ; Differential Threshold ; Humans ; Learning ; Memory ; *Phonetics ; *Psychoacoustics ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {The influence of spectral cues on discrimination peaks in the region of the phonetic voicing boundary was examined. The discriminability of voice onset time (VOT) differences of the same temporal magnitude was assessed using stimuli from labial and velar consonant-vowel VOT continua that differed in the timing of spectral changes associated with the first formant (F1) transition, and in the location of the phonetic boundary. Subjects were initially given labeling tests and fixed-standard AX and all-step discrimination tests on both series. Half the subjects then received all-step discrimination training on one series and half received training on the other series. Finally, all subjects were again given the labeling and discrimination tests on both series. Just noticeable differences (jnds) in VOT were estimated from the all-step functions before and after training. Initial jnds showed that VOT discrimination was most accurate around the voicing boundary on the two continua, where differences in F1 onset frequency accompany variations in VOT. jnds on both series decreased significantly after training, although these regions of greater sensitivity remained. No evidence was seen of increased sensitivity around +/- 20-ms VOT, as expected if auditory processing constraints were influencing temporal order judgments. Comparisons of post-training jnds within and across series indicated that spectral components of VOT, primarily F1 onset frequency differences, exert a substantial influence on discrimination, and, along with other spectral cues provided by source differences at stimulus onset, can account for the discontinuities in discrimination often reported in research with VOT continua. Large phonetic effects also were seen in the initial performance of all subjects: jnds decreased consistently as standards drew nearer the voicing boundary. However, these effects were absent in the final jnds for most subjects. Implications of these findings for the understanding of basic auditory and attentional processes in speech perception are discussed.}, } @article {pmid6224967, year = {1983}, author = {Beckman, DA and Wold, DC and Montague, JC}, title = {A noninvasive acoustic method using frequency perturbations and computer-generated vocal-tract shapes.}, journal = {Journal of speech and hearing research}, volume = {26}, number = {2}, pages = {304-314}, doi = {10.1044/jshr.2602.304}, pmid = {6224967}, issn = {0022-4685}, mesh = {Adult ; Down Syndrome/complications/*physiopathology ; Female ; Humans ; Male ; Mathematics ; Mouth/physiopathology ; Pharynx/physiopathology ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Tongue/physiopathology ; Voice Disorders/complications/physiopathology ; }, abstract = {This study investigated improved processing of acoustic data with two adult Down's syndrome subjects. Sustained vowel samples were processed through a fast-Fourier-transform spectrum analyzer, and digital waveform data were used to obtain period-by-period measurements of the fundamental frequencies. Unusual frequency perturbation (jitter), later identified as diplophonia, was found for one of the Down's subjects. In addition, the first three formant frequencies of the vowels were determined and, utilizing an algorithm described by Ladefoged and his colleagues, computer-generated vocal-tract shapes were plotted. Differences in vocal-tract shapes, especially for the back vowels, were observed between the Down's female and the normal shape. Correlations between vocal-tract shapes of the Down's subjects and those for a normal man or woman were computed. A partial three-way factor analysis was carried out to determine those load factors or coefficients for each subject that were due to individual differences. These procedures, offering synthesized techniques portraying the interpharyngeal/oral functioning of the speech structures, may eventually have direct noninvasive diagnostic and therapeutic benefit for voice/resonance-disordered clients.}, } @article {pmid6863754, year = {1983}, author = {Miller, JL and Baer, T}, title = {Some effects of speaking rate on the production of /b/ and /w/.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {5}, pages = {1751-1755}, doi = {10.1121/1.389399}, pmid = {6863754}, issn = {0001-4966}, support = {NS 00661/NS/NINDS NIH HHS/United States ; NS 14394/NS/NINDS NIH HHS/United States ; RR 07143/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Time Factors ; }, abstract = {One of the acoustic properties distinguishing the syllable-initial stop consonant /b/ from the semivowel /w/ is the duration of the initial formant transitions; syllables beginning with /b/ have shorter transitions than those beginning with /w/. This experiment investigated the way in which the transition durations of /b/ and /w/ change as a function of speaking rate by examining tokens of /ba/ and /wa/ produced by four male speakers. At any given speaking rate the /wa/ transitions were, on average, longer than the /ba/ transitions, although pooled across rates, the distributions of transition duration for /ba/ and /wa/ were overlapping. In addition, the magnitude of the difference between average /ba/ and /wa/ transition durations increased with decreases in speaking rate. This is because as rate of speech decreased so that syllable duration increased, there was little change in the initial transition duration of /ba/, but a considerable increase in the initial transition duration of /wa/. Given the overall pattern of results, the transition duration that could optimally distinguish /ba/ from /wa/ was not constant, but increased with syllable duration. This is in accord with Miller and Liberman's (1979) finding that when listeners identify /ba/ and /wa/ on the basis of transition duration, they do so in relation to the duration of the syllable.}, } @article {pmid6863753, year = {1983}, author = {Mack, M and Blumstein, SE}, title = {Further evidence of acoustic invariance in speech production: the stop-glide contrast.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {5}, pages = {1739-1750}, doi = {10.1121/1.389398}, pmid = {6863753}, issn = {0001-4966}, support = {NS15123/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Lip/physiology ; Male ; *Phonetics ; Sound Spectrography ; *Speech/physiology ; *Speech Acoustics ; Speech Perception ; Time Factors ; }, abstract = {We have conducted a detailed comparative acoustic analysis of the labial stop and glide, [b] and [w], and we have attempted to identify an invariant acoustic property which can accurately distinguish stops and glides. To this end, we conducted three experiments. In experiment I, we undertook computer analysis of the labial stop [b] and the labial glide [w] as produced in five vowel contexts by two speakers. Results indicated that transition durations and formant frequencies often differed considerably in these two classes of sounds--and to a much greater extent than suggested in previous perception experiments. In experiment II we examined a measure of amplitude--unit energy. We calculated the degree of relative amplitude change occurring in the vicinity of the stop and glide release and found reliably larger changes in energy associated with the stop release than the glide release across vowel contexts and speakers. These changes seemed to provide an invariant property characterizing the stop--glide contrast. In experiment III we tested the generality of our claims by examining a new set of data consisting of the stops [d] and [g] and the glide [y]. Results of this experiment further supported our hypothesis. We have related our findings to a general theory of acoustic invariance in speech.}, } @article {pmid6621013, year = {1983}, author = {Hoffman, PR and Stager, S and Daniloff, RG}, title = {Perception and production of misarticulated (r).}, journal = {The Journal of speech and hearing disorders}, volume = {48}, number = {2}, pages = {210-215}, doi = {10.1044/jshd.4802.210}, pmid = {6621013}, issn = {0022-4677}, mesh = {Articulation Disorders/*psychology ; Child ; Humans ; Phonetics ; Sound Spectrography ; Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Twelve children who consistently misarticulated consonant [r] and five children who correctly articulated [r] were recorded while repeating sentences which differed only in a single (r)-(w) contrast. All (r) and (w) productions were spectrographically analyzed. Error productions were judged for their similarity to [w]. Each child identified all of the recorded sentences via a picture-pointing task. Misarticulated [r] was identified as (w) at above chance levels only by the children who did not misarticulate [r]. The subject groups did not differ in their perception of correctly articulated (r) and (w) phones. Children whose misarticulated [r] phones were judged to be (w)-like were most likely to misperceive their own productions of (r). Children whose misarticulated [r] productions were characterized by higher second formant frequencies were better able to identify their productions of (r). Results suggest that a subpopulation of children who misarticulate [r] may mark it acoustically in a nonstandard manner.}, } @article {pmid6853846, year = {1983}, author = {Recasens, D}, title = {Place cues for nasal consonants with special reference to Catalan.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {4}, pages = {1346-1353}, doi = {10.1121/1.389238}, pmid = {6853846}, issn = {0001-4966}, support = {HD01994/HD/NICHD NIH HHS/United States ; RR05596/RR/NCRR NIH HHS/United States ; }, mesh = {*Cues ; Humans ; Language ; Male ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {This study investigates the perceptual contributions of formant transitions and nasal murmurs to the identification of the unreleased Catalan nasal consonants [n], [n], [n] (alveolar, palatal, velar, respectively) after [a] in absolute final position. Transition and murmur patterns were synthesized and varied simultaneously and systematically by interpolating between optimal values obtained from spectrographic analysis of natural speech. Catalan subjects were asked to identify the synthetic stimuli as [n], [n], and [n]. The main findings were: (1) Although transitions provided more effective cues for place of articulation than murmurs, the murmurs did make a significant contribution to the [n]-[n] distinction. (2) The cue value of the transitions ([n] greater than [n], [n]) was inversely related to that of the murmurs ([n], [n] greater than [n]). It is concluded that static and dynamic place cues for nasals in an [aC#] context are perceptually integrated with reference to the typical pattern of production of these consonants.}, } @article {pmid6852285, year = {1983}, author = {Suvorov, NF and Danilova, LK and Shefer, SI and Shuvaev, VT}, title = {[Role of the amygdala in the evaluation of the biological significance of conditioned stimuli].}, journal = {Fiziologicheskii zhurnal SSSR imeni I. M. Sechenova}, volume = {69}, number = {3}, pages = {304-312}, pmid = {6852285}, issn = {0015-329X}, mesh = {Acoustic Stimulation ; Afferent Pathways/physiology ; Amygdala/*physiology ; Animals ; Cats ; Conditioning, Classical/*physiology ; Dogs ; Evoked Potentials, Auditory ; *Neural Inhibition ; Somatosensory Cortex/physiology ; }, abstract = {Lesions of the amygdala in cats and dogs prevent some animals to respond adequately to signals of different biological significance; the amplitudes and probabilities of EPs appearance in the sensomotor cortex lose their maximums at 0.8 and 1.6 frequencies within the range of 2.0 to 3.0 kHz which are formant ones of some communication signals in cats. Rendering an indifferent stimulus a signal, i. e. food significance, becomes reflected in the amygdala neuronal activity, the amygdala starting to respond to the signal as if it were an unconditioned stimulus. The data obtained are used to discuss the role of amygdala in assessment of conditioned stimuli biological significance as well as in differentiation of incoming afferent influences.}, } @article {pmid6841809, year = {1983}, author = {Walley, AC and Carrell, TD}, title = {Onset spectra and formant transitions in the adult's and child's perception of place of articulation in stop consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {3}, pages = {1011-1022}, doi = {10.1121/1.389149}, pmid = {6841809}, issn = {0001-4966}, mesh = {Adult ; Child ; *Child Development ; Child, Preschool ; Humans ; *Phonetics ; Psychoacoustics ; Speech Articulation Tests ; *Speech Perception ; }, abstract = {Stevens and Blumstein [Perspectives on the Study of Speech (Erlbaum, Hillsdale, NJ, 1981)] have proposed that the global shape of the CV syllable onset spectrum provides the listener with a primary and contextually invariant cue for place of stop consonant articulation. Contextually variable formant transitions are, in contrast, claimed to constitute secondary cues to place of articulation that, during development, are learned through their co-occurrence with the primary spectral ones. In the two experiments reported here, these claims about the relative importance of the onset spectrum and formant transition information were assessed by obtaining adults' and young children's identifications of synthetic stimuli in which these two potential cues specified different places of articulation. In general, the responses of both adults and children appeared to be determined by the formant transitions of the stimuli. These results provide little support for the claim that sensitivity to the global properties of the onset spectrum (as described by Stevens and Blumstein) underlie place of articulation perception or for Stevens and Blumstein's primary versus secondary cue distinction. Rather, these findings are consistent with the view that dynamic, time-varying information is important in the perception of place of articulation.}, } @article {pmid6223180, year = {1983}, author = {Monsen, RB and Engebretson, AM}, title = {The accuracy of formant frequency measurements: a comparison of spectrographic analysis and linear prediction.}, journal = {Journal of speech and hearing research}, volume = {26}, number = {1}, pages = {89-97}, doi = {10.1044/jshr.2601.89}, pmid = {6223180}, issn = {0022-4685}, support = {NS 03856/NS/NINDS NIH HHS/United States ; RR 00396/RR/NCRR NIH HHS/United States ; }, mesh = {Communication Aids for Disabled ; Phonetics ; Sound Spectrography/*standards ; *Speech ; *Speech Acoustics ; Statistics as Topic ; }, abstract = {The accuracy of spectrographic techniques and of linear prediction analysis in measuring formant frequencies is compared. The first three formant frequencies of 90 synthetic speech tokens were measured by three experienced spectrographic readers and by linear prediction analysis. For fundamental frequencies between 100 and 300 Hz, both methods are accurate to within approximately +/- 60 Hz for both first and second formants. The third formant can be measured with the same degree of accuracy by linear prediction, but only to within +/- 110 Hz by spectrographic means. The accuracy of both methods decreases greatly when fundamental frequency is 350 Hz or greater. These limits of measurement appear to be within the range of the difference limens for formant frequencies.}, } @article {pmid6221041, year = {1983}, author = {Moore, BC and Glasberg, BR}, title = {Masking patterns for synthetic vowels in simultaneous and forward masking.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {3}, pages = {906-917}, doi = {10.1121/1.389015}, pmid = {6221041}, issn = {0001-4966}, mesh = {Auditory Threshold ; Communication Aids for Disabled ; Humans ; *Perceptual Masking ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {Two synthetic vowels /i/ and /ae/ with a fundamental frequency of 100 Hz served as maskers for brief (5 or 15 ms) sinusoidal signals. Threshold was measured as a function of signal frequency, for signals presented immediately following the masker (forward masking, FM) or just before the cessation of the masker (simultaneous masking, SM). Three different overall masker levels were used: 50, 70, and 90 dB SPL. In order to compare the data from simultaneous and forward masking, and to compensate for the nonlinear characteristics of forward masking, each signal threshold was expressed as the level of a flat-spectrum noise which would give the same masking. The internal representation of the formant structure of the vowels, as inferred from the transformed masking patterns, was enhanced in FM and "blurred" in SM in comparison to the physical spectra, suggesting that suppression plays a role in enhancing spectral contrasts. The first two or three formants were usually visible in the masking patterns and the representation of the formant structure was impaired only slightly at high masker levels. For high levels, filtering out the relatively intense low-frequency components enhanced the representation of the higher formants in FM but not in SM, indicating a broadly tuned remote suppression from lower formants towards higher ones. The relative phase of the components in the masker had no effect on thresholds in forward masking, indicating that the detailed temporal structure of the masker waveform is not important.}, } @article {pmid6221040, year = {1983}, author = {Kuhl, PK and Padden, DM}, title = {Enhanced discriminability at the phonetic boundaries for the place feature in macaques.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {3}, pages = {1003-1010}, doi = {10.1121/1.389148}, pmid = {6221040}, issn = {0001-4966}, support = {71-2420//PHS HHS/United States ; RR 00166/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; Animals ; Communication Aids for Disabled ; Female ; Humans ; Macaca ; Male ; *Phonetics ; *Speech Perception ; }, abstract = {Discrimination of speech-sound pairs drawn from a computer-generated continuum in which syllables varied along the place of articulation phonetic feature (/b,d,g/) was tested with macaques. The acoustic feature that was varied along the two-formant 15-step continuum was the starting frequency of the second-formant transition. Discrimination of stimulus pairs separated by two steps was tested along the entire continuum in a same-different task. Results demonstrated that peaks in the discrimination functions occur for macaques at the "phonetic boundaries" which separate the /b-d/ and /d-g/ categories for human listeners. The data support two conclusions. First, although current theoretical accounts of place perception by human adults suggest that isolated second-formant transitions are "secondary" cues, learned by association with primary cues, the animal data are more compatible with the notion that second-formant transitions are sufficient to allow the appropriate partitioning of a place continuum in the absence of associative pairing with other more complex cues. Second, we discuss two potential roles played by audition in the evolution of the acoustics of language. One is that audition provided a set of "natural psychophysical boundaries," based on rather simple acoustic properties, which guided the selection of the phonetic repertoire but did not solely determine it; the other is that audition provided a set of rules for the formation of "natural classes" of sound and that phonetic units met those criteria. The data provided in this experiment provide support for the former. Experiments that could more clearly differentiate the two hypotheses are described.}, } @article {pmid6823564, year = {1983}, author = {Tong, YC and Dowell, RC and Blamey, PJ and Clark, GM}, title = {Two-component hearing sensations produced by two-electrode stimulation in the cochlea of a deaf patient.}, journal = {Science (New York, N.Y.)}, volume = {219}, number = {4587}, pages = {993-994}, doi = {10.1126/science.6823564}, pmid = {6823564}, issn = {0036-8075}, mesh = {Cochlea/physiology ; Deafness/*therapy ; Electric Stimulation ; Humans ; Speech Perception ; }, abstract = {Dissimilarities in perception elicited by stimulation with two electrodes were estimated. A two-dimensional spatial configuration was found to be suitable to represent the dissimilarity data, and the two dimensions could be interpreted as corresponding to the position of the apical and basal electrode of the two-electrode combination. A speech-processing strategy that converts acoustic, first and second formants to two-electrode stimulation is proposed.}, } @article {pmid6841800, year = {1983}, author = {Sinex, DG and Geisler, CD}, title = {Responses of auditory-nerve fibers to consonant-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {73}, number = {2}, pages = {602-615}, doi = {10.1121/1.389007}, pmid = {6841800}, issn = {0001-4966}, support = {HD-03352/HD/NICHD NIH HHS/United States ; NS-06195/NS/NINDS NIH HHS/United States ; NS-12732/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Cats ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Time Factors ; Vestibulocochlear Nerve/*physiology ; }, abstract = {The discharge patterns elicited by a set of synthesized consonant-vowel (CV) syllables were studied in the auditory nerve of the cat. The syllables, heard as /ba/, /da/, or /ga/, included a 25-, 50-, or 75-ms formant transition followed by a segment of steady-state vowel. The data were analyzed in terms of average discharge rate and in terms of the synchrony of discharges with respect to various spectral components of the stimuli. The results differ slightly from those of previous reports of the responses to vowels [Sachs and Young, J. Acoust. Soc. Am. 66, 470-479 (1979); Young and Sachs, J. Acoust. Soc. Am. 66, 1381-1403 (1979)], in that average discharge rates appear to provide more information about the spectra of formant transitions than they do about the spectra of steady-state vowels. This difference reflects changes in the spectrum of the syllable produced by movements of the formants. The synchrony of discharges, however, may provide more detailed information about the spectra of CVs than does average discharge rate. Each fiber's response at a particular peristimulus time may be characterized by the "dominant response component," the largest peak in the Fourier transform of the period histogram. The trajectories of the first three formants can be inferred from changes in the "dominant components" in a sample of fibers.}, } @article {pmid6871280, year = {1983}, author = {Inbar, GF and Eden, G}, title = {Physiological evidence for central modulation of voice tremor.}, journal = {Biological cybernetics}, volume = {47}, number = {1}, pages = {1-12}, pmid = {6871280}, issn = {0340-1200}, mesh = {Electromyography ; Humans ; Muscles/physiology ; Stress, Psychological/complications/*diagnosis ; Tremor/diagnosis/etiology ; Vocal Cords/physiology ; *Voice ; *Voice Quality ; }, abstract = {The present report presents an attempt to define the physiological parameter used to describe "voice tremor" in psychological stress evaluating machines, and to find its sources. This parameter was found to be a low frequency (5-20 Hz) random process which frequency modulates the vocal cord waveform and (independently) affects the frequency range of the third speech formant. The frequency variations in unstressed speakers were found to be the result of forced muscular undulations driven by central nervous signals and not of a passive resonant phenomenon. In this paper various physiological and clinical experiments which lead to the above conclusions are discussed. a) It is shown that induced muscular activity in the vocal tract and vocal cord regions can generate tremor in the voice. b) It is shown that relaxed subjects exhibit significant tremor correlation between spontaneously generated speech and EMG, with the EMG leading the speech tremor. c) Tremor in the electrical activity recorded from muscles overlapping vocal tract area was correlated with third formant demodulated signal and vocal cord demodulated pitch tremor was correlated with first formant demodulated tremor. d) Enhanced tremor was found in Parkinson patients and diminished tremor in patients with some traumatic brain injuries.}, } @article {pmid6844418, year = {1983}, author = {Ziegler, W and von Cramon, D}, title = {Vowel distortion in traumatic dysarthria: a formant study.}, journal = {Phonetica}, volume = {40}, number = {1}, pages = {63-78}, doi = {10.1159/000261681}, pmid = {6844418}, issn = {0031-8388}, mesh = {Adult ; Brain Injuries/*complications ; Dysarthria/*diagnosis ; Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech Articulation Tests ; Speech Disorders/*diagnosis ; *Speech Production Measurement ; }, abstract = {The frequencies of the first two formants of three German vowels in word context were determined by a formant tracking routine for 8 male subjects with closed head trauma. A centralized formant pattern was found to characterize vowel articulation in traumatic dysarthria. Different degrees of severity could be assessed and the process of recovery was described by the parameter 'formant triangle area'. An interpretation in articulatory terms is given and the results are discussed in the light of articulatory models.}, } @article {pmid6657760, year = {1983}, author = {Ziegler, W and von Cramon, D}, title = {Vowel distortion in traumatic dysarthria: lip rounding versus tongue advancement.}, journal = {Phonetica}, volume = {40}, number = {4}, pages = {312-322}, doi = {10.1159/000261698}, pmid = {6657760}, issn = {0031-8388}, mesh = {Brain Injuries/*complications ; Dysarthria/*physiopathology ; Facial Muscles/physiopathology ; Humans ; Lip ; Male ; *Phonetics ; Posture ; Speech/*physiology ; Speech Disorders/*physiopathology ; Speech Production Measurement/methods ; Tongue/physiopathology ; }, abstract = {Formant analysis of tense, high, German vowels was performed to the end of obtaining information about the role of insufficient lip rounding in distorted vowel production of 8 traumatic dysarthrics. A comparison was made between two allophones of /y/ in different consonantal contexts. Noticeable undershoot in lip rounding or protrusion proved to occur in a context of conflicting labial gestures. Where the articulatory realization of a CVC sequence required gross tongue movements, a lingual undershoot resulted as the prevailing deficit. No evidence for dyscoordinative defects was obtained from the results.}, } @article {pmid6575670, year = {1983}, author = {Remez, RE and Rubin, PE and Pisoni, DB}, title = {Coding of the speech spectrum in three time-varying sinusoids.}, journal = {Annals of the New York Academy of Sciences}, volume = {405}, number = {}, pages = {485-489}, doi = {10.1111/j.1749-6632.1983.tb31663.x}, pmid = {6575670}, issn = {0077-8923}, support = {HD 01944/HD/NICHD NIH HHS/United States ; HD 15672/HD/NICHD NIH HHS/United States ; MH 24027/MH/NIMH NIH HHS/United States ; }, mesh = {Humans ; Methods ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Recent perceptual experiments with normal adult listeners show that phonetic information can readily be conveyed by sinewave replicas of speech signals. These tonal patterns are made of three sinusoids set equal in frequency and amplitude to the respective peaks of the first three formants of natural-speech utterances. Unlike natural and most synthetic speech, the spectrum of sinusoidal patterns contains neither harmonics nor broadband formants, and is identified as grossly unnatural in voice timbre. Despite this drastic recoding of the short-time speech spectrum, listeners perceive the phonetic content if the temporal properties of spectrum variation are preserved. These observations suggest that phonetic perception may depend on properties of coherent spectrum variation, a second-order property of the acoustic signal, rather than any particular set of acoustic elements present in speech signals.}, } @article {pmid6575658, year = {1983}, author = {Dillier, N and Spillmann, T and Güntensperger, J}, title = {Computerized testing of signal-encoding strategies with round-window implants.}, journal = {Annals of the New York Academy of Sciences}, volume = {405}, number = {}, pages = {360-369}, doi = {10.1111/j.1749-6632.1983.tb31650.x}, pmid = {6575658}, issn = {0077-8923}, mesh = {Adult ; *Auditory Threshold ; Cochlea/*surgery ; *Cochlear Implants ; Computers ; Deafness/rehabilitation ; Female ; Humans ; Loudness Perception ; Male ; Pitch Discrimination ; Round Window, Ear/*surgery ; Speech Discrimination Tests ; }, abstract = {After extensive testing of a patient with two bipolar modiolar electrodes connected to a percutaneous plug in 1977, we provided four patients with a single-channel monopolar round-window electrode connected to a tuned radio-frequency receiver coil. Loudness and pitch discrimination and results of psychophysical scaling experiments of extracochlear electrodes are comparable to those with intracochlear stimulation. Extensive testing with a computerized test system and with tape-recorded and live speech material showed that accurate vowel and speaker identification by stimulation alone is possible and that discrimination by lipreading is considerably improved. Interactive training sessions further improve discrimination results. Different signal-encoding algorithms can be used to generate in real time stimulation signals from prestored speech parameters (such as pitch, gain, formants, and zerocrossing intervals).}, } @article {pmid6225414, year = {1983}, author = {Aaltonen, O and Suonpää, J}, title = {Computerized two-dimensional model for Finnish vowel identifications.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {22}, number = {4}, pages = {410-415}, doi = {10.3109/00206098309072801}, pmid = {6225414}, issn = {0020-6091}, mesh = {*Communication Aids for Disabled ; *Computers ; Humans ; *Language ; *Phonetics ; *Self-Help Devices ; Sound Spectrography/instrumentation ; *Speech Perception ; }, abstract = {511 different vowel-like stimuli were produced using a computer and speech synthesizer OVE III B. Their first formant varied from 250 to 800 Hz and their second formant from 800 to 2 400 Hz, covering thus the formant frequencies of the eight Finnish vowels. The randomized stimuli were listened to by 32 young adults with normal hearing. The identifications were analyzed automatically with the computer and plotted on a two-dimensional plane as a function of F1 and F2. The resulting basic vowel identification chart demonstrates the areas where the stimuli were identified as a certain vowel. The chart offers the practical means for presentation and follow-up of individual articulatory and auditory capacities.}, } @article {pmid6223554, year = {1983}, author = {White, MW}, title = {Formant frequency discrimination and recognition in subjects implanted with intracochlear stimulating electrodes.}, journal = {Annals of the New York Academy of Sciences}, volume = {405}, number = {}, pages = {348-359}, doi = {10.1111/j.1749-6632.1983.tb31649.x}, pmid = {6223554}, issn = {0077-8923}, support = {N0 1-NS-7-2367/NS/NINDS NIH HHS/United States ; NS-11804/NS/NINDS NIH HHS/United States ; }, mesh = {*Cochlear Implants ; Communication Aids for Disabled ; Humans ; Loudness Perception ; *Speech ; *Speech Acoustics ; *Speech Discrimination Tests ; Speech Intelligibility ; }, } @article {pmid6219082, year = {1983}, author = {Hose, B and Langner, G and Scheich, H}, title = {Linear phoneme boundaries for German synthetic two-formant vowels.}, journal = {Hearing research}, volume = {9}, number = {1}, pages = {13-25}, doi = {10.1016/0378-5955(83)90130-2}, pmid = {6219082}, issn = {0378-5955}, mesh = {Communication Aids for Disabled ; Humans ; Neurons/physiology ; *Phonetics ; Psychoacoustics ; Speech Acoustics ; *Speech Perception/physiology ; }, abstract = {The phonetic boundaries of variable, synthetic two-formant vowels were studied in psychophysical tests. In the F1 versus F2 formant plane, vowel boundaries appeared to be straight and all but one were parallel to the formant axes. Discrimination of two of the eight German vowels examined relied within certain boundaries upon information of the frequency of the first formant (F1) independent of that of the second (F2) or vice versa. Two vertical F1 boundaries were common to six vowels and one horizontal F2 boundary was common to five vowels. Interpreted in the light of recent neurophysiological data these findings point to the existence of mechanisms of vowel recognition with independent assessment of the two formants.}, } @article {pmid6194006, year = {1983}, author = {Scheich, H}, title = {Two columnar systems in the auditory neostriatum of the chick: evidence from 2-deoxyglucose.}, journal = {Experimental brain research}, volume = {51}, number = {2}, pages = {199-205}, pmid = {6194006}, issn = {0014-4819}, mesh = {Acoustic Stimulation ; Animals ; Auditory Cortex/*physiology ; Axonal Transport ; Carbon Radioisotopes ; Chickens ; Deoxyglucose/analogs & derivatives/metabolism ; Ear/physiology ; Fluorodeoxyglucose F18 ; Functional Laterality ; Muscles/metabolism ; }, abstract = {In chicks, monaurally deafened at 3 weeks of age, 2-deoxyglucose labeling of the input layer L2 in the auditory neostriatum (field L) was analyzed after acoustic stimulation. Two types of stimuli were used: narrow band frequency modulations and white noise. Both stimuli provide evidence that there are bands or columns with different inputs from the contra- and ipsilateral ear, a rostral, intermediate, and caudal band, all oriented orthogonal to the isofrequency contours. The intermediate band, in addition, shows multiple alternating columns of ipsi-vs. contralateral dominant input, parallel to the isofrequency contours. Consequently, the tonotopic gradient is interrupted several times by these columns. The organization of field L bears interesting parallels to the mammalian auditory cortex and is the first evidence of columnar organization in a submammalian forebrain. The multiple interruptions of the tonotopic gradient between low and high frequencies in the intermediate band by alternating aural dominance has possible implications for understanding directional hearing, as well as for understanding formant analysis of species-specific sounds.}, } @article {pmid7150957, year = {1982}, author = {Steinschneider, M and Arezzo, J and Vaughan, HG}, title = {Speech evoked activity in the auditory radiations and cortex of the awake monkey.}, journal = {Brain research}, volume = {252}, number = {2}, pages = {353-365}, doi = {10.1016/0006-8993(82)90403-6}, pmid = {7150957}, issn = {0006-8993}, support = {5T32GM7288/GM/NIGMS NIH HHS/United States ; HD 01799/HD/NICHD NIH HHS/United States ; MH 06723/MH/NIMH NIH HHS/United States ; }, mesh = {*Acoustic Stimulation ; Animals ; Auditory Cortex/*physiology ; Axons/physiology ; Evoked Potentials, Auditory ; Humans ; Macaca fascicularis ; Macaca mulatta ; *Speech ; Thalamus/physiology ; Wakefulness/physiology ; }, abstract = {To determine whether phonetic features of human speech are reflected in activity patterns of the auditory cortex and its thalamic afferents, concurrent recordings of multiple unit activity (MUA) and averaged evoked potentials (AEP) to 3 synthetic syllables: /da/,/ba/ and /ta/, were performed in awake monkeys. Using clicks, responses from thalamocortical axons and cortical cells were differentiated on the basis of their response latency, spatial distribution, and relationships to AEP components. Voice onset time was reflected in MUA time-locked to consonant release and voicing onset, and phase-locked to the syllables' fundamental frequency. Place of articulation was reflected in discriminative 'on' and phase-locked responses occurring to the formant transitions of the syllables. Duration of the voiced formant transitions was represented by an accentuation of the phase-locked responses occurring to this period. Activity of thalamocortical fibers and cortical cells differed. Thalamocortical fibers were more responsive to speech sounds, as well as responding more frequently with a phase-locked response pattern. Cortical cells responded with sustained activity to a greater degree. Responses to identical portions of the vowels were biased by the preceding consonant. The spatial extent and timing of the responses demonstrate that speech sounds are processed along parallel, but not synchronous, channels. Relevance to human psychoacoustical phenomena is discussed.}, } @article {pmid6219342, year = {1982}, author = {Alexander, DW and Frost, BP}, title = {Decelerated synthesized speech as a means of shaping speed of auditory processing of children with delayed language.}, journal = {Perceptual and motor skills}, volume = {55}, number = {3 Pt 1}, pages = {783-792}, doi = {10.2466/pms.1982.55.3.783}, pmid = {6219342}, issn = {0031-5125}, mesh = {Child ; *Communication Aids for Disabled ; Female ; Generalization, Stimulus ; Humans ; Language Development Disorders/psychology/*therapy ; Language Disorders/*therapy ; Male ; Phonetics ; *Self-Help Devices ; *Speech Perception ; Speech Therapy/*methods ; }, abstract = {This study investigated whether the rate at which children with delayed language process auditory stimuli can be increased through a process of shaping with synthesized speech stimuli. Results indicate that stimuli with slowed down critical formant cues were easier for these 24 children with delayed language to discriminate and that training with stimuli of extended duration did generalize to stimuli of normal durations. Implications of these findings and usefulness of this methodology are discussed.}, } @article {pmid7173417, year = {1982}, author = {Chistovich, LA and Malinnikova, TG and Stoliarova, EI}, title = {[Perception of single-formant vowels with irregular fluctuation of fundamental period and amplitude of glottal signals].}, journal = {Fiziologicheskii zhurnal SSSR imeni I. M. Sechenova}, volume = {68}, number = {10}, pages = {1330-1336}, pmid = {7173417}, issn = {0015-329X}, mesh = {Humans ; *Phonetics ; *Speech ; *Speech Acoustics ; Speech Discrimination Tests ; *Speech Perception ; }, } @article {pmid7176608, year = {1982}, author = {Klich, RJ and May, GM}, title = {Spectrographic study of vowels in stutterers' fluent speech.}, journal = {Journal of speech and hearing research}, volume = {25}, number = {3}, pages = {364-370}, doi = {10.1044/jshr.2503.364}, pmid = {7176608}, issn = {0022-4685}, mesh = {Adolescent ; Adult ; Humans ; Male ; *Phonetics ; Sound Spectrography ; Speech Acoustics ; Stuttering/*physiopathology ; }, abstract = {Measurements were made of the formant frequencies and formant transitions associated with the vowels /i/, /ae/ and /u/ produced by seven moderate-to-severe stutterers when they read fluently in a control (normal) condition and under four experimental condition: masking noise, delayed auditory feedback, rhythmic pacing, and whispering. The first and second formant frequencies in an isolated /hVd/ context were more centralized than those reported for nonstutterers. The formant frequencies were centralized even more in reading, but varied little across conditions despite changes in fluency, speaking rates, and vowel duration. Duration and rate of formant transitions also were essentially the same across conditions. These findings and those reported in other studies indicate that stutterers' vowel production is more restricted, spatially and temporally, than nonstutterers'.}, } @article {pmid7142032, year = {1982}, author = {Voigt, HF and Sachs, MB and Young, ED}, title = {Representation of whispered vowels in discharge patterns of auditory-nerve fibers.}, journal = {Hearing research}, volume = {8}, number = {1}, pages = {49-58}, doi = {10.1016/0378-5955(82)90033-8}, pmid = {7142032}, issn = {0378-5955}, mesh = {Animals ; Cats ; Nerve Fibers/physiology ; Noise ; *Phonetics ; Vestibulocochlear Nerve/*physiology ; }, abstract = {We have previously shown that the spectra of speech sounds can be represented in the temporal patterns of discharge of populations of auditory-nerve fibers. These results were obtained using perfectly periodic stimuli, for which a temporal representation is straightforward. In order to see if our results could be generalized to nonperiodic stimuli, we have studied responses to a whispered vowel with formant frequencies typical of /epsilon/. The whispered vowel was generated by exciting a vocal tract model with noise; this signal was therefore aperiodic. Temporal patterns of responses to the vowel in populations of auditory-nerve fibers were analyzed using interval histograms. Fourier transforms of these histograms show large components at frequencies near the formant frequencies of the vowel. With these Fourier transform components as a measure of the temporal response, a temporal place representation of the response of populations of fibers preserves the spectral features of the aperiodic vowel stimulus. Profiles of average rate versus characteristic frequency for fibers with spontaneous rates greater than 1/s show little if any formant-related structure. On the other hand, such profiles for fibers with spontaneous rates less than 1/s may show peaks in the region of the formants.}, } @article {pmid7119280, year = {1982}, author = {Kewley-Port, D}, title = {Measurement of formant transitions in naturally produced stop consonant-vowel syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {72}, number = {2}, pages = {379-389}, doi = {10.1121/1.388081}, pmid = {7119280}, issn = {0001-4966}, support = {MH-24027-06/MH/NIMH NIH HHS/United States ; NS-12179-05/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Time Factors ; Voice ; }, abstract = {Formant transitions have been considered important context-dependent acoustic cues to place of articulation in stop-vowel syllables. However, the bulk of earlier research supporting their perceptual importance has been conducted primarily with synthetic speech stimuli. The present study examined the acoustic correlates of place of articulation in the voiced formant transitions from natural speech. Linear prediction analysis was used to provide detailed temporal and spectral measurements of the formant transitions for /b,d,g/ paired with eight vowels produced by one talker. Measurements of the transition onset and steady state frequencies, durations, and derived formant loci for F1, F2, and F3 are reported. Analysis of these measures showed little evidence of context invariant acoustic correlates of place. When vowel context was known, most transition parameters were not reliable acoustic correlates of place except for the F2 transition and a two-dimensional representation of F2 X F3 onset frequencies. The results indicated that the information contained in the formant transitions in these natural stop-vowel syllables was not sufficient to distinguish place across all the vowel contexts studied.}, } @article {pmid7119278, year = {1982}, author = {Kent, RD and Murray, AD}, title = {Acoustic features of infant vocalic utterances at 3, 6, and 9 months.}, journal = {The Journal of the Acoustical Society of America}, volume = {72}, number = {2}, pages = {353-365}, doi = {10.1121/1.388089}, pmid = {7119278}, issn = {0001-4966}, support = {NS 16763/NS/NINDS NIH HHS/United States ; }, mesh = {Age Factors ; Female ; Humans ; Infant ; Larynx/physiology ; Male ; Phonation ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; }, abstract = {Recordings were obtained of the comfort-state vocalizations of infants at 3, 6, and 9 months of age during a session of play and vocal interaction with the infant's mother and the experimenter. Acoustic analysis, primarily spectrography, was used to determine utterance durations, formant frequencies of vocalic utterances, patterns of f0 frequency change during vocalizations, variations in source excitation of the vocal tract, and general properties of the utterances. Most utterances had durations of less than 400 ms although occasional sounds lasted 2 s or more. An increase in the ranges of both the F1 and F2 frequencies was observed across both periods of age increase, but the center of the F1-F2 plot for the group vowels appeared to change very little. Phonatory characteristics were at least generally compatible with published descriptions of infant cry. The f0 frequency averaged 445 Hz for 3-month-olds, 450 Hz for 6-month-olds, and 415 Hz for 9-month-olds. As has been previously reported for infant cry, the vocalizations frequently were associated with tremor (vibrato), harmonic doubling, abrupt f0 shift, vocal fry (or roll), and noise segments. Thus, from a strictly acoustic perspective, early cry and the later vocalizations of cooing and babbling appear to be vocal performances in continuity. Implications of the acoustic analyses are discussed for phonetic development and speech acquisition.}, } @article {pmid7108043, year = {1982}, author = {Summerfield, Q}, title = {Differences between spectral dependencies in auditory and phonetic temporal processing: Relevance to the perception of voicing in initial stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {72}, number = {1}, pages = {51-61}, doi = {10.1121/1.388024}, pmid = {7108043}, issn = {0001-4966}, mesh = {Auditory Perception/*physiology ; Humans ; Male ; *Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; Time Factors ; Voice ; }, abstract = {Untrained listeners can reliably judge the temporal order of the onset of (a) pairs of coterminous tones [forming tone-onset-time (TOT) continua], and (b) higher-frequency bandlimited noises and lower-frequency bandlimited pulse trains [forming noise-onset-time (NOT) continua], but only if the onset of the second sound lags the first by at least 15-20 ms. It has been argued that the limitation of auditory temporal-order resolution that gives rise to this threshold also underlies the distinction between voiced [b, d, g] and voiceless aspirated [ph, th, kh] syllable-initial stop constants [which can be expressed in differences of voice-onset-time. (VOT)]. The positions of boundaries between phonetic categories on VOT continua depend on the values of a variety of spectral parameters, including the onset frequency of the first formant; lowering this results in boundaries shifting to longer values of VOT. The present experiment demonstrated that analogous spectral manipulations applied to the members of TOT and NOT continua do not result in systematic shifts in the location of the simultaneity-successivity threshold. The result suggest that the role of F1 in the perception of voicing does not have a purely auditory basis, a conclusion compatible with certain development and cross-language studies that have demonstrated that sensitivity to F1 is acquired and language dependent. The threshold may determine ranges of VOT between which auditory contrast is heightened, and so have helped to shape the preferred phonetic forms of phonological distinctions in the world's languages. However, other factors, such a production constraints or arbitrary processes of cultural development, appear to be required to account for the positions of voicing boundaries in particular languages.}, } @article {pmid7108031, year = {1982}, author = {Repp, BH and Mann, VA}, title = {Fricative-stop coarticulation: acoustic and perceptual evidence.}, journal = {The Journal of the Acoustical Society of America}, volume = {71}, number = {6}, pages = {1562-1567}, doi = {10.1121/1.387810}, pmid = {7108031}, issn = {0001-4966}, support = {HD01994/HD/NICHD NIH HHS/United States ; HD05677/HD/NICHD NIH HHS/United States ; RR05596/RR/NCRR NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Eight native speakers of American English each produced ten tokens of all possible CV, FCV, and VFCV utterances with V = [a] or [u], F = [s] or [integral of], and C = [t] or [k]. Acoustic analysis showed that the formant transition onsets following the stop consonant release were systematically influenced by the preceding fricative, although there were large individual differences. In particular, F3 and F4 tended to be higher following [s] than following [integral of]. The coarticulatory effects were equally large in FCV (e.g.,/sta/) and VFCV (e.g.,/asda/) utterances; that is, they were not reduced when a syllable boundary intervened between fricative and stop. In a parallel perceptual study, the CV portions of these utterances (with release bursts removed to provoke errors) were presented to listeners for identification of the stop consonant. The pattern of place-of-articulation confusions, too, revealed coarticulatory effects due to the excised fricative context.}, } @article {pmid7096618, year = {1982}, author = {Tolkmitt, F and Helfrich, H and Standke, R and Scherer, KR}, title = {Vocal indicators of psychiatric treatment effects in depressives and schizophrenics.}, journal = {Journal of communication disorders}, volume = {15}, number = {3}, pages = {209-222}, doi = {10.1016/0021-9924(82)90034-x}, pmid = {7096618}, issn = {0021-9924}, mesh = {Depressive Disorder/*psychology/therapy ; Emotions ; Humans ; Phonation ; Phonetics ; Schizophrenia/therapy ; *Schizophrenic Psychology ; *Speech ; *Voice ; }, abstract = {Voice and speech changes as a result of clinical treatment for 17 depressive and 15 schizophrenic patients were investigated. Speech samples taken at interviews before and after treatment were analyzed with regard to f0, spectral energy distribution, and formant frequencies of vowels occurring in identical phonetical context. Both groups of patients showed a decrease in f0 after therapy, which was interpreted as a reduction of general arousal. Differential results with regard to spectral energy distribution suggested that the voice of depressives became more relaxed after therapy, but the opposite seemed to be true for schizophrenics. Significant formant changes obtained for the first formants of two vowels were interpreted as differences in the precision of articulation before and after therapy.}, } @article {pmid6212634, year = {1982}, author = {Martin, JG and Bunnell, HT}, title = {Perception of anticipatory coarticulation effects in vowel-stop consonant-bowel sequences.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {8}, number = {3}, pages = {473-488}, doi = {10.1037//0096-1523.8.3.473}, pmid = {6212634}, issn = {0096-1523}, support = {NS 13645/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Reaction Time ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Articulatory and acoustic studies of speech production have shown that the effects of anticipatory coarticulation may extend across several segments of an utterance. The present experiments show that such effects have perceptual significance. In two experiments, a talker produced consonant (C) and vowel (V) sequences in a sentence frame (e.g., "I say pookee") of the form "I say / C V1 C V2/" in which V1 was /u, ae/ and V2 was /i, a/. Each /i, a/ sentence pair was cross-spliced by exchanging the final syllable /C V2/ so that coarticulatory information prior to the crosspoint was inappropriate for te final vowel (V2) in crossed sentences. Recognition time (RT) for V2 in crossed and intact (as spoken) sentences was obtained from practiced listeners. In both experiments RT was slower in crossed sentences; crossed sentences also attracted more false alarms. The pattern of perceptual results was mirrored in the pattern of precross acoustic differences in experimental sentences (e.g., formants F2 and F3 were higher preceding /i/ than preceding /a/). Pretarget variation in the formants jointly predicted amount of RT interference in crossed sentences. A third experiment found interference (slower RT) and also facilitation (faster RT) from exchanges of pretarget coarticulatory information in sentences. Two final experiments showed that previous results were not dependent on the use of practiced listeners.}, } @article {pmid7085995, year = {1982}, author = {Landahl, KL and Blumstein, SE}, title = {Acoustic invariance and the perception of place of articulation: a selective adaptation study.}, journal = {The Journal of the Acoustical Society of America}, volume = {71}, number = {5}, pages = {1234-1241}, doi = {10.1121/1.387772}, pmid = {7085995}, issn = {0001-4966}, support = {NS 15123/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Two adaptation experiments were conducted exploring the perception of place of articulation in stop consonants. We hope to determine whether in making a phonetic decision about place of articulation, the perceptual system is sensitive to subtle changes in the fine structure of a CV stimulus (i.e. onset formant frequencies and transition motions) when both the adapting and test stimuli shared the same invariant properties. These invariants are defined in terms of the gross spectral shape at stimulus onset. In the first experiment, the effects of adaptors varying in duration (full CV syllables or shortened CV syllables called onsets), vowel context ([a] versus [i]), onset formant frequencies, and presence or absence of moving transitions were tested on CV [ba da ga] test stimuli. All adaptors contained sufficient information to extract the invariant properties for place of articulation based on spectral shape at consonant release. Results showed that the various onsets did not adapt full CV [ba da ga] test stimuli. In the second experiment, the effects of the onset adaptors were tested on onset CV [ba da ga] test stimuli. Results showed that onsets can adapt an onset place of articulation continuum in a manner similar to place of articulation adaptation using full CV syllables. Further, the fine structure of the stimuli significantly affected the adaptation results. Finally, vowel contingency effects seem to reflect differences in onset frequencies of the consonants in CV syllables rather than the steady-state frequencies of the following vowels.}, } @article {pmid7085986, year = {1982}, author = {Assmann, PF and Nearey, TM and Hogan, JT}, title = {Vowel identification: orthographic, perceptual, and acoustic aspects.}, journal = {The Journal of the Acoustical Society of America}, volume = {71}, number = {4}, pages = {975-989}, doi = {10.1121/1.387579}, pmid = {7085986}, issn = {0001-4966}, mesh = {Adolescent ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {This study investigates conditions under which vowels are well recognized and relates perceptual identification of individual tokens to acoustic characteristics. Results support recent finding that isolated vowels may be readily identified by listeners. Two experiments provided evidence that certain response tasks result in inflated error rates. Subsequent experiments showed improved identification in a fixed speaker context, compared with randomized speakers, for isolated vowels and gated centers. Performance was worse for gated vowels, suggesting that dynamic properties (such as duration and diphthongization) supplement steady-state cues. However, even-speaker-randomized gated vowels were well identified (14% errors). Measures of "steady-state information" (formant frequencies and f0), "dynamic information" (formant slopes and duration), and "speaker information" (normalization) were adopted. Discriminant analyses of acoustic measurements indicated relatively little overlap between vowel categories. Using a new technique for relating acoustic measurements of individual tokens with identification by listeners, it is shown that (a) identification performance is clearly related to acoustic characteristics; (b) improvement in the fixed speaker context is correlated with improved statistical separation resulting from formant normalization, for the gated vowels; and (c) "dynamic information" is related to identification differences between full and gated isolated vowels.}, } @article {pmid7085978, year = {1982}, author = {Paliwal, KK and Rao, PV}, title = {Synthesis-based recognition of continuous speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {71}, number = {4}, pages = {1016-1024}, doi = {10.1121/1.387653}, pmid = {7085978}, issn = {0001-4966}, mesh = {Humans ; Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; }, abstract = {An acoustic phonemic recognition system for continuous speech is presented. The system utilizes both steady-state and transition segments of the speech signal to achieve recognition. The information contained in formant transitions is utilized by the system by using a synthesis-based recognition approach. It is shown that this improves the performance of the system considerably. Recognition of continuous speech is accomplished here in three stages: segmentation, steady-state recognition, and synthesis-based recognition. The system has been tried out on 40 test utterances, each 3-4 s in duration, spoken by a single male speaker and the following results are obtained: 5.4% missed segment error, 8.3% extra segment error, 52.3% correct recognition using only steady-state segments, and 62.0% correct recognition using both steady-state and transition segments.}, } @article {pmid7085977, year = {1982}, author = {Sharf, DJ and Benson, PJ}, title = {Identification of synthesized [r-w] continua for adult and child speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {71}, number = {4}, pages = {1008-1015}, doi = {10.1121/1.387652}, pmid = {7085977}, issn = {0001-4966}, mesh = {Child Language ; Humans ; *Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception/physiology ; }, abstract = {A continuing problem in speech research is the comparison of phonetically similar but acoustically distinct sounds such as the comparison between adult speech and child speech. This problem is especially important in the remediation of speech disorders. In this study, we focus on this problem as manifested in the disordered production of [r] sounds. Stimuli which varied in F2 and F3 onset frequencies were synthesized to span continua from [r] to [w] for adult, child, and scaled adult (formants 1.5 times adult values) speakers. These stimuli were presented to 18 naive adults for identification as "R"or "W" at three listening sessions to test for stability of category boundaries. Significant shifts in category boundaries were found for all three continua between sessions one and two but not between sessions two and three. Category boundaries in terms of formant frequencies showed significant differences between the adult and childlike continua. Rescaling these values on a psychophysical dimension and adjusting for formant spread brought the category boundaries into close alignment. In free choice testing, synthesized [r] and [w] stimuli were identified by most subjects at least as well as natural [r] and [w] stimuli but synthesized child [r] stimuli were identified by most subject considerably better than natural child [r] stimuli.}, } @article {pmid7088662, year = {1982}, author = {Grunke, ME and Pisoni, DB}, title = {Some experiments on perceptual learning of mirror-image acoustic patterns.}, journal = {Perception & psychophysics}, volume = {31}, number = {3}, pages = {210-218}, pmid = {7088662}, issn = {0031-5117}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; MH-24027/MH/NIMH NIH HHS/United States ; NS-12179/NS/NINDS NIH HHS/United States ; }, mesh = {*Auditory Perception ; Humans ; Judgment ; Learning ; *Phonetics ; Psychoacoustics ; Speech Perception ; }, abstract = {It is well known that the formant transitions of stop consonants in CV and VC syllables are roughly the mirror image of each other in time. These formant motions reflect the acoustic correlates of the articulators as they move rapidly into and out of the period of stop closure. Although acoustically different, these formant transitions are correlated perceptually with similar phonetic segments. Earlier research of Klatt and Shattuck (1975) had suggested that mirror image acoustic patterns resembling formant transitions were not perceived as similar. However, mirror image patterns could still have some underlying similarity which might facilitate learning, recognition, and the establishment of perceptual constancy of phonetic segments across syllable positions. This paper reports the results of four experiments designed to study the perceptual similarity of mirror-image acoustic patterns resembling the formant transitions and steady-state segments of the CV and VC syllables/ba/,/da/,/ab/, and /ad/. Using a perceptual learning paradigm, we found that subjects could learn to assign mirror-image acoustic patterns to arbitrary response categories more consistently than they could do so with similar arrangements of the same patterns based on spectrotemporal commonalities. Subjects respond not only to the individual components or dimensions of these acoustic patterns, but also process entire patterns and make use of the patterns’ internal organization in learning to categorize them consistently according to different classification rules.}, } @article {pmid7087410, year = {1982}, author = {Hack, ZC and Erber, NP}, title = {Auditory, visual, and auditory-visual perception of vowels by hearing-impaired children.}, journal = {Journal of speech and hearing research}, volume = {25}, number = {1}, pages = {100-107}, doi = {10.1044/jshr.2501.100}, pmid = {7087410}, issn = {0022-4685}, support = {NS 03856/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Child ; Female ; Hearing Disorders/*psychology ; Humans ; Lipreading ; Male ; Speech Acoustics ; *Speech Perception ; *Visual Perception ; }, abstract = {The vowels (foreign letters in text) were presented through auditory, visual, and combined auditory-visual modalities to hearing-impaired children having good, intermediate, and poor auditory work-recognition skills. When they received acoustic information only, children with good word-recognition skills confused neighboring vowels (i.e., those having similar formant frequencies). Children with intermediate work-recognition skills demonstrated this same difficulty and confused front and back vowels. Children with poor word-recognition skills identified vowels mainly on the basis of temporal and intensity cues. Through lipreading alone, all three groups distinguished spread from rounded vowels but could not reliably identify vowels within the categories. The first two groups exhibited only moderate difficulty in identifying vowels audiovisually. The third group, although showing a small amount of improvement over lipreading alone, still experienced difficulty in identifying vowels through combined auditory and visual modes.}, } @article {pmid6805396, year = {1982}, author = {Dent, LJ}, title = {Vowel discrimination with the single-electrode cochlear implant: a pilot study.}, journal = {The Annals of otology, rhinology & laryngology. Supplement}, volume = {91}, number = {2 Pt 3}, pages = {41-46}, pmid = {6805396}, issn = {0096-8056}, mesh = {Adult ; *Cochlear Implants ; Female ; Humans ; Male ; Middle Aged ; Pilot Projects ; Speech Discrimination Tests/*methods ; }, abstract = {Eight normal-hearing individuals and six single-electrode cochlear implant users were asked to discriminate tape recorded pairs of spoken monosyllables consisting of 11 English syllabic nuclei (see formula in text) bounded by the consonant frame. A same-different paradigm was used. Most normal-hearing subjects discriminated all pairs intended by the talker as different; two normal listeners confused monosyllables containing (see formula in text). Cochlear implant users showed some intersubject variation, but for the 6 subjects, discriminations were consistent enough to allow the data to be pooled for analysis. The pooled data for these four subjects indicated that high front and high back vowels were discriminable from low vowels. The basis for the discriminations is difficult to evaluate, since the naturally spoken stimuli varied with respect to formant frequencies, fundamental frequency, intensity, and duration.}, } @article {pmid7176580, year = {1982}, author = {Robbins, J and Fisher, HB and Logemann, JA}, title = {Acoustic characteristics of voice production after Staffieri's surgical reconstructive procedure.}, journal = {The Journal of speech and hearing disorders}, volume = {47}, number = {1}, pages = {77-84}, doi = {10.1044/jshd.4701.77}, pmid = {7176580}, issn = {0022-4677}, mesh = {Aged ; Humans ; Hypopharynx/surgery ; Laryngeal Neoplasms/surgery ; Laryngectomy/methods/*rehabilitation ; Male ; Middle Aged ; *Phonation ; *Speech ; *Speech Acoustics ; Speech, Esophageal ; Trachea/surgery ; *Voice ; }, abstract = {Selected phonatory variables for two laryngeal cancer patients were studied before and after total laryngectomy with reconstruction designed to restore phonation. Each subject performed five speaking tasks while simultaneous pneumographic and voice recordings were obtained. Measures of sound pressure level, fundamental frequency, vowel formant structure, frequency range, words per minute, maximum phonation time, and syllables per breath were compared pre- and post-operatively. Results indicate that for the post-surgical condition there was a reduction in intensity, frequency, and temporal measures for the two subjects studied. Results are analyzed and discussed in relation to variation within subjects and between subjects, and by comparison with the acoustic parameters of normal and esophageal talkers.}, } @article {pmid7178806, year = {1982}, author = {Ginzel, A and Brahe Pedersen, C and Spliid, PE and Andersen, E}, title = {The effect of age and hearing loss on the identification of synthetic /b, d, g/-stimuli.}, journal = {Scandinavian audiology}, volume = {11}, number = {2}, pages = {103-112}, doi = {10.3109/01050398209076206}, pmid = {7178806}, issn = {0105-0397}, mesh = {Adolescent ; Adult ; Age Factors ; Aged ; Child ; Hearing Loss/physiopathology ; Humans ; Middle Aged ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {A series of speech identification tests using 15 synthetic consonant-vowel syllables with variations in second and third formant transitions were conducted in different age groups with normal hearing and a group of older subjects with a sensorineural hearing loss. Varying degree of categorical perception of three distinct phoneme categories /b, d, g/ was observed in all groups. Responses were depending on start frequency and gliding direction of second and third formant transitions. In the group of 60-80-year-old persons with a hearing loss response percentages were significantly lower and error frequency high. Furthermore, category shift boundaries changed as compared with the group of 20-40 years old. Results in a group of children age 8-15 years, were rather similar to those in the 20-40-year-old group. Both age and age-induced sensorineural hearing loss seemed to influence elderly people's ability to identity the synthetic speech stimuli.}, } @article {pmid7178802, year = {1982}, author = {Suonpää, J and Aaltonen, O}, title = {The validity of real-time spectrum analysis of Finnish vowel formants in profoundly hearing-impaired children.}, journal = {Scandinavian audiology}, volume = {11}, number = {1}, pages = {43-48}, doi = {10.3109/01050398209076198}, pmid = {7178802}, issn = {0105-0397}, mesh = {Adolescent ; Deafness/complications/*physiopathology ; Finland ; Humans ; Male ; *Phonetics ; Sound Spectrography/*methods ; *Speech ; *Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement/methods ; Voice Disorders/etiology/physiopathology ; }, abstract = {A two-dimensional vowel chart has been used in the presentation of the articulatory positions of eight Finnish vowels produced by deaf subjects. The method and its evaluation is presented, showing individual analyses of a 13-year-old deaf boy. The values of the two lowest formants were determined from the real-time spectral analyses of each Finnish vowel and their positions on the vowel diagram, and these are compared with the identifications by listeners with normal hearing. Although the distortion and fuzziness of vowel spectra caused some difficulties in the exact determination of formant values, the comparison showed a fairly high validity for the method. This kind of vowel chart can be used for the estimation of individual articulatory capacity of deaf subjects.}, } @article {pmid7163596, year = {1982}, author = {Youinou, P and Clavier, J and Guillerm, D and Miossec, P and Kerbrat, G}, title = {[Prognostic significance of lymphocytes forming "active rosettes" in lung cancer].}, journal = {Revue francaise des maladies respiratoires}, volume = {10}, number = {6}, pages = {409-416}, pmid = {7163596}, issn = {0301-0279}, mesh = {Adenocarcinoma/immunology ; Adult ; Aged ; Anaplasia/immunology ; Carcinoma, Squamous Cell/immunology ; Erythrocytes/immunology/metabolism ; Female ; Humans ; Lung Neoplasms/*immunology/mortality/pathology ; Lymphocytes/*immunology ; Male ; Middle Aged ; Neoplasm Staging ; Prognosis ; Rosette Formation ; }, abstract = {Lymphocytes forming E rosettes, active E, autologous and EA antibodies were studied in 66 subjects (56 men, 10 women, mean age 59, range: 40-85 years) who presented with lung cancer classified on the TMN scale in stages I and II: 19; stage III: 20; stage IV: 27. In comparison to the controls, there was a significant reduction (p less than 0.01) in E, autologous and EA rosettes in the patients. A reduction in E active rosettes (compared to controls) was noted for stage IV cases (p less than 0.01) and rose with the stage. The formation of rosettes was reduced in 50 squamous carcinomas compared to 12 anaplastic carcinomas for E rosettes (p less than 0.01) and E active (p less than 0.02). If one compares the actuarial survival curves of 37 patients with E act rosettes greater than or equal to 23% and of 29 with a level of less than 23%, the mean survival for all stages combined was 14.1 months in the first group and 8.2 months in the second group. Associated with an extension of the tumour, the immunological system has a role in determining outcome and lymphocytes forming active E rosettes appear to be the subpopulation most closely correlated with survival.}, } @article {pmid7118015, year = {1982}, author = {Hellyer, NL and Farmer, A}, title = {A comparison of vowel formant measurements between posttracheostomy and postbabbling children.}, journal = {Folia phoniatrica}, volume = {34}, number = {1}, pages = {17-20}, doi = {10.1159/000265622}, pmid = {7118015}, issn = {0015-5705}, mesh = {Child, Preschool ; Female ; Humans ; Infant ; Language Development Disorders/*psychology ; Language Disorders/*psychology ; Male ; *Phonation ; *Phonetics ; Postoperative Complications/psychology ; Speech Production Measurement ; *Tracheotomy ; *Voice ; }, } @article {pmid7089068, year = {1982}, author = {Sato, S and Yokota, M and Kasuya, H}, title = {Statistical relationships among the first three formant frequencies in vowel segments in continuous speech.}, journal = {Phonetica}, volume = {39}, number = {1}, pages = {36-46}, doi = {10.1159/000261649}, pmid = {7089068}, issn = {0031-8388}, mesh = {Humans ; Male ; *Speech ; *Speech Acoustics ; Statistics as Topic ; }, abstract = {Statistical relationships among F1, F2 and F3 in vowel segments in continuous speech were investigated with 909 vowel samples from Japanese sentences uttered by 4 adult male speakers. Determined by minimizing the square errors from the measured F3, two separate planes corresponding to front and back vowels are situated in the F1F2F3 space to obtain the F3 values estimated from F1 and F2. Estimation errors are found to be comparable to difference limens in hearing. Also, uniform/nonuniform formant frequency scaling is discussed in terms of physical differences in the planes for individual speakers.}, } @article {pmid7300264, year = {1981}, author = {Colton, RH and Steinschneider, A}, title = {The cry characteristics of an infant who died of the sudden infant death syndrome.}, journal = {The Journal of speech and hearing disorders}, volume = {46}, number = {4}, pages = {359-363}, doi = {10.1044/jshd.4604.359}, pmid = {7300264}, issn = {0022-4677}, support = {N01-HO-5-2853/HO/NHLBI NIH HHS/United States ; }, mesh = {Acoustics ; *Crying ; Female ; Humans ; Infant, Newborn ; Sound Spectrography ; Sudden Infant Death/*psychology ; }, abstract = {Fourteen cries of a four day old infant who subsequently died suddenly of unexplained causes were analyzed on nine acoustic characteristics including fo, duration, formant frequencies and sound pressure level. In comparison to a group of newborn controls, the Sudden Infant Death Syndrome (SIDS) victim's cries exhibited a lower fo, longer duration, lower formant frequencies and greater sound pressure level throughout the spectrum. Cry duration and sound pressure levels, however, deviated in excess on one standard deviation from the mean of the other newborns. Similar findings resulted when the SIDS infant was compared to a group of full term infants who were siblings of SIDS victims, although the magnitude of the differences was slightly less especially with respect to sound pressure level. Measurement of selected acoustic variables in a newborn's cry may be of value in our understanding of SIDS and for identifying infants at risk.}, } @article {pmid7288042, year = {1981}, author = {Eilers, RE and Morse, PA and Gavin, WJ and Oller, DK}, title = {Discrimination of voice onset time in infancy.}, journal = {The Journal of the Acoustical Society of America}, volume = {70}, number = {4}, pages = {955-965}, doi = {10.1121/1.387024}, pmid = {7288042}, issn = {0001-4966}, support = {HD 03352/HD/NICHD NIH HHS/United States ; HD 08240/HD/NICHD NIH HHS/United States ; MH 30634/MH/NIMH NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Female ; Humans ; *Infant ; Male ; Sound Spectrography ; *Speech Perception ; *Voice ; }, abstract = {In an effort to determine whether infants can discriminate speech sounds on the basis of a single acoustic cue, timing onset of periodic voicing, two experiments were conducted employing synthetic speech sounds. Naturally produced syllable pairs were also used for comparison. In the first experiment infants evidenced discrimination of a naturally produced /ba/ versus /pa/ pair and a naturally /du/ versus /tu/ pair. In addition, infants discriminated a synthetic /ba/ versus /pa/ contrast that was cued by several acoustic differences in addition to timing onset of periodic voicing but failed to evidence an ability to discriminate a synthetic /du/ versus /tu/ contrast that contained flat first formants and differed only in timing onset of periodic voicing. A second experiment was conducted in which infants once again evidenced discrimination of naturally produced /du/ versus /tu/ stimuli but not of synthetic /du/ versus /tu/ stimuli containing slight first-formant transitions. These results suggest that timing onset of periodic voicing alone may not be a sufficient cue for infant discrimination of English voicing contrasts.}, } @article {pmid7288030, year = {1981}, author = {Elliott, LL and Longinotti, C and Meyer, D and Raz, I and Zucker, K}, title = {Developmental differences in identifying and discriminating CV syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {70}, number = {3}, pages = {669-677}, doi = {10.1121/1.386929}, pmid = {7288030}, issn = {0001-4966}, mesh = {Adult ; Age Factors ; Child ; *Child Development ; Humans ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {The purpose of this experiment was to determine whether age-related differences would be observed for identification and discrimination of synthesized, five-formant CV syllables among listeners who showed equal performance scores on a standard clinical test of speech understanding. A second question concerned the relations between performance on identification and discrimination tasks as a function of age. Two 13-item continua that varied in the place of articulation feature ([ba, da, ga]) were used; they differed primarily in the presence of absence of a 5-ms noise burst at the consonant onset. Digitized natural speech syllables were also employed in one experimental task. Strong age effects were obtained for the three tasks--identification of syllables, adaptive estimation of "ba, da-" and "da, ga-" boundaries, and discrimination. With the exception of one condition for six-year-olds, only adults showed significant differences between boundaries and just noticeable differences. Minimal differences were obtained in responses to stimuli with and without initial bursts. Across ages there were no significant differences in the subjects' ability to label the synthesized syllables as compared to the natural speech stimuli. Possible explanations for the observed developmental effects are discussed.}, } @article {pmid7265929, year = {1981}, author = {Phatate, DD and Umano, H}, title = {Auditory discrimination of voiceless fricatives in children.}, journal = {Journal of speech and hearing research}, volume = {24}, number = {2}, pages = {162-168}, doi = {10.1044/jshr.2402.162}, pmid = {7265929}, issn = {0022-4685}, mesh = {Age Factors ; Child ; *Child Language ; Child, Preschool ; Humans ; *Language Development ; Phonetics ; Sound Spectrography ; *Speech Perception ; }, abstract = {Auditory discrimination of the voiceless fricatives magnitude of theta f integral of s was studied in 200 subjects between the ages of four and six and a half years. In the test task the subject was asked to remember one of the sounds and then to indicate each time this sound was presented. Two types of errors were analyzed. An error of omission was a failure to identify the remembered sound, and this type of error did not change with age. An error of commission, a failure to discriminate between the remembered sound and one of the other voiceless fricatives, decreased with age. The results are interpreted as support for a theory of the development of auditory perception of speech in which discrimination of some properties in speech, such as relatively weak spectral cues and second formant transitions, have to be learned by a child.}, } @article {pmid7286307, year = {1981}, author = {Chistovich, LA and Ogorodnikova, EA and Chikhman, VN}, title = {[Principles of temporal processing of spectral information during vowel perception].}, journal = {Fiziologicheskii zhurnal SSSR imeni I. M. Sechenova}, volume = {67}, number = {5}, pages = {712-719}, pmid = {7286307}, issn = {0015-329X}, mesh = {Computers ; Humans ; *Psychoacoustics ; Psycholinguistics ; Speech Perception/*physiology ; }, abstract = {Pairs of "one-formant pulses" with 10 ms onset-to-onset interval and highly different F values for the first and the second pulse (LH and HL pairs) were used in vowel identification experiment. Response distributions corresponding to these stimuli and to "stationary" two-formant and one-formant pulse pairs were compared to calculate the phonetic distance between stimuli. The results are in favour of low-pass filtering of excitation pattern with sampling at the stimulus offset.}, } @article {pmid7270117, year = {1981}, author = {Tonndorf, J}, title = {Stereociliary dysfunction, a case of sensory hearing loss, recruitment, poor speech discrimination and tinnitus.}, journal = {Acta oto-laryngologica}, volume = {91}, number = {5-6}, pages = {469-479}, doi = {10.3109/00016488109138530}, pmid = {7270117}, issn = {0001-6489}, mesh = {Animals ; Cats ; Hair Cells, Auditory/*pathology ; Hair Cells, Auditory, Inner/drug effects/ultrastructure ; Hearing Loss, Sensorineural/*pathology ; Humans ; Kanamycin/poisoning ; *Neural Conduction ; *Recruitment, Neurophysiological ; Speech Perception/*physiology ; Tectorial Membrane/physiology ; Tinnitus/*pathology ; }, abstract = {The following hypothesis is presented: A dysfunction of the hair cell cilia, either in the form of ciliary pathology or as a temporary loss of ciliary stiffness, both of which are documented, ought to lead to a partial decoupling of the involved hair cells from the tectorial membrane. Consequently, 1) energy transmission should be attenuated (= hearing loss), 2) the noise level at the hair cell input should be increased (= tinnitus) and, owing to the concomitant center-clipping of the signal waveform, 3) the input/output function should become steeper (= recruitment) and 4) the formant structure of speech should be largely destroyed (= relatively poor speech discrimination). The above signs and symptoms are characteristic of a number of acute cochlear disorders and would thus find a common explanation.}, } @article {pmid7260164, year = {1981}, author = {Karnitskaia, EG}, title = {[Functional model of stationary signal formation in auditory analysis].}, journal = {Biofizika}, volume = {26}, number = {3}, pages = {517-521}, pmid = {7260164}, issn = {0006-3029}, mesh = {*Hearing Tests/instrumentation ; Humans ; Mathematics ; Psychoacoustics ; Speech ; }, abstract = {The model for spectrum transformation in the auditory analyser consists of three layers: that of filters whose parameters imitate the masking curves; that of non-linear transformers, the non-linearity parameters being chosen in such a way that the calculation results would correspond to the basic regularities of perception of stationary signal loudness; that of lateral inhibition. The weight function parameters of the latter were chosen according to a psychoacoustical experiment. By means of such a model the formant structure of the sound can be distinguished in a synthetic speech-like signal; at low frequencies to 1000 Hz besides formants the first harmonics are distinguished.}, } @article {pmid7240581, year = {1981}, author = {Traunmüller, H}, title = {Perceptual dimension of openness in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {5}, pages = {1465-1475}, doi = {10.1121/1.385780}, pmid = {7240581}, issn = {0001-4966}, support = {HD01994/HD/NICHD NIH HHS/United States ; RR05596/RR/NCRR NIH HHS/United States ; }, mesh = {Adolescent ; Female ; Humans ; Linguistics ; Male ; Models, Psychological ; *Phonetics ; *Psychoacoustics ; }, abstract = {The role of intrinsic factors determining perceived degree of vowel openness was examined. In order to determine the role of F1 and F0, one-formant vowels, covering a wide range of fundamental and formant frequencies, were identified by 23 subjects who were native speakers of a Bavarian dialect in which five degrees of openness occur distinctively. The significance of intrinsic factors other than F1 and F0 was also studied using synthetic versions of natural vowels with F1 and/or F0 systematically displaced in frequency. It was found that, generally, the tonality distance between F1 and F0 is decisive for openness, while the higher formants contribute marginally. It was further found that the distance between widely spaced formants, as between F2 and F1 in front vowels, is not crucial for vowel identification. The results are evaluated in terms of a psychoacoustic model of identification by pattern matching. the model incorporates two basic assumptions. First, a certain pattern of excitation along the basilar membrane is recognized as a given feature regardless of position along the membrane. Second, there is an integration band with a width of 3 Bark effective in spectrum envelope recognition.}, } @article {pmid7240575, year = {1981}, author = {Dorman, MF and Dougherty, K}, title = {Shifts in phonetic identification with changes in signal presentation level.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {5}, pages = {1439-1440}, doi = {10.1121/1.385827}, pmid = {7240575}, issn = {0001-4966}, mesh = {Hearing Loss, Sensorineural/physiopathology ; Humans ; *Phonetics ; Pressure ; *Psychoacoustics ; Sound ; *Speech Perception ; }, abstract = {Identification functions for stimuli from a two-formant [bdg] continuum were assessed at three levels of signal presentation: 55, 70, and 90 dB SPL. At 90 dB the [b] category was narrowed, the [d] category virtually eliminated, and the [g] category greatly enlarged. Since high SPLs can alter the identification functions for these simplified speech signals, caution should be exercised when using stimuli of this nature to compare the speech perception abilities of normal hearing and hearing impaired listeners. If high SPLs, independent of cochlear damage, can alter identification functions, then perceptual experiments may best be conducted at equal SPLs rather than at equal SLs.}, } @article {pmid7240573, year = {1981}, author = {Broad, DJ}, title = {Piecewise-planar vowel formant distributions across speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {5}, pages = {1423-1429}, doi = {10.1121/1.385825}, pmid = {7240573}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Models, Biological ; *Phonetics ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {A previous study [D. J. broad and H. Wakita, J. Acoust. Soc. Am. 62, 1467--1473 (1977)] showing that one female speaker's first three vowel formant frequencies clustered about a two-part piecewise-planar surface is extended to five additional speakers. For each speaker, a similar two-plane representation is found, with the rms spread of the data about the planes ranging between 69 and 103 Hz. The orientations of the planes for the different speakers are similar: The front-vowel planes make an average angle of 10 degrees to the average front-vowel plane, while the back-vowel planes make an average angle of 13 degrees to the average back-vowel plane. Nearly all these departures from the average are significant at the 99% level. The hypothesis of uniform scaling of vowel formant frequencies between speakers must therefore be rejected if it is carried strictly to three dimensions. This is also shown by the positions of the planes. The speakers do, however, group into two almost uniformly scalable subsets. Finally, the third formants of the retroflex vowels for most of the speakers are lower than would be predicted solely from exploitation of low-F3 regions of the piecewise-planar surfaces.}, } @article {pmid7240572, year = {1981}, author = {Bladon, RA and Lindblom, B}, title = {Modeling the judgment of vowel quality differences.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {5}, pages = {1414-1422}, doi = {10.1121/1.385824}, pmid = {7240572}, issn = {0001-4966}, mesh = {Auditory Perception ; Cues ; Female ; Humans ; Judgment ; Male ; Models, Psychological ; *Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {The hypothesis of this study is that the auditory cues relevant to listeners' judgment of vowel quality are a spectral representation of loudness density versus pitch. A model is described that generates such patterns for steady-state vowels. In addition to the nonlinear transformations underlying the loudness density and pitch scales, it incorporates experimentally established characteristics associated with frequency resolution and masking, such as the critical band concept. This model is combined with a measure of auditory perceptual distance which, operating on pairs of vowels, treats each stimulus representation as a single spectral shape. In order to test the distance metric and the model, experimental data were gathered from listeners' numerical estimates of quality differences between stimulus pairs which compared four-formant and two-formant vowels. The correlation between experimental and theoretical results was 0.89. We interpret this value to indicate that the present definition of auditory cue and auditory distance can be said to account for the experimental behavior of our listeners only in a rather gross fashion. On the other hand, the theory was developed on the basis of rather conservative assumptions about the nature of auditory cues. For instance, the model ignores the possibility of temporal coding and certain nonlinear effects, and it does not pay special attention to spectral peaks. Seen in that light, the agreement between observed and predicted auditory distance is remarkably good.}, } @article {pmid7229201, year = {1981}, author = {Bernstein, J}, title = {Formant-based representation of auditory similarity among vowel-like sounds.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {4}, pages = {1132-1144}, doi = {10.1121/1.385693}, pmid = {7229201}, issn = {0001-4966}, support = {1-TO-HD000275/HD/NICHD NIH HHS/United States ; 2-R01-NS04332-11/NS/NINDS NIH HHS/United States ; 5-T32-NS07040-02/NS/NINDS NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Female ; Humans ; Male ; Models, Psychological ; Probability ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Can the acoustic properties of vowels, in particular, formant frequencies, and formant amplitudes, be scaled so that the auditory relations among vowels are representable by their positions in a formant frequency space? Experiments are described in which carefully selected sets of five-formant vowel-like sounds were presented to listeners. Auditory judgements were elicited sufficient to evaluate the conformity of steady state vowel perception with various algebraic forms. Test results demonstrate that if intervowel distances are organized on a formant-by-formant basis, as for example in the commonly used Euclidean distance metric, no scaling of listeners' responses using a formant-based distance, two factors are required for each formant: Scaled formant amplitude levels must be included in the representation, and the difference of the scaled formant frequencies must be weighted by the formant amplitude levels.}, } @article {pmid7240561, year = {1981}, author = {Gay, T and Lindblom, B and Lubker, J}, title = {Production of bite-block vowels: acoustic equivalence by selective compensation.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {3}, pages = {802-810}, doi = {10.1121/1.385591}, pmid = {7240561}, issn = {0001-4966}, support = {NS-10424/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Immobilization ; Jaw/*physiology ; Larynx/physiology ; Lip/physiology ; Male ; Mouth/physiology ; Movement ; Pharynx/physiology ; Phonetics ; *Speech/*physiology ; *Speech Acoustics ; }, abstract = {Acoustic and articulatory data are reported for steady state vowels produced both normally and with a bite block. The formant patterns of the bite-block vowels were found to approximate those of the naturally spoken vowels. Measurements derived from lateral view still x-ray films showed that the bite blocks induce drastic articulatory reorganization. Using a mandibular frame of reference, we found that speakers compensated for a large bite block by using supershapes of the tongue and the lips (for [u] and [o]). Comparing the two productions using a maxillary frame of reference, we noted that compensation was maximum at the points of maximum constriction and incomplete or partial at points where the vocal-tract area was large. A computer simulation of our speakers' compensatory strategy revealed that they behaved optimally according to acoustic theory. These findings suggest that a vowel target is coded neurophysiologically in terms of acoustically significant area-function, specifically, by information related to cavity configuration at points of maximum constriction.}, } @article {pmid7462476, year = {1981}, author = {Edwards, TJ}, title = {Multiple features analysis of intervocalic English plosives.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {2}, pages = {535-547}, doi = {10.1121/1.385482}, pmid = {7462476}, issn = {0001-4966}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Speech ; *Speech Acoustics ; Voice ; }, abstract = {A controlled study involving a multi-feature acoustic analysis of intervocalic English plosives has been conducted for the purpose of evaluating acoustic stop features for automatic stop recognition. Both time-domain and frequency-domain features were measured via computer interaction. The resulting discrete density functions allowed for a comparative evaluation of individual (and in some instance conditional or combined) feature contributions to stop identification. This evaluation consisted of the maximum a posteriori probability of a correct identification of either the voicing mode (voiced or voiceless) or the place of articulation (labial, alveolar, or palato-velar) of a stop independent of the talker or of a limited phonetic context. These features included variations on ten acoustic features for voicing mode identification and seven for place-of-articulation identification. The data base for this study consisted of five male and five female talkers producing all permutations of the stops C epsilon /p,t,k,b,d,g/ and vowels V epsilon /i,e,a,u/ in the sentence frame "please say /h 'CVt/ again." While the results indicate that the acoustic features of burst/formant-onset frequency and voice onset time (VOT) are the two single most salient features for stop identification, several additional features are also shown to provide important redundancy for identification. The often overlooked time-domain features of "double-burst release" and "voicing during stop closure" were found to be particularly useful in providing such redundancy.}, } @article {pmid7255090, year = {1981}, author = {Schwab, EC and Sawusch, JR and Nusbaum, HC}, title = {The role of second formant transitions in the stop-semivowel distinction.}, journal = {Perception & psychophysics}, volume = {29}, number = {2}, pages = {121-128}, pmid = {7255090}, issn = {0031-5117}, support = {MH31468-01/MH/NIMH NIH HHS/United States ; NS-12179/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; *Psychoacoustics ; Speech Perception/*physiology ; }, } @article {pmid7280032, year = {1981}, author = {Pisoni, DB}, title = {Variability of vowel formant frequencies and the quantal theory of speech: a first report.}, journal = {Phonetica}, volume = {37}, number = {5-6}, pages = {285-305}, pmid = {7280032}, issn = {0031-8388}, support = {R01 DC000111/DC/NIDCD NIH HHS/United States ; MH-24027/MH/NIMH NIH HHS/United States ; NS-04332/NS/NINDS NIH HHS/United States ; NS-07040/NS/NINDS NIH HHS/United States ; }, mesh = {Computers ; Humans ; Phonetics ; Reaction Time ; Sound Spectrography ; *Speech ; *Speech Acoustics ; }, abstract = {This paper reports the results of a study in which variability of formant frequencies for different vowels was examined with regard to several predictions derived from the quantal theory of speech. Two subjects were required to reproduce eight different steady-state synthetic vowels which were presented repeatedly in a randomized order. Spectral analysis was carried out on the vocal responses in order to obtain means and standard deviations of the vowel formant frequencies. In the spirit of the quantal theory, it was predicted that the point vowel, /i/, /a/ and /u/ would show lower standard deviations than the nonpoint vowels because these vowels are assumed to be produced at places in the vocal tract where small perturbations in articulation produce only minimal changes in the resulting formant frequencies. That is, these vowels are assumed to be quantal vowels. The results of this study provided little support for the hypothesis under consideration. A discussion of the outcome of the results as well as some speculation as to its failure to find support for the quantal theory is provided in the report. Several final comments are also offered about computer simulation studies of speech production and the need for additional empirical studies on vowel production with real talkers.}, } @article {pmid7267718, year = {1981}, author = {Miller, JL}, title = {Some effects of speaking rate on phonetic perception.}, journal = {Phonetica}, volume = {38}, number = {1-3}, pages = {159-180}, doi = {10.1159/000260021}, pmid = {7267718}, issn = {0031-8388}, support = {71-2420//PHS HHS/United States ; NS-14394/NS/NINDS NIH HHS/United States ; RR-07143/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; Infant ; *Phonetics ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {This paper reviews a series of studies on the effects of variation in speaking rate on phonetic perception, in particular, on the identification of /b/ and /w/ in syllable-initial position. The major finding of these studies was that in a variety of tasks listeners adjusted for speaking rate when using the duration of the initial formant transitions to distinguish between /b/ and /w/: At slower rates of speech, a longer transition was needed to hear /w/ rather than /b/, and conversely at faster rates of speech. This effect occurred when rate was specified by the duration and acoustic-phonetic structure of the syllable containing the target consonant and, to a more limited extent, when it was specified by the duration of a subsequent syllable. Moreover, the influence of syllable duration on the processing of transition duration was also evident in young, prearticulate infants, suggesting that at least the rudiments of a system that provides perceptual constancy across rate is part of the linguistic endowment of the infant.}, } @article {pmid7217525, year = {1981}, author = {Whalen, DH}, title = {Effects of vocalic formant transitions and vowel quality on the English [s]-[ŝ] boundary.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {1}, pages = {275-282}, doi = {10.1121/1.385348}, pmid = {7217525}, issn = {0001-4966}, support = {HD01994/HD/NICHD NIH HHS/United States ; RR05596/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; Voice Quality ; }, abstract = {The effects of the vocalic portion of fricative-vowel syllables on the perception of alveolar and palatal fricatives were examined. The fricative noises were synthesized to represent a continuum from [s] to [ŝ]; the vowels ranged from [u] to [i] through [1] and [ü]. The vocalic formant transitions were of two types, those appropriate to [s] and those to [ŝ]. All stimuli were presented in forced-choice labeling tests. The boundary between [s] and [ŝ] for English-speaking listeners was found to vary as a function both of transitions and of vowel. The effect of the transitions was clear and straightforward: An ambiguous fricative noise was heard more often as [s] before [s] transitions, and as [ŝ] before [ŝ] transitions. The quality of the vowel clearly had an effect, but the interpretation of the effect in terms of the perception of coarticulation was not clear. The responses of listeners who were unfamiliar with languages which use [ü] and/or [1] distinctively were not significantly different from those of listeners who were familiar with such languages.}, } @article {pmid7217522, year = {1981}, author = {Diehl, RL and McCusker, SB and Chapman, LS}, title = {Perceiving vowels in isolation and in consonantal context.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {1}, pages = {239-248}, doi = {10.1121/1.385344}, pmid = {7217522}, issn = {0001-4966}, support = {NS13764/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Memory ; *Phonetics ; *Speech Perception ; }, abstract = {Recent studies have shown that vowels tend to be identified more accurately in consonantal context than in isolation. This contextual advantage is often explained perceptually, e.g., by assuming that the formant transitions associated with the consonants convey significant vowel information. In two experiments with stylized synthetic speech patterns, we were unable to replicate the contextual advantage. These negative results were probably due to certain unnatural stimulus characteristics. In another experiment we used natural speech stimuli to assess whether nonperceptual factors associated with the identification task contribute to the contextual advantage. Subjects responded to the test items either by (a) circling written CVC syllables, (b) circling written isolated vowels, or (c) vocally mimicking the items (a task that we assume imposes minimal memory load on subjects). Of these response conditions, only the first yielded an advantage for vowels in context, suggesting that the effect depends on two factors: Stimulus-response compatibility and memory load.}, } @article {pmid7217521, year = {1981}, author = {Bennett, S}, title = {Vowel formant frequency characteristics of preadolescent males and females.}, journal = {The Journal of the Acoustical Society of America}, volume = {69}, number = {1}, pages = {231-238}, doi = {10.1121/1.385343}, pmid = {7217521}, issn = {0001-4966}, support = {RR-07042/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; Body Height ; Body Weight ; Child ; Female ; Humans ; Male ; Neck/anatomy & histology ; Pharynx/anatomy & histology ; Puberty ; Sex Factors ; *Speech ; *Speech Acoustics ; }, abstract = {This report describes the vowel formant frequency characteristics (F1-F4 of five vowels produced in a fixed phonetic context) of 42 seven and eight year old boys and girls and the relationship of vocal tract resonances to several indices of body size. Results showed that the vowel resonances of male children were consistently lower than those of females, and that the extent of the sexual differences varied as a function of formant number and vowel category. Average across all measured formants of all five vowels, the overall sexual distinction was approximately 10%. The range of differences extended from about 3% for F1 of /i/ to 16%for F1 of /ae/. Measures of body size were always significantly related to these children's formant frequencies (range in multiple r's -0.506 to -0.866). The origin of the sexual differences in vocal tract resonance characteristics is discussed with reference to differences in vocal tract size and articulatory behaviors.}, } @article {pmid6936099, year = {1981}, author = {Watterson, T and Emanuel, F}, title = {Effects of oral-nasal coupling on whispered vowel spectra.}, journal = {The Cleft palate journal}, volume = {18}, number = {1}, pages = {24-38}, pmid = {6936099}, issn = {0009-8701}, mesh = {Acoustics ; Adult ; Cleft Palate/physiopathology ; Female ; Humans ; Mouth/*physiology ; Nose/*physiology ; *Phonetics ; }, abstract = {This study was designed to investigate some acoustic (formant) effects of systematic changes in oral-nasal coupling area during whispered vowel productions. One adult female cleft palate subject was fitted with a specially-designed prosthetic speech appliance. The appliance was drilled to provide seven controlled diameters of oral nasal coupling, ranging from no coupling to a maximum of 14 mm. At each coupling condition, the subject's whispered productions of each of two test vowels (/i/,/u/) were magnetically recorded. From narrowband (3 Hz) acoustic spectra of the recorded vowels, measurements of formant frequency, intensity, and bandwidth were obtained for the formants visualized below 4 K HZ. The most reliable acoustic indicators of increased coupling were changes in the measurements for formant two of /i/ and for formant three of /u/ and, for both test vowels, the appearance of "extra formants." In general, however, the findings appeared consistent with the view implicit in acoustic transmission line theory that the spectrographically-delineated formant effects of oral-nasal coupling may be inherently inconsistent and difficult to predict.}, } @article {pmid6265257, year = {1981}, author = {Langner, G and Bonke, D and Scheich, H}, title = {Neuronal discrimination of natural and synthetic vowels in field L of trained mynah birds.}, journal = {Experimental brain research}, volume = {43}, number = {1}, pages = {11-24}, pmid = {6265257}, issn = {0014-4819}, mesh = {Animals ; Auditory Pathways/physiology ; Birds ; Evoked Potentials, Auditory ; Neural Inhibition ; Neurons/physiology ; *Phonetics ; Speech Perception/*physiology ; *Synaptic Transmission ; Telencephalon/*physiology ; Thalamic Nuclei/physiology ; }, abstract = {The discrimination of single neurons for vowels and vowel components was analyzed in the telencephalic field L which is a layered and tonotopically organized primary auditory projection area in the bird's neostriatum. Among 250 units, 132 (53%) were responsive to at least one out of nine vowels from one German speaker. The distribution of responsiveness to n (one to nine) vowels showed that a maximum of 33 out of the 132 neurons preferred n = one vowel. The mechanisms of vowel selectivity were analyzed with five synthetic vowels composed of two formants F1 and F2 which could be presented separately. Most of the selective units also responded to F1 or F2 of the preferred vowel alone. The suppression of the vowels could be explained by formants which fell into inhibitory ranges of that unit, independently demonstrated by the pure tone response. Other units had several excitatory bands which coincided with the formants of the preferred vowel. In some cases a certain amplitude ratio of F1 versus F2 gave the strongest response. Several qualitative models of excitatory-inhibitory interaction of inputs to field L neurons are presented which explain the described selectivities. It is interesting that the distribution of vowel-selective units relates to the most superficial and most basal layer of field L where units selective for species-specific calls have previously been located in a gallinaceous bird.}, } @article {pmid7462461, year = {1980}, author = {Gottfried, TL and Strange, W}, title = {Identification of coarticulated vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {68}, number = {6}, pages = {1626-1635}, doi = {10.1121/1.385218}, pmid = {7462461}, issn = {0001-4966}, support = {HD-01136/HD/NICHD NIH HHS/United States ; HD-07151/HD/NICHD NIH HHS/United States ; MH-21153/MH/NIMH NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; Male ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Previous explanations of vowel perception held that the most definitive information for vowel identity is the relatively constant formant frequencies in the steady-state portions of vowels. Perceptual studies indicate, however, that vowels spoken in syllables with labial stop consonants are identified more accurately than vowels spoken in isolation. The present study investigated the nature and scope of this consonantal context advantage in the perception of ten American English vowels spoken by adult male and female speakers. Vowels in /p/-vowel-/p/, /b/-vowel-/b/, /k/-vowel-/k/, /k/, /k/-vowel, and vowel-/k/ syllables were identified much more accurately than isolated vowels. This is consistent with the hypothesis that dynamic acoustic information due to the coarticulation in syllables is important for vowel identification. Identification of vowels in /g/-vowel-/g/, /g/-vowel, and vowel-/g/ syllables was not better than isolated vowels and was significantly poorer than for other consonantal contexts. Acoustical analyses were performed to determine whether poor production of vowels could account for perceptual errors. Misproduced vowel targets could not account for the overall pattern of identification performance. Phonological factors were also considered but were found to be inadequate to account fully for the results.}, } @article {pmid7451673, year = {1980}, author = {Geffner, D}, title = {Feature characteristics of spontaneous speech production in young deaf children.}, journal = {Journal of communication disorders}, volume = {13}, number = {6}, pages = {443-454}, doi = {10.1016/0021-9924(80)90044-1}, pmid = {7451673}, issn = {0021-9924}, mesh = {Child ; Deafness/*physiopathology ; Humans ; *Speech ; Speech Articulation Tests ; Tongue/physiology ; }, abstract = {Sixty-five 6-yr-old deaf children from state supported schools were given an adaptation of the Goldman Fristoe test of articulation to assess their spontaneous speech production. Responses were measured in terms of features of manner, place, voice visibility, position, and error type and compared to imitative samples. A rank order of difficulty for each phoneme, error type, and word position is presented. Results show that of the phonemes, low back vowels, diphthongs, laterals, and voiced consonants were more easily produced. A relationship could be found between fundamental frequency, formant frequency, intensity, and phoneme production, suggesting that these variables and features may be providing the governance underlying the phonological rules in the development of speech in the deaf. Suggestions for training are given.}, } @article {pmid7440851, year = {1980}, author = {Murry, T and Singh, S}, title = {Multidimensional analysis of male and female voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {68}, number = {5}, pages = {1294-1300}, doi = {10.1121/1.385122}, pmid = {7440851}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Sex Factors ; *Speech ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {This paper is a sequel to a study which showed that the dominant dimension for perceptual discrimination among normal voices was the male-female categorization and which also suggested that discrimination within the male-female categories utilized distinct dimenisons. The present study eliminates the male-female axis by treating the gender groups separately and making the within-category dimensions available for more sensitive analysis. The purpose was to determine the number and nature of perceptual parameters needed to explain judgments of voice similarity depending on talker sex and whether the stimulus sample was a sustained vowel or a short phrase. The similarity judgments were submitted to multidimensional analysis via INDSCAL and the resulting dimenisons were interpreted in terms of available acoustic measures and unidimensional voice-quality ratings of pitch, breathiness, hoarseness, nasality, and effort. The decisions of the listerners appeared to be influenced by both the sex of the speaker and the stimulus sample, although fundamental frequency (fo), was important for all judgments. Aside from the fo dimensions, judgments concerning male voices were related to vocal tract parameters, while similarity judgments of female voices were related to perceived glottal/vocal tranct differences. Formant structure was apparently important in judging the similarity of vowels for both sexes while perceptual glottal/temporal attributes may have been used as cues in the judgments of phrases.}, } @article {pmid6449534, year = {1980}, author = {Soli, SD}, title = {Some effects of acoustic attributes of speech on the processing of phonetic feature information.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {6}, number = {4}, pages = {622-638}, doi = {10.1037//0096-1523.6.4.622}, pmid = {6449534}, issn = {0096-1523}, support = {HD-00098/HD/NICHD NIH HHS/United States ; HD-01136/HD/NICHD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonetics ; Reaction Time ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {How do acoustic attributes of the speech signal contribute to feature-processing interactions that occur in phonetic classification? In a series of five experiments addressed to this question, listeners performed speeded classification tasks that explicitly required a phonetic decision for each response. Stimuli were natural consonant-vowel syllables differing by multiple phonetic features, although classification responses were based on a single target feature. In control tasks, no variations in nontarget features occurred, whereas in orthogonal tasks nonrelevant feature variations occurred but had to be ignored. Comparison of classification times demonstrated that feature information may either be processed separately as independent cues for each feature or as a single integral segment that jointly specifies several features. The observed form on processing depended on the acoustic manifestations of feature variation in the signal. Stop-consonant place of articulation and voicing cues, conveyed independently by the pattern and excitation source of the initial formant transitions, may be processed separately. However, information for consonant place of articulation and vowel quality, features that interactively affect the shape of initial formant transitions, are processed as an integral segment. Articulatory correlates of each type of processing are discussed in terms of the distinction between source features that vary discretely in speech production and resonance features that can change smoothly and continuously. Implications for perceptual models that include initial segmentation of an input utterance into a phonetic feature representation are also considered.}, } @article {pmid7403875, year = {1980}, author = {Eimas, PD and Miller, JL}, title = {Contextual effects in infant speech perception.}, journal = {Science (New York, N.Y.)}, volume = {209}, number = {4461}, pages = {1140-1141}, doi = {10.1126/science.7403875}, pmid = {7403875}, issn = {0036-8075}, mesh = {Cognition ; Discrimination, Psychological ; Humans ; Infant ; Phonetics ; *Psychology, Child ; Speech Perception/*physiology ; }, abstract = {Infants, aged 2 to 4 months, discriminated synthetic speech patterns that varied in duration of the formant transitions; this variation provides information sufficient to signal the phonetic distinction between a stop consonant and a semivowel in adult listeners. In addition, the discriminability of a given difference in transition duration was a function of both the particular stimulus values and the total duration of the syllable. This contextual effect occurred even though the information for syllable duration came after the transition information. The obtained pattern of discontinuous discriminability was in accord with perception that is relational and categorical.}, } @article {pmid7419820, year = {1980}, author = {Delgutte, B}, title = {Representation of speech-like sounds in the discharge patterns of auditory-nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {68}, number = {3}, pages = {843-857}, doi = {10.1121/1.384824}, pmid = {7419820}, issn = {0001-4966}, support = {2 RO1 NS04332/NS/NINDS NIH HHS/United States ; 5 PO1 NS13126/NS/NINDS NIH HHS/United States ; }, mesh = {Animals ; Cats ; *Phonetics ; Speech/*physiology ; Speech Acoustics ; Vestibulocochlear Nerve/*physiology ; }, abstract = {It is now possible to study how acoustic characteristics important for speech discrimination are represented in the discharge patterns of auditory-nerve fibers. The electrical activity of single auditory-nerve fibers in response to speech-like sounds (single and two formant synthetic stimuli and "fricative" noise bursts) was recorded with microelectrodes in anesthetized cats. Results demonstrate that a conceptualization of some basic properties of responses to simple acoustic stimuli is useful in interpreting qualitatively how certain characteristics of speech-like sounds can be coded. Specific examples are given for (1) the rapid changes in amplitude and spectrum that occur for each syllable, (2) the fundamental frequency of voiced sounds, and (3) the spectral envelope of fricative consonants. It is also shown that the presence of background noise can have qualitatively different effects on how various characteristics of speech-like sounds are represented.}, } @article {pmid7419807, year = {1980}, author = {Hall, JL}, title = {Frequency selectivity of the cochlea for formant peaks at high signal levels.}, journal = {The Journal of the Acoustical Society of America}, volume = {68}, number = {2}, pages = {480-481}, doi = {10.1121/1.384746}, pmid = {7419807}, issn = {0001-4966}, mesh = {Auditory Perception/*physiology ; Basilar Membrane/physiology ; Cochlea/*physiology ; Humans ; Models, Biological ; }, abstract = {The frequency selectivity for formant peaks in a synthetic continuous vowel was measured on a nonlinear model for motion of the basilar membrane. In spite of the fact that the model becomes more broadly tuned at high levels, the formant peaks in the model response become more sharply defined. This effect is caused by nonlinear suppression of the response to frequencies between adjacent formant peaks.}, } @article {pmid6447767, year = {1980}, author = {Bailey, PJ and Summerfield, Q}, title = {Information in speech: observations on the perception of [s]-stop clusters.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {6}, number = {3}, pages = {536-563}, doi = {10.1037//0096-1523.6.3.536}, pmid = {6447767}, issn = {0096-1523}, support = {HD-01994/HD/NICHD NIH HHS/United States ; RR-05596/RR/NCRR NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; Psychoacoustics ; *Speech Perception ; }, abstract = {A series of experiments is reported that investigated the pattern of acoustic information specifying place and manner of stop consonants in medial position after [s]. In both production and perception, information for stop place includes the spectrum of the fricative at offset, the duration of the silent closure interval, the spectral relationship between the frequency of the stop release burst and the following periodically excited formants, and the spectral and temporal characteristics of the first formant transition. Similarly, the information for stop manner includes the duration of silent closure, the frequency of the first formant at the release, the magnitude of the first formant transition, and the proximity of the second and third formants at release. A relationship was shown to exist in perception between the spectral characteristics of the first formant and the duration of the silent closure required to hear a stop. This appears to reciprocate the covariation of these parameters in production across different places of articulation and different vocalic contexts. The existence of perceptual sensitivity to a wide range of the acoustic consequences of production questions the efficacy of accounts of speech perception in terms of the fractionation of the signal into elemental acoustic cues, which are then integrated to yield a phonetic percept. It is argued that it is inappropriate to ascribe a psychological status to cues whose only reality is their operational role as physical parameters whose manipulation can change the phenotic interpretation of a signal. It is suggested that the metric of the information for phonetic perception cannot be that of the cues; rather, a metric should be sought in which acoustic and articulatory dynamics are isomorphic.}, } @article {pmid7347740, year = {1980}, author = {Godfrey, JJ and Millay, KK}, title = {Perception of synthetic speech sounds by hearing-impaired listeners.}, journal = {The Journal of auditory research}, volume = {20}, number = {3}, pages = {187-203}, pmid = {7347740}, issn = {0021-9177}, support = {HD 10422/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Cues ; Female ; Hearing Loss, Sensorineural/*psychology ; Humans ; Male ; Middle Aged ; *Phonetics ; Psychoacoustics ; Speech Discrimination Tests/methods ; *Speech Perception ; }, abstract = {Volunteers (7 men, 8 women) with mild to severe hearing losses listened by monaural earphone to 5 series of computer-synthesized speech syllables, and identified each token. Presentation level 30 db SL. The 5 series consisted of (a, b) steady-state vowels varying from to [o] and from [o] to [a], (c,d) CV syllables varying from [ba] to [wa], and from [da] to [ya], differing in the rate of formant transitions at onset, and (e) CV syllables varying from [ba] to [da] differing in both direction and extent of formant transitions from locus to vowel. All hearing-impaired Ss categorized the vowels essentially the same as normal control Ss; some had difficulty identifying the stop vs glide series, and all but 2 were unable to identify the [b-d] series normally. A few impaired Ss improved at a presentation level close to tolerable limits. Results imply a "hierarchy of difficulty" based on the nature of the spectral cue in different speech sound types.}, } @article {pmid7390071, year = {1980}, author = {Van Tasell, DJ}, title = {Perception of second-formant transitions by hearing-impaired persons.}, journal = {Ear and hearing}, volume = {1}, number = {3}, pages = {130-136}, doi = {10.1097/00003446-198005000-00004}, pmid = {7390071}, issn = {0196-0202}, mesh = {Adult ; *Auditory Perception ; Hearing Loss, Sensorineural/*psychology ; Humans ; Middle Aged ; Perceptual Masking ; }, abstract = {Just-noticeable difference in starting frequency of the F2 transition was measured in four moderately hearing-impaired and four normal-hearing adults at a stimulus presentation level of 85 dB SPL. Just-noticeable difference was established for two stimulus configurations (F2 in isolation and F1 + F2) at two F2 starting frequency locations of the standard stimulus (1100 Hz and the subject's /ba/-/da/ phoneme boundary). There were no significant differences in discrimination performance across the four stimulus conditions within either subject group; the F1 upward spread of masking effect described by previous investigators was not demonstrated by the averaged group data from this study. The high variability in performance among hearing-impaired subjects implies that: (1) subjects with similar audiometric configurations do not necessarily perform similarly on auditory tasks involving synthetic speech stimuli; and (2) discriminability of changes in F2 may not be an appropriate index of upward spread of masking by F1 of F2.}, } @article {pmid7372933, year = {1980}, author = {Porter, RJ and Whittaker, RG}, title = {Dichotic and monotic masking of CV's by CV second formants with different transition starting values.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {5}, pages = {1772-1780}, doi = {10.1121/1.384305}, pmid = {7372933}, issn = {0001-4966}, mesh = {Humans ; *Perceptual Masking ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Listeners were asked to identify ambiguous and unambiguous stop-vowel targets placed in monotic and dichotic competition with second formants (bleats) from voiced consonant-vowel (CV) syllables lying along a place-of-articulation continuum. Target performance varied with bleat-continuum position as well as bleat intensities. In cases where target errors occurred, either dichotically or monotically, they reflected predominantly the place cue of the bleat. This result, like that of previous studies, suggests the dominance of target or bleat reflects the relative "salience" of the two signals' cues. Differences were seen between monotic and dichotic conditions in the rate of change in performance with bleat intensity and continuum position. The rate of monotic performance change was a more precipitous (higher slope) function of these variables than was dichotic performance. This difference was interpreted as suggesting that monotic interference includes a peripheral masking component which is sensitive to the relative spectral energies of target and bleat. Dichotic effects, in contrast, seem to primarily reflect the operation of (central) processes which grant different perceptual weights to signals' cues depending on their intensity-dependent saliences. The observation that ambiguity, per se, of the targets (or the CV's from which the bleats were extracted) played little role in predicting results, was interpreted as reflecting a primarily prephonetic (i.e., auditory) locus for both monotic and dichotic interactions.}, } @article {pmid7372920, year = {1980}, author = {Dorman, MF and Raphael, LJ}, title = {Distribution of acoustic cues for stop consonant place of articulation in VCV syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {4}, pages = {1333-1335}, doi = {10.1121/1.384186}, pmid = {7372920}, issn = {0001-4966}, mesh = {Humans ; Male ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Perception ; }, abstract = {Most theoretical accounts of the identification of stop consonant place of articulation have focused on how bursts and formant transitions conspire to signal place in CV syllables. In the present series of experiments we have examined the identification of place in VCV syllables and have found that not only do the burst and opening transitions affect the judgement of place, but so also do the closing transitions and the duration of the closure interval. This outcome is consistent with the outcomes of many other experiments in showing that there are multiple acoustic events which bear on the identification of a given phone and that those acoustic events are distributed over time. Theoretical accounts of place identification based on data of this kind may prove viable than theories based on data from the identification of place in the absolute, syllable-initial position.}, } @article {pmid7369978, year = {1980}, author = {Saito, I and Fujiwara, O and Utsuki, N and Mizumoto, C and Arimori, T}, title = {Hypoxia-induced fatal aircraft accident revealed by voice analysis.}, journal = {Aviation, space, and environmental medicine}, volume = {51}, number = {4}, pages = {402-406}, pmid = {7369978}, issn = {0095-6562}, mesh = {*Accidents, Aviation ; Adult ; Altitude ; Humans ; Hypoxia/*complications/diagnosis ; Male ; Oxygen Consumption ; Reaction Time ; Time Factors ; *Voice ; }, abstract = {The voice communication was the only clue of the fatal F-104J accident encountered during high-altitude intercept procedures, and it was analysed to prove the presence of hypoxia as a causal factor. A simulated low-pressure chamber flight was undertaken, and the subject's voice, saying the same words as the pilot, was analyzed in the same way. Comparison of these two voices revealed a similarity in characteristic changes of the sound spectrum and time course. The blurred formation of formant, fundamental, and harmonic frequencies, as well as the obscured gap in pre-vocal cord opening time (VOT) of the sound spectrogram, were thought to be the effects of hypoxia. Lowered fundamental frequency of the pilot's voice, even at the stressful period of attack, has strongly suggested decreased vigilance due to hypoxia. Through these findings, it was concluded that the cause of the accident was probably hypoxia in the pilot.}, } @article {pmid7442186, year = {1980}, author = {Buhr, RD}, title = {The emergence of vowels in an infant.}, journal = {Journal of speech and hearing research}, volume = {23}, number = {1}, pages = {73-94}, doi = {10.1044/jshr.2301.73}, pmid = {7442186}, issn = {0022-4685}, support = {5 RO1 HDO197-02/HD/NICHD NIH HHS/United States ; }, mesh = {*Child Language ; Child, Preschool ; Humans ; Infant ; Jaw/physiology ; *Language Development ; Lip/physiology ; Male ; Masticatory Muscles/physiology ; Models, Biological ; Movement ; Speech/*physiology ; Speech Acoustics ; Tongue/physiology ; Vocal Cords/anatomy & histology/growth & development/physiology ; }, abstract = {Recordings of vocal production of an infant (age 16-64 weeks) were subjected to perceptual and acoustic analysis. Sounds resembling the vowel sounds of English were identified, and formant frequency measurements were made from spectrograms. Significant longitudinal trends for individual vowel sounds were not apparent during this period, although formant relationships for some vowels after 38 weeks were consistent with the notion of restructuring of the infant's vocal tract. However, analysis of F1/F2 plots over time revealed the emergence of a well-developed vowel triangle, resembling that of older children and adults. The acute axis of this triangle seems to develop before the grave axis. Implications for anatomical, neuromuscular, and linguistic development are discussed.}, } @article {pmid7442176, year = {1980}, author = {Huggins, AW}, title = {Better spectrograms from children's speech. A research note.}, journal = {Journal of speech and hearing research}, volume = {23}, number = {1}, pages = {19-27}, doi = {10.1044/jshr.2301.19}, pmid = {7442176}, issn = {0022-4685}, support = {NS 04332/NS/NINDS NIH HHS/United States ; }, mesh = {Child ; Humans ; Larynx, Artificial ; Sound Spectrography/*methods ; }, abstract = {A major problem in the spectrographic analysis of children's speech is the poor resolution of formants, which is the result of the widely spaced harmonics of the high fundamental frequency. An attempt has been made to bypass this problem by exciting a child's vocal tract with an artificial larynx, using a fundamental frequency appropriate to a man. This method has promise for tracking formants in children's speech.}, } @article {pmid7358922, year = {1980}, author = {Massaro, DW and Oden, GC}, title = {Evaluation and integration of acoustic features in speech perception.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {3}, pages = {996-1013}, doi = {10.1121/1.383941}, pmid = {7358922}, issn = {0001-4966}, mesh = {Adult ; Humans ; Inhalation ; Models, Psychological ; Phonetics ; *Speech/physiology ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Identification of synthetic stop consonants as either /bae/, /pae/, /dae/, or /tae/ was examined in two experiments in which the stimuli varied independently on voice onset time (VOT), the consonantal second and third formant (F2-F3) transitions and, in experiment 2, the intensity of the aspiration noise during the VOT period. In both experiments, the patterns of the resulting identification probabilities were complex, but systematic, functions of each of the independent variables. Of most interest was the fact that the likelihood of identifying a stimulus to be /bae/ or /pae/, rather than /dae/ or /tae/, was strongly influenced by the VOT as well as by the F2-F3 transitions. Analogously, the likelihood of identifying a stimulus to be /bae/ or /dae/, rather than /pae/ or /tae/, depended on the F2-F3 transitions as well as on VOT. Three explanations of these results were considered within a fuzzy logical model of speech perception: (1) that there is interaction in the evaluation of acoustic features, (2) that the listener requires more extreme values of acoustic features for some speech sounds than for that of other speech sounds, and (3) that the aspiration noise during the VOT period serves as an independent acoustic feature to distinguish /pae/ and /bae/ from /tae/ and /dae/.}, } @article {pmid7358914, year = {1980}, author = {Reale, RA and Geisler, CD}, title = {Auditory-nerve fiber encoding of two-tone approximations to steady-state vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {3}, pages = {891-902}, doi = {10.1121/1.383969}, pmid = {7358914}, issn = {0001-4966}, mesh = {Animals ; Cats ; Nerve Fibers/*physiology ; *Phonetics ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses to two harmonically related tones, approximating the lowest formants of nine American English vowels, were recorded from single auditory-nerve fibers. Data were compiled as period histograms for tones presented singly and in combination using the fundamental frequency of the two-tone complex as the time base. The amplitudes of the primary frequency components present in a histogram were estimated by least-squares fitting a half-wave rectified sum of the stimulating sinusoids plus a constant. Nonlinear interactions resulted for most two-tone stimuli: one tone dominated the response. When one tone was equal to best frequency, that tone always controlled discharge timing, usually suppressing the response to the second tone. Complicated interactions took place when the stimulating frequencies bracketed best frequency. The tone nearest best frequency was most effective near threshold, while higher stimulus levels usually favored the low-frequency tone. Nevertheless, the suppression mechanisms appear to provide an effective spatial separation in the cochlea for the response components to each vowel approximation. Fourier analysis of the period histograms yielded qualitatively similar results.}, } @article {pmid7358910, year = {1980}, author = {Van de Grift Turek, S and Dorman, MF and Franks, JR and Summerfield, Q}, title = {Identification of synthetic /bdg/ by hearing-impaired listeners under monotic and dichotic formant presentation.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {3}, pages = {1031-1040}, doi = {10.1121/1.384070}, pmid = {7358910}, issn = {0001-4966}, mesh = {Adolescent ; Adult ; Female ; Hearing Loss, Sensorineural/*psychology ; Humans ; Male ; Middle Aged ; Perceptual Masking ; Phonetics ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {Individuals with sensorineural hearing losses of both flat and sloping configuration evidence difficulty in identifying stop consonant place of articulation. To assess whether upward spread of masking is responsible for this difficulty, we presented hearing-impaired listeners with stimuli from a /ba da ga/ continuum in both monotic and dichotic (F1 to one ear; F2/F3 to the other ear) listening conditions. In the monotic conditions, listeners with similar audiograms evidence great variability in identification performance. In the dichotic conditions performance did not generally improve. For a few listeners, however, the improvement was striking. At moderate levels of signal presentation, upward spread of masking does not appear to be responsible for the poor identification of place by the majority of listeners with moderate hearing losses.}, } @article {pmid7358906, year = {1980}, author = {Blumstein, SE and Stevens, KN}, title = {Perceptual invariance and onset spectra for stop consonants in different vowel environments.}, journal = {The Journal of the Acoustical Society of America}, volume = {67}, number = {2}, pages = {648-662}, doi = {10.1121/1.383890}, pmid = {7358906}, issn = {0001-4966}, mesh = {Adult ; Humans ; *Phonetics ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {A series of listening tests with brief synthetic consonant-vowel syllables was carried out to determine whether the initial part of a syllable can provide cues to place of articulation for voiced stop consonants independent of the remainder of the syllable. The data show that stimuli as short as 10-20 ms sampled from the onset of a consonant-vowel syllable, can be reliably identified for consonantal place of articulation, whether the second and higher formants contain moving or straight transitions and whether or not an initial burst is present. In most instances, these brief stimuli also contain sufficient information for vowel indentification. Stimulus continua in which formant transitions ranged from values appropriate to [b], [d], [g] in various vowel environments, and in which stimulus durations were 20 and 46 ms, yielded categorical labeling functions with a few exceptions. These results are consistent with a theory of speech perception in which consonant place of articulation is cued by invariant properties derived from the spectrum sampled in a 10-20 ms time window adjacent to consonantal onset or offset.}, } @article {pmid7462041, year = {1980}, author = {Welzl-Müller, K}, title = {[Transmission of speech sounds by hearing aids (author's transl)].}, journal = {HNO}, volume = {28}, number = {9}, pages = {313-316}, pmid = {7462041}, issn = {0017-6192}, mesh = {*Hearing ; *Hearing Aids/instrumentation ; Humans ; Phonetics ; Sound ; *Sound Spectrography ; }, abstract = {Oscillograms and sonagrams demonstrate some changes of speech sounds due to transmission by hearing aids. Most important are: cut off at low and high frequencies, internal noise of the hearing aid and variations of transients. The influence on speech discrimination and subjective satisfaction is discussed. A consequence of low frequency cut off is the loss of the first vocal formants and the loss of most important parts of oral occlusives /m/ and /n/; the cut off at high frequencies diminishes the information of some fricatives. The internal noise of the hearing aid may have a masking effect on important components of fricatives, Observed temporal variations in transients are too small to deteriorate phonem identification.}, } @article {pmid7422715, year = {1980}, author = {Guérin, B and Boë, LJ}, title = {[Influences of acoustic coupling between source and vocal tract of the Fo of oral vowels. Consequence for the study of intrinsic characteristics].}, journal = {Phonetica}, volume = {37}, number = {3}, pages = {169-192}, doi = {10.1159/000259990}, pmid = {7422715}, issn = {0031-8388}, mesh = {Humans ; Models, Biological ; Phonation ; *Speech/*physiology ; *Speech Acoustics ; Vocal Cords/physiology ; }, abstract = {With the aid of a two-mass model of the vocal source loaded by the input impedance of the vocal tract, we first study theoretically, then by simulation, the effects of source-vocal tract coupling on the intrinsic frequencies Fo of the French oral vowels. We explain the correlation which exists between Fo and the frequency of the first formant as well as the results from the experiment described by Ishizaka and Flanagan [1972]. In conclusion, we show that the acoustic coupling cannot explain the different values of Fo, intrinsic in the vowels, but on the contrary, a physiological coupling has to compensate for the effects of acoustic coupling to explain the characteristics revealed in natural speech.}, } @article {pmid7413770, year = {1980}, author = {Wakita, H}, title = {New methods of analysis in speech acoustics.}, journal = {Phonetica}, volume = {37}, number = {1-2}, pages = {87-108}, doi = {10.1159/000259984}, pmid = {7413770}, issn = {0031-8388}, mesh = {Adult ; Child ; Computers ; Female ; Glottis/physiology ; Humans ; Larynx/physiology ; Male ; Phonetics ; *Speech ; *Speech Acoustics ; Speech Production Measurement ; }, abstract = {This paper gives a tutorial review of the linear prediction method in its application to acoustical analysis of sampled speech. The paper explains how the formant frequencies and the fundamental frequency can be estimated from sampled speech waves by the use of linear prediction. Advantages and disadvantages of the method are discussed together with the problems in estimating the above parameters. Application examples are also given.}, } @article {pmid7413769, year = {1980}, author = {Fant, G}, title = {The relations between area functions and the acoustic signal.}, journal = {Phonetica}, volume = {37}, number = {1-2}, pages = {55-86}, doi = {10.1159/000259983}, pmid = {7413769}, issn = {0031-8388}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Glottis/physiology ; Humans ; Infant ; Infant, Newborn ; Larynx/*physiology ; Male ; Palate, Soft/physiology ; Phonetics ; Sex Factors ; *Speech/*physiology ; *Speech Acoustics ; *Speech Production Measurement ; Vocal Cords/physiology ; }, abstract = {To derive speech wave data from area function specifications and the reverse, to predict the area function from the speech wave, are fundamental problems of acoustic theory of speech production. Deviations from ideal resonator theory in terms of vocal tract boundary conditions and source filter interactions are discussed. Perturbation theory is related to special problems of male-female vocal scaling. Shortcomings of the inverse transforms are discussed. Merits of lossy transmission line theory over standard linear prediction procedures are emphasized. The use of bandwidths for removing ambiguities is illustrated in simple models. A limited amount of bandwidth data supplementing formant frequency data and model related vocal tract constraints appears to be optimal.}, } @article {pmid4836041, year = {1974}, author = {Kent, RD}, title = {Auditory-motor formant tracking: a study of speech imitation.}, journal = {Journal of speech and hearing research}, volume = {17}, number = {2}, pages = {203-222}, doi = {10.1044/jshr.1702.203}, pmid = {4836041}, issn = {0022-4685}, mesh = {Acoustic Stimulation ; Adult ; *Auditory Perception ; Computers ; Feedback ; Humans ; *Imitative Behavior ; Motor Skills ; Phonetics ; Photography ; *Speech ; Time Factors ; }, } @article {pmid4833080, year = {1974}, author = {Sundberg, J}, title = {Articulatory interpretation of the "singing formant".}, journal = {The Journal of the Acoustical Society of America}, volume = {55}, number = {4}, pages = {838-844}, doi = {10.1121/1.1914609}, pmid = {4833080}, issn = {0001-4966}, mesh = {*Acoustics ; Humans ; Larynx/*physiology ; Male ; Models, Biological ; Pharynx/*physiology ; Sex Factors ; *Voice ; }, } @article {pmid4465608, year = {1974}, author = {Sharf, DJ and Beiter, RC}, title = {Identification of consonants from formant transitions presented forward and backward.}, journal = {Language and speech}, volume = {17}, number = {2}, pages = {110-118}, doi = {10.1177/002383097401700202}, pmid = {4465608}, issn = {0023-8309}, mesh = {*Auditory Perception ; Humans ; *Phonetics ; }, } @article {pmid4819867, year = {1974}, author = {Stevens, KN and Klatt, DH}, title = {Role of formant transitions in the voiced-voiceless distinction for stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {55}, number = {3}, pages = {653-659}, doi = {10.1121/1.1914578}, pmid = {4819867}, issn = {0001-4966}, mesh = {*Auditory Perception ; Cues ; Humans ; *Phonetics ; Time Factors ; }, } @article {pmid4277296, year = {1974}, author = {Berthier, R and Douady, F and Kaufmann, A and Marcille, G and Hollard, D}, title = {[The preservation, at -196 centigrade, of colony forming cells on agar (CFCA)].}, journal = {Nouvelle revue francaise d'hematologie}, volume = {14}, number = {2}, pages = {330-334}, pmid = {4277296}, issn = {0029-4810}, mesh = {Agar ; *Blood Preservation ; Bone Marrow Cells ; Cell Survival ; Cells, Cultured ; Cryoprotective Agents ; Culture Media ; Dimethyl Sulfoxide ; Freezing ; *Hematopoietic Stem Cells ; Humans ; Methods ; }, } @article {pmid4815754, year = {1974}, author = {Kirman, JH}, title = {Tactile perception of computer-derived formant patterns from voiced speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {55}, number = {1}, pages = {163-169}, doi = {10.1121/1.1928145}, pmid = {4815754}, issn = {0001-4966}, mesh = {*Computers ; Humans ; *Phonetics ; *Sensory Aids ; *Speech ; *Touch ; *Voice ; }, } @article {pmid4744962, year = {1973}, author = {Danaher, EM and Osberger, MJ and Pickett, JM}, title = {Discrimination of formant frequency transitions in synthetic vowels.}, journal = {Journal of speech and hearing research}, volume = {16}, number = {3}, pages = {439-451}, doi = {10.1044/jshr.1603.439}, pmid = {4744962}, issn = {0022-4685}, mesh = {Audiometry ; *Auditory Perception ; Hearing Aids ; Hearing Disorders/*diagnosis ; Hearing Tests ; Humans ; Perceptual Masking ; }, } @article {pmid4346446, year = {1972}, author = {Bach, MA and Bach, JF}, title = {[Effect of cyclic AMP on spontaneous rosette forming cells].}, journal = {Comptes rendus hebdomadaires des seances de l'Academie des sciences. Serie D: Sciences naturelles}, volume = {275}, number = {23}, pages = {2783-2786}, pmid = {4346446}, mesh = {Adenosine Monophosphate/pharmacology ; Adenylyl Cyclases/biosynthesis ; Animals ; Azathioprine/pharmacology ; Cyclic AMP/*pharmacology ; Drug Interactions ; *Immune Adherence Reaction ; Immune Sera ; Isoproterenol/pharmacology ; Mice ; Mice, Inbred C57BL ; Phosphoric Diester Hydrolases/metabolism ; Spleen/*drug effects/immunology ; T-Lymphocytes/immunology ; Thymectomy ; }, } @article {pmid4666327, year = {1972}, author = {Ainsworth, WA and Millar, JB}, title = {The effect of relative formant amplitude on the perceived identity of synthetic vowels.}, journal = {Language and speech}, volume = {15}, number = {4}, pages = {328-341}, doi = {10.1177/002383097201500403}, pmid = {4666327}, issn = {0023-8309}, mesh = {Computers ; Humans ; Perception ; *Phonetics ; Time Factors ; }, } @article {pmid5047882, year = {1972}, author = {Sisty, NL and Weinberg, B}, title = {Formant frequency characteristics of esophageal speech.}, journal = {Journal of speech and hearing research}, volume = {15}, number = {2}, pages = {439-448}, doi = {10.1044/jshr.1502.439}, pmid = {5047882}, issn = {0022-4685}, mesh = {*Acoustics ; Female ; Humans ; Male ; Phonetics ; Sex Factors ; *Speech, Alaryngeal ; Tape Recording ; }, } @article {pmid4627897, year = {1972}, author = {Ohayon, E and Ouhayoun, E and Marty, Y and Pris, J and Ducos, J}, title = {[Anti-Rh specificity of certain rosette forming cells].}, journal = {Revue francaise de transfusion}, volume = {15}, number = {1}, pages = {37-45}, doi = {10.1016/s0035-2977(72)80027-0}, pmid = {4627897}, issn = {0035-2977}, mesh = {Anemia, Hemolytic, Autoimmune/*immunology ; Antibody Specificity ; Autoantibodies/analysis ; Erythrocytes/immunology ; Female ; Humans ; Immune Adherence Reaction ; Immunization ; Lymphocytes/immunology ; Maternal-Fetal Exchange ; Pregnancy ; *Rh-Hr Blood-Group System ; }, } @article {pmid4669688, year = {1972}, author = {Gonay, P}, title = {[Effect of rhinolalia aperta on vocal formants. Sonagraphic study].}, journal = {Acta oto-rhino-laryngologica Belgica}, volume = {26}, number = {6}, pages = {757-770}, pmid = {4669688}, issn = {0001-6497}, mesh = {*Auditory Perception ; Hearing Tests ; Humans ; *Phonetics ; *Speech Disorders ; *Voice ; }, } @article {pmid4262133, year = {1972}, author = {Berthier, R and Kaufmann, AE and Marcille, G and Leger, J and Schaerer, R and Suscillon, M and Hollard, D}, title = {[Study of the survival of erythropoietin sensitive cells and of cells forming clones on agar in suspensions of human bone marrow frozen to -196 degrees C: preliminary results].}, journal = {Nouvelle revue francaise d'hematologie}, volume = {12}, number = {1}, pages = {65-78}, pmid = {4262133}, issn = {0029-4810}, mesh = {Animals ; Bone Marrow/metabolism ; Bone Marrow Cells ; Cell Survival ; *Erythropoietin ; *Freezing ; *Hematopoietic Stem Cells ; Heme/metabolism ; Humans ; Iron/metabolism ; Iron Isotopes ; Methods ; Mice ; Microscopy, Phase-Contrast ; *Tissue Preservation ; }, } @article {pmid5128781, year = {1971}, author = {Rosenberg, AE and Schafer, RW and Rabiner, LR}, title = {Effects of smoothing and quantizing the parameters of formant-coded voiced speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {50}, number = {6}, pages = {1532-1538}, doi = {10.1121/1.1912807}, pmid = {5128781}, issn = {0001-4966}, mesh = {Computers ; Models, Biological ; Pitch Discrimination ; *Speech ; Verbal Behavior ; Voice ; }, } @article {pmid5163891, year = {1971}, author = {Coleman, RO}, title = {Male and female voice quality and its relationship to vowel formant frequencies.}, journal = {Journal of speech and hearing research}, volume = {14}, number = {3}, pages = {565-577}, doi = {10.1044/jshr.1403.565}, pmid = {5163891}, issn = {0022-4685}, mesh = {Adult ; Female ; Humans ; Larynx/physiology ; Male ; *Phonetics ; *Sex Factors ; Speech ; *Voice ; }, } @article {pmid4106391, year = {1971}, author = {Olive, JP}, title = {Automatic formant tracking by a Newton-Raphson technique.}, journal = {The Journal of the Acoustical Society of America}, volume = {50}, number = {2}, pages = {661-670}, doi = {10.1121/1.1912681}, pmid = {4106391}, issn = {0001-4966}, mesh = {Audiovisual Aids ; Computers ; Humans ; Mathematics ; *Models, Biological ; Sex Factors ; *Speech ; Time Factors ; }, } @article {pmid5489904, year = {1970}, author = {Gilbert, JH}, title = {Formant concentration positions in the speech of children at two levels of linguistic development.}, journal = {The Journal of the Acoustical Society of America}, volume = {48}, number = {6}, pages = {Suppl 2:1404+}, doi = {10.1121/1.1912299}, pmid = {5489904}, issn = {0001-4966}, mesh = {*Acoustics ; Child, Preschool ; Humans ; *Language Development ; *Speech ; }, } @article {pmid4101061, year = {1970}, author = {Phuoc-Sinh, M and Delbarre, F and Le Gô, A and Delrieu, F}, title = {[Morphology of cells forming "rheumatoid rosettes" in humans].}, journal = {Comptes rendus hebdomadaires des seances de l'Academie des sciences. Serie D: Sciences naturelles}, volume = {271}, number = {18}, pages = {1658-1660}, pmid = {4101061}, mesh = {Erythrocytes/immunology ; Humans ; Lymphocytes/*cytology/immunology ; Microscopy, Phase-Contrast ; *Rheumatoid Factor ; Staining and Labeling ; }, } @article {pmid5450923, year = {1970}, author = {Decreusefond, C and Mouton, D and Binet, JL and Pavlovsky, S and Stiffel, C and Bouthillier, Y and Biozzi, G}, title = {[Study of the immunological response at the cellular level. II. Dynamic study of various rosette-forming cellular types during the immunological response].}, journal = {Annales de l'Institut Pasteur}, volume = {119}, number = {1}, pages = {76-86}, pmid = {5450923}, issn = {0020-2444}, mesh = {Adjuvants, Immunologic/*pharmacology ; Animals ; Antibody Formation ; Cell Count ; Corynebacterium/immunology ; Erythrocytes/immunology ; Kinetics ; Lymphocytes/*immunology ; Male ; Mice ; Plasma Cells/*immunology ; Sheep ; Spleen/cytology/*immunology ; Time Factors ; }, } @article {pmid5426309, year = {1970}, author = {Broad, DJ and Fertig, RH}, title = {Formant-frequency trajectories in selected CVC-syllable nuclei.}, journal = {The Journal of the Acoustical Society of America}, volume = {47}, number = {6}, pages = {1572-1582}, doi = {10.1121/1.1912090}, pmid = {5426309}, issn = {0001-4966}, mesh = {Analysis of Variance ; Data Display ; Humans ; Models, Theoretical ; *Phonetics ; }, } @article {pmid5421453, year = {1970}, author = {Pickett, JM and Mártony, J}, title = {Low-frequency vowel formant discrimination in hearing-impaired listeners.}, journal = {Journal of speech and hearing research}, volume = {13}, number = {2}, pages = {347-359}, doi = {10.1044/jshr.1302.347}, pmid = {5421453}, issn = {0022-4685}, mesh = {Audiometry/instrumentation ; *Auditory Perception ; Auditory Threshold ; Deafness/*physiopathology ; Hearing Aids ; Humans ; Pitch Discrimination ; Rehabilitation/instrumentation ; Touch ; Transducers ; }, } @article {pmid4986793, year = {1970}, author = {Reyes, F and Bach, JF}, title = {[Ultrastructure of cells forming "spontaneous rosettes" in mouse spleen].}, journal = {Comptes rendus hebdomadaires des seances de l'Academie des sciences. Serie D: Sciences naturelles}, volume = {270}, number = {13}, pages = {1702-1704}, pmid = {4986793}, mesh = {Animals ; *Antigen-Antibody Reactions ; *Erythrocytes ; Histiocytes/cytology/immunology ; Lymphocytes/cytology/immunology ; Mice ; Microscopy, Electron ; Spleen/*immunology ; }, } @article {pmid5445369, year = {1970}, author = {Schafer, RW and Rabiner, LR}, title = {System for automatic formant analysis of voiced speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {47}, number = {2}, pages = {634-648}, doi = {10.1121/1.1911939}, pmid = {5445369}, issn = {0001-4966}, mesh = {Acoustics ; *Autoanalysis ; Methods ; Pitch Discrimination ; *Spectrum Analysis ; *Speech ; }, } @article {pmid5430062, year = {1970}, author = {Sundberg, J}, title = {Formant structure and articulation of spoken and sung vowels.}, journal = {Folia phoniatrica}, volume = {22}, number = {1}, pages = {28-48}, doi = {10.1159/000263365}, pmid = {5430062}, issn = {0015-5705}, mesh = {Humans ; Male ; *Phonetics ; *Speech ; }, } @article {pmid5372623, year = {1969}, author = {Schlöndorff, G and Tegtmeier, W}, title = {[Experimental contribution to vowel discrimination in frequency transposition of the formants].}, journal = {Archiv fur klinische und experimentelle Ohren- Nasen- und Kehlkopfheilkunde}, volume = {194}, number = {2}, pages = {466-469}, pmid = {5372623}, issn = {0003-9195}, mesh = {Hearing Disorders/physiopathology ; Hearing Tests ; Humans ; Methods ; *Pitch Discrimination ; *Speech ; }, } @article {pmid5352693, year = {1969}, author = {Counitchansky, Y and Berthillier, G and Got, R}, title = {[Trypsin- and chymotrypsin-binding proteins in human colostrum].}, journal = {Clinica chimica acta; international journal of clinical chemistry}, volume = {26}, number = {2}, pages = {223-229}, doi = {10.1016/0009-8981(69)90370-2}, pmid = {5352693}, issn = {0009-8981}, mesh = {Centrifugation, Density Gradient ; Chromatography, Gel ; *Chymotrypsin ; Colostrum/*analysis ; Esterases ; Female ; Humans ; Molecular Weight ; Pregnancy ; *Protein Binding ; *Trypsin ; }, } @article {pmid4976883, year = {1969}, author = {Lieberman, PH and Klatt, DH and Wilson, WH}, title = {Vocal tract limitations on the vowel repertoires of rhesus monkey and other nonhuman primates.}, journal = {Science (New York, N.Y.)}, volume = {164}, number = {3884}, pages = {1185-1187}, doi = {10.1126/science.164.3884.1185}, pmid = {4976883}, issn = {0036-8075}, mesh = {Animals ; Computers ; *Haplorhini ; Humans ; Models, Biological ; Pharynx/anatomy & histology ; *Phonetics ; Species Specificity ; *Speech ; Speech, Alaryngeal ; *Vocalization, Animal ; }, abstract = {The vowel repertoire of a rhesus monkey (Macaca mulatta) was explored by means of a computer program that calculated formant frequencies from the area function of the animal's supralaryngeal vocal tract, which was systematically varied within the limits imposed by anatomical constraints. The resulting vowels were compared with those of humans and with recorded vocalizations of nonhuman primates. The computer model indicates that the acoustic "vowel space" of a rhesus monkey is quite restricted compared to that of the human. This limitation results from the lack of a pharyngeal region that can change its cross-sectional area. These animals thus lack the output mechanism necessary for production of human speech. Man's speech output mechanism is apparently species-specific.}, } @article {pmid5702031, year = {1968}, author = {Gay, T}, title = {Effect of speaking rate on diphthong formant movements.}, journal = {The Journal of the Acoustical Society of America}, volume = {44}, number = {6}, pages = {1570-1573}, doi = {10.1121/1.1911298}, pmid = {5702031}, issn = {0001-4966}, mesh = {*Auditory Perception ; Humans ; Language ; Phonetics ; *Speech ; *Time ; }, } @article {pmid5679934, year = {1968}, author = {Ainsworth, WA}, title = {First formant transitions and the perception of synthetic semivowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {44}, number = {3}, pages = {689-694}, doi = {10.1121/1.1911162}, pmid = {5679934}, issn = {0001-4966}, mesh = {Adult ; *Auditory Perception ; Computers ; Hearing ; Humans ; Methods ; Phonetics ; *Speech ; }, } @article {pmid5677032, year = {1968}, author = {Suga, N}, title = {Analysis of frequency-modulated and complex sounds by single auditory neurones of bats.}, journal = {The Journal of physiology}, volume = {198}, number = {1}, pages = {51-80}, pmid = {5677032}, issn = {0022-3751}, mesh = {Action Potentials ; Animals ; Chiroptera ; Hearing/*physiology ; Mesencephalon/*physiology ; Neurons/*physiology ; Sound ; Speech ; }, abstract = {1. Single unit activity in the inferior colliculus of bats was studied in relation to the analysis of frequency-modulated (FM) and complex sounds. Complex sounds were composed of tone pulse I (pure or FM tone) delivered simultaneously with tone pulse II (pure or sometimes FM tone). It was assumed that in relevant complex sounds produced by animals, an important component (e.g. a formant in human speech) occurred at the best frequency (BF) of a given neurone. Tone pulse I represented such a component (called BF component). Tone pulse II was assumed to correspond to higher or lower components according to its relation to BF. Depending on characteristics of responses to tonal stimuli, collicular neurones were classified into five types: symmetrical, asymmetrical, FM-insensitive, FM-sensitive (or FM-specialized) and upper-threshold units.2. The symmetrical unit had a wide excitatory area and no inhibitory areas and it responded with equal thresholds to FM tone pulses sweeping in either directions. This type of neurone responded to all frequency modulations (e.g. transition in human speech) of the BF component, and the response was scarcely inhibited by other components.3. In the asymmetrical unit, the extent of frequency modulation of the BF component which could excite the neurone was limited by inhibitory areas on one or both sides of an excitatory area. Inhibitory areas on the lower frequency side tended to be larger than those on the high frequency side. The limitation was more severe for frequency sweeps toward the best frequency than for sweeps starting from it. The response to the BF component was inhibited by lower and/or higher components unless these were outside the inhibitory areas. In most of the asymmetrical units, lower components were more important than higher ones in determining whether the response to the BF component could occur.4. In the FM-insensitive unit with a narrow excitatory area, inhibitory areas on both sides of the excitatory area restricted the extent of frequency modulation of the BF components which could activate the neurone. Responses to frequency sweeps toward the best frequency were strongly limited by the inhibitory areas. When the lower and/or higher components were within the inhibitory areas, the response to the BF component was inhibited. This type of neurone responded to more restricted combinations of components than did the asymmetrical units.5. The FM-sensitive unit which had no excitatory area but a large inhibitory area responded only to FM components in a certain range. Evidence was obtained that not only the range and direction but rate and functional form of frequency sweep were important in determining the excitation of the neurone. Noise bursts with various band widths did not activate the neurones. Responses of the neurones were commonly inhibited by tones within an inhibitory area so wide as to involve even frequencies in the FM component which excited the neurone. Thus, the response of the FM-sensitive unit depended not only upon the characteristics of the FM component, but also on the frequencies of other components.6. The asymmetrical, FM-insensitive and FM-sensitive units required for their activation a certain structure in the complex sound. Some of the upper-threshold units did not respond to a sufficiently strong BF component and/or its frequency modulation. Furthermore, the response of the neurone to a weak BF component was inhibited by strong lower and/or higher components in a certain range. Some upper-threshold units also had asymmetrical or FM-sensitive characteristics. Those neurones appeared to be specialized for the analysis of sound structure not only in frequency, but in intensity.7. Although various types of behaviour of single neurones were found in the inferior colliculus, a strong tendency in the neural analysis of complex sound was the restriction of conditions under which single neurones were activated. Neurones at higher levels responded to more restricted sequences or sets of sound stimuli than did those at lower levels.}, } @article {pmid4893899, year = {1968}, author = {Saëz, H}, title = {[Study of 4 arthrospore fungi that form endospore].}, journal = {Microbiologia espanola}, volume = {21}, number = {3}, pages = {193-204}, pmid = {4893899}, issn = {0026-2595}, mesh = {Bacteriological Techniques ; Culture Media ; Fungi/*growth & development ; }, } @article {pmid5645831, year = {1968}, author = {Rabiner, LR}, title = {Digital-formant synthesizer for speech-synthesis studies.}, journal = {The Journal of the Acoustical Society of America}, volume = {43}, number = {4}, pages = {822-828}, doi = {10.1121/1.1910901}, pmid = {5645831}, issn = {0001-4966}, mesh = {*Acoustics ; Computers ; Models, Biological ; *Speech ; }, } @article {pmid6062445, year = {1967}, author = {Majewski, W and Hollien, H}, title = {Formant frequency regions of Polish vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {42}, number = {5}, pages = {1031-1037}, doi = {10.1121/1.1910685}, pmid = {6062445}, issn = {0001-4966}, mesh = {Adult ; Auditory Perception ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Poland ; *Speech ; }, } @article {pmid6075568, year = {1967}, author = {Lindblom, BE and Studdert-Kennedy, M}, title = {On the role of formant transitions in vowel recognition.}, journal = {The Journal of the Acoustical Society of America}, volume = {42}, number = {4}, pages = {830-843}, doi = {10.1121/1.1910655}, pmid = {6075568}, issn = {0001-4966}, mesh = {*Auditory Perception ; Discrimination, Psychological ; Humans ; *Speech ; }, } @article {pmid5583884, year = {1967}, author = {Fujimura, O}, title = {On the second spectral peak of front vowels: a perceptual study of the role of the second and third formants.}, journal = {Language and speech}, volume = {10}, number = {3}, pages = {181-193}, doi = {10.1177/002383096701000304}, pmid = {5583884}, issn = {0023-8309}, mesh = {*Auditory Perception ; Humans ; *Language ; *Phonetics ; *Speech ; }, } @article {pmid6075557, year = {1967}, author = {Strong, WJ}, title = {Machine-aided formant determination for speech synthesis.}, journal = {The Journal of the Acoustical Society of America}, volume = {41}, number = {6}, pages = {1434-1442}, doi = {10.1121/1.1910503}, pmid = {6075557}, issn = {0001-4966}, mesh = {*Models, Biological ; *Speech ; }, } @article {pmid6074791, year = {1967}, author = {Mermelstein, P}, title = {Determination of the vocal-tract shape from measured formant frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {41}, number = {5}, pages = {1283-1294}, doi = {10.1121/1.1910470}, pmid = {6074791}, issn = {0001-4966}, mesh = {Larynx/*physiology ; *Models, Biological ; Sound ; Speech ; *Voice ; }, } @article {pmid5182282, year = {1966}, author = {Okamura, M}, title = {[Acoustical studies on the Japanese vowels in children. The formant construction and the developmental process].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {69}, number = {6}, pages = {1198-1214}, doi = {10.3950/jibiinkoka.69.6_1198}, pmid = {5182282}, issn = {0030-6622}, mesh = {Adult ; *Aging ; Child ; Child, Preschool ; Female ; Humans ; Infant ; Japan ; Male ; *Phonetics ; }, } @article {pmid14341722, year = {1965}, author = {PETERSON, GE and HANNE, JR}, title = {EXAMINATION OF TWO DIFFERENT FORMANT-ESTIMATION TECHNIQUES.}, journal = {The Journal of the Acoustical Society of America}, volume = {38}, number = {}, pages = {224-228}, doi = {10.1121/1.1909639}, pmid = {14341722}, issn = {0001-4966}, mesh = {*Acoustics ; *Hearing ; *Hearing Tests ; Humans ; *Speech ; }, } @article {pmid14347605, year = {1965}, author = {GOLD, B}, title = {TECHNIQUES FOR SPEECH BANDWIDTH COMPRESSION, USING COMBINATIONS OF CHANNEL VOCODERS AND FORMANT VOCODERS.}, journal = {The Journal of the Acoustical Society of America}, volume = {38}, number = {}, pages = {2-10}, doi = {10.1121/1.1909607}, pmid = {14347605}, issn = {0001-4966}, mesh = {*Data Compression ; *Equipment and Supplies ; Humans ; *Research ; *Speech ; }, } @article {pmid14196631, year = {1964}, author = {DELAPORTE, B}, title = {[COMPARATIVE STUDY OF LARGE SPORE-FORMING SPIRILLEAE: SPOROSPIRILLUM (SPIRILLUM) PRAECLARUM (COLLIN) N.G., SPOROSPIRILLUM GYRINI N. SP. AND SPOROSPIRILLUM BISPORUM N.SP].}, journal = {Annales de l'Institut Pasteur}, volume = {107}, number = {}, pages = {246-262}, pmid = {14196631}, issn = {0020-2444}, mesh = {*Bacteriological Techniques ; *Classification ; *Microscopy ; *Plants, Medicinal ; *Research ; *Spirillum ; *Spores ; *Spores, Bacterial ; }, } @article {pmid14147499, year = {1964}, author = {ANGELOCCI, AA and KOPP, GA and HOLBROOK, A}, title = {THE VOWEL FORMANTS OF DEAF AND NORMAL-HEARING ELEVEN- TO FOURTEEN-YEAR-OLD BOYS.}, journal = {The Journal of speech and hearing disorders}, volume = {29}, number = {}, pages = {156-160}, doi = {10.1044/jshd.2902.156}, pmid = {14147499}, issn = {0022-4677}, mesh = {Adolescent ; Child ; *Deafness ; *Hearing Loss ; Humans ; *Language ; Male ; *Speech ; }, } @article {pmid14077672, year = {1963}, author = {HUNEBELLE, G}, title = {[THE PROBLEM OF SIMULTANEOUS SPINAL CORRECTIVE EXERCISES FOR A HETEROGENOUS GROUP OF YOUNG GIRLS. METHOD USED AT THE CENTRE M'EDICAL SCOLAIRE DE LI'EGE].}, journal = {Archives belges de medecine sociale, hygiene, medecine du travail et medecine legale. Belgisch archief van sociale geneeskunde, hygiene, arbeidsgeneeskunde en gerechtelijke geneeskunde}, volume = {21}, number = {}, pages = {393-413}, pmid = {14077672}, issn = {0003-9578}, mesh = {Adolescent ; *Exercise Therapy ; Female ; *Gymnastics ; Humans ; *Kyphosis ; *Lordosis ; *Physical Education and Training ; *School Health Services ; *Scoliosis ; }, } @article {pmid13908296, year = {1962}, author = {HOLBROOK, A and FAIRBANKS, G}, title = {Diphthong formants and their movements.}, journal = {Journal of speech and hearing research}, volume = {5}, number = {}, pages = {38-58}, doi = {10.1044/jshr.0501.38}, pmid = {13908296}, issn = {0022-4685}, mesh = {Humans ; *Movement ; *Phonetics ; }, } @article {pmid13697940, year = {1961}, author = {FAIRBANKS, G and GRUBB, P}, title = {A psychophysical investigation of vowel formants.}, journal = {Journal of speech and hearing research}, volume = {4}, number = {}, pages = {203-219}, doi = {10.1044/jshr.0403.203}, pmid = {13697940}, issn = {0022-4685}, mesh = {Humans ; *Language ; *Phonetics ; *Psychophysics ; }, } @article {pmid14403377, year = {1960}, author = {HOUSE, AS}, title = {Formant band widths and vowel preference.}, journal = {Journal of speech and hearing research}, volume = {3}, number = {}, pages = {3-8}, doi = {10.1044/jshr.0301.03}, pmid = {14403377}, issn = {0022-4685}, mesh = {*Language ; *Phonetics ; }, } @article {pmid13655307, year = {1959}, author = {PETERSON, GE}, title = {Vowel formant measurements.}, journal = {Journal of speech and hearing research}, volume = {2}, number = {2}, pages = {173-183}, doi = {10.1044/jshr.0202.173}, pmid = {13655307}, issn = {0022-4685}, mesh = {*Language ; *Phonetics ; }, } @article {pmid13599151, year = {1958}, author = {HOUSE, AS and STEVENS, KN}, title = {Estimation of formant band widths from measurements of transient response of the vocal tract.}, journal = {The Journal of speech and hearing disorders}, volume = {1}, number = {4}, pages = {309-315}, doi = {10.1044/jshr.0104.309}, pmid = {13599151}, issn = {0022-4677}, mesh = {Humans ; Speech/*physiology ; }, } @article {pmid13573730, year = {1958}, author = {NOIROT-TIMOTHEE, C}, title = {[Ultrastructure of the ectoplasm-endoplasm border & of the fibers forming the karyophore in cilia of the genus Isotricha Stein (Holotriches, Trichostoma)].}, journal = {Comptes rendus hebdomadaires des seances de l'Academie des sciences}, volume = {247}, number = {7}, pages = {692-695}, pmid = {13573730}, issn = {0001-4036}, mesh = {Animals ; *Cells ; *Cilia ; *Ciliophora ; *Cytoplasm ; *Gadiformes ; *Trichostomatida ; }, } @article {pmid13429663, year = {1957}, author = {FLANAGAN, JL}, title = {Difference limen for formant amplitude.}, journal = {The Journal of speech and hearing disorders}, volume = {22}, number = {2}, pages = {205-212}, doi = {10.1044/jshd.2202.205}, pmid = {13429663}, issn = {0022-4677}, mesh = {*Differential Threshold ; Humans ; *Speech ; }, } @article {pmid14894085, year = {1951}, author = {BOEREMA, I}, title = {[Resection of the esophagus, followed by re-establishment of continuity by resection of the stomach and formation of a gastric tube].}, journal = {Acta chirurgica Belgica}, volume = {50}, number = {8}, pages = {496-503}, pmid = {14894085}, issn = {0001-5458}, mesh = {*Esophageal Neoplasms ; Humans ; Stomach/*surgery ; }, } @article {pmid14840988, year = {1951}, author = {TOKIZANE, T}, title = {The formant construction of Japanese vowels.}, journal = {The Japanese journal of physiology}, volume = {1}, number = {4}, pages = {297-308}, doi = {10.2170/jjphysiol.1.297}, pmid = {14840988}, issn = {0021-521X}, mesh = {*Language ; *Phonetics ; }, } @article {pmid14790469, year = {1950}, author = {GUYOTJEANNIN, C}, title = {[Inhibitory effect of acetic acid on the assimilation of organic acids by velum-forming yeasts].}, journal = {Annales de parasitologie humaine et comparee}, volume = {25}, number = {4}, pages = {350-352}, pmid = {14790469}, issn = {0003-4150}, mesh = {*Acetates ; *Acetic Acid ; *Saccharomyces cerevisiae ; *Yeast, Dried ; *Yeasts ; }, } @article {pmid1474221, year = {1992}, author = {Deng, L and Erler, K}, title = {Structural design of hidden Markov model speech recognizer using multivalued phonetic features: comparison with segmental speech units.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {6}, pages = {3058-3067}, doi = {10.1121/1.404202}, pmid = {1474221}, issn = {0001-4966}, mesh = {Communication ; Female ; Humans ; Male ; *Markov Chains ; Models, Theoretical ; *Phonetics ; Speech Acoustics ; *Speech Perception ; Vocabulary ; }, abstract = {A novel approach to speech recognition, on the basis of a multidimensional multivalued phonetic-feature description of speech signals, is presented and evaluated. The hidden Markov model (HMM) framework is used to provide the recognition algorithm, which assumes that the underlying Markov chain tracks the temporal evolution of the features. It is shown that this approach can naturally accommodate such coarticulatory effects as feature spreading and formant transition in the functionality of the recognizer, and can provide a high degree of acoustic data sharing that makes effective use of training data. Use of phonetic features as the basic speech units creates a framework where the Markov model's state topology in the recognizer can be designed with guidance of detailed speech knowledge. Details of such a design for a stop consonant-vowel vocabulary are described. Experimental results on the task of speaker-dependent stop consonant discrimination, evaluated from speech data from a total of ten male and five female speakers, demonstrate effectiveness of this feature-based recognizer. Over the 15 speakers, the error rates were shown to be reduced by 23%, 37%, 42%, and 38%, respectively, compared with the conventional HMM-based recognition methods using words, phonemes, allophones, and microsegments as the primary speech units.}, } @article {pmid1481665, year = {1992}, author = {Mülder, HE and Van Olphen, AF and Bosman, A and Smoorenburg, GF}, title = {Phoneme recognition by deaf individuals using the multichannel nucleus cochlear implant.}, journal = {Acta oto-laryngologica}, volume = {112}, number = {6}, pages = {946-955}, doi = {10.3109/00016489209137495}, pmid = {1481665}, issn = {0001-6489}, mesh = {Adult ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Male ; *Phonetics ; Prosthesis Design ; *Speech Discrimination Tests ; }, abstract = {Experiments have been carried out to determine which cues are used in phoneme identification by deaf individuals using a cochlear implant. Five deaf individuals with a Nucleus 22-channel cochlear implant were tested with open set speech audiometry in free field without lipreading. Speech material consisted of lists of Dutch words of the Consonant-Vowel-Consonant type (CVC-words). Word scores ranged from 0 to 22%, phoneme scores from 11 to 54%. For each subject the responses to the initial consonant, the vowel and the final consonant were entered into separate confusion matrices. Kruskal analysis, which provided a geometric representation of these confusions, showed that in the recognition of consonants the feature of voicing is all important. Vowels were identified on the basis of the frequencies of the first and second formants. In one subject the electrode array could only partially be inserted into the cochlea, leaving roughly half the second formant area of the electrode array outside the cochlea. For this subject vowel identification was based upon the first formant and vowel duration; there was no contribution of second formant information to vowel identification. Compressing the first and second formant frequency to the limited intracochlear array did not enhance transmission of second formant information and did not improve performance. The basic findings for consonant and vowel recognition could be explained by the speech coding strategy of the Nucleus speech processor in which voicing determines stimulus periodicity and formant frequencies determine channel selection. Kruskal analysis of phoneme confusions may aid in programming and evaluating the performance of the Nucleus cochlear implant.}, } @article {pmid1463568, year = {1992}, author = {Klingholz, F}, title = {[What is the value of measuring "singer's formant" in phoniatry?].}, journal = {Laryngo- rhino- otologie}, volume = {71}, number = {11}, pages = {581-583}, doi = {10.1055/s-2007-997360}, pmid = {1463568}, issn = {0935-8943}, mesh = {Humans ; Larynx/physiopathology ; *Music ; *Phonetics ; *Sound Spectrography ; Vocal Cord Paralysis/diagnosis/physiopathology ; Voice Disorders/*diagnosis/physiopathology ; Voice Quality/physiology ; }, abstract = {The singing formant is the product of an articulatory gesture. Devices for phonetogram measurement often claim the ability to determine it. However, they measure rather the overtone content or the noise of the voice than the formant. Moreover, the voice efficiency (ratio of the sound pressure in the frequency band 2-5 kHz to the total sound pressure), also measured by these devices, is not defined in a physical or physiological sense. Such measurement does not evaluate the efficiency of the vocal apparatus.}, } @article {pmid1487094, year = {1992}, author = {Geier, LL and Norton, SJ}, title = {The effects of limiting the number of Nucleus 22 cochlear implant electrodes programmed on speech perception.}, journal = {Ear and hearing}, volume = {13}, number = {5}, pages = {340-348}, doi = {10.1097/00003446-199210000-00011}, pmid = {1487094}, issn = {0196-0202}, mesh = {Acoustic Stimulation ; Adult ; Aged ; Auditory Threshold ; *Cochlear Implants ; Double-Blind Method ; Ear/physiopathology ; Electrodes ; Equipment Design ; Female ; Hearing Loss, Sensorineural/*physiopathology/rehabilitation ; Humans ; Male ; Middle Aged ; *Speech Perception ; }, abstract = {The purpose of this double-blind study was to evaluate systematically the effects of varying programmed electrodes on speech perception. The performance of five subjects implanted with the Nucleus 22-electrode cochlear implant was compared on the Four-Choice Spondee test, the Central Institute for the Deaf Sentence test, and Speech Tracking across the following conditions: (1) five most apical electrodes eliminated from the subject's MAP (stimulus parameters); (2) five most basal electrodes eliminated from subject's MAP; (3) the middle five electrodes eliminated from subject's MAP; and (4) subject's current MAP. Statistically significant differences were found for the Four-Choice Spondee test and both the auditory-only and auditory-plus-lipreading Speech Tracking measures. Three subjects demonstrated poorer performance on all test measures when the five electrodes from the apical portion of the array were not programmed. Two subjects performed equally well, regardless of MAP condition. Group means for all test measures present a trend of consistently poorer performance when the -5 Apex MAPs were utilized. A subjective rating scale was consistent with the perceptual tests, with all subjects best liking their current MAP and least liking the -5 Apex MAP. Results suggest that for some subjects, a fixed place code may control their ability to use spectral information for speech discrimination. For these subjects, first formant information (F1) traditionally coded on the most apical electrodes could not be utilized as effectively when coded on the adjacent middle electrodes.}, } @article {pmid1447929, year = {1992}, author = {Van Summers, W and Leek, MR}, title = {The role of spectral and temporal cues in vowel identification by listeners with impaired hearing.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {5}, pages = {1189-1199}, doi = {10.1044/jshr.3505.1189}, pmid = {1447929}, issn = {0022-4685}, support = {DC 00626/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Auditory Threshold ; Female ; Hearing/physiology ; Hearing Disorders/*physiopathology ; Humans ; Male ; Phonetics ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This study examined the use of duration and formant frequency in the labeling of synthetic CVC stimuli forming a beet-bit continuum. Durational and F2 frequency cues to vowel identity varied systematically across stimuli. Subjects with normal hearing tended to rely primarily on F2 frequency in vowel labeling, whereas subjects with impaired hearing relied less on F2 information. This group difference was observed even for stimuli with large F2 differences, which were easily discriminated by all subjects. The effect of vowel duration on labeling was similar for both groups, with long-duration stimuli receiving more "beet" responses than short-duration stimuli across the F2 range. Psychoacoustic measures of frequency resolution and temporal resolution were poor predictors of a subject's use of formant information and duration information in labeling.}, } @article {pmid1401528, year = {1992}, author = {Bloothooft, G and Bringmann, E and van Cappellen, M and van Luipen, JB and Thomassen, KP}, title = {Acoustics and perception of overtone singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {4 Pt 1}, pages = {1827-1836}, doi = {10.1121/1.403839}, pmid = {1401528}, issn = {0001-4966}, mesh = {Fourier Analysis ; Humans ; Loudness Perception ; *Music ; Phonetics ; *Pitch Perception ; Psychoacoustics ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Speech Perception ; *Voice Quality ; }, abstract = {Overtone singing, a technique of Asian origin, is a special type of voice production resulting in a very pronounced, high and separate tone that can be heard over a more or less constant drone. An acoustic analysis is presented of the phenomenon and the results are described in terms of the classical theory of speech production. The overtone sound may be interpreted as the result of an interaction of closely spaced formants. For the lower overtones, these may be the first and second formant, separated from the lower harmonics by a nasal pole-zero pair, as the result of a nasalized articulation shifting from /c/ to /a/, or, as an alternative, the second formant alone, separated from the first formant by the nasal pole-zero pair, again as the result of a nasalized articulation around /c/. For overtones with a frequency higher than 800 Hz, the overtone sound can be explained as a combination of the second and third formant as the result of a careful, retroflex, and rounded articulation from /c/, via schwa /e/ to /y/ and /i/ for the highest overtones. The results indicate a firm and relatively long closure of the glottis during overtone phonation. The corresponding short open duration of the glottis introduces a glottal formant that may enhance the amplitude of the intended overtone. Perception experiments showed that listeners categorized the overtone sounds differently from normally sung vowels, which possibly has its basis in an independent perception of the small bandwidth of the resonance underlying the overtone. Their verbal judgments were in agreement with the presented phonetic-acoustic explanation.}, } @article {pmid1403325, year = {1992}, author = {Matsushima, J and Kumagai, M and Harada, C and Takahashi, K and Inuyama, Y and Ifukube, T}, title = {[A comparison of time resolution among auditory, tactile and promontory electrical stimulation--superiority of cochlear implants as human communication aids].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {95}, number = {9}, pages = {1366-1371}, doi = {10.3950/jibiinkoka.95.1366}, pmid = {1403325}, issn = {0030-6622}, mesh = {Cochlea/*physiology ; *Cochlear Implants ; *Communication Aids for Disabled ; Electric Stimulation ; Humans ; Time Factors ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Our previous reports showed that second formant information, using a speech coding method, could be transmitted through an electrode on the promontory. However, second formant information can also be transmitted by tactile stimulation. Therefore, to find out whether electrical stimulation of the auditory nerve would be superior to tactile stimulation for our speech coding method, the time resolutions of the two modes of stimulation were compared. The results showed that the time resolution of electrical promontory stimulation was three times better than the time resolution of tactile stimulation of the finger. This indicates that electrical stimulation of the auditory nerve is much better for our speech coding method than tactile stimulation of the finger.}, } @article {pmid1401518, year = {1992}, author = {Economou, A and Tartter, VC and Chute, PM and Hellman, SA}, title = {Speech changes following reimplantation from a single-channel to a multichannel cochlear implant.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {3}, pages = {1310-1323}, doi = {10.1121/1.403925}, pmid = {1401518}, issn = {0001-4966}, support = {1R01DC01250-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; *Cochlear Implants ; Deafness/*rehabilitation ; Feedback ; Female ; Humans ; Prosthesis Design ; Signal Processing, Computer-Assisted/instrumentation ; Sound Spectrography/instrumentation ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Intelligibility ; *Speech Production Measurement ; Voice Quality ; }, abstract = {The speech of a postlingually deafened preadolescent was recorded and analyzed while a single-electrode cochlear implant (3M/House) was in operation, on two occasions after it failed (1 day and 18 days) and on three occasions after stimulation of a multichannel cochlear implant (Nucleus 22) (1 day, 6 months, and 1 year). Listeners judged 3M/House tokens to be the most normal until the subject had one year's experience with the Nucleus device. Spectrograms showed less aspiration, better formant definition and longer final frication and closure duration post-Nucleus stimulation (6 MO. NUCLEUS and 1 YEAR NUCLEUS) relative to the 3M/House and no auditory feedback conditions. Acoustic measurements after loss of auditory feedback (1 DAY FAIL and 18 DAYS FAIL) indicated a constriction of vowel space. Appropriately higher fundamental frequency for stressed than unstressed syllables, an expansion of vowel space and improvement in some aspects of production of voicing, manner and place of articulation were noted one year post-Nucleus stimulation. Loss of auditory feedback results are related to the literature on the effects of postlingual deafness on speech. Nucleus and 3M/House effects on speech are discussed in terms of speech production studies of single-electrode and multichannel patients.}, } @article {pmid1401517, year = {1992}, author = {Gay, T and Boé, LJ and Perrier, P}, title = {Acoustic and perceptual effects of changes in vocal tract constrictions for vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {3}, pages = {1301-1309}, doi = {10.1121/1.403924}, pmid = {1401517}, issn = {0001-4966}, support = {DC-00848/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Airway Resistance/*physiology ; Humans ; *Models, Anatomic ; *Phonetics ; Pulmonary Ventilation/*physiology ; Sound Spectrography ; Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {The purpose of this study was to use vocal tract simulation and synthesis as means to determine the acoustic and perceptual effects of changing both the cross-sectional area and location of vocal tract constrictions for six different vowels: Area functions at and near vocal tract constrictions are considered critical to the acoustic output and are also the central point of hypotheses concerning speech targets. Area functions for the six vowels, [symbol: see text] were perturbed by changing the cross-sectional area of the constriction (Ac) and the location of the constriction (Xc). Perturbations for Ac were performed for different values of Xc, producing several series of acoustic continua for the different vowels. Acoustic simulations for the different area functions were made using a frequency domain model of the vocal tract. Each simulated vowel was then synthesized as a 1-s duration steady-state segment. The phoneme boundaries of the perturbed synthesized vowels were determined by formal perception tests. Results of the perturbation analyses showed that formants for each of the vowels were more sensitive to changes in constriction cross-sectional area than changes in constriction location. Vowel perception, however, was highly resistant to both types of changes. Results are discussed in terms of articulatory precision and constriction-related speech production strategies.}, } @article {pmid1401516, year = {1992}, author = {Svirsky, MA and Lane, H and Perkell, JS and Wozniak, J}, title = {Effects of short-term auditory deprivation on speech production in adult cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {3}, pages = {1284-1300}, doi = {10.1121/1.403923}, pmid = {1401516}, issn = {0001-4966}, support = {DC00361/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Humans ; Language Development Disorders/*rehabilitation ; Male ; Middle Aged ; Phonetics ; Prosthesis Design ; Pulmonary Ventilation/physiology ; *Sensory Deprivation ; Signal Processing, Computer-Assisted/instrumentation ; Sound Spectrography/instrumentation ; *Speech Perception ; *Speech Production Measurement ; }, abstract = {Speech production parameters of three postlingually deafened adults who use cochlear implants were measured: after 24 h of auditory deprivation (which was achieved by turning the subject's speech processor off); after turning the speech processor back on; and after turning the speech processor off again. The measured parameters included vowel acoustics [F1, F2, F0, sound-pressure level (SPL), duration and H1-H2, the amplitude difference between the first two spectral harmonics, a correlate of breathiness] while reading word lists, and average airflow during the reading of passages. Changes in speech processor state (on-to-off or vice versa) were accompanied by numerous changes in speech production parameters. Many changes were in the direction of normalcy, and most were consistent with long-term speech production changes in the same subjects following activation of the processors of their cochlear implants [Perkell et al., J. Acoust. Soc. Am. 91, 2961-2978 (1992)]. Changes in mean airflow were always accompanied by H1-H2 (breathiness) changes in the same direction, probably due to underlying changes in laryngeal posture. Some parameters (different combinations of SPL, F0, H1-H2 and formants for different subjects) showed very rapid changes when turning the speech processor on or off. Parameter changes were faster and more pronounced, however, when the speech processor was turned on than when it was turned off. The picture that emerges from the present study is consistent with a dual role for auditory feedback in speech production: long-term calibration of articulatory parameters as well as feedback mechanisms with relatively short time constants.}, } @article {pmid1401515, year = {1992}, author = {Tartter, VC and Hellman, SA and Chute, PM}, title = {Vowel perception strategies of normal-hearing subjects and patients using Nucleus multichannel and 3M/House cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {3}, pages = {1269-1283}, doi = {10.1121/1.403922}, pmid = {1401515}, issn = {0001-4966}, support = {R01DC01250-01/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Follow-Up Studies ; Humans ; Male ; Middle Aged ; *Phonetics ; Prosthesis Design ; Reference Values ; Sound Spectrography ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Vowel perception strategies were assessed for two "average" and one "star" single-channel 3M/House and three "average" and one "star" Nucleus 22-channel cochlear implant patients and six normal-hearing control subjects. All subjects were tested by computer with real and synthetic speech versions of [symbol: see text], presented randomly. Duration, fundamental frequency, and first, second, and third formant frequency cues to the vowels were the vowels were systematically manipulated. Results showed high accuracy for the normal-hearing subjects in all conditions but that of the first formant alone. "Average" single-channel patients classified only real speech [hVd] syllables differently from synthetic steady state syllables. The "star" single-channel patient identified the vowels at much better than chance levels, with a results pattern suggesting effective use of first formant and duration information. Both "star" and "average" Nucleus users showed similar response patterns, performing better than chance in most conditions, and identifying the vowels using duration and some frequency information from all three formants.}, } @article {pmid1405544, year = {1992}, author = {Fox, RA and Wall, LG and Gokcen, J}, title = {Age-related differences in processing dynamic information to identify vowel quality.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {4}, pages = {892-902}, doi = {10.1044/jshr.3504.892}, pmid = {1405544}, issn = {0022-4685}, support = {5R01 AG08353-02/AG/NIA NIH HHS/United States ; }, mesh = {Adult ; Age Factors ; Aged ; Audiometry, Pure-Tone ; Auditory Perception ; Female ; Humans ; Male ; Middle Aged ; Speech Acoustics ; *Speech Discrimination Tests ; *Speech Perception ; }, abstract = {This study examined age-related differences in the use of dynamic acoustic information (in the form of formant transitions) to identify vowel quality in CVCs. Two versions of 61 naturally produced, commonly occurring, monosyllabic English words were created: a control version (the unmodified whole word) and a silent-center version (in which approximately 62% of the medial vowel was replaced by silence). A group of normal-hearing young adults (19-25 years old) and older adults (61-75 years old) identified these tokens. The older subjects were found to be significantly worse than the younger subjects at identifying the medial vowel and the initial and final consonants in the silent-center condition. These results support the hypothesis of an age-related decrement in the ability to process dynamic perceptual cues in the perception of vowel quality.}, } @article {pmid1405527, year = {1992}, author = {Kent, JF and Kent, RD and Rosenbek, JC and Weismer, G and Martin, R and Sufit, R and Brooks, BR}, title = {Quantitative description of the dysarthria in women with amyotrophic lateral sclerosis.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {4}, pages = {723-733}, doi = {10.1044/jshr.3504.723}, pmid = {1405527}, issn = {0022-4685}, support = {DC00319/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Aged ; Amyotrophic Lateral Sclerosis/*complications ; Articulation Disorders/diagnosis/etiology ; Dysarthria/*diagnosis/etiology ; Female ; Humans ; Male ; Middle Aged ; Phonation ; Phonetics ; Sex Factors ; Speech Disorders/diagnosis/etiology ; Speech Intelligibility ; Voice Quality ; }, abstract = {Speech intelligibility and its phonetic and acoustic correlates were studied in a group of 10 women with amyotrophic lateral sclerosis (ALS). Intelligibility assessment with a word-identification test indicated that the most disrupted phonetic features pertained to velopharyngeal valving, lingual function for consonant contrasts of place and manner, and syllable shape. An acoustic signature analysis based on trajectories of the first and second formants in selected monosyllabic test words revealed that the mean slope of the second formant (F2) was reduced compared with that of a normal geriatric control group. This F2 slope reduction is interpreted to reflect loss of lingual motoneurons. Acoustic measures of phonatory function for sustained vowel prolongation demonstrated abnormalities in fundamental frequency, perturbations of frequency (jitter) and amplitude (shimmer), and signal-to-noise ratio. The data for women with ALS are compared with data for a normal geriatric control group of women and with data for a group of 25 men with ALS (Kent et al., 1990). Although the overall ranking of errors was similar for males and females with ALS, men were more likely to have impairments of voicing in syllable-initial position.}, } @article {pmid1512318, year = {1992}, author = {Van Son, RJ and Pols, LC}, title = {Formant movements of Dutch vowels in a text, read at normal and fast rate.}, journal = {The Journal of the Acoustical Society of America}, volume = {92}, number = {1}, pages = {121-127}, doi = {10.1121/1.404277}, pmid = {1512318}, issn = {0001-4966}, mesh = {Humans ; *Language ; *Models, Theoretical ; Netherlands ; *Phonetics ; Sound Spectrography ; *Speech ; }, abstract = {Speaking rate in general, and vowel duration more specifically, is thought to affect the dynamic structure of vowel formant tracks. To test this, a single, professional speaker read a long text at two different speaking rates, fast and normal. The present project investigated the extent to which the first and second formant tracks of eight Dutch vowels varied under the two different speaking rate conditions. A total of 549 pairs of vowel realizations from various contexts were selected for analysis. The formant track shape was assessed on a point-by-point basis, using 16 samples at the same relative positions in the vowels. Differences in speech rate only resulted in a uniform change in F1 frequency. Within each speaking rate, there was only evidence of a weak leveling off of the F1 tracks of the open vowels /a a/ with shorter durations. When considering sentence stress or vowel realizations from a more uniform, alveolar-vowel-alveolar context, these same conclusions were reached. These results indicate a much more active adaptation to speaking rate than implied by the target undershoot model.}, } @article {pmid1354380, year = {1992}, author = {Wang, X and Sachs, MB}, title = {Coding of envelope modulation in the auditory nerve and anteroventral cochlear nucleus.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {336}, number = {1278}, pages = {399-402}, doi = {10.1098/rstb.1992.0074}, pmid = {1354380}, issn = {0962-8436}, mesh = {Acoustic Stimulation ; Animals ; Auditory Perception/physiology ; Cats ; Cochlear Nerve/*physiology ; Evoked Potentials, Auditory/physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {We have investigated responses of the auditory nerve fibres (ANFS) and anteroventral cochlear nucleus (AVCN) units to narrowband 'single-formant' stimuli (SFSS). We found that low and medium spontaneous rate (SR) ANFS maintain greater amplitude modulation (AM) in their responses at high sound levels than do high SR units when sound level is considered in dB SPL. However, this partitioning of high and low SR units disappears if sound level is considered in dB relative to unit threshold. Stimuli with carrier frequencies away from unit best frequency (BF) were found to generate higher AM in responses at high sound levels than that observed even in most low and medium SR units for stimuli with carrier frequencies near BF. AVCN units were shown to have increased modulation depth in their responses when compared with high SR ANFS with similar BFS and to have increased or comparable modulation depth when compared with low SR ANFS. At sound levels where AM almost completely disappears in high SR ANFS, most AVCN units we studied still show significant AM in their responses. Using a dendritic model, we investigated possible mechanisms of enhanced AM in AVCN units, including the convergence of inputs from different SR groups of ANFS and a postsynaptic threshold mechanism in the soma.}, } @article {pmid1619129, year = {1992}, author = {Ross, J}, title = {Formant frequencies in Estonian folk singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {6}, pages = {3532-3539}, doi = {10.1121/1.402842}, pmid = {1619129}, issn = {0001-4966}, mesh = {Estonia ; Humans ; *Microcomputers ; *Music ; *Phonetics ; *Pitch Perception ; Psychoacoustics ; Sound Spectrography/*instrumentation ; *Speech Perception ; }, abstract = {Formant frequencies in an old Estonian folk song performed by two female voices were estimated for two back vowels /a/ and /u/, and for two front vowels /e/ and /i/. Comparison of these estimates with formant frequencies in spoken Estonian vowels indicates a trend of the vowels to be clustered into two sets of front and back ones in the F1/F2 plane. Similar clustering has previously been shown to occur in opera and choir singing, especially with increasing fundamental frequency. The clustering in the present song, however, may also be due to a tendency for a mid vowel to be realized as a higher-beginning diphthong, which is characteristic of the North-Estonian coastal dialect area where the singers come from. No evidence of a "singer's formant" was found.}, } @article {pmid1619126, year = {1992}, author = {Sommers, MS and Moody, DB and Prosen, CA and Stebbins, WC}, title = {Formant frequency discrimination by Japanese macaques (Macaca fuscata).}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {6}, pages = {3499-3510}, doi = {10.1121/1.402839}, pmid = {1619126}, issn = {0001-4966}, support = {DC00078-26/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Animals ; *Auditory Threshold ; Humans ; Macaca/*psychology ; Male ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; Species Specificity ; *Speech Perception ; }, abstract = {These studies investigated formant frequency discrimination by Japanese macaques (Macaca fuscata) using an AX discrimination procedure and techniques of operant conditioning. Nonhuman subjects were significantly more sensitive to increments in the center frequency of either the first (F1) or second (F2) formant of single-formant complexes than to corresponding pure-tone frequency shifts. Furthermore, difference limens (DLs) for multiformant signals were not significantly different than those for single-formant stimuli. These results suggest that Japanese monkeys process formant and pure-tone frequency increments differentially and that the same mechanisms mediate formant frequency discrimination in single-formant and vowel-like complexes. The importance of two of the cues available to mediate formant frequency discrimination, changes in the phase and the amplitude spectra of the signals, was investigated by independently manipulating these two parameters. Results of the studies indicated that phase cues were not a significant feature of formant frequency discrimination by Japanese macaques. Rather, subjects attended to relative level changes in harmonics within a narrow frequency range near F1 and F2 to detect formant frequency increments. These findings are compared to human formant discrimination data and suggest that both species rely on detecting alterations in spectral shape to discriminate formant frequency shifts. Implications of the results for animal models of speech perception are discussed.}, } @article {pmid1619125, year = {1992}, author = {Kirk, KI and Tye-Murray, N and Hurtig, RR}, title = {The use of static and dynamic vowel cues by multichannel cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {6}, pages = {3487-3498}, doi = {10.1121/1.402838}, pmid = {1619125}, issn = {0001-4966}, support = {DC00242/DC/NIDCD NIH HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; Attention ; *Cochlear Implants ; *Cues ; Electrodes, Implanted ; Female ; Humans ; Male ; Middle Aged ; *Phonetics ; Prosthesis Design ; Sound Spectrography ; *Speech Perception ; Speech Reception Threshold Test ; }, abstract = {Multichannel cochlear implant users vary greatly in their word-recognition abilities. This study examined whether their word recognition was related to the use of either highly dynamic or relatively steady-state vowel cues contained in /bVb/ and /wVb/ syllables. Nine conditions were created containing different combinations of formant transition, steady-state, and duration cues. Because processor strategies differ, the ability to perceive static and dynamic information may depend on the type of cochlear implant used. Ten Nucleus and ten Ineraid subjects participated, along with 12 normal-hearing control subjects. Vowel identification did not differ between implanted groups, but both were significantly poorer at identifying vowels than the normal-hearing group. Vowel identification was best when at least two kinds of cues were available. Using only one type of cue, performance was better with excised vowels containing steady-state formants than in "vowelless" syllables, where the center vocalic portion was deleted and transitions were joined. In the latter syllable type, Nucleus subjects identified vowels significantly better when /b/ was the initial consonant; the other two groups were not affected by specific consonantal context. Cochlear implant subjects' word-recognition was positively correlated with the use of dynamic vowel cues, but not with steady-state cues.}, } @article {pmid1619123, year = {1992}, author = {Sundberg, J and Lindblom, B and Liljencrants, J}, title = {Formant frequency estimates for abruptly changing area functions: a comparison between calculations and measurements.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {6}, pages = {3478-3482}, doi = {10.1121/1.402836}, pmid = {1619123}, issn = {0001-4966}, mesh = {Humans ; Larynx/*physiology ; Pharynx/*physiology ; *Phonetics ; Psychoacoustics ; Pulmonary Ventilation/*physiology ; *Sound Spectrography ; Speech/*physiology ; Tongue/*physiology ; }, abstract = {Vocal tract area functions may contain quite abrupt changes in cross-sectional area. In formant frequency calculations for such area functions, an inner length correction (ILC) should be applied. The relevance of this correction was investigated by comparing acoustic measurements obtained from a physical model of the vocal tract with data gathered by means of computer simulations. Calculating formant frequencies without applying internal length corrections caused substantial errors, particularly for area functions representing apical stops just anterior to occlusion. Decentering and axial symmetry in the arrangement of the area elements of the physical model were briefly studied and found to have effects on the formant frequency values.}, } @article {pmid1608248, year = {1992}, author = {Subtelny, JD and Whitehead, RL and Samar, VJ}, title = {Spectral study of deviant resonance in the speech of women who are deaf.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {3}, pages = {574-579}, doi = {10.1044/jshr.3503.574}, pmid = {1608248}, issn = {0022-4685}, mesh = {Adult ; Deafness/*complications/physiopathology ; Female ; Humans ; Larynx/physiopathology ; Pharyngeal Muscles/physiopathology ; Pharynx/physiopathology ; Spectrum Analysis ; *Speech Production Measurement ; Tape Recording ; Vocal Cords/physiopathology ; Voice Disorders/*diagnosis/physiopathology ; }, abstract = {In a previous radiographic study of 4 deaf women, some aberrant features in vocal tract configuration were identified for vowels produced with excessive pharyngeal resonance. These features included neutralization of tongue position, elevation of the hyoid, and a retraction of the tongue, associated with a deflection of the epiglottis in the lower pharynx. The vowels, produced simultaneously with X-ray exposure, were analyzed spectrographically to study acoustic correlates of the vocal tract deformation. Comparisons of the formants for vowels /i/ /u/ and /a/ produced by the deaf women with mean formant values for these vowels produced by normal-hearing women revealed no consistent pattern of second-formant deviation. Formant structure evaluated on isovowel lines disclosed consistent neutralization of vowels, with F2 values clustering in the 1500-2100 Hz frequency range, which is attributed to the observed restricted horizontal movements of the tongue within the oral and pharyngeal cavities. If such restrictions affect the production of all vowels, a lower F2 might be assumed for the front vowels, which normally have a high F2; a higher F2 frequency would be anticipated for back vowels, which normally have a low F2. The limited sample studied supports this assumption.}, } @article {pmid1535639, year = {1992}, author = {Qi, Y and Shipp, T}, title = {An adaptive method for tracking voicing irregularities.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {6}, pages = {3471-3477}, doi = {10.1121/1.402835}, pmid = {1535639}, issn = {0001-4966}, mesh = {Adult ; Aged ; *Algorithms ; *Attention ; *Communication Aids for Disabled ; Humans ; Middle Aged ; *Phonetics ; Psychoacoustics ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; *Speech Perception ; Voice Disorders/diagnosis ; }, abstract = {A method has been developed for tracking irregularities in the acoustic waveform of a sustained phonation using the adaptive Wiener filter. Irregularities are determined by the technique of correlation cancellation. The algorithm is evaluated using sustained vowels produced by a formant synthesizer and by subjects with and without phonatary disorders. Results indicate that the method is capable of differentiating between normal and abnormal voices. Most significantly, however, it can also track sporadic or nonstationary irregularities in the shape of an individual acoustic wavelet. This method is expected to be a useful tool for the acoustics analysis of voice production.}, } @article {pmid1397755, year = {1992}, author = {Kraus, N and McGee, T and Sharma, A and Carrell, T and Nicol, T}, title = {Mismatch negativity event-related potential elicited by speech stimuli.}, journal = {Ear and hearing}, volume = {13}, number = {3}, pages = {158-164}, doi = {10.1097/00003446-199206000-00004}, pmid = {1397755}, issn = {0196-0202}, mesh = {Adolescent ; Adult ; Auditory Cortex/physiopathology ; Child ; *Evoked Potentials, Auditory ; Female ; Hearing/physiology ; Humans ; Male ; Neural Pathways/physiopathology ; Pilot Projects ; Speech Acoustics ; *Speech Perception ; }, abstract = {The mismatch negativity (MMN) is a passively elicited event-related potential that is extremely sensitive to acoustic stimulus properties. The MMN was characterized in normal adults and school-age children in response to speech stimuli differing minimally in the onset frequency of the second and third formant transitions. The speech-evoked MMN consists of a negative waveform at about 230 msec that occurs in response to the deviant stimulus when it is presented in an oddball paradigm. It is absent in response to that same stimulus when presented alone. The MMN was clearly present in all adults and children tested. Using the procedures developed in this study, this event-related potential was found to be robust enough in individual subjects to be considered a potential clinical measure for assessing central auditory function in school-age children and adults.}, } @article {pmid1393153, year = {1992}, author = {Tschopp, K and Käser, H and Kunert, F}, title = {Acoustical changes of loudly spoken speech and their effects on speech recognition in hearing-impaired listeners.}, journal = {British journal of audiology}, volume = {26}, number = {3}, pages = {153-158}, doi = {10.3109/03005369209079033}, pmid = {1393153}, issn = {0300-5364}, mesh = {Acoustic Stimulation ; Aged ; Female ; Hearing Loss, Sensorineural/*diagnosis ; Hearing Tests ; Humans ; Male ; Middle Aged ; Speech Discrimination Tests ; Speech Perception/*physiology ; }, abstract = {The level of speech is usually increased in conversations with unaided hearing-impaired listeners. However, the speaker may talk at conversational levels to aided hearing-impaired persons. In this case, the level of speech is electronically increased by the hearing aid. In the present study, the acoustical changes of loudly spoken speech and their effects on speech recognition were investigated in 20 patients with sensorineural hearing loss. Eight test words of the German Speech Intelligibility Test ('Freiburger Sprachtest') were recorded at original levels of 60 and 75 dB SPL by a male speaker. Both recordings were presented to hearing-impaired subjects at a playback level of 75 dB SPL. Thus, the level of the 60 dB SPL recording was increased electronically by 15 dB. For the 75 dB SPL recording, playback and recording levels were identical. The average whole-word score was 49% for the 60 dB SPL recording and 39% for the 75 dB SPL recording. This difference was statistically significant (0.002 less than P less than 0.005). The results of the speech recognition ability tests could be explained by the acoustical changes of the loudly spoken speech. In the 75 dB SPL recording, the levels of voiceless fricatives, nasals and plosives were significantly lower than in the 60 dB SPL recording. Spectrally, the fundamental frequency was increased and the configuration of the first formant was altered in the 75 dB SPL recording. The significance of the findings for clinical speech audiometry and hearing aid evaluation is discussed.}, } @article {pmid1629490, year = {1992}, author = {Stevens, KN and Blumstein, SE and Glicksman, L and Burton, M and Kurowski, K}, title = {Acoustic and perceptual characteristics of voicing in fricatives and fricative clusters.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {5}, pages = {2979-3000}, doi = {10.1121/1.402933}, pmid = {1629490}, issn = {0001-4966}, support = {DC00075/DC/NIDCD NIH HHS/United States ; DC00142/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Glottis/physiology ; Humans ; Male ; Models, Theoretical ; Phonation/*physiology ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; *Speech Perception ; Time Factors ; Vibration ; }, abstract = {Several types of measurements were made to determine the acoustic characteristics that distinguish between voiced and voiceless fricatives in various phonetic environments. The selection of measurements was based on a theoretical analysis that indicated the acoustic and aerodynamic attributes at the boundaries between fricatives and vowels. As expected, glottal vibration extended over a longer time in the obstruent interval for voiced fricatives than for voiceless fricatives, and there were more extensive transitions of the first formant adjacent to voiced fricatives than for the voiceless cognates. When two fricatives with different voicing were adjacent, there were substantial modifications of these acoustic attributes, particularly for the syllable-final fricative. In some cases, these modifications leads to complete assimilation of the voicing feature. Several perceptual studies with synthetic vowel-consonant-vowel stimuli and with edited natural stimuli examined the role of consonant duration, extent and location of glottal vibration, and extent of formant transitions on the identification of the voicing characteristics of fricatives. The perceptual results were in general consistent with the acoustic observations and with expectations based on the theoretical model. The results suggest that listeners base their voicing judgments of intervocalic fricatives on an assessment of the time interval in the fricative during which there is no glottal vibration. This time interval must exceed about 60 ms if the fricative is to be judged as voiceless, except that a small correction to this threshold is applied depending on the extent to which the first-formant transitions are truncated at the consonant boundaries.}, } @article {pmid1631323, year = {1992}, author = {Ciocca, V and Bregman, AS and Capreol, KL}, title = {The phonetic integration of speech and non-speech sounds: effects of perceived location.}, journal = {The Quarterly journal of experimental psychology. A, Human experimental psychology}, volume = {44}, number = {3}, pages = {577-593}, doi = {10.1080/14640749208401299}, pmid = {1631323}, issn = {0272-4987}, mesh = {Acoustic Stimulation ; Functional Laterality ; Humans ; *Phonetics ; *Speech Perception ; }, abstract = {The third-formant (F3) transition of a three-formant /da/ or /ga/ syllable was extracted and replaced by sine-wave transitions that followed the F3 centre frequency. The syllable without the F3 transition (base) was always presented at the left ear, and a /da/ (falling) or /ga/ (rising) sine-wave transition could be presented at either the left, the right, or both ears. The listeners perceived the base as a syllable, and the sine-wave transition as a non-speech whistle, which was lateralized near the left ear, the right ear, or the middle of the head, respectively. In Experiment 1, the sine-wave transition strongly influenced the identity of the syllable only when it was lateralized at the same ear as the base (left ear). Phonetic integration between the base and the transitions became weak, but was not completely eliminated, when the latter was perceived near the middle of the head or at the opposite ear as the base (right ear). The second experiment replicated these findings by using duplex stimuli in which the level of the sine-wave transitions was such that the subjects could not reliably tell whether a /da/ or a /ga/ transition was present at the same ear as the base. This condition was introduced in order to control for the possibility that the subjects could have identified the syallables by associating a rising or falling transition presented at the left ear with a /da/ or /ga/ percept. Alternative suggestions about the relation between speech and non-speech perceptual processes are discussed on the basis of these results.}, } @article {pmid1573870, year = {1992}, author = {Ansel, BM and Kent, RD}, title = {Acoustic-phonetic contrasts and intelligibility in the dysarthria associated with mixed cerebral palsy.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {2}, pages = {296-308}, doi = {10.1044/jshr.3502.296}, pmid = {1573870}, issn = {0022-4685}, mesh = {Adult ; Cerebral Palsy/*complications ; Dysarthria/diagnosis/etiology/*physiopathology ; Evaluation Studies as Topic ; Humans ; Male ; *Phonetics ; Predictive Value of Tests ; *Speech Acoustics ; *Speech Intelligibility ; Speech Perception ; Speech Production Measurement ; }, abstract = {This study evaluated the relationship between specific acoustic features of speech and perceptual judgments of word intelligibility of adults with cerebral palsy-dysarthria. Use of a contrasting word task allowed for intelligibility analysis and correlated acoustic analysis according to specified spectral and temporal features. Selected phonemic contrasts included syllable-initial voicing; syllable-final voicing; stop-nasal; fricative-affricate; front-back, high-low, and tense-lax vowels. Speech materials included a set of CVC stimulus words. Acoustic data are reported on vowel duration, formant frequency locations, voice onset times, amplitude rise times, and frication durations. Listeners' perceptual assessment of intelligibility of the 16 dysarthric adults by transcription and rating tasks is also presented. All but one acoustic contrast was successfully made as evidenced by measured acoustic differences between contrast pairs. However, the generally successful acoustic contrasts stood in marked contrast to the poorly rated intelligibility scores and high error percentages that were ascribed to the opposite pair members. A second analysis examined the contribution of these acoustic features towards estimates and prediction of intelligibility deficits in speakers with dysarthria. The scaled intelligibility was predicted by multiple regression analysis with 62.6% accuracy by acoustic measures related to one consonant contrast (fricative-affricate) and three vowel contrasts (front-back, high-low, and tense-lax). Other measured contrasts, such as those related to contrast voicing effects and stop-nasal distinctions, did not seem to contribute in a significant way to variability in the intelligibility estimates. These findings are discussed in relation to specific areas of production deficiency that are consistent across different types of dysarthria with cerebral palsy as the etiology.}, } @article {pmid1564205, year = {1992}, author = {Howell, P and Williams, M}, title = {Acoustic analysis and perception of vowels in children's and teenagers' stuttered speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {3}, pages = {1697-1706}, doi = {10.1121/1.402449}, pmid = {1564205}, issn = {0001-4966}, mesh = {Adolescent ; Child ; Child, Preschool ; Female ; Glottis/physiopathology ; Humans ; Male ; *Phonetics ; Signal Processing, Computer-Assisted/instrumentation ; Sound Spectrography/instrumentation ; *Speech Intelligibility ; Stuttering/*physiopathology ; }, abstract = {The syllable repetitions of 24 child and eight teenage stutterers were investigated to assess whether the vowels neutralize and, if so, what causes this. In both groups of speakers, the vowel in CV syllable repetitions and the following fluent vowel were excised from conversational speech samples. Acoustic analyses showed the formant frequencies of vowels in syllable repetitions to be appropriate for the intended vowel and the duration of the dysfluent vowels to be shorter than those of the fluent vowels for both groups of speakers. The intensity of the fluent vowels was greater than that of the dysfluent vowels for the teenagers but not the children: For both age groups, excitation waveforms obtained by inverse filtering showed that the excitation spectra associated with dysfluent vowels fell off more rapidly with frequency than did those associated with the fluent vowels. The fundamental frequency of the children's dysfluent speech was higher than their fluent speech while there was no difference in the teenager's speech. The relationship between the intensities of the glottal volume velocities was the same as that of the speech waveforms. Perceptual tests were also conducted to assess whether duration and the differences found in the source excitation would make children's vowels sound neutral. The experiments show that in children neither vowel duration nor fundamental frequency differences cause the vowels to be perceived as neutral. The results suggest that the low intensity and characteristics of the source of excitation which cause vowels to sound neutral may only occur in late childhood. Furthermore, monitoring stuttered speech for the emergence of neutral vowels may be a way of indexing the progress of the disorder.}, } @article {pmid1735980, year = {1992}, author = {Ryalls, J and Larouche, A}, title = {Acoustic integrity of speech production in children with moderate and severe hearing impairment.}, journal = {Journal of speech and hearing research}, volume = {35}, number = {1}, pages = {88-95}, doi = {10.1044/jshr.3501.88}, pmid = {1735980}, issn = {0022-4685}, mesh = {Belgium ; Child ; *Child Language ; Female ; Hearing Disorders/*physiopathology ; Humans ; Male ; Speech/physiology ; *Speech Acoustics ; *Speech Production Measurement ; }, abstract = {Ten normally hearing and 10 age-matched subjects with moderate-to-severe hearing impairment were recorded producing a protocol of 18 basic syllables [/pi/,/pa/,/pu/;/bi/,/ba/,/bu/; /ti/,/ta/,/tu/ ;/di/,/da/,/du/; /ki/,/ka/,/ku/; /gi/,/ga/,/gu/] repeated five times. The resulting 90 syllables were digitized and measured for (a) total duration; (b) voice-onset time (VOT) of the initial consonant; (c) fundamental frequency (F0) at midpoint of vowel; and (d) formant frequencies (F1, F2, F3), also measured at midpoint of vowel. Statistical comparisons were conducted on (a) average values for each syllable, and (b) standard deviations. Although there were numerical differences between normally hearing and hearing-impaired groups, few differences were statistically significant.}, } @article {pmid1735970, year = {1992}, author = {Titze, IR}, title = {Acoustic interpretation of the voice range profile (phonetogram).}, journal = {Journal of speech and hearing research}, volume = {35}, number = {1}, pages = {21-34}, doi = {10.1044/jshr.3501.21}, pmid = {1735970}, issn = {0022-4685}, support = {DC 00387-03/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Male ; *Phonation ; *Speech Acoustics ; *Speech Perception ; Voice Quality ; }, abstract = {The voice range profile (VRP) is a display of vocal intensity range versus fundamental frequency (F0). Past measurements have shown that the intensity range is reduced at the extremes of the F0 range, that there is a gradual upward tilt of the high- and low-intensity boundaries with increasing F0, and that a ripple exists at the boundaries. The intensity ripple, which results from tuning of source harmonics to the formants, is more noticeable at the upper boundary than the lower boundary because higher harmonics are not energized as effectively near phonation threshold as at maximum lung pressure. The gradual tilt of the intensity boundaries results from more effective transmission and radiation of acoustic energy at higher fundamental frequencies. This depends on the spectral distribution of the source power, however, At low F0, a smaller spectral slope (more harmonic energy) produces greater intensity. At high F0, on the other hand, a shift of energy toward the fundamental results in greater intensity. This dependence of intensity on spectral distribution of source power seems to explain the reduced intensity range at higher F0. An unrelated problem of reduced intensity range at low F0 stems from the inherent difficulty of keeping F0 from rising when subglottal pressure is increased.}, } @article {pmid1556310, year = {1992}, author = {Weismer, G and Martin, R and Kent, RD and Kent, JF}, title = {Formant trajectory characteristics of males with amyotrophic lateral sclerosis.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {2}, pages = {1085-1098}, doi = {10.1121/1.402635}, pmid = {1556310}, issn = {0001-4966}, mesh = {Adult ; Aged ; Aged, 80 and over ; Amyotrophic Lateral Sclerosis/*physiopathology ; Dysarthria/*physiopathology ; Humans ; Larynx/physiopathology ; Male ; Middle Aged ; *Phonetics ; *Sound Spectrography ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {The purpose of this study was to describe the formant trajectories produced by males with amyotrophic lateral sclerosis (ALS), a degenerative neuromuscular disease that is typically associated with dysarthria. Formant trajectories of 25 males with ALS and 15 neurologically normal geriatric males were compared for 12 words selected from the speech intelligibility task developed by Kent et al. [J. Speech. Hear. Disord. 54, 482-499 (1989)]. The results indicated that speakers with ALS (1) produced formant transitions having shallower slopes than transitions of normal speakers, (2) tended to produce exaggerations of formant trajectories at the onset of vocalic nuclei, and (3) had greater interspeaker variability of formant transition characteristics than normal speakers. Within the group of ALS speakers, those subjects who were less than 70% intelligible produced distinctly more aberrant trajectory characteristics than subjects who were more than 70% intelligible. ALS subjects who were less than 70% intelligible produced many trajectories that were essentially flat, or that had very shallow slopes. These results are discussed in terms of the speech production deficit in the dysarthria associated with ALS, and with respect to the potential influence of aberrant trajectories on speech intelligibility.}, } @article {pmid1551932, year = {1992}, author = {Levitt, AG and Utman, JG}, title = {From babbling towards the sound systems of English and French: a longitudinal two-case study.}, journal = {Journal of child language}, volume = {19}, number = {1}, pages = {19-49}, doi = {10.1017/s0305000900013611}, pmid = {1551932}, issn = {0305-0009}, support = {DC00403/DC/NIDCD NIH HHS/United States ; }, mesh = {Cross-Cultural Comparison ; Female ; Humans ; Infant ; Infant, Newborn ; *Language Development ; Longitudinal Studies ; Male ; *Phonetics ; *Speech ; }, abstract = {The utterances of one French and one American infant at 0;5, 0;8, 0;11, and 1;2 were transcribed and acoustically analysed for syllable duration and vowel formant values. Both general and language-specific effects emerged in the longitudinal study. Initial similarities in the consonantal repertoires of both infants, increasing control in producing target F1 and F2 values, and developmental changes in babbling characteristics over time seem to reflect universal patterns. Yet the babbling of the infants differed in ways that appear to be due to differences in their language environments. Shifts in the infants' sound repertoires reflected phoneme frequencies in the adult languages. The English-learning infant produced more closed syllables, which is characteristic of English, than the French-learning infant. The French-learning infant tended to produce more regularly-timed nonfinal syllables and showed significantly more final-syllable lengthening (both characteristic of French) than the English-learning infant.}, } @article {pmid1737887, year = {1992}, author = {Andruski, JE and Nearey, TM}, title = {On the sufficiency of compound target specification of isolated vowels and vowels in /bVb/ syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {91}, number = {1}, pages = {390-410}, doi = {10.1121/1.402781}, pmid = {1737887}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; *Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {It has been suggested [e.g., Strange et al., J. Acoust. Soc. Am. 74, 695-705 (1983); Verbrugge and Rakerd, Language Speech 29, 39-57 (1986)] that the temporal margins of vowels in consonantal contexts, consisting mainly of the rapid CV and VC transitions of CVC's, contain dynamic cues to vowel identity that are not available in isolated vowels and that may be perceptually superior in some circumstances to cues which are inherent to the vowels proper. However, this study shows that vowel-inherent formant targets and cues to vowel-inherent spectral change (measured from nucleus to offglide sections of the vowel itself) persist in the margins of /bVb/ syllables, confirming a hypothesis of Nearey and Assmann [J. Acoust. Soc. Am. 80, 1297-1308 (1986)]. Experiments were conducted to test whether listeners might be using such vowel-inherent, rather than coarticulatory information to identify the vowels. In the first experiment, perceptual tests using "hybrid silent center" syllables (i.e., syllables which contain only brief initial and final portions of the original syllable, and in which speaker identity changes from the initial to the final portion) show that listeners' error rates and confusion matrices for vowels in /bVb/ syllables are very similar to those for isolated vowels. These results suggest that listeners are using essentially the same type of information in essentially the same way to identify both kinds of stimuli. Statistical pattern recognition models confirm the relative robustness of nucleus and vocalic offglide cues and can predict reasonably well listeners' error patterns in all experimental conditions, though performance for /bVb/ syllables is somewhat worse than for isolated vowels. The second experiment involves the use of simplified synthetic stimuli, lacking consonantal transitions, which are shown to provide information that is nearly equivalent phonetically to that of the natural silent center /bVb/ syllables (from which the target measurements were extracted). Although no conclusions are drawn about other contexts, for speakers of Western Canadian English coarticulatory cues appear to play at best a minor role in the perception of vowels in /bVb/ context, while vowel-inherent factors dominate listeners' perception.}, } @article {pmid1603839, year = {1992}, author = {Whalen, DH and Xu, Y}, title = {Information for Mandarin tones in the amplitude contour and in brief segments.}, journal = {Phonetica}, volume = {49}, number = {1}, pages = {25-47}, doi = {10.1159/000261901}, pmid = {1603839}, issn = {0031-8388}, support = {HD-01994/HD/NICHD NIH HHS/United States ; }, mesh = {Adult ; Attention ; Female ; Humans ; *Language ; Male ; *Phonetics ; *Pitch Perception ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {While the tones of Mandarin are conveyed mainly by the F0 contour, they also differ consistently in duration and in amplitude contour. The contribution of these factors was examined by using signal-correlated noise stimuli, in which natural speech is manipulated so that it has no F0 or formant structure but retains its original amplitude contour and duration. Tones 2, 3 and 4 were perceptible from just the amplitude contour, even when duration was not also a cue. In two further experiments, the location of the critical information for the tones during the course of the syllable was examined by extracting small segments from each part of the original syllable. Tones 2 and 3 were often confused with each other, and segments which did not have much F0 change were most often heard as Tone 1. There were, though, also cases in which a low, unchanging pitch was heard as Tone 3, indicating a partial effect of register even in Mandarin. F0 was positively correlated with amplitude, even when both were computed on a pitch period basis. Taken together, the results show that Mandarin tones are realized in more than just the F0 pattern, that amplitude contours can be used by listeners as cues for tone identification, and that not every portion of the F0 pattern unambiguously indicates the original tone.}, } @article {pmid1578392, year = {1992}, author = {Stone, MA and Moore, BC}, title = {Spectral feature enhancement for people with sensorineural hearing impairment: effects on speech intelligibility and quality.}, journal = {Journal of rehabilitation research and development}, volume = {29}, number = {2}, pages = {39-56}, doi = {10.1682/jrrd.1992.04.0039}, pmid = {1578392}, issn = {0748-7711}, mesh = {Electronics ; Hearing Loss, Sensorineural/*rehabilitation ; Humans ; Noise ; Sensory Thresholds ; Signal Processing, Computer-Assisted/instrumentation ; Speech Discrimination Tests ; *Speech Intelligibility ; *Voice Quality ; }, abstract = {People with sensorineural hearing loss often have difficulty understanding speech in background noise at speech-to-noise ratios (0 to +6 dB) for which normally hearing people have little difficulty. Spectral analysis of speech in noise at these ratios typically shows that the major spectral prominences in the speech (formants) are well represented, but the spectral valleys between the formants are filled with noise. Hearing impaired people have a reduced ability to pick out the spectral prominences, and are more affected by the noise filling in the valleys, partly because of their reduced frequency selectivity. This paper describes a 16-channel bandpass filter bank, implemented in analog electronics, that attempts to enhance spectral features of speech in noise to improve intelligibility for the hearing impaired. Each channel generates an 'activity function' that is proportional to the magnitude of the signal envelope in that channel, averaged over a short period of time. A positively weighted activity function from the nth channel is combined with negatively weighted functions from channels n-2, n-1, n+1, and n+2, giving a correction signal used to control the gain of the bandpass signal in the nth channel. Recombining the bandpass signals results in an enhancement of spectral features of the speech in noise. Two different experiments are described here, one using the activity function as described, and the other using a non-linear transform of the activity function. In both experiments, several different weighting patterns were used in calculating the correction signal. The intelligibility of speech in noise processed by the system was measured for subjects with moderate sensorineural hearing loss. In both experiments, no improvement in intelligibility was found. However, subjective ratings of the stimuli used in Experiment 2 indicated that some subjects judged the processed stimuli to have both higher quality and higher intelligibility than unprocessed stimuli.}, } @article {pmid1571581, year = {1992}, author = {Manning, WH and Moore, JN and Dunham, MJ and Lu, FL and Domico, E}, title = {Vowel production in a prelinguistic child following cochlear implantation.}, journal = {Journal of the American Academy of Audiology}, volume = {3}, number = {1}, pages = {16-21}, pmid = {1571581}, issn = {1050-0545}, mesh = {Child ; *Cochlear Implants ; Deafness/*rehabilitation ; Female ; Follow-Up Studies ; Humans ; *Phonation ; *Phonetics ; *Speech Intelligibility ; }, abstract = {Formant frequencies were determined from vocalic utterances of a prelinguistically deafened child implanted with a Nucleus 22-channel device at age 5 years. Speech samples were obtained from recordings of speech made 5 and 2 months prior to implantation and at 6, 12, 18, 24, and 36 months post implantation. Prior to implantation, first formant values showed a greater range than those of normal hearing children of comparable age, and second formant values showed a greatly reduced range and clustered around a mean of 1800 Hz. By 36 months post implantation, first formant values approximated age-matched norms. By 6 months post implantation, higher second formant values were apparent. A progressive shift in second formant values was noted through 36 months post implantation when the vocalic space most closely corresponded to age-matched normative data.}, } @article {pmid1444934, year = {1992}, author = {Tyler, RS and Preece, JP and Lansing, CR and Gantz, BJ}, title = {Natural vowel perception by patients with the ineraid cochlear implant.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {31}, number = {4}, pages = {228-239}, doi = {10.3109/00206099209081658}, pmid = {1444934}, issn = {0020-6091}, support = {P50 DC000242/DC/NIDCD NIH HHS/United States ; CDR1P01NS20466-1A1/NS/NINDS NIH HHS/United States ; RR59/RR/NCRR NIH HHS/United States ; }, mesh = {Adult ; *Cochlear Implants ; Female ; Hearing Loss, Sensorineural/*rehabilitation ; Hearing Tests ; Humans ; Male ; Middle Aged ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Vowel recognition was tested in 10 patients using the Ineraid cochlear implant. The vowels were produced by a male speaker in the context 'heed, hid, head, had, hawed, hood, who'd, hud' and 'heard'. Performance varied from 34 to 93% correct. A descriptive feature system for the vowels was determined from an acoustic analysis. An information transfer analysis of these features suggested that information about the first formant frequency, vowel duration and fundamental frequency was transmitted. Information about the second and third formant frequency was transmitted less well. A sequential information transmission analysis suggested that the features of the first formant and duration accounted for nearly 80% of the information transmitted. The fundamental frequency and second formant frequency information accounted for an additional 8%. Information provided by the third formant frequency was largely redundant.}, } @article {pmid1418941, year = {1992}, author = {Rihkanen, H and Soini, I}, title = {Changes in voice characteristics after uvulopalatopharyngoplasty.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {249}, number = {6}, pages = {322-324}, pmid = {1418941}, issn = {0937-4477}, mesh = {Female ; Humans ; Male ; Middle Aged ; Palate, Soft/*surgery ; Pharynx/*surgery ; Postoperative Complications ; Sleep Apnea Syndromes/physiopathology/surgery ; Sound Spectrography ; Uvula/*surgery ; *Voice Quality ; }, abstract = {Previous reports have warned that tonsillectomy or uvulopalatopharyngoplasty (UPPP) may alter patients' speech by increasing the amount of nasal resonance as well as by changing voice timbre due to enlargement of the vocal tract. However, very few objective investigations, excluding nasality problems, have been carried out. We studied eight patients who underwent surgery for UPPP and recorded pre- and postoperative speech. The speech samples were then rated by seven experienced listeners, and acoustic spectra of two long vowels,/a/ and /e/, were analyzed using a computer program (MacSpeech Lab II). The listeners were unable to make a distinction between pre- and postoperative voice samples in the recordings studied. Acoustic analysis showed that the fundamental frequency as well as the first and second formants remained essentially unchanged. Present findings show that UPPP should not have a significant effect on voice characteristics as long as excessive nasality is not produced.}, } @article {pmid1344556, year = {1992}, author = {Ogüt, F and Cura, O and Kirazli, T and Karci, B and Apaydin, F}, title = {The evaluation of the changes of voice registers in trainee singers by using the two-channel signal processing method.}, journal = {Revue de laryngologie - otologie - rhinologie}, volume = {113}, number = {4}, pages = {365-367}, pmid = {1344556}, issn = {0035-1334}, mesh = {Adolescent ; Adult ; Analog-Digital Conversion ; Humans ; *Music ; *Signal Processing, Computer-Assisted ; Tongue/physiology ; *Voice Training ; }, abstract = {This study was done at the ENT Department of the Ege University Medical Faculty on twenty trainee singers. Using the two-channel signal processing method, the electroglottographic (EGG) signals and the voice signals were digitized with an analog-digital converting card during an ascending and descending glissando exercised by the trainee singer. These signals were recorded on the computer's hard disk and the obtained data was analysed. It has been determined that the EGG signals were more irregular the singing formant of the voice signal was very weak or absent and the change of register was more significant in less trained singers. This method can be used to evaluate objectively the change of voice registers in the training of the singers and be easily performed by adding an analog-digital converting card to a PC computer, without the need of expensive modern devices.}, } @article {pmid1287393, year = {1992}, author = {Kawasaki-Fukumori, H}, title = {An acoustical basis for universal phonotactic constraints.}, journal = {Language and speech}, volume = {35 (Pt 1-2)}, number = {}, pages = {73-86}, doi = {10.1177/002383099203500207}, pmid = {1287393}, issn = {0023-8309}, mesh = {Female ; Humans ; Linguistics ; Male ; *Phonetics ; Speech ; *Speech Acoustics ; }, abstract = {Spectral characteristics of some selected phoneme sequences are investigated in an attempt to explain cross-linguistic tendencies in phonotactic constraints. A hypothesis is offered that some universal sequential constraints are acoustically motivated. Two acoustic factors are posited as determinants of favored/disfavored sequences: The magnitude of acoustic modulation within a sequence and the degree of acoustic difference between sequences. The hypothesis is tested experimentally for its applicability to some of the universal constraints. Trajectories of the first three formants are obtained for selected sequences of stop + liquid + vowel, stop + glide + vowel, stop + vowel, and vowel + stop. Standard Euclidean distance in frequency is computed as a measure approximating each acoustic factor. The results show that some universally rare or unstable phoneme sequences can be explained on the basis of their lack of spectral modulation and/or their spectral similarity to other sequences, suggesting that acoustic/auditory factors play a significant role in determining the phonetic shape of language.}, } @article {pmid1287382, year = {1992}, author = {Weitzman, RS}, title = {Vowel categorization and the critical band.}, journal = {Language and speech}, volume = {35 (Pt 1-2)}, number = {}, pages = {115-125}, doi = {10.1177/002383099203500210}, pmid = {1287382}, issn = {0023-8309}, mesh = {Auditory Perception ; Female ; Humans ; Learning ; Male ; *Phonetics ; *Speech Acoustics ; Speech Perception ; }, abstract = {Using the concept formation paradigm, two series of experiments were conducted to test the hypothesis that the critical band (CB) was a factor in learning to make absolute discriminations of vowels. The specific hypothesis being examined was that the CB is a psychoacoustic boundary in learning to make vowel categorizations, and that learning absolute discriminations of pairs of vowels that differ in one of their formants by one bark or more is significantly easier than learning absolute discriminations of vowels that differ by less than one bark. Subjects were given the task of learning to identify paired sets of synthesized vowels that differed in either F1 or F2 by 0.4, 0.6, 0.8, 1.0, or 1.2 bark. The results of these experiments suggest that the critical bandwidth is not a natural psychoacoustic boundary in the learning of vowel categories, and also that subjects seem better able to learn distinctions involving differences in F1 than distinctions involving differences in F2. The discussion raises the possibility that vowels that differ by less than one bark may not be perceptually viable because of such factors as ambient noise, articulatory constraints, and coarticulatory influences. Some evidence from Dutch and French is presented in support of this conjecture.}, } @article {pmid1755877, year = {1991}, author = {Childers, DG and Wu, K}, title = {Gender recognition from speech. Part II: Fine analysis.}, journal = {The Journal of the Acoustical Society of America}, volume = {90}, number = {4 Pt 1}, pages = {1841-1856}, doi = {10.1121/1.401664}, pmid = {1755877}, issn = {0001-4966}, support = {DC 00577/DC/NIDCD NIH HHS/United States ; }, mesh = {Computer Graphics ; Female ; Humans ; Male ; *Phonetics ; *Sex Characteristics ; Signal Processing, Computer-Assisted/*instrumentation ; Sound Spectrography/*instrumentation ; Speech Acoustics ; *Speech Perception ; }, abstract = {The purpose of this research was to investigate the potential effectiveness of digital speech processing and pattern recognition techniques for the automatic recognition of gender from speech. In part I Coarse Analysis [K. Wu and D. G. Childers, J. Acoust. Soc. Am. 90, 1828-1840 (1991)] various feature vectors and distance measures were examined to determine their appropriateness for recognizing a speaker's gender from vowels, unvoiced fricatives, and voiced fricatives. One recognition scheme based on feature vectors extracted from vowels achieved 100% correct recognition of the speaker's gender using a database of 52 speakers (27 male and 25 female). In this paper a detailed, fine analysis of the characteristics of vowels is performed, including formant frequencies, bandwidths, and amplitudes, as well as speaker fundamental frequency of voicing. The fine analysis used a pitch synchronous closed-phase analysis technique. Detailed formant features, including frequencies, bandwidths, and amplitudes, were extracted by a closed-phase weighted recursive least-squares method that employed a variable forgetting factor, i.e., WRLS-VFF. The electroglottograph signal was used to locate the closed-phase portion of the speech signal. A two-way statistical analysis of variance (ANOVA) was performed to test the differences between gender features. The relative importance of grouped vowel features was evaluated by a pattern recognition approach. Numerous interesting results were obtained, including the fact that the second formant frequency was a slightly better recognizer of gender than fundamental frequency, giving 98.1% versus 96.2% correct recognition, respectively. The statistical tests indicated that the spectra for female speakers had a steeper slope (or tilt) than that for males. The results suggest that redundant gender information was imbedded in the fundamental frequency and vocal tract resonance characteristics. The feature vectors for female voices were observed to have higher within-group variations than those for male voices. The data in this study were also used to replicate portions of the Peterson and Barney [J. Acoust. Soc. Am. 24, 175-184 (1952)] study of vowels for male and female speakers.}, } @article {pmid1749235, year = {1991}, author = {Kamen, RS and Watson, BC}, title = {Effects of long-term tracheostomy on spectral characteristics of vowel production.}, journal = {Journal of speech and hearing research}, volume = {34}, number = {5}, pages = {1057-1065}, doi = {10.1044/jshr.3405.1057}, pmid = {1749235}, issn = {0022-4685}, mesh = {Child, Preschool ; Female ; Follow-Up Studies ; Humans ; Male ; Phonation ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Time Factors ; Tracheostomy/*adverse effects ; }, abstract = {This study investigated the effects of long-term tracheostomy on the development of speech. Eight children who underwent tracheotomy during the prelingual period were compared to matched controls on selected spectral parameters of the speech acoustic signal and standard measures of oral-motor, phonologic, and articulatory proficiency. Analysis of formant frequency values revealed significant between-group differences. Children with histories of long-term tracheostomy showed reduced acoustic vowel space, as defined by group formant frequency values. This suggests that these children were limited in their ability to produce extreme vocal tract configurations for vowels (a,i,u) postdecannulation. Oral motor patterns were less mature, and sound substitutions were not only more variable for this group, but also reflected a persistent overlay of maladaptive compensations developed during cannulation.}, } @article {pmid1662441, year = {1991}, author = {Telegina, TL and Pigareva, ML}, title = {[The dependence of the emotional state of 8- to 9-year-old children on the conditions of play].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {41}, number = {5}, pages = {899-906}, pmid = {1662441}, issn = {0044-4677}, mesh = {Child ; Electroencephalography ; Emotions/*physiology ; Female ; Heart Rate/physiology ; Humans ; Male ; Microcomputers ; *Play and Playthings ; Reaction Time/physiology ; Speech/physiology ; Stress, Psychological/physiopathology ; }, abstract = {Dependence was studied of emotional state of children, aged 8-9, on the terms of computer play: without additional instruction (with a strategy typical of the child), at time deficiency and at demonstration of techniques of play without limitation in time. Prior to and during the play the following parameters were recorded: heart rate (HR); frequency of basis tone (Fb.t.) and evaluation of frequency of the first formant (n(0] of the vowel "a" in the word "da" pronounced by the child during the play in response to acoustic signal; reaction time (RT) to acoustic signal; parameters of playing activity and also spontaneous verbal expressions of children. Increase of emotional stress during the play at time deficiency was accompanied by the most expressed changes of HR, RT, Fb.t., worsening of playing activity and the increase of the number of negative emotional reactions. When playing without additional instruction, the children showed the best results of playing activity, accompanied by the least expressed changes of HR and Fb.t. in comparison with the background at the greatest expressiveness of the n(0) change, and predominantly positive emotional reactions. The analysis of the correlation of the motivational and informational characteristics of the studied playing activity allowed to make a conclusion about preferential connection of HR and Fb.t. with the motivational and n(0) with informational components of the playing activity.}, } @article {pmid1775650, year = {1991}, author = {Assmann, PF}, title = {The perception of back vowels: centre of gravity hypothesis.}, journal = {The Quarterly journal of experimental psychology. A, Human experimental psychology}, volume = {43}, number = {3}, pages = {423-448}, doi = {10.1080/14640749108400980}, pmid = {1775650}, issn = {0272-4987}, mesh = {*Attention ; Humans ; Loudness Perception ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; *Speech Perception ; }, abstract = {According to the formant centre of gravity (FCOG) hypothesis, two vowel formants in close proximity are merged during perceptual analysis, and their contribution to vowel quality depends on the centre of gravity of the formant cluster. Findings consistent with this hypothesis are that two formants can be replaced by a single formant of intermediate centre frequency, provided their separation is less than 3-3.5 Bark; and that changes in their relative amplitudes produce systematic shifts in vowel quality. In Experiment 1, listeners adjusted the frequencies of F1 and F2 in a synthesized 6-formant vowel (with the F1-F2 separation fixed at 250 Hz, i.e. less than 3 Bark) to find the best phonetic match to a reference vowel with modified formant amplitudes. Contrary to FCOG predictions, F2 attenuation did not produce lower frequency matches. Raising the amplitude of F2 led to predicted upward shifts in formant frequencies of the matched vowel, but with increased variability of matches for some stimuli. In Experiment 2, listeners identified synthesized vowels with a range of separations of F1 and F2. Formant amplitude manipulations had no effect on listeners' judgements when the fundamental frequency was low (125 Hz). Small shifts in vowel quality appeared for stimuli with a high fundamental (250 Hz), but the shifts were significantly larger for F1-F2 separations greater than 3.5 Bark. These effects of formant amplitude are qualitatively different from those observed with single-formant vowels and are generally incompatible with a formant-averaging mechanism.}, } @article {pmid1775649, year = {1991}, author = {McKeown, JD and Darwin, CJ}, title = {Effects of phase changes in low-numbered harmonics on the internal representation of complex sounds.}, journal = {The Quarterly journal of experimental psychology. A, Human experimental psychology}, volume = {43}, number = {3}, pages = {401-421}, doi = {10.1080/14640749108400979}, pmid = {1775649}, issn = {0272-4987}, mesh = {Adult ; *Attention ; Humans ; Loudness Perception ; *Phonetics ; *Pitch Discrimination ; Psychoacoustics ; Sound Spectrography ; *Speech Perception ; }, abstract = {A series of experiments investigated the effect of phase changes in low-numbered single harmonics in target sounds that were either synthesized steady-state vowels fo periodic signals having only a single formant. A matching procedure was sued in which subjects selected a sound along a continuum differing in first formant frequency in order to get the best match with the target sound; perceptual effects of the phase manipulations in the target were detected as a change in the matched first formant frequency. Stimuli had to contain at least three harmonics to produce the effect, but id did not require a particular starting phase of the components. A suppression phenomenon is discussed, in which phase changes alter the phase-locking characteristics of auditory fibres tuned to low-numbered harmonics.}, } @article {pmid1745666, year = {1991}, author = {Sussman, HM}, title = {The representation of stop consonants in three-dimensional acoustic space.}, journal = {Phonetica}, volume = {48}, number = {1}, pages = {18-31}, doi = {10.1159/000261869}, pmid = {1745666}, issn = {0031-8388}, mesh = {Adolescent ; Adult ; Female ; Humans ; Language ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Voice ; }, abstract = {An acoustic study of English initial voiced stops showed that the three place categories are well distinguished on the basis of three properties: second-formant onset frequency (F2 onset), second-formant frequency of the vowel nucleus (F2), and third-formant onset frequency (F3 onset). Ten male and 10 female subjects produced multiple tokens of /bVt/, /dVt/, and /gVt/ with ten different medial vowels. Three-dimensional scatterplots were generated using the above acoustic properties as coordinates. Three nonoverlapping clusters corresponding to the three place categories were evident across all subjects. Discriminant analyses based on F2 onset, F2 vowel, and F3 onset as predictor variables showed a mean correct categorization rate of 84.5% when tested within gender groups. The graphic and statistical results are interpreted as indicating that phonetic categories may be defined and represented in purely acoustic terms if variably valued multiple acoustic properties are relationally examined.}, } @article {pmid521553, year = {1979}, author = {Purcell, ET}, title = {Formant frequency patterns in Russian VCV utterances.}, journal = {The Journal of the Acoustical Society of America}, volume = {66}, number = {6}, pages = {1691-1702}, doi = {10.1121/1.383641}, pmid = {521553}, issn = {0001-4966}, mesh = {Female ; Humans ; *Language ; Male ; *Phonation ; Regression Analysis ; *Voice ; }, abstract = {Previous studies have sought to establish the significance of various parameters in the determination of the patternings of Russian formant frequency trajectories in vowel-consonant-vowell (VCV) syllables. In the present study, 9600 measurements of first and second formant frequency were made on Russian VCV nonsense words. Variance for speakers was controlled through regression. Stepwise multiple regression was employed to determine the relative contributions of six predictors to the explanation of the patternings of first and second formant frequency in the test words. The six predictors included the height of the first and second vowels, the fron/back dimensionality of the first and second vowels, the place of articulation of the consonant, and the palatalization or nonpalatalization of the consonant. Results of the regression analyses are summarized. The regression coefficients for the useful predictors of the various dependent variables comprise a model of formant frequency patterns in Russian VCV utterances.}, } @article {pmid292773, year = {1979}, author = {Ghi, H and McGivney, GP}, title = {Influence of tooth p roprioception on speech articulation.}, journal = {The Journal of prosthetic dentistry}, volume = {42}, number = {6}, pages = {609-613}, doi = {10.1016/0022-3913(79)90188-4}, pmid = {292773}, issn = {0022-3913}, mesh = {Adult ; Aged ; Dental Occlusion ; Denture, Complete ; Denture, Overlay ; Humans ; Middle Aged ; Phonetics ; *Proprioception ; *Speech ; Speech Articulation Tests/instrumentation ; Time Factors ; Tooth/*physiology ; }, abstract = {Based on an analysis of the data obtained, the following conclusion can be made: 1. Time used to pronounce /S/ is increased for patients when their prostheses are not being worn. 2. There is a decrease in formant frequency when pronounciation of /S/ takes place without the prosthesis in place. 3. The amount of time span increase was less for overdenture group than for the complete denture group; the drop of formant frequency was not as greater as that of the complete denture group. 4. The precision of speech movement for the production /S/ is affected by the presence of tooth proprioception.}, } @article {pmid513674, year = {1979}, author = {Kent, RD}, title = {Isovowel lines for the evaluation of vowel formant structure in speech disorders.}, journal = {The Journal of speech and hearing disorders}, volume = {44}, number = {4}, pages = {513-521}, doi = {10.1044/jshd.4404.513}, pmid = {513674}, issn = {0022-4677}, mesh = {Adolescent ; Adult ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech Acoustics ; Speech Disorders/*physiopathology ; }, abstract = {This report describes a system of isovowel lines in the F1-F2 and F2-F3 planes and demonstrates how these linear approximations to vowel formant frequencies for a diverse sample of speakers can be used to evaluate the vowel formant structures of individuals with speech disorders. The application to disordered speech is illustrated with data for dysarthric adults, deaf adolescents, and young children with developmental errors of: Formula: (See Text) production.}, } @article {pmid500976, year = {1979}, author = {Young, ED and Sachs, MB}, title = {Representation of steady-state vowels in the temporal aspects of the discharge patterns of populations of auditory-nerve fibers.}, journal = {The Journal of the Acoustical Society of America}, volume = {66}, number = {5}, pages = {1381-1403}, doi = {10.1121/1.383532}, pmid = {500976}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Animals ; Auditory Perception/*physiology ; Cats ; Speech Perception/physiology ; Vestibulocochlear Nerve/*physiology ; }, abstract = {This paper is concerned with the representation of the spectra of synthesized steady-state vowels in the temporal aspects of the discharges of auditory-nerve fibers. The results are based on a study of the responses of large numbers of single auditory-nerve fibers in anesthetized cats. By presenting the same set of stimuli to all the fibers encountered in each cat, we can directly estimate the population response to those stimuli. Period histograms of the responses of each unit to the vowels were constructed. The temporal response of a fiber to each harmonic component of the stimulus is taken to be the amplitude of the corresponding component in the Fourier transform of the unit's period histogram. At low sound levels, the temporal response to each stimulus component is maximal among units with CFs near the frequency of the component (i.e., near its place). Responses to formant components are larger than responses to other stimulus components. As sound level is increased, the responses to the formants, particularly the first formant, increase near their places and spread to adjacent regions, particularly toward higher CFs. Responses to nonformant components, exept for harmonics and intermodulation products of the formants (2F1,2F2,F1 + F2, etc), are suppressed; at the highest sound levels used (approximately 80 dB SPL), temporal responses occur almost exclusively at the first two or three formants and their harmonics and intermodulation products. We describe a simple calculation which combines rate, place, and temporal information to provide a good representation of the vowels' spectra, including a clear indication of at least the first two formant frequencies. This representation is stable with changes in sound level at least up to 80 dB SPL; its stability is in sharp contrast to the behavior of the representation of the vowels' spectra in terms of discharge rate which degenerates at stimulus levels within the conversational range.}, } @article {pmid502519, year = {1979}, author = {Kent, RD and Netsell, R and Abbs, JH}, title = {Acoustic characteristics of dysarthria associated with cerebellar disease.}, journal = {Journal of speech and hearing research}, volume = {22}, number = {3}, pages = {627-648}, doi = {10.1044/jshr.2203.627}, pmid = {502519}, issn = {0022-4685}, mesh = {Aged ; Cerebellar Diseases/*physiopathology ; Cerebellum/physiopathology ; Dysarthria/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Speech ; *Speech Acoustics ; Speech Disorders/*physiopathology ; }, abstract = {The speech of five individuals with cerebellar disease and ataxic dysarthria was studied with acoustic analyses of CVC words, words of varying syllabic structure (stem, stem plus suffix, stem plus two suffixes), simple sentences, the Rainbow Passage, and conversation. The most consistent and marked abnormalities observed in spectrograms were alterations of the normal timing pattern, with prolongation of a variety of segments and a tendency toward equalized syllable durations. Vowel formant structure in the CVC words was judged to be essentially normal except for transitional segments. The greater the severity of the dysarthria, the greater the number of segments lengthened and the degree of lengthening of individual segments. The ataxic subjects were inconsistent in durational adjustments of the stem syllable as the number of syllables in a word was varied and generally made smaller reductions than normal subjects as suffixes were added. Disturbances of syllable timing frequently were accompanied by abnormal contours of fundamental frequency, particularly monotone and syllable-falling patterns. These dysprosodic aspects of ataxic dysarthria are discussed in relation to cerebellar function in motor control.}, } @article {pmid512208, year = {1979}, author = {Sachs, MB and Young, ED}, title = {Encoding of steady-state vowels in the auditory nerve: representation in terms of discharge rate.}, journal = {The Journal of the Acoustical Society of America}, volume = {66}, number = {2}, pages = {470-479}, doi = {10.1121/1.383098}, pmid = {512208}, issn = {0001-4966}, mesh = {*Acoustic Stimulation ; Animals ; Cats ; Phonetics ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Responses of large populations of auditory-nerve fibers to synthesized steady-state vowels were recorded in anesthetized cats. Driven discharge rate to vowels, normalized by dividing by saturation rate (estimated from the driven rate to CF tones 50 dB above threshold), was plotted versus fiber CF for a number of vowel levels. For the vowels /I/ and /e/, such rate profiles showed a peak in the region of the first formant and another in the region of the second and third formants, for sound levels below about 70 dB SPL. For /a/ at levels below about 40 dB SPL there are peaks in the region of the first and second formants. At higher levels these peaks disappear for all the vowels because of a combination of rate saturation and two-tone suppression. This must be qualified by saying that rate profiles plotted separately for units with spontaneous rates less than one spike per second may retain peaks at higher levels. Rate versus level functions for units with CFs above the first formant can saturate at rates less than the saturation rate to CF to-es or they can be nonmonotonic; these effects are most likely produced by the same mechanism as that involved in two-tone suppression.}, } @article {pmid512198, year = {1979}, author = {Ewan, WG}, title = {Can intrinsic vowel Fo be explained by source/tract coupling?.}, journal = {The Journal of the Acoustical Society of America}, volume = {66}, number = {2}, pages = {358-362}, doi = {10.1121/1.383669}, pmid = {512198}, issn = {0001-4966}, mesh = {Female ; Humans ; Male ; Pharynx/physiology ; *Phonetics ; *Speech/*physiology ; *Speech Acoustics ; }, abstract = {There is extensive evidence that in the same phonetic environment the voice fundamental frequency (Fo) of vowels varies directly with vowel "height." This Fo difference between vowels could be caused by acoustic interaction between the first vowel formant and the vibrating vocal folds. Since higher vowels have lower first formants than low vowels the acoustic interaction should be greatest for high vowels whose first formant frequencies are closer in frequency to Fo. Ten speakers were used to see if acoustic interaction could cause the Fo differences. The consonant [m] was recorded in the utterances [umu] and [ama]. Although the formant structure of [m] in [umu] and [ama] should not differ significantly, the Fo of each [m] allophone was significantly different. However, the Fo of each [m] allophone did not differ significantly from the Fo of the following vowel. These results did not support acoustic interaction. However, it is quite reasonable to conclude that the Fo variation of [m] was caused by coarticulatory anticipation of the tongue and jaw for the following vowel. Another experiment is offered in order to help explain the physical causes of intrinsic vowel Fo. In this experiment Fo lowering was found at the beginning of vowels following Arabic pharyngeal approximants. This finding indicates that the Fo of pharyngeal constricting vowels, e.g., [ae] and [a], might be lowered as a result of similar articulary movements, viz. tongue compression and active pharyngeal constriction.}, } @article {pmid502495, year = {1979}, author = {Hillenbrand, J and Minifie, FD and Edwards, TJ}, title = {Tempo of spectrum change as a cue in speech-sound discrimination by infants.}, journal = {Journal of speech and hearing research}, volume = {22}, number = {1}, pages = {147-165}, doi = {10.1044/jshr.2201.147}, pmid = {502495}, issn = {0022-4685}, mesh = {Cues ; Discrimination, Psychological ; Female ; Humans ; Infant ; Male ; Methods ; *Phonetics ; Sound Spectrography ; *Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Six- to seven-month-old infants were tested on their ability to discriminate among three speech sounds which differed on the basis of formant-transition duration, a major cue to distinctions among stop, semivowel and diphthong classes. The three speech sounds, [see text] were produced in two different ways. The stimuli for one experiment were two-formant synthetic tokens which differed in formant-transition duration. The stimuli for a second experiment was produced with a computer-modification technique which artificially shortened or lengthened the formant-transition portion of a naturally produced [see text], resulting in tokens of [see text]. The discrimination procedure involved visual reinforcement of a head-turn response following a change from a repeating background stimulus to a contrasting stimulus. Infants in both experiments discriminated [see text]; evidence for [see text] discrimination was obtained for the "computer modified" tokens only. These findings are discussed in terms of possible mechanisms underlying speech perception in infancy.}, } @article {pmid447908, year = {1979}, author = {Kuhn, GM}, title = {Stop consonant place perception with single-formant stimuli: evidence for the role of the front-cavity resonance.}, journal = {The Journal of the Acoustical Society of America}, volume = {65}, number = {3}, pages = {774-788}, doi = {10.1121/1.382491}, pmid = {447908}, issn = {0001-4966}, mesh = {*Acoustics ; Cues ; Humans ; Phonetics ; *Speech Perception ; }, abstract = {The third formant and the second formant were found on average to cue the place of articulation of intervocalic stop consonants equally well when the stop consonants occurred before the vowel/i/. This result and others provide some support for the notion that the fundamental resonance of the front cavity plays an important role in the perception of the phonetic dimension of place of articulation.}, } @article {pmid447335, year = {1979}, author = {Zhukov SYa, and Zhukova, MG}, title = {Effect of the formant structure of a syllable on its perception.}, journal = {Human physiology}, volume = {4}, number = {2}, pages = {179-183}, pmid = {447335}, issn = {0362-1197}, mesh = {Humans ; Neural Analyzers/physiology ; Speech Perception/*physiology ; Time Factors ; }, } @article {pmid760193, year = {1979}, author = {Suga, N and O'Neill, WE and Manabe, T}, title = {Harmonic-sensitive neurons in the auditory cortex of the mustache bat.}, journal = {Science (New York, N.Y.)}, volume = {203}, number = {4377}, pages = {270-274}, doi = {10.1126/science.760193}, pmid = {760193}, issn = {0036-8075}, mesh = {Action Potentials ; Animals ; Auditory Cortex/cytology/*physiology ; Auditory Pathways/physiology ; Auditory Perception/*physiology ; Chiroptera/*physiology ; Echolocation/physiology ; Neurons/physiology ; Time Factors ; }, abstract = {Human speech and animal sounds contain phonemes with prominent and meaningful harmonics. The biosonar signals of the mustache bat also contain up to four harmonics, and each consists of a long constant-frequency component followed by a short frequency-modulated component. Neurons have been found in a large cluster within auditory cortex of this bat whose responses are facilitated by combinations of two or more harmonically related tones. Moreover, the best frequencies for excitation of these neurons are closely associated with the constant-frequency components of the biosonar signals. The properties of these neurons make them well suited for identifying the signals produced by other echolocating mustache bats. They also show how meaningful components of sound are assembled by neural circuits in the central nervous system and suggest a method by which sounds with important harmonics (or formants) may be detected and recognized by the brain in other species, including humans.}, } @article {pmid544408, year = {1979}, author = {Schultz-Coulon, HJ and Battmer, RD and Riechers, H}, title = {[The 3-kHz formant--a criterion for the carrying capacity of voice? II. The trained singing voice].}, journal = {Folia phoniatrica}, volume = {31}, number = {4}, pages = {303-313}, pmid = {544408}, issn = {0015-5705}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Sound Spectrography ; *Voice ; *Voice Quality ; *Voice Training ; }, } @article {pmid544407, year = {1979}, author = {Schultz-Coulon, HJ and Battmer, RD and Riechers, H}, title = {[The 3-kHz formant--a criterion for the carrying capacity of voice? I. The untrained normal voice].}, journal = {Folia phoniatrica}, volume = {31}, number = {4}, pages = {291-301}, pmid = {544407}, issn = {0015-5705}, mesh = {Adult ; Female ; Humans ; Male ; Middle Aged ; *Sound Spectrography ; *Voice ; *Voice Quality ; }, } @article {pmid544402, year = {1979}, author = {Dmitriev, L and Kiselev, A}, title = {Relationship between the formant structure of different types of singing voices and the dimensions of supraglottic cavities.}, journal = {Folia phoniatrica}, volume = {31}, number = {4}, pages = {238-241}, doi = {10.1159/000264170}, pmid = {544402}, issn = {0015-5705}, mesh = {Humans ; Mouth/*anatomy & histology ; Pharynx/*anatomy & histology ; Sound Spectrography ; *Voice ; *Voice Training ; }, } @article {pmid542499, year = {1979}, author = {Bell-Berti, F and Raphael, LJ and Pisoni, DB and Sawusch, JR}, title = {Some relationships between speech production and perception.}, journal = {Phonetica}, volume = {36}, number = {6}, pages = {373-383}, pmid = {542499}, issn = {0031-8388}, support = {P01 HD001994/HD/NICHD NIH HHS/United States ; R01 DC000111/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech Perception ; *Speech Production Measurement ; *Tongue Habits ; *Verbal Behavior ; }, abstract = {EMG studies of the American English vowel pairs /i-I/ and /e-epsilon/ reveal two different production strategies: some speakers appear to differentiate the members of each pair primarily on the basis to tongue height; for others the basis of differentiation appears to be tongue tension. There was no obvious reflection of these differences in the speech wave-forms or formant patterns of the two groups. To determine if these differences in production might correspond to differences in perception, two vowel identification tests were given to the EMG subjects. Subjects were asked to label the members of a seven-step vowel continuum, /i/ through /I/. In one condition each item had an equal probability of occurrence. The other condition was an anchoring test; the first stimulus, /i/, was heard four times as often as any other stimulus. Compared with the equal-probability test labelling boundary, the boundary in the anchoring test was displaced toward the more frequently occurring stimulus. The magnitude of the shift of the labelling boundary was greater for subjects using a production strategy based on tongue height than for subjects using tongue tension to differentiate these vowels, suggesting that the stimuli represent adjacent categories in the speakers' phonetic space for the former, but not for the latter, group.}, } @article {pmid527984, year = {1979}, author = {Lyublinskaya, VV and Slepokurova, NA}, title = {Perception of synthetic vowels with initial and final transitions of the second formant.}, journal = {Human physiology}, volume = {5}, number = {1}, pages = {111-112}, pmid = {527984}, issn = {0362-1197}, mesh = {Humans ; *Phonetics ; *Speech Perception ; }, } @article {pmid523521, year = {1979}, author = {Kohler, KJ}, title = {Dimensions in the perception of fortis and lenis plosives.}, journal = {Phonetica}, volume = {36}, number = {4-5}, pages = {332-343}, doi = {10.1159/000259970}, pmid = {523521}, issn = {0031-8388}, mesh = {Humans ; *Phonetics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {The analysis of the production of fortis and lenis plosives in German has shown the importance of the duration ratio vowel/(vowel + closure) for the distinction. To complement these results a perception test was carried out in which 29 native speakers identified a randomised sequence of 220 stimuli from tape as one of the phrases 'Diese Gruppe kann ich nicht leid(e)n (leit(e)n)'. The stimuli were obtained from the two naturally produced originals by changing the ratios and the length of voicing in the plosive through computer processing. The test results indicate very conclusively that judgment can be reversed simply by changing the ratio to the appropriate ones found in production. There is a hierarchy of perceptual dimensions: duration ratio greater than formant transition greater than voicing.}, } @article {pmid523519, year = {1979}, author = {Schroeder, MR and Strube, HW}, title = {Acoustic measurements of articulator motions.}, journal = {Phonetica}, volume = {36}, number = {4-5}, pages = {302-313}, doi = {10.1159/000259968}, pmid = {523519}, issn = {0031-8388}, mesh = {Humans ; Lip/physiology ; Mandible/physiology ; Maxilla/physiology ; Palate, Soft/physiology ; Phonetics ; Speech/*physiology ; *Speech Articulation Tests ; *Speech Production Measurement ; Tongue/physiology ; }, abstract = {Methods for estimating articulatory data from acoustic measurements are reviewed. First, relations between the vocal-tract area function and formant or impedance data are pointed out. Then the possibility of determining a (discretized) area function from the speech signal itself is considered. Finally, we look at the estimation of certain articulatory parameters rather than the area function. By using a regression method, such parameters can even be estimated independently of any vocal-tract model. Results for real-speech data are given.}, } @article {pmid523518, year = {1979}, author = {Heike, G}, title = {Articulatory measurement and synthesis. Methods and preliminary results.}, journal = {Phonetica}, volume = {36}, number = {4-5}, pages = {294-301}, doi = {10.1159/000259967}, pmid = {523518}, issn = {0031-8388}, mesh = {Humans ; Lip/physiology ; Mandible/physiology ; Maxilla/physiology ; Phonetics ; Speech/physiology ; *Speech Articulation Tests ; *Speech Production Measurement ; Tongue/physiology ; }, abstract = {The relations between area functions of the vocal tract and the acoustic result are of phonetic interest only if they can be interpreted in articulatory terms. We show which problems may arise if two-dimensional configurations of the vocal tract are to be transformed into area functions, and which relations exist between articulatory variables, area functions, and formant frequencies in an articulatory model.}, } @article {pmid478411, year = {1979}, author = {McGlone, RE and Manning, WH}, title = {Role of the second formant in pitch perception of whispered and voiced vowels.}, journal = {Folia phoniatrica}, volume = {31}, number = {1}, pages = {9-14}, doi = {10.1159/000264144}, pmid = {478411}, issn = {0015-5705}, mesh = {Adult ; Auditory Threshold ; Humans ; Male ; *Phonetics ; *Pitch Perception ; *Speech Perception ; }, } @article {pmid422815, year = {1979}, author = {Kent, RD and Forner, LL}, title = {Developmental study of vowel formant frequencies in an imitation task.}, journal = {The Journal of the Acoustical Society of America}, volume = {65}, number = {1}, pages = {208-217}, doi = {10.1121/1.382237}, pmid = {422815}, issn = {0001-4966}, mesh = {Adult ; Child ; Child, Preschool ; Female ; Humans ; *Language Development ; Male ; Mathematics ; *Speech ; }, abstract = {Imitations of ten synthesized vowels were recorded from 33 speakers including men, women, and children. The first three formant frequencies of the imitations were estimated from spectrograms and considered with respect to developmental patterns in vowel formant structure, uniform scale factors for vowel normalization, and formant variability. Strong linear effects were observed in the group data for imitations of most of the English vowels studied, and straight lines passing through the origin provided a satisfactory fit to linear F1--F2 plots of the English vowel data. Logarithmic transformations of the formant frequencies helped substantially to equalize the dispersion of the group data for different vowels, but formant scale factors were observed to vary somewhat with both formant number and vowel identity. Variability of formant frequency was least for F1 (s.d. of 60 Hz or less for English vowels of adult males) and about equal for F2 and F3 (s.d. of 100 Hz or less for English vowels of adult males).}, } @article {pmid744282, year = {1978}, author = {Sheĭkin, RL}, title = {[Detection of phonetically significant nonuniformity by the high frequency slope of the spectrum of a synthetic vowel].}, journal = {Fiziologicheskii zhurnal SSSR imeni I. M. Sechenova}, volume = {64}, number = {12}, pages = {1790-1795}, pmid = {744282}, issn = {0015-329X}, mesh = {Humans ; Speech Perception/*physiology ; }, abstract = {Conditions for perception of phonetically relevant ununiformity at high frequency slope of the spectrum of synthetic noise--excited vowel were studied with the aid of psychoacoustic technique. The threshold for detection of high frequency spectral peak did not depend on the properties of the stimulus spectrum in low frequency region. The detection threshold could be influenced by spectrum properties near the peak location. The data obtained are inconsistent with the hypothesis of identification of spectral peaks with formants being the intermediate procedure in phonetic interpretation of the vowel.}, } @article {pmid744281, year = {1978}, author = {Liublinskaia, VV and Chistovich, LA}, title = {[Factors determining the subjective closeness of vowel-like stimuli].}, journal = {Fiziologicheskii zhurnal SSSR imeni I. M. Sechenova}, volume = {64}, number = {12}, pages = {1782-1789}, pmid = {744281}, issn = {0015-329X}, mesh = {Acoustic Stimulation/methods ; Humans ; Speech Perception/*physiology ; }, abstract = {Conditions for two formants combined into one spectral cue or giving rise to two spectral cues were studied by psychoacoustical methods. Critical distance between formant frequencies corresponding to perceptual separation of the formants was measured. Interdependence between the formant amplitudes and the equivalent frequency was revealed for stimuli with closely spaced formants. For stimuli with widely spaced formants formant amplitudes appeared to be unimportant for vowel quality.}, } @article {pmid744836, year = {1978}, author = {Stevens, KN and Blumstein, SE}, title = {Invariant cues for place of articulation in stop consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {64}, number = {5}, pages = {1358-1368}, doi = {10.1121/1.382102}, pmid = {744836}, issn = {0001-4966}, mesh = {Acoustic Stimulation ; Cues ; Humans ; Phonetics ; *Psychoacoustics ; *Speech Perception ; }, abstract = {In a series of experiments, identification responses for place of articulation were obtained for synthetic stop consonants in consonant-vowel syllables with different vowels. The acoustic attributes of the consonants were systematically manipulated, the selection of stimulus characteristics being guided in part by theoretical considerations concerning the expected properties of the sound generated in the vocal tract as place of articulation is varied. Several stimulus series were generated with and without noise bursts at the onset, and with and without formant transitions following consonantal release. Stimuli with transitions only, and with bursts plus transitions, were consistently classified according to place of articulation, whereas stimuli with bursts only and no transitions were not consistently identified. The acoustic attributes of the stimuli were examined to determine whether invariant properties characterized each place of atriculation independent of vowel context. It was determined that the gross shape of the spectrum sampled at the consonantal release showed a distinctive shape for each place of articulation: a prominent midfrequency spectral peak for velars, a diffuse-rising spectrum for alveolars, and a diffuse-falling spectrum for labials. These attributes are evident for stimuli containing transitions only, but are enhanced by the presence of noise bursts at the onset.}, } @article {pmid744826, year = {1978}, author = {Ladefoged, P and Harshman, R and Goldstein, L and Rice, L}, title = {Generating vocal tract shapes from formant frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {64}, number = {4}, pages = {1027-1035}, doi = {10.1121/1.382086}, pmid = {744826}, issn = {0001-4966}, mesh = {Acoustics ; Humans ; Lip/*physiology ; Mathematics ; Models, Biological ; Pharynx/*physiology ; Speech/*physiology ; Tongue/*physiology ; }, abstract = {An algorithm that uses only the first three formant frequencies has been devised for generating vocal tract shapes as seen on midsagittal x-ray diagrams of most English vowels. The shape of the tongue is characterized in terms of the sum of two factors derived from PARAFAC analysis: a front raising component and a back raising component. Stepwise multiple regression techniques were used to show that the proportions of these two components, and of a third parameter corresponding to the distance between the lips, are highly correlated with the formant frequencies in 50 vowels. The recovery algorithm developed from these correlations was tested on a number of published sets of tracings from x-ray diagrams, and appears to be generalizable to other speakers.}, } @article {pmid718057, year = {1978}, author = {Fink, BR}, title = {Fourth Daniel C. Baker, Jr. memorial lecture. Energy and the larynx.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {87}, number = {5 Pt 1}, pages = {595-605}, doi = {10.1177/000348947808700501}, pmid = {718057}, issn = {0003-4894}, mesh = {Anatomy, Comparative ; Animal Communication ; Animals ; Biological Evolution ; Body Weight ; Brain/anatomy & histology ; Cetacea ; Eating ; *Energy Metabolism ; Fishes ; Humans ; Invertebrates ; Larynx/*anatomy & histology/physiology ; Mammals ; Organ Size ; Phonation ; Primates ; Respiration ; }, abstract = {Review of the logistics of energy supply in animals indicates that the advent of the larynx was part of a response to an earlier energy crisis. It permitted a major increase in the flow of energy available to the organism and became the mechanical control point and rate-limiting factor of the increased inflow. Phylogenetically, a succession of structural innovations steadily enhanced the flow capacity of the larynx and rendered the mechanism more versatile, most recently with the accrual of phonation (in mammals), pressurized closure (in primates and odontocetes), and vocal formants and efficiency (in man). The larynx is thus a marker of the animal's capabilities for energy intake, energy utilization and information emmission and, apparently, of the size and complexity of the brain. Today's energy crises appear to continue those of the past.}, } @article {pmid701614, year = {1978}, author = {Nábĕlek, IV}, title = {Temporal summation of constant and gliding tones at masked auditory threshold.}, journal = {The Journal of the Acoustical Society of America}, volume = {64}, number = {3}, pages = {751-763}, doi = {10.1121/1.382040}, pmid = {701614}, issn = {0001-4966}, mesh = {*Auditory Threshold ; Humans ; *Perceptual Masking ; Time Factors ; }, abstract = {Masked thresholds for constant and gliding tones were determined by the method of adjustment for durations between 0.5 and 5000 ms in three overlapping frequency regions between 0.25 and 3.3 kHz. The masker was a continuous white noise at 70-dB SPL. Listening was monaural; subjects had normal hearing. Below 10 ms the thresholds for upward glides were lower and those for downward glides higher than the thresholds for constant tones. In the 10--300 ms duration range, which encompasses formant transitions of speech, the highest thresholds are for downward glides and the lowest ones for constant tones. These differences could result from different time courses of neural decay and inhibition for constant tones, upward and downward glides. The differences between upward and downward glides indicate that the phase spectra influence sound detectability. The thresholds for constant tones reach minimum around 1 s. The thresholds for glides continue to decrease at least up to 5 s. The "critical" duration for constant tone integration can result from the overriding of integration effects by adaptation effects, the latter ones being eliminated by changing frequency. The curves for constant-tone threshold between 10- and 1000-ms duration were fitted by a product of exponential and hyperbolic functions.}, } @article {pmid83329, year = {1978}, author = {Monsen, RB and Shaughnessy, DH}, title = {Improvement in vowel articulation of deaf children.}, journal = {Journal of communication disorders}, volume = {11}, number = {5}, pages = {417-424}, doi = {10.1016/0021-9924(78)90034-5}, pmid = {83329}, issn = {0021-9924}, mesh = {Acoustics ; Articulation Disorders/*therapy ; Audiovisual Aids ; Child ; Hearing Loss/*rehabilitation ; Humans ; Male ; Methods ; Models, Anatomic ; Phonetics ; Speech Production Measurement/methods ; }, abstract = {Three deaf adolescents who had poorly developed skills in oral communication were given specialized instruction designed to improve their ability to articulate vowels. This instruction was given for 1 hour weekly over a 5-month period, and it led to an improvement in vowel articulation that was substantial enough to be measured acoustically. The instruction emphasized articulatory relations between vowel sounds that are particularly difficult for the deaf to perceive visually or vibrotactually. The formant frequencies of five vowels were measured spectrographically before and after training. The measurements showed that all three children improved in their ability to articulate vowels as indicated by an increased frequency range of the second formant, a reduced amount of overlap in the frequency plots of adjacent vowels, and in more appropriate formant frequency values for individual vowels.}, } @article {pmid712002, year = {1978}, author = {Keating, P and Blumstein, SE}, title = {Effects of transition length on the perception of stop consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {64}, number = {1}, pages = {57-64}, doi = {10.1121/1.381956}, pmid = {712002}, issn = {0001-4966}, mesh = {Discrimination, Psychological ; Humans ; *Phonetics ; *Psychoacoustics ; *Speech Perception ; Time Factors ; }, abstract = {This study investigated the effects of lengthened transitions on the perception of stop consonants. In experiment I, three continua representing the phonetic categories [da] and [ga]containing transitions of 45, 95, or 145 ms were presented to 20 subjects for both labeling and discrimination. Results indicated that although there was a significant change in identification performance from 95 to 145 ms, the shape of the functions, and the locus and slope of the phonetic boundary did not significantly vary across transition lengths. In addition, discrimination of within-category stimulus comparisons was significantly better at the 95-ms transition length than at 45 or 145 ms. In experiment II, the availability of acoustic information was investigated further with the adaptation paradigm. Eight subjects labeled the 45-ms series before and after adaptation with 45-, 95-, and 145-ms [da] stimuli. No effect of transition length was found. These results suggest that the slope and duration of formant transitions seem to contribute minimally to the perception of place of articulation in stop consonants.}, } @article {pmid687238, year = {1978}, author = {Danaher, EM and Wilson, MP and Pickett, JM}, title = {Backward and forward masking in listeners with severe sensorineural hearing loss.}, journal = {Audiology : official organ of the International Society of Audiology}, volume = {17}, number = {4}, pages = {324-338}, doi = {10.3109/00206097809101302}, pmid = {687238}, issn = {0020-6091}, mesh = {Acoustic Stimulation ; Audiometry ; Auditory Threshold ; Deafness/*physiopathology ; Humans ; *Perceptual Masking ; Pitch Discrimination ; }, abstract = {Using synthetic stimuli, backward and forward masking by a first-formant stimulus were measured for two tasks: (1) discrimination of a second-formant transition and (2) pure-tone detection. 16 listeners with moderate to severe sensorineural hearing impairment were studied. Many of the listeners showed backward and forward discrimination masking, sometimes extending as far as 200 ms before or after the first-formant interval. There were large individual differences in degree and temporal extent of masking. The masking contribution of sensitivity shifts, as indicated by pure-tone detection levels, appeared to be limited to a range of only 50 ms before or after the first-formant masker.}, } @article {pmid98051, year = {1978}, author = {Hilloowala, RA and Lass, NJ}, title = {Spectrographic analysis of laryngeal air sac resonance in rhesus monkey.}, journal = {American journal of physical anthropology}, volume = {49}, number = {1}, pages = {129-131}, doi = {10.1002/ajpa.1330490119}, pmid = {98051}, issn = {0002-9483}, mesh = {Animals ; Haplorhini ; Larynx/anatomy & histology ; Macaca/*physiology ; Macaca mulatta/anatomy & histology/*physiology ; Respiratory System/anatomy & histology ; Vocalization, Animal/*physiology ; }, abstract = {Laryngeal air sacs are circular out-pocketings, located in the hyoid bone with their ostium in the midline of the anterior part of the larynx. From previous cadaver studies of the rhesus monkey it was deduced that the function of the air sac is to act as a resonating chamber. The present study was designed to test this hypothesis. Recordings were made of three rhesus monkeys before and after surgical removal of the air sac. Spectrographic analysis of the monkeys' vocalizations indicated that differences in formant frequency characteristics between pre-and post-surgical recordings were negligible. This finding suggests that the laryngeal air sac does not play an important role in the resonant properties of the monkeys' vocal tracts.}, } @article {pmid703272, year = {1978}, author = {Monsen, RB}, title = {Toward measuring how well hearing-impaired children speak.}, journal = {Journal of speech and hearing research}, volume = {21}, number = {2}, pages = {197-219}, doi = {10.1044/jshr.2102.197}, pmid = {703272}, issn = {0022-4685}, mesh = {Acoustics ; Adolescent ; Child ; Female ; Hearing Loss/*physiopathology ; Humans ; Male ; Phonetics ; Speech Intelligibility/*physiology ; Speech Production Measurement/*methods ; }, abstract = {Average intelligibility scores for a group of 37 hearing-impaired and two normally hearing adolescents were determined by 50 normal listeners and were compared with nine acoustically measured speech variables. These nine variables included measurements of consonant production, vowel production, and prosody. Regression analysis of the variables showed that three of the speech variables bore a multiple correlation of 0.85 with measured intelligibility scores. Two variables alone, the mean voice-onset-time difference between /t/ and /d/ and the mean second-formant difference between /i/ and /c/, accounted for about 70% of the variance in the intelligibility scores. To cross-validate the reliability of these correlations, intelligibility scores were subsequently predicted for another group of 30 hearing-impaired adolescents and then compared with intelligibility scores as determined by another group of normal listeners. For this second group, the correlation between measured intelligibility scores and predicted scores was 0.86, which indicates that the reliability of the predicting variables is high. Five of the nine variables correlated more highly with measured speech intelligibility than did pure-tone audiometric thresholds. The average speech intelligibility of all 67 hearing-impaired subjects was 76%.}, } @article {pmid690333, year = {1978}, author = {Atal, BS and Chang, JJ and Mathews, MV and Tukey, JW}, title = {Inversion of articulatory-to-acoustic transformation in the vocal tract by a computer-sorting technique.}, journal = {The Journal of the Acoustical Society of America}, volume = {63}, number = {5}, pages = {1535-1553}, doi = {10.1121/1.381848}, pmid = {690333}, issn = {0001-4966}, mesh = {Acoustics ; Computers ; Glottis/physiology ; Humans ; Lip/physiology ; Models, Biological ; Phonetics ; Speech/*physiology ; Speech Production Measurement/*methods ; }, abstract = {We present numerical methods for studying the relationship between the shape of the vocal tract and its acoustic output. For a stationary vocal tract, the articulatory-acoustic relationship can be represented as a multidimensional function of a multidimensional argument: y=f(x), where x, y are vectors describing the vocal-tract shape and the resulting acoustic output, respectively. Assuming that y may be computed for any x, we develop a procedure for inverting f(x). Inversion by computer sorting consists of computing y for many values of x and sorting the resulting (y,x) pairs into a convenient order according to y; x for a given y is then obtained by looking up y in the sorted data. Application of this method for determining parameters of an articulatory model corresponding to a given set of formant frequencies is presented. A method is also described for finding articulatory regions (fibers) which map into a single point in the acoustic space. The local nature of f(x) is determined by linearization in a small neighborhood. Larger regions are explored by extending the linear neighborhoods in small steps. This method was applied for the study of compensatory articulation. Sounds produced by various articulations along a fiber were synthesized and were compared by informal listening tests. These tests show that, in many cases of interest, a given sound could be produced by many different vocal-tract shapes.}, } @article {pmid690332, year = {1978}, author = {Ainsworth, WA}, title = {Perception of speech sounds with alternate formants presented to opposite ears.}, journal = {The Journal of the Acoustical Society of America}, volume = {63}, number = {5}, pages = {1528-1534}, doi = {10.1121/1.381847}, pmid = {690332}, issn = {0001-4966}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/*physiology ; Humans ; Phonetics ; Psychoacoustics ; Speech Perception/*physiology ; }, abstract = {The perception of speech syllables with alternate formants presented to opposite ears has been investigated. Using synthesized speech and normal-hearing subjects, it was found that this mode of presentation reduced the recognition scores with stop consonants by about 6%, semivowels by about 4%, and fricatives by about 5%, compared with binaural presentation. Vowel recognition, on the other hand, was not significantly reduced.}, } @article {pmid683827, year = {1978}, author = {Syrdal-Lasky, A}, title = {Effects of intensity on the categorical perception of stop consonants and isolated second formant transitions.}, journal = {Perception & psychophysics}, volume = {23}, number = {5}, pages = {420-432}, pmid = {683827}, issn = {0031-5117}, mesh = {Humans ; *Perception ; *Phonetics ; *Psychoacoustics ; }, } @article {pmid753829, year = {1978}, author = {Watanabe, T and Kuwahara, H and Ohgushi, K}, title = {Directional sensitivity in cat of collicular auditory neurons to FM sound.}, journal = {The Journal of auditory research}, volume = {18}, number = {2}, pages = {79-82}, pmid = {753829}, issn = {0021-9177}, mesh = {*Acoustic Stimulation ; Animals ; Cats ; Neurons/physiology ; *Sound ; Vestibulocochlear Nerve/*physiology ; }, abstract = {Directional sensitivity for FM sound was compared between two types of FM sound, a linear FM and a "Step" FM digitally synthesized to contain 28 steady short tones with half-overlap in time between every successive pair, simulating the formant transitions of speech. There seemed to be no difference in the response of the cat collicular neuron in directionality of the units for ascending vs descending frequencies. As far as we have examined, no FM-sensitive neurons were observed which were specifically responsive to particular FM components in a stimulus.}, } @article {pmid681320, year = {1978}, author = {Knorr, SG}, title = {Limitations of bandwidth compression hearing aids applied to the voiced portion of speech.}, journal = {Journal of bioengineering}, volume = {2}, number = {1-2}, pages = {47-57}, pmid = {681320}, issn = {0145-3068}, mesh = {Auditory Perception ; Biophysical Phenomena ; Biophysics ; Deafness/therapy ; Hearing Aids ; Humans ; Male ; Speech/*physiology ; }, abstract = {Numerous speech processing techniques have been applied to assist hearing-impaired subjects with extreme high-frequency hearing losses who can be helped only to a limited degree with conventional hearing aids. The results of providing this class of deaf subjects with a speech encoding hearing aid, which is able to reproduce intelligible speech for their particular needs, have generally been disappointing. There are at least four problems related to bandwidth compression applied to the voiced portion of speech: (1) the problem of pitch extraction in real time; (2) pitch extraction under realistic listening conditions, i.e. when competing speech and noise sources are present; (3) an insufficient data base for successful compression of voiced speech; and (4) the introduction of undesirable spectral energies in the bandwidth-compressed signal, due to the compression process itself. Experiments seem to indicate that voiced speech segments bandwidth limited to f = 1000 Hz, even at a loss of higher formant frequencies, is in most instances superior in intelligibility compared to bandwidth-compressed voiced speech segments of the same bandwidth, even if pitch can be extracted with no error. With the added complexity of real-time pitch extraction which has to function in actual listening conditions, it is doubtful that a speech encoding hearing aid, based on bandwidth compression on the voiced portion of speech, could be successfully implemented. However, if bandwidth compression is applied to the unvoiced portions of speech only, the above limitations can be overcome (1).}, } @article {pmid660328, year = {1978}, author = {Tsuda, Y}, title = {[An experimental study of discrimination of central frequency of a single formant (author's transl)].}, journal = {Nihon Jibiinkoka Gakkai kaiho}, volume = {81}, number = {3}, pages = {224-232}, doi = {10.3950/jibiinkoka.81.224}, pmid = {660328}, issn = {0030-6622}, mesh = {Adult ; *Auditory Perception ; Humans ; Male ; *Phonetics ; }, } @article {pmid659292, year = {1978}, author = {Godfrey, JJ and Millay, K}, title = {Perception of rapid spectral change in speech by listeners with mild and moderate sensorineural hearing loss.}, journal = {Journal of the American Audiology Society}, volume = {3}, number = {5}, pages = {200-208}, pmid = {659292}, issn = {0360-9294}, mesh = {Acoustics ; Adult ; Aged ; Cues ; Deafness/*physiopathology ; Discrimination, Psychological ; Female ; Humans ; Male ; Middle Aged ; *Perception ; Phonetics ; *Speech ; Time Factors ; }, abstract = {An experimental method is described for assessing the effect of sensorineural hearing impairment on the perception of particular speech cues. Stimuli consisted of synthetic consonant-vowel syllables, varying along a continuum in the duration of initial formant transitions, such that the shorter stimuli sounded like [bepsilon], and the longer ones sounded like [Wepsilon]. Subjects with mild and moderate hearing losses were asked to identify the stimuli, and their performance was compared to that of normal-hearing listeners. Observed differences suggest that categorizing these sounds as stops versus glides is especially difficult for some impaired listeners. This difficulty is shown to be specific to the "rapid spectral change" cue, independent of frequency content or intensity level. The importance of this finding with respect to categorical perception and the implications for further tests of impairment for speech are discussed briefly.}, } @article {pmid632415, year = {1978}, author = {Gay, T}, title = {Effect of speaking rate on vowel formant movements.}, journal = {The Journal of the Acoustical Society of America}, volume = {63}, number = {1}, pages = {223-230}, doi = {10.1121/1.381717}, pmid = {632415}, issn = {0001-4966}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Phonetics ; Speech/*physiology ; Time Factors ; }, abstract = {The purpose of this experiment was to study the effects of changes in speaking rate on both the attainment of acoustic vowel targets and the relative time and speed of movements toward these presumed targets. Four speakers produced a number of different CVC and CVCVC utterances at slow and fast speaking rates. Spectrographic measurements showed that the midpoint format frequencies of the different vowels did not vary as a function of rate. However, for fast speech the onset frequencies of second formant transitions were closer to their target frequencies while CV transition rates remained essentially unchanged, indicating that movement toward the vowel simply began earlier for fast speech. Changes in both speaking rate and lexical stress had different effects. For stressed vowels, an increase in speaking rate was accompanied primarily by a decrease in duration. However, destressed vowels, even if they were of the same duration as quickly produced stressed vowels, were reduced in overall amplitude, fundamental frequency, and to some extent, vowel color. These results suggest that speaking rate and lexical stress are controlled by two different mechanisms.}, } @article {pmid292155, year = {1978}, author = {Scharf, B}, title = {Comparison of normal and impaired hearing. II. Frequency analysis, speech perception.}, journal = {Scandinavian audiology. Supplementum}, volume = {}, number = {6}, pages = {81-106}, pmid = {292155}, issn = {0107-8593}, mesh = {Audiometry, Pure-Tone ; Hearing Disorders/*psychology ; Humans ; Loudness Perception ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Frequency analysis covers two separate listening tasks, one involving frequency discrimination, the other frequency selectivity. Discrimination refers to the ability to distinguish one frequency from another. Selectivity refers to the ability to hear one frequency in the presence of other frequencies. Selectivity is critical to the understanding of speech which comprises sounds containing many different frequencies. To understand speech easily, the listener must be able to analyze speech sounds into their component frequencies, especially formants. The hard-of-hearing person is probably less able to make that analysis, but we know surprisingly little about either discrimination or selectivity in hearing impairment. Existing evidence does suggest that both discrimination and selectivity are reduced in cochlear impairment so that such patients need a bigger frequency difference to discriminate between two tones and they have a wider critical band. A widened critical band would be expected to make it very difficult for the severely impaired person to understand speech under all listening conditions; it would make it difficult for the moderately impaired person to understand speech in a noisy background, unless the signal-to-noise ratio is improved as is possible by appropriate amplitude compression in hearing aids.}, } @article {pmid604687, year = {1977}, author = {Cullinan, WL and Erdos, E and Schaefer, R and Tekieli, ME}, title = {Perception of temporal order of vowels and consonant-vowel syllables.}, journal = {Journal of speech and hearing research}, volume = {20}, number = {4}, pages = {742-751}, doi = {10.1044/jshr.2004.742}, pmid = {604687}, issn = {0022-4685}, mesh = {Adult ; *Auditory Perception ; Female ; Humans ; Phonetics ; *Speech ; Time Factors ; }, abstract = {Recent findings indicate that the presence of formant transitions aids the perception of the order of stimuli in repeating sequences of vowels or consonant-vowel (CV) syllables. In this study, 12 listeners reported the perceived order of four vowels or CVs in repeating sequences. Stimuli ranged in duration from 75 to 300 msec in 25-msec steps. Four stimulus sequences were used (1) varying vowels (Vv), (2) CVs with varying consonants but a constant vowel (CvVc), (3) CVs with a constant consonant but varying vowels (CcVv), (4) CVs with consonants and vowels varying (CvVv). Percentage of correct identification of order was significantly higher and mean threshold duration significantly lower for the CvVv and CvVc conditions than for the Vv condition. Mean number of sequences per response was significantly smaller for the CvVv condition than for the other conditions. Threshold durations ranged from 100 msec for the CvVv sequences to 135 msec for the Vv sequences. Ordering performance was nearly perfect for stimulus durations of about 225 to 250 msec. The results support the hypothesis that as stimuli in repeating sequences more closely resemble connected speech, listeners can more easily correctly identify the order of the stimuli.}, } @article {pmid591680, year = {1977}, author = {Broad, DJ and Wakita, H}, title = {Piecewise--planar representation of vowel formant frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {62}, number = {6}, pages = {1467-1473}, doi = {10.1121/1.381676}, pmid = {591680}, issn = {0001-4966}, mesh = {Acoustics ; Female ; Humans ; Mathematics ; Phonetics ; *Speech ; }, abstract = {The first three formant frequencies for 778 steady-state tokens of 30 nonretroflex vowel types uttered by a female speaker are found to lie close to a piecewise-planar surface (expressed numerically as 0.634F1 +0.603F2 -- 0.485F3 -- 366 = 0, for F2 greater than 0.027F1 +1692 and 0.686F1 -- 0.528F2 -- 0.501F3 +1569 = 0, otherwise). The rms distance of the vowels from this surface is only 86 Hz. The intersection between the two planes is a line of nearly constant F2, corresponding closely to the F2 of a uniform vocal tract of the same length as our speaker's. The piecewise-planar representation also suggests a way to test the hypotheses of uniform and nonuniform formant-frequency scaling between speakers.}, } @article {pmid915766, year = {1977}, author = {Borland, RG and Cannings, R and Nicholson, AN}, title = {Pitch and formant analysis of the voice in the investigation of stress in pilots [proceedings].}, journal = {The Journal of physiology}, volume = {270}, number = {1}, pages = {15P-16P}, pmid = {915766}, issn = {0022-3751}, mesh = {*Aerospace Medicine ; Computers ; Humans ; Stress, Psychological/*physiology ; *Voice ; }, } @article {pmid881491, year = {1977}, author = {Reeder, K and Strong, WJ and Palmer, EP}, title = {Preliminary study of a low-frequency, formant-based speech code for the severely hearing impaired.}, journal = {The Journal of the Acoustical Society of America}, volume = {61}, number = {5}, pages = {1379-1381}, doi = {10.1121/1.381407}, pmid = {881491}, issn = {0001-4966}, mesh = {Acoustics ; Deafness/*rehabilitation ; *Hearing Aids ; Humans ; *Speech ; }, } @article {pmid856778, year = {1977}, author = {Ptacek, PH and Koutstaal, CW}, title = {Identification of vowels with equated intensity. A preliminary study.}, journal = {Journal of the American Audiology Society}, volume = {2}, number = {5}, pages = {169-172}, pmid = {856778}, issn = {0360-9294}, mesh = {Acoustics ; *Auditory Perception ; *Discrimination, Psychological ; Humans ; Individuality ; *Phonetics ; Voice ; }, abstract = {The ability of listeners to identify 10 vowels under two conditions was investigated. In both conditions the vowels were presented at a comfortable loudness level in a constant phonetic environment with equal stress. In the first condition, the vowels were presented with their natural intensity differences, and in the second condition they were presented with their intensities equated. The role of relative intensity differences for the identification of vowel sounds at various loudness levels is discussed, and the likelihood that intensity differences play a more important role in identifying those vowel sounds having similar characteristics (i.e., formant frequencies ranges) is considered.}, } @article {pmid856777, year = {1977}, author = {Miner, R and Danhauer, JL}, title = {Relation between formant frequencies and optimal octaves in vowel perception.}, journal = {Journal of the American Audiology Society}, volume = {2}, number = {5}, pages = {163-168}, pmid = {856777}, issn = {0360-9294}, mesh = {*Auditory Perception ; Auditory Threshold ; Discrimination, Psychological ; Humans ; *Phonetics ; Pitch Discrimination ; }, abstract = {The purpose of this study was to investigate the relation between the traditional concept of formant frequencies and the controversial concept of optimal octaves in the perception of the vowels/i, a, u/. The vowels (produced by a male speaker of General American English) were filtered through eight bandwidths (80-160, 160-315, 315-630, 630-1,250, 1,250-2,500, 2,500-5,000, 5,000-10,000, and 10,000-20,000 Hz) and presented to two groups of subjects. One group performed similarity ratings on pairs of filtered and nonfiltered stimuli; the other identified the individual filtered and nonfiltered vowels. The optimal bandwidths derived from the data of the two groups were compared to the formant frequencies of our speaker, to those reported in the literature, and to the optimal octaves published earlier by other authors. The results showed that there were specific bandwidths which allowed for correct perception and identification of each vowel, and that these bandwidths were compatible with both the optimal octaves and the formant frequencies reported in the literature for each vowel.}, } @article {pmid291353, year = {1977}, author = {Nagai, I and Fukaya, M and Tsuge, S and Watanabe, K and Satoh, H and Yoshida, M and Terashima, Y}, title = {[Basic study of sound when the oral cavity is obstructed by a foreign object--on vowel formant frequencies].}, journal = {Aichi Gakuin Daigaku Shigakkai shi}, volume = {14}, number = {4}, pages = {397-403}, pmid = {291353}, issn = {0044-6912}, mesh = {Computers ; Foreign Bodies ; Humans ; Mouth/physiopathology ; Speech Disorders/*physiopathology ; Speech Production Measurement ; }, } @article {pmid831707, year = {1977}, author = {Simonov, PV and Frolov, MV}, title = {Analysis of the human voice as a method of controlling emotional state: achievements and goals.}, journal = {Aviation, space, and environmental medicine}, volume = {48}, number = {1}, pages = {23-25}, pmid = {831707}, issn = {0095-6562}, mesh = {*Emotions ; Humans ; Male ; *Space Flight ; *Speech ; Stress, Psychological ; Verbal Behavior ; }, abstract = {Background factors and those of formant structure, spectral and spectro-temporal characteristics of rapid and slow speech components, temporal peculiarities, and intensity of speech turned out to be informative indices of the human emotional state. Application of mathematical methods, in particular methods of recognition theory, to these factors helped to assess the degree and the psychological sign of emotion, to diagnose the status of attention and fatigue, and to differentiate emotional and physical stress. The paper outlines the results obtained in model experiments on cosmonaut A. Leonov at different flight stages, including EVA, on Voskhod-2.}, } @article {pmid1018058, year = {1976}, author = {Montgomery, AA and Cooke, PA}, title = {Perceptual and acoustic analysis of repetitions in stuttered speech.}, journal = {Journal of communication disorders}, volume = {9}, number = {4}, pages = {317-330}, doi = {10.1016/0021-9924(76)90021-6}, pmid = {1018058}, issn = {0021-9924}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Male ; Phonetics ; *Speech ; Stuttering/*diagnosis ; Tape Recording ; }, abstract = {This study presents perceptual and acoustic data on a carefully selected set of part-word repetitions from the speech of adult stutters. Results indicated that the schwa vowel was perceived in only 25% of the repetitions, far less than previously indicated. Spectrographic analysis showed that although abnormal consonant duration and C-V formant transitions characterized the initial segment of the stuttered word, the remainder of the word is identical to its identical to its fluently produced counterpart. The results were interpreted to mean that for the type of dysfluency selected, the articulatory breakdown is confined to the initial consonant, and it is likely that abnormal formant transitions from initial consonant to vowel, when present, are due to deviant formation of the consonant rather than to faulty transition dynamics.}, } @article {pmid826333, year = {1976}, author = {Focan, C and Beaumariage, ML}, title = {[Logarithmo-normal distributions of relative and absolute values of human lymphocytes forming spontaneous rosettes in presence of sheep and mouse erythrocytes in normal adult].}, journal = {Comptes rendus hebdomadaires des seances de l'Academie des sciences. Serie D: Sciences naturelles}, volume = {283}, number = {13}, pages = {1567-1570}, pmid = {826333}, mesh = {Age Factors ; Animals ; Erythrocytes/immunology ; Humans ; Immunologic Techniques ; Lymphocytes/*immunology ; Mice ; Sex Factors ; Sheep ; Statistics as Topic ; }, abstract = {The percentages and absolute numbers of peripheral blood lymphocytes forming rosettes with sheep and mouse red blood cells were established in 135 normal human beings. We found SRFC values almost similar to the proportion of so called "active" rosettes, whereas the values of MRFC were low. SRFC and MRFC values had a logarithmico-normal distribution. The absolute number of the two types of rosettes was decreased in the subjects more than 70 years old.}, } @article {pmid1014888, year = {1976}, author = {Baru, AV and Shmigidina, GN}, title = {[Role of the auditory cortex in animal recognition of synthesized vowels].}, journal = {Zhurnal vysshei nervnoi deiatelnosti imeni I P Pavlova}, volume = {26}, number = {6}, pages = {1165-1173}, pmid = {1014888}, issn = {0044-4677}, mesh = {Animals ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Brain Mapping ; Conditioning, Classical/physiology ; Discrimination, Psychological/physiology ; Dogs ; Pain ; *Speech ; }, abstract = {Discrimination of synthesized vowels [a] and [i] was studied in intact dogs and animals with an ablated auditory cortex. Electro-defensive and conditioned reflex methods were used. It has been found that as a result of learning an auditory image of a stationary vowel can be formed in intact dogs, which is invariant relatively to any change in the basic frequency of the voice, the intensity and duration of presentation (300 to 75 msec), and that two formants are sufficent for discrimination. The auditory image of the vowel is preserved after a bilateral ablation of the auditory cortical projection zone. Discrimination of vowels is disturbed in animals with a removed auditory cortex when information redundancy is reduced (diminished number of formants and reduced signal duration).}, } @article {pmid979206, year = {1976}, author = {Kent, RD}, title = {Anatomical and neuromuscular maturation of the speech mechanism: evidence from acoustic studies.}, journal = {Journal of speech and hearing research}, volume = {19}, number = {3}, pages = {421-447}, doi = {10.1044/jshr.1903.421}, pmid = {979206}, issn = {0022-4685}, mesh = {Acoustics ; Adolescent ; Adult ; Age Factors ; Child ; Child, Preschool ; Female ; Humans ; Infant ; Infant, Newborn ; Infant, Newborn, Diseases/diagnosis ; *Language Development ; Male ; *Motor Skills ; Phonetics ; Sex Factors ; Speech/*physiology ; Speech Disorders/diagnosis ; Time Factors ; Voice ; }, abstract = {This paper surveys acoustic studies of speech development and discusses the data with respect to the anatomical and neuromuscular maturation of the speech mechanism. The acoustic data on various aspects of speech production indicate that the accuracy of motor control improves with age until adult-like performance is achieved at about 11 or 12 years, somewhat after the age at which speech sound acquisition usually is judged to be complete. Other topics of discussion are (1) problems in the spectrographic analysis of children's speech, (2) formant scale factors that relate children's and adults' data, and (3) identification and diagnosis of developmental disorders through acoustic analyses of speech sounds.}, } @article {pmid135897, year = {1976}, author = {Schlöndorff, G and Bruchmüller, HG}, title = {[Experiments on intelligibility of esophagus-speech (author's transl)].}, journal = {Laryngologie, Rhinologie, Otologie}, volume = {55}, number = {9}, pages = {716-719}, pmid = {135897}, issn = {0340-1588}, mesh = {Acoustics ; *Amplifiers, Electronic ; Humans ; *Speech, Alaryngeal ; *Speech, Esophageal ; Verbal Behavior ; Voice ; }, abstract = {An electroacustic voice amplifier with a formant-specific amplitude frequency response has been developed, which leads to a remarkable increasing of intellegibility.}, } @article {pmid950791, year = {1976}, author = {Wolfe, VI and Bacon, M}, title = {Spectrographic comparison of two types of spastic dysphonia.}, journal = {The Journal of speech and hearing disorders}, volume = {41}, number = {3}, pages = {325-332}, doi = {10.1044/jshd.4103.325}, pmid = {950791}, issn = {0022-4677}, mesh = {Aphonia/etiology/*physiopathology ; Female ; Humans ; Methods ; Middle Aged ; Phonetics ; Spasm/*complications ; *Voice ; }, abstract = {A spectrographic comparison of the voices of two patients with spastic dysphonia demonstrated differences in vocal characteristics. The voice of one patient was characterized by intermittent breathiness which appeared spectrographically as a breakdown in formant structure or as the addition of fricative fill superimposed upon resonance bars. The voice of the second patient was characterized by strain-strangle phonation which appeared spectrographically as widely and irregularly spaced vertical striations. The contrasting vocal characteristics of the two patients are compatible with the viewpoint that there may be two types of spastic dysphonia.}, } @article {pmid979202, year = {1976}, author = {Monsen, RB}, title = {Second formant transitions of selected consonant-vowel combinations in the speech of deaf and normal-hearing children.}, journal = {Journal of speech and hearing research}, volume = {19}, number = {2}, pages = {279-289}, doi = {10.1044/jshr.1902.279}, pmid = {979202}, issn = {0022-4685}, mesh = {Acoustics ; Adolescent ; *Deafness ; Humans ; Male ; Phonetics ; *Speech ; Time Factors ; }, abstract = {Although it is well known that the speech produced by the deaf is generally of low intelligibility, the sources of this low speech intelligibility have generally been ascribed either to aberrant articulation of phonemes or inappropriate prosody. This study was designed to determine to what extent a nonsegmental aspect of speech, formant transitions, may differ in the speech of the deaf and of the normal hearing. The initial second formant transitions of the vowels /i/ and /u/ after labial and alveolar consonants (/b, d, f/) were compared in the speech of six normal-hearing and six hearing-impaired adolescents. In the speech of the hearing-impaired subjects, the second formant transitions may be reduced both in time and in frequency. At its onset, the second formant may be nearer to its eventual target frequency than in the speech of the normal subjects. Since formant transitions are important acoustic cues for the adjacent consonants, reduced F2 transitions may be an important factor in the low intelligibility of the speech of the deaf.}, } @article {pmid1249317, year = {1976}, author = {Goldstein, UG}, title = {Speaker-identifying features based on formant tracks.}, journal = {The Journal of the Acoustical Society of America}, volume = {59}, number = {1}, pages = {176-182}, doi = {10.1121/1.380837}, pmid = {1249317}, issn = {0001-4966}, mesh = {Acoustics ; Adult ; Humans ; Male ; *Pattern Recognition, Automated ; *Phonetics ; *Speech ; United States ; }, } @article {pmid996115, year = {1976}, author = {Mrayati, M and Carré, R}, title = {[Relations between the form of the vocal tract and the acoustic characteristics of the French vowels].}, journal = {Phonetica}, volume = {33}, number = {4}, pages = {285-306}, doi = {10.1159/000259777}, pmid = {996115}, issn = {0031-8388}, mesh = {Acoustics ; Electric Conductivity ; France ; Humans ; *Larynx ; Models, Biological ; *Nasal Cavity ; *Nose ; *Pharynx ; *Phonetics ; }, abstract = {The distribution inside the vocal tract of certain acoustic parameters (volume velocity, acoustic pressure, stored energy) is calculated for the French vowels. Calculations are made using a simulated lossy transmission line model of the vocal tract. The obtained results are used to study the relation between the acoustic characteristics of these vowels and the corresponding articulatory dimensions. Three aspects of this relation are developed: (1) the "affiliation degree" of each formant with different sections of the vocal tract; (2) the sensitivity of each formant frequency to small changes of the vocal tract area function, and (3) the contribution of each part of the vocal tract to formant damping for each of the distributed losses. The effects of radiation impedance and wall vibration on these relations are evaluated. Use of these results is possible in the fields of speech analysis and vocal tract modeling.}, } @article {pmid827958, year = {1976}, author = {Andrew, RJ}, title = {Use of formants in the grunts of baboons and other nonhuman primates.}, journal = {Annals of the New York Academy of Sciences}, volume = {280}, number = {}, pages = {673-693}, doi = {10.1111/j.1749-6632.1976.tb25530.x}, pmid = {827958}, issn = {0077-8923}, mesh = {Acoustics ; Animals ; Behavior, Animal ; Biological Evolution ; Cercopithecus ; Female ; Grooming ; Haplorhini ; History, Ancient ; Humans ; Laughter ; Lip ; Macaca ; Male ; Movement ; Pan troglodytes ; *Papio ; *Primates ; Speech ; Tongue ; *Vocalization, Animal ; }, } @article {pmid1206167, year = {1975}, author = {Winitz, H and LaRiviere, C and Herriman, E}, title = {Letter: Comments on summarization of the findings of Liberman et al. regarding the role of formant transitions in the perception of voiceless stops.}, journal = {The Journal of the Acoustical Society of America}, volume = {58}, number = {6}, pages = {1333}, doi = {10.1121/1.380788}, pmid = {1206167}, issn = {0001-4966}, mesh = {Humans ; Models, Theoretical ; Phonetics ; *Psychoacoustics ; *Speech ; Voice ; }, } @article {pmid1164332, year = {1975}, author = {Simonov, PV and Frolov, MV and Taubkin, VL}, title = {Use of the invariant method of speech analysis to discern the emotional state of announcers.}, journal = {Aviation, space, and environmental medicine}, volume = {46}, number = {8}, pages = {1014-1016}, pmid = {1164332}, issn = {0095-6562}, mesh = {*Emotions ; Heart Rate ; Humans ; *Phonetics ; *Speech ; Stress, Psychological ; }, abstract = {An improved method of discerning emotional colouring of speech on the frequency of main tone and average number of intersections at zero level within the range of first formant frequencies is described. Vowels from words pronounced by announcers in different emotional states served as material for experiments. The method elaborated in the experiments with actors and drama school undergraduates was then tested in natural conditions on amateur parachute jumpers.}, } @article {pmid1141504, year = {1975}, author = {Lisker, L}, title = {Letter: Is it VOT or a first-formant transition detector?.}, journal = {The Journal of the Acoustical Society of America}, volume = {57}, number = {6 Pt 2}, pages = {1547-1551}, doi = {10.1121/1.380602}, pmid = {1141504}, issn = {0001-4966}, mesh = {Humans ; *Phonetics ; *Psychoacoustics ; Time Factors ; Voice ; }, } @article {pmid1194866, year = {1975}, author = {Dorman, MF and Cutting, JE and Raphael, LJ}, title = {Perception of temporal order in vowel sequences with and without formant transitions.}, journal = {Journal of experimental psychology. Human perception and performance}, volume = {104}, number = {2}, pages = {147-153}, pmid = {1194866}, issn = {0096-1523}, mesh = {Adolescent ; Adult ; *Auditory Perception ; Humans ; *Speech ; Time Factors ; }, abstract = {Temporal-order perception of phoneme segments in running speech is much superior to temporal-order perception in repeating vowel sequences. The more rapid rates possible in running speech may be due largely to the presence of formant transitions. In a series of five experiments we observed that many temporal-order misjudgements of repeating vowels can be explained in terms of auditory stream segregation, triggered for the most part by discontinuities in first-formant frequencies of adjacent vowels. Streaming, however, can be suppressed by formant transitions appropriate for the perception of stop consonants and by continuous transitions resembling those in coarticulated vowels. At rapid sequence rates, when the constraints of auditory streaming are removed, correct temporal-order identification is limited by linguistic transformations of vowels into other phoneme segments.}, } @article {pmid128737, year = {1975}, author = {Berthier, R and Douady, F and Marcille, G and Sotto, JJ and Schaerer, R and Hollard, D}, title = {[Study of colony forming cells and aggregates (CFCA) in vitro in blood and bone marrow of patients with chronic myeloid leukemia: simplified bovine serum albumin gradient centrifugation].}, journal = {Nouvelle revue francaise d'hematologie}, volume = {15}, number = {3}, pages = {365-374}, pmid = {128737}, issn = {0029-4810}, mesh = {Bone Marrow/pathology ; Bone Marrow Cells ; Cell Separation ; Centrifugation, Density Gradient ; Clone Cells ; Female ; Humans ; In Vitro Techniques ; Leukemia, Myeloid/blood/*pathology ; Leukocyte Count ; Male ; Recurrence ; Remission, Spontaneous ; }, abstract = {In untreated CML patients (at diagnosis or in relapse) we find about the same number of CFC per 1.10(6) nucleated cells in the blood and bone marrow (sometimes, slightly greater in the blood than in the marrow). Application of density-cut separation shows normal number of CFC in the low density fraction (Ldf-CFC) of bone marrow cells from patients in remission. In 7 untreated patients (at diagnosis or in relapse), we have always found a greater number of Ldf-CFC in the blood than in the marrow, when the study is performed on the same day and in the same technical conditions. This difference is observed even if the leukocytes count is elevated and thence, the contamination of bone marrow cells by blood cells is presumably important. The percentage of peripheral blood Ldf-CFC seems to be positively correlated with the number of peripheral blood leukocytes. The highest percentage of Ldf-CFC (greater than 60%) have been found in 5 patients in relapse. Two of these have entered into the blastic phase of CML and the three others relapse repeatedly in the chronic form of the disease.}, } @article {pmid1180496, year = {1975}, author = {Clot, J and Charmasson, E and Sany, J and Serre, A}, title = {[Proceedings: Subpopulation of lymphocytes forming EA-rosettes with great affinity].}, journal = {Annales d'immunologie}, volume = {126}, number = {3}, pages = {360-361}, pmid = {1180496}, issn = {0300-4910}, mesh = {Cell Aggregation ; Cytotoxicity Tests, Immunologic ; Erythrocytes/*immunology ; Humans ; Lymphocytes/*immunology ; }, } @article {pmid52336, year = {1975}, author = {Liacopoulos, P and Birrien, JL and Martin-Taieb, C and Bach, JF}, title = {[Proceedings: Double rosette-forming cells].}, journal = {Annales d'immunologie}, volume = {126}, number = {3}, pages = {356}, pmid = {52336}, issn = {0300-4910}, mesh = {Animals ; Cell Aggregation ; Epitopes ; Humans ; *Immune Adherence Reaction ; Mice ; T-Lymphocytes/immunology ; }, } @article {pmid1124262, year = {1975}, author = {LaRivere, C}, title = {Contributions of fundamental frequency and formant frequencies to speaker identification.}, journal = {Phonetica}, volume = {31}, number = {3-4}, pages = {185-197}, doi = {10.1159/000259668}, pmid = {1124262}, issn = {0031-8388}, mesh = {Adult ; *Auditory Perception ; Cues ; Filtration ; Humans ; Male ; Phonetics ; Photography ; Pitch Discrimination ; *Speech ; Time Factors ; }, }