@article {pmid39955192, year = {2025}, author = {Liu, W and Wang, Y}, title = {Acoustic Characteristics of Tenors and Sopranos in Chinese National Singing and Bel Canto.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2025.01.039}, pmid = {39955192}, issn = {1873-4588}, abstract = {BACKGROUND: With the advancement of vocal arts, Chinese National Singing and Western Classical Singing (Bel Canto) encounter challenges in cross-cultural adaptation. Investigating formant tuning strategies and the singer's formant is crucial for scientifically characterizing the vocal production techniques in Chinese singing styles.

METHOD: Eight singers-Chinese National Singing tenors, Chinese National Singing sopranos, Bel Canto tenors, and Bel Canto sopranos-were recruited. The fundamental frequency (F0), intensity, formants, and long-term average spectrum (LTAS) were analyzed using a series of designed tasks to examine the phonation and articulation characteristics of these two singing genres in the context of cross-cultural adaptation.

RESULTS: A positive correlation between F0 and intensity was generally observed, though variations existed across vowels and singers. Both linear and non-linear relationships were found between F0 and formants. The first formant (F1) was proportional to F0, with greater variability for female singers in the vowel /a/. LTAS analysis revealed that the tenors exhibited the singer's formant in sung vowels and songs, whereas the sopranos did not exhibit this feature when singing vowels but did so in specific songs. Moreover, the primary and secondary spectral peaks in Bel Canto were less influenced by songs compared to Chinese National Singing.

CONCLUSIONS: (i) Intensity can provide an objective basis for differentiating subjective differences between singing genres, and individual differences are evident in how singers handle the relationship between F0 and intensity. (ii) Vowel modification and vowel migration in sopranos reflect consistency and variability across linguistic and cultural contexts. (iii) The presence and characteristics of the singer's formant are influenced by sexes, singing genres, and songs. Differences in the degree of spectral influence between the two singing genres suggest that Bel Canto emphasizes yi qiang xing zi (ie, phonation drives articulation), while Chinese National Singing emphasizes yi zi xing qiang (ie, articulation drives phonation).}, } @article {pmid39924373, year = {2025}, author = {Pan, AY and Grail, GPO and Albert, G and Groll, MD and Stepp, CE and Arnocky, SA and Hodges-Simeon, CR}, title = {What Contributes to Masculine Perception of Voice Among Transmasculine People on Testosterone Therapy?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.12.037}, pmid = {39924373}, issn = {1873-4588}, abstract = {Voice is a highly salient and complex signal that people use to categorize another's gender. For transmasculine individuals seeking to align their gender expression with their gender identity, vocal presentation is a major concern. Voice-gender incongruence, where one's voice does not match their gender identity, can lead to vocal strain, fatigue, emotional distress, and increased risk of suicidality. Testosterone therapy, which uses exogenous testosterone to masculinize or androgynize the voice and other secondary sexual characteristics in individuals assigned female at birth, is one method to address this issue. However, many individuals remain dissatisfied with their voice post therapy, indicating that hormonal voice modification is a complex process not fully understood. In the present study, we use unmodified voice samples from 30 transmasculine individuals undergoing testosterone therapy and utilized multivariate analysis to determine the relative and combined effects of four acoustic parameters on two measures of gender perception. The results show that transmasculine individuals' speech is perceived as equally "masculine" as that of cisgender males, with both groups being statistically categorized as male at similar rates. Although mean fundamental frequency and formant-estimated vocal tract length together account for a significant portion of the variance in gender perceptions, a substantial amount of variance in gender perception remains unexplained. Understanding the acoustic and sociolinguistic factors that contribute to masculine voice presentation can lead to more informed and individualized care for transmasculine individuals experiencing voice-gender incongruence and considering testosterone therapy. For this population, addressing voice-gender incongruence has important implications for life satisfaction, quality of life, and self-esteem.}, } @article {pmid39889010, year = {2024}, author = {Luo, X and Lv, J and Liu, W and Mi, C and Wang, J and Yang, L and Chu, PK and Liu, C}, title = {Double-formant PCF-SPR refractive index sensor with ultra-high double-peak-shift sensitivity and a wide detection range.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {41}, number = {10}, pages = {1873-1883}, doi = {10.1364/JOSAA.530505}, pmid = {39889010}, issn = {1520-8532}, abstract = {A dual-resonance-peak photonic crystal fiber-surface plasmon resonance (PCF-SPR) refractive index (RI) sensor is designed for different wavelength ranges. The first resonance peak of the sensor is distributed in the wavelength range of 700-2350 nm, while the second peak is distributed in the range of 2350-5550 nm. In addition to detecting analytes using the full spectrum of constraint losses (CLs), it is also possible to use a single resonance peak to achieve the detection of analytes. By systematically optimizing the nanowire diameter, the diameter of the inner and outer layer air hole, the width of the groove, the polishing depth, and the distance from the outer layer air hole to the fiber core, the optimal structure of the sensor is finally determined. In this study, the sensor was studied by numerical analysis, and the characteristics of the sensor were evaluated by wavelength detection technology. The results show that within the RI range of 1.24-1.37, the sensor has a maximum wavelength sensitivity (WS) of 54700 nm/RIU for detecting the RI of analytes. Within the above refractive index range, the regression coefficient R [2] of the dual-peak-resonance wavelength is 0.99993, ensuring the accuracy of the estimated resonance wavelength of the sensor. In addition, the sensor can also use dual-peak-shift sensitivity (DPSS) to detect the refractive index, which is a relatively new sensing technology. The maximum DPSS of the sensor is 95300 nm/RIU. Due to its high sensitivity and unique dual-peak characteristics, this sensor has wide application prospects in medical diagnosis, environmental monitoring, food safety, and other fields.}, } @article {pmid39824758, year = {2025}, author = {Đinh, LG and Brunelle, M and Tạ, TT}, title = {Relating production and perception in two Raglai dialects at different stages of registrogenesis.}, journal = {Phonetica}, volume = {}, number = {}, pages = {}, pmid = {39824758}, issn = {1423-0321}, abstract = {This paper explores the perception of two diachronically related and mutually intelligible phonological oppositions, the onset voicing contrast of Northern Raglai and the register contrast of Southern Raglai. It is the continuation of a previous acoustic study that revealed that Northern Raglai onset stops maintain a voicing distinction accompanied by weak formant and voice quality modulations on following vowels, while Southern Raglai has transphonologized this voicing contrast into a register contrast marked by vowel and voice quality distinctions. Our findings indicate that the two dialects partially differ in their use of identification cues, Northern Raglai listeners using both voicing and F1 as major cues while Southern Raglai listeners largely focus on F1. Production and perception are thus not perfectly aligned in Northern Raglai, because F1 plays a stronger role in perception than production in this dialect. We conclude that mutual intelligibility between dialects is possible because they both use F1 for identification.}, } @article {pmid39769881, year = {2024}, author = {Jv, X and Wu, J and Mao, Q and Li, Q and Zhang, T}, title = {Development on Light and Thin Broadband Sound Absorption Structure Based on Unequal-Cross-Section Microperforated Plate Series Connection.}, journal = {Materials (Basel, Switzerland)}, volume = {17}, number = {24}, pages = {}, pmid = {39769881}, issn = {1996-1944}, support = {51965041//National Natural Science Foundation of China/ ; YC2022-s735//Jiangxi Postgraduate Innovation Special Fund Project/ ; }, abstract = {The sound absorption structure of a microperforated plate has many advantages and has great potential in the field of noise control. In order to solve the problem of broadband sound absorption of microperforated plates, a series acoustic structure of microperforated plates of unequal cross-section was designed based on the traditional microperforated plate series acoustic structure. Compared with the traditional series structure, the sudden change of cross-section increases the sound energy dissipation and greatly improves the sound absorption performance. Through the analysis of its parameters, when the overall thickness of the structure is 20 mm, its sound absorption coefficient is above 0.5 in the frequency range of 1000-3450 Hz; there are three formants, and the sound absorption coefficients corresponding to the three formants reach 1. This study provides new ideas and methods for the design of broadband acoustic structures.}, } @article {pmid39763462, year = {2024}, author = {Caragli, V and Zacheo, E and Nodari, R and Genovese, E and Mancuso, A and Mazzoni, L}, title = {Effects of face protector devices on acoustic parameters of voice.}, journal = {Acta otorhinolaryngologica Italica : organo ufficiale della Societa italiana di otorinolaringologia e chirurgia cervico-facciale}, volume = {44}, number = {6}, pages = {377-391}, pmid = {39763462}, issn = {1827-675X}, mesh = {Humans ; *COVID-19/prevention & control/transmission ; Male ; Adult ; Female ; *Personal Protective Equipment ; *Voice Quality ; *Speech Acoustics ; Masks ; Young Adult ; Middle Aged ; Voice ; }, abstract = {OBJECTIVES: The SARS-CoV-2 pandemic required the use of personal protective equipment (PPE) in medical and social contexts to reduce exposure and prevent pathogen transmission. This study aims to analyse possible changes in voice and speech parameters with and without PPE.

METHODS: Speech samples using different types of PPE were obtained. Recordings were then analysed using PRAAT software (version 6.1.42). Statistical analysis was conducted using ANOVA in Jamovi software. A post-hoc test was performed to compare PPE-related results.

RESULTS: Statistically significant differences were found in Cepstral Peak of Prominence-Smoothed, Harmonic to Noise Ratio (HNR), slope of Long-Term Average Spectrum (LTAS), tilt of trendline through LTAS, shimmer parameters, HNR mean and standard deviation of vowels, vowels and consonants formants. HNR values increased whereas shimmer parameters and formant values reduced using PPE [PPE combined>filtering face piece (FFP)> surgical masks>no PPE].

CONCLUSIONS: Our data show improvement in many parameters of voice and speech quality and modification of speech articulation when using masks, particularly in case of combined PPE. The most relevant changes were found with a combination of face shield and FFP2 masks. This may be due to unconscious improvements in speech articulation and increased demand on vocal folds to achieve better speech intelligibility.}, } @article {pmid39738817, year = {2024}, author = {Hu, Z and Zhang, Z and Li, H and Yang, LZ}, title = {Cross-device and test-retest reliability of speech acoustic measurements derived from consumer-grade mobile recording devices.}, journal = {Behavior research methods}, volume = {57}, number = {1}, pages = {35}, pmid = {39738817}, issn = {1554-3528}, support = {82371931//Natural Science Fund of China/ ; YZJJ202207-TS//HFIPS Director's Fund/ ; 202204295107020004//Anhui Province Key Research and Development Project/ ; }, mesh = {Humans ; Reproducibility of Results ; Male ; Female ; Adult ; Young Adult ; *Speech Acoustics ; Smartphone ; Computers, Handheld ; Speech/physiology ; }, abstract = {In recent years, there has been growing interest in remote speech assessment through automated speech acoustic analysis. While the reliability of widely used features has been validated in professional recording settings, it remains unclear how the heterogeneity of consumer-grade recording devices, commonly used in nonclinical settings, impacts the reliability of these measurements. To address this issue, we systematically investigated the cross-device and test-retest reliability of classical speech acoustic measurements in a sample of healthy Chinese adults using consumer-grade equipment across three popular speech tasks: sustained phonation (SP), diadochokinesis (DDK), and picture description (PicD). A total of 51 participants completed two recording sessions spaced at least 24 hours apart. Speech outputs were recorded simultaneously using four devices: a voice recorder, laptop, tablet, and smartphone. Our results demonstrated good reliability for fundamental frequency and cepstral peak prominence in the SP task across testing sessions and devices. Other features from the SP and PicD tasks exhibited acceptable test-retest reliability, except for the period perturbation quotient from the tablet and formant frequency from the smartphone. However, measures from the DDK task showed a significant decrease in reliability on consumer-grade recording devices compared to professional devices. These findings indicate that the lower recording quality of consumer-grade equipment may compromise the reproducibility of syllable rate estimation, which is critical for DDK analysis. This study underscores the need for standardization of remote speech monitoring methodologies to ensure that remote home assessment provides accurate and reliable results for early screening.}, } @article {pmid39734777, year = {2024}, author = {Lobmaier, JS and Klatt, WK and Schweinberger, SR}, title = {Voice of a woman: influence of interaction partner characteristics on cycle dependent vocal changes in women.}, journal = {Frontiers in psychology}, volume = {15}, number = {}, pages = {1401158}, pmid = {39734777}, issn = {1664-1078}, abstract = {INTRODUCTION: Research has shown that women's vocal characteristics change during the menstrual cycle. Further, evidence suggests that individuals alter their voices depending on the context, such as when speaking to a highly attractive person, or a person with a different social status. The present study aimed at investigating the degree to which women's voices change depending on the vocal characteristics of the interaction partner, and how any such changes are modulated by the woman's current menstrual cycle phase.

METHODS: Forty-two naturally cycling women were recorded once during the late follicular phase (high fertility) and once during the luteal phase (low fertility) while reproducing utterances of men and women who were previously assessed to have either attractive or unattractive voices.

RESULTS: Phonetic analyses revealed that women's voices in response to speakers changed depending on their menstrual cycle phase (F0 variation, maximum F0, Centre of gravity) and depending on the stimulus speaker's vocal attractiveness (HNR, Formants 1-3, Centre of gravity), and sex (Formant 2). Also, the vocal characteristics differed when reproducing spoken sentences of the stimulus speakers compared to when they read out written sentences (minimum F0, Formants 2-4).

DISCUSSION: These results provide further evidence that women alter their voice depending on the vocal characteristics of the interaction partner and that these changes are modulated by the menstrual cycle phase. Specifically, the present findings suggest that cyclic shifts on women's voices may occur only in social contexts (i.e., when a putative interaction partner is involved).}, } @article {pmid39721882, year = {2024}, author = {Xiu, N and Liu, L and Li, W and Cai, Z and Wang, Y and Wang, R and Vaxelaire, B and Sock, R and Ling, Z and Chen, J}, title = {Correlation Analysis Between Cortical Structural Features and Acoustic Features in Patients With Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.11.042}, pmid = {39721882}, issn = {1873-4588}, abstract = {PURPOSE: Parkinson disease (PD) is a progressive neurodegenerative disease. The aim of this study is to investigate the association between acoustic and cortical brain features in Parkinson's disease patients.

METHODS: We recruited 19 (eight females, 11 males) Parkinson's disease patients and 19 (eight females, 11 males) healthy subjects to participate in the experiment. Speech samples of three vowels (/i/, /a/, /u/), six plosives (/p/, /pʰ/, /t/, /tʰ/, /k/, /kʰ/), and three voiced consonants (/l/, /m/, /n/) were collected for the experiment, and the acoustic parameters were extracted for fundamental frequency (F0), voice onset time (VOT), voicing onset-vocalic voicing onset (VO-VVO), first formant (F1), second formant (F2), third formant (F3), first bandwidth (B1), second bandwidth (B2), third bandwidth (B3), Jitter, Shimmer, and Harmonics-to-noise ratio (HNR). We also used Ingenia CX 3.0 T to complete the cranial magnetic resonance scanning and did image processing based on the Desikan-Killiany-Tourville Atlas. We assessed the differences in acoustic and neuroimaging parameters between the PD and healthy controls (HCs) groups using the Levene's test (LT), two-sample independent t test (TT), and Mann-Whitney U test (MWUT), and calculated Spearman's bias correlations for acoustic and neuroimaging parameters in the PD and HC groups, respectively.

RESULTS: The results showed that in acoustic features, based on the results of the TT, it can be seen that the F3 of the PD group regarding the vowel /i/ is significantly smaller than that of the HC group. The jitter on the vowel /u/ was significantly higher in the male PD group than in the male HC group. For other acoustic measures, there were no statistically significant differences between the two groups. In the cortical features, the thickness, area, and volume of the cortex were reduced in the vast majority of the brains of the PD patients, however, there is also a small portion of the cortex that appears to be thickened. In the correlation analysis between cortical and acoustic features, F0, F1, F2, F3, B2, B3, VO-VVO, Jitter, HNR, and VOT acoustic parameters showed significant and strong correlation with thickness, area, and volume of cortical sites such as frontal, temporal, entorhinal, fusiform, and precuneus in PD patients, whereas no significant correlation was found in HC group.

CONCLUSIONS: This suggests that Parkinson's disease does have an effect on the acoustic and cortical features of the patient's brain, and that there is a correlation between the two features.}, } @article {pmid39720068, year = {2024}, author = {Song, J and Kim, H and Lee, YO}, title = {Laryngeal disease classification using voice data: Octave-band vs. mel-frequency filters.}, journal = {Heliyon}, volume = {10}, number = {24}, pages = {e40748}, pmid = {39720068}, issn = {2405-8440}, abstract = {INTRODUCTION: Laryngeal cancer diagnosis relies on specialist examinations, but non-invasive methods using voice data are emerging with artificial intelligence (AI) advancements. Mel Frequency Cepstral Coefficients (MFCCs) are widely used for voice analysis, but Octave Frequency Spectrum Energy (OFSE) may offer better accuracy in detecting subtle voice changes.

PROBLEM STATEMENT: Accurate early diagnosis of laryngeal cancer through voice data is challenging with current methods like MFCC.

OBJECTIVES: This study compares the effectiveness of MFCC and OFSE in classifying voice data into healthy, laryngeal cancer, benign mucosal disease, and vocal fold paralysis categories.

METHODS: Voice samples from 363 patients were analyzed using CNN models, employing MFCC and OFSE with 1/3 octave band filters. Grad-Class Activation Mapping (Grad-CAM) was used to visualize key voice features.

RESULTS: OFSE with 1/3 octave band filters outperformed MFCC in classification accuracy, especially in multi-class classification including laryngeal cancer, benign mucosal disease, and vocal fold paralysis groups (0.9398 ± 0.0232 vs. 0.7061 ± 0.0561). Grad-CAM analysis revealed that OFSE with 1/3 octave band filters effectively distinguished laryngeal cancer from healthy voices by focusing on increased noise in the over-formant area and changes in the fundamental frequency. The analysis also highlighted that specific narrow frequency areas, particularly in vocal fold paralysis, were critical for classification, and benign mucosal diseases occasionally resembled healthy voices, making AI differentiation between benign conditions and laryngeal cancer a significant challenge.

CONCLUSION: OFSE with 1/3 octave band filters provides superior accuracy in diagnosing laryngeal diseases including laryngeal cancer, showing potential for non-invasive, AI-driven early detection.}, } @article {pmid39656685, year = {2024}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA and Madureira, S}, title = {Revisiting the speaker discriminatory power of vowel formant frequencies under a likelihood ratio-based paradigm: The case of mismatched speaking styles.}, journal = {PloS one}, volume = {19}, number = {12}, pages = {e0311363}, pmid = {39656685}, issn = {1932-6203}, mesh = {Humans ; Male ; Adult ; *Speech/physiology ; Speech Acoustics ; Phonetics ; Likelihood Functions ; Young Adult ; Speech Production Measurement/methods ; Language ; }, abstract = {Differentiating subjects through the comparison of their recorded speech is a common endeavor in speaker characterization. When using an acoustic-based approach, this task typically involves scrutinizing specific acoustic parameters and assessing their discriminatory capacity. This experimental study aimed to evaluate the speaker discriminatory power of vowel formants-resonance peaks in the vocal tract-in two different speaking styles: Dialogue and Interview. Different testing procedures were applied, specifically metrics compatible with the likelihood ratio paradigm. Only high-quality recordings were analyzed in this study. The participants were 20 male Brazilian Portuguese (BP) speakers from the same dialectal area. Two speaker-discriminatory power estimates were examined through Multivariate Kernel Density analysis: Log cost-likelihood ratios (Cllr) and equal error rates (EER). As expected, the discriminatory performance was stronger for style-matched analyses than for mismatched-style analyses. In order of relevance, F3, F4, and F1 performed the best in style-matched comparisons, as suggested by lower Cllr and EER values. F2 performed the worst intra-style in both Dialogue and Interview. The discriminatory power of all individual formants (F1-F4) appeared to be affected in the mismatched condition, demonstrating that discriminatory power is sensitive to style-driven changes in speech production. The combination of higher formants 'F3 + F4' outperformed the combination of lower formants 'F1 + F2'. However, in mismatched-style analyses, the magnitude of improvement in Cllr and EER scores increased as more formants were incorporated into the model. The best discriminatory performance was achieved when most formants were combined. Applying multivariate analysis not only reduced average Cllr and EER scores but also influenced the overall probability distribution, shifting the probability density distribution towards lower Cllr and EER values. In general, front and central vowels were found more speaker discriminatory than back vowels as far as the 'F1 + F2' relation was concerned.}, } @article {pmid39656649, year = {2024}, author = {Cervantes Constantino, F and Caputi, Á}, title = {Cortical tracking of speakers' spectral changes predicts selective listening.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {34}, number = {12}, pages = {}, doi = {10.1093/cercor/bhae472}, pmid = {39656649}, issn = {1460-2199}, support = {FCE_1_2019_1_155889//Agencia Nacional de Investigación e Innovación/ ; }, mesh = {Humans ; Male ; Female ; *Speech Perception/physiology ; Adult ; *Electroencephalography/methods ; Young Adult ; Cerebral Cortex/physiology ; Acoustic Stimulation/methods ; }, abstract = {A social scene is particularly informative when people are distinguishable. To understand somebody amid a "cocktail party" chatter, we automatically index their voice. This ability is underpinned by parallel processing of vocal spectral contours from speech sounds, but it has not yet been established how this occurs in the brain's cortex. We investigate single-trial neural tracking of slow frequency modulations in speech using electroencephalography. Participants briefly listened to unfamiliar single speakers, and in addition, they performed a cocktail party comprehension task. Quantified through stimulus reconstruction methods, robust tracking was found in neural responses to slow (delta-theta range) modulations of frequency contours in the fourth and fifth formant band, equivalent to the 3.5-5 KHz audible range. The spectral spacing between neighboring instantaneous frequency contours (ΔF), which also yields indexical information from the vocal tract, was similarly decodable. Moreover, EEG evidence of listeners' spectral tracking abilities predicted their chances of succeeding at selective listening when faced with two-speaker speech mixtures. In summary, the results indicate that the communicating brain can rely on locking of cortical rhythms to major changes led by upper resonances of the vocal tract. Their corresponding articulatory mechanics hence continuously issue a fundamental credential for listeners to target in real time.}, } @article {pmid39665279, year = {2024}, author = {Heiszenberger, E and Reinisch, E and Hartmann, F and Brown, E and Pustka, E}, title = {Perceptually Easy Second-Language Phones Are Not Always Easy: The Role of Orthography and Phonology in Schwa Realization in Second-Language French.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309241277995}, doi = {10.1177/00238309241277995}, pmid = {39665279}, issn = {1756-6053}, abstract = {Encoding and establishing a new second-language (L2) phonological category is notoriously difficult. This is particularly true for phonological contrasts that do not exist in the learners' native language (L1). Phonological categories that also exist in the L1 do not seem to pose any problems. However, foreign-language learners are not only presented with oral input. Instructed L2 learning often involves heavy reliance on written forms of the target language. The present study investigates the contribution of orthography to the quality of phonolexical encoding by examining the acoustics of French schwa by Austrian German learners-a perceptually and articulatorily easy L2 phone with incongruent grapheme-phoneme correspondences between the L1 and L2. We compared production patterns in an auditory word-repetition task (without orthographic input) with those in a word-reading task. We analyzed the formant values (F1, F2, F3) of the schwa realizations of two groups of Austrian high-school students who had been learning French for 1 and 6 years. The results show that production patterns are more likely to be affected by L1 grapheme-to-phoneme correspondences when orthographic input is present. However, orthography does not appear to play the dominant role, as L2 development patterns are strongly determined by both the speaker and especially the lexical item, suggesting a highly complex interaction of multiple internal and external factors in the establishment of L2 phonological categories beyond orthography and phonology.}, } @article {pmid39643915, year = {2024}, author = {Fadeev, KA and Romero Reyes, IV and Goiaeva, DE and Obukhova, TS and Ovsiannikova, TM and Prokofyev, AO and Rytikova, AM and Novikov, AY and Kozunov, VV and Stroganova, TA and Orekhova, EV}, title = {Attenuated processing of vowels in the left temporal cortex predicts speech-in-noise perception deficit in children with autism.}, journal = {Journal of neurodevelopmental disorders}, volume = {16}, number = {1}, pages = {67}, pmid = {39643915}, issn = {1866-1955}, mesh = {Humans ; Male ; *Speech Perception/physiology ; *Magnetoencephalography ; Child ; *Temporal Lobe/physiopathology ; *Noise ; Acoustic Stimulation ; Evoked Potentials, Auditory/physiology ; Autism Spectrum Disorder/physiopathology/complications ; Adolescent ; Auditory Cortex/physiopathology ; Autistic Disorder/physiopathology/complications ; }, abstract = {BACKGROUND: Difficulties with speech-in-noise perception in autism spectrum disorders (ASD) may be associated with impaired analysis of speech sounds, such as vowels, which represent the fundamental phoneme constituents of human speech. Vowels elicit early (< 100 ms) sustained processing negativity (SPN) in the auditory cortex that reflects the detection of an acoustic pattern based on the presence of formant structure and/or periodic envelope information (f0) and its transformation into an auditory "object".

METHODS: We used magnetoencephalography (MEG) and individual brain models to investigate whether SPN is altered in children with ASD and whether this deficit is associated with impairment in their ability to perceive speech in the background of noise. MEG was recorded while boys with ASD and typically developing boys passively listened to sounds that differed in the presence/absence of f0 periodicity and formant structure. Word-in-noise perception was assessed in the separate psychoacoustic experiment using stationary and amplitude modulated noise with varying signal-to-noise ratio.

RESULTS: SPN was present in both groups with similarly early onset. In children with ASD, SPN associated with processing formant structure was reduced predominantly in the cortical areas lateral to and medial to the primary auditory cortex, starting at ~ 150-200 ms after the stimulus onset. In the left hemisphere, this deficit correlated with impaired ability of children with ASD to recognize words in amplitude-modulated noise, but not in stationary noise.

CONCLUSIONS: These results suggest that perceptual grouping of vowel formants into phonemes is impaired in children with ASD and that, in the left hemisphere, this deficit contributes to their difficulties with speech perception in fluctuating background noise.}, } @article {pmid39605265, year = {2024}, author = {Xie, B and Li, Z and Wang, H and Kuang, X and Ni, W and Zhong, R and Li, Y}, title = {[The influence of vowel and sound intensity on the results of voice acoustic formant detection was analyzed].}, journal = {Lin chuang er bi yan hou tou jing wai ke za zhi = Journal of clinical otorhinolaryngology head and neck surgery}, volume = {38}, number = {12}, pages = {1149-1153}, doi = {10.13201/j.issn.2096-7993.2024.12.011}, pmid = {39605265}, issn = {2096-7993}, mesh = {Humans ; Male ; Female ; Young Adult ; *Speech Acoustics ; Voice Quality ; Phonetics ; Voice/physiology ; Adult ; }, abstract = {Objective:This study aims to explore the influence of vowels and sound intensity on formant, so as to provide reference for the selection of sound samples and vocal methods in acoustic detection. Methods:Thirty-eight healthy subjects, 19 male and 19 female, aged 19-24 years old were recruited. The formants of different vowels(/a/, /(?)/, /i/ and /u/) and different sound intensities(lowest sound, comfort sound, highest true sound and highest falsetto sound) were analyzed, and pairings were compared between groups with significant differences. Results:①The vowels /a/ and /(?)/ in the first formant were larger than /i/ and /u/, and /i/ was the largest in the second formant. The minimum value of the first formant is the lowest sound of /i/ and the maximum is the highest sound of /a/. ②In the first formant, the chest sound area increases with the increase of sound intensity, while the second formant enters the highest falsetto and decreases significantly. Conclusion:Different vowels and sound intensity have different distribution of formant, that is, vowel and sound intensity have different degree of influence on formant. According to the extreme value of the first formant, the maximum normal range is determined initially, which is helpful to improve the acoustic detection.}, } @article {pmid39589237, year = {2025}, author = {Fagniart, S and Delvaux, V and Harmegnies, B and Huberlant, A and Huet, K and Piccaluga, M and Watterman, I and Charlier, B}, title = {Producing Nasal Vowels Without Nasalization? Perceptual Judgments and Acoustic Measurements of Nasal/Oral Vowels Produced by Children With Cochlear Implants and Typically Hearing Peers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {68}, number = {1}, pages = {301-322}, doi = {10.1044/2024_JSLHR-24-00083}, pmid = {39589237}, issn = {1558-9102}, mesh = {Humans ; *Cochlear Implants ; Female ; Male ; Child ; *Speech Acoustics ; *Phonetics ; *Cues ; *Speech Perception/physiology ; *Judgment ; Speech Production Measurement/methods ; Speech/physiology ; Nose/physiology ; Deafness/rehabilitation ; }, abstract = {PURPOSE: The objective of the present study is to investigate nasal and oral vowel production in French-speaking children with cochlear implants (CIs) and children with typical hearing (TH). Vowel nasality relies primarily on acoustic cues that may be less effectively transmitted by the implant. The study investigates how children with CIs manage to produce these segments in French, a language with contrastive vowel nasalization.

METHOD: The children performed a task in which they repeated sentences containing a consonant-vowel-consonant-vowel-type pseudoword, the vowel being a nasal or oral vowel from French. Thirteen children with CIs and 25 children with TH completed the task. Among the children with CIs, the level of exposure to Cued Speech (CS) was either occasional (CS-) or intense (CS+). The productions were analyzed through perceptual judgments and acoustic measurements. Different acoustic cues related to nasality were collected: segmental durations, formant values, and predicted values of nasalization. Multiple regression analyses were conducted to examine which acoustic features are associated with perceived nasality in perceptual judgments.

RESULTS: The perceptual judgments realized on the children's speech productions indicate that children with sustained exposure to CS (CS+) exhibited the best identified and most distinct oral/nasal productions. Acoustic measures revealed different production profiles among the groups: Children in the CS+ group seem to differentiate between nasal and oral vowels by relying on segmental duration cues and variations in oropharyngeal configurations (associated with formant differences) but less through nasal resonance.

CONCLUSION: The study highlights (a) a benefit of sustained CS practice for CI children for the intelligibility of nasal-oral segments, (b) privileged exploitation of temporal (segmental duration) and salient acoustic cues (oropharyngeal configuration) in the CS+ group, and (c) difficulties among children with CI in distinguishing nasal-oral segments through nasal resonance.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.27744768.}, } @article {pmid39550323, year = {2024}, author = {Bøyesen, B and Hide, Ø}, title = {Using Twang and Medialization Techniques to Gain Feminine-Sounding Speech in Trans Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.10.020}, pmid = {39550323}, issn = {1873-4588}, abstract = {OBJECTIVES: In this study, we introduce an intervention based on two techniques: twang and medialization. The hypothesis is that a combination of these two techniques will enable trans women to gain feminine-sounding speech without vocal strain or harm.

METHOD: Five trans women took part in the study. A control group of five cisgender women and five cisgender men were included. A list of 14 monosyllabic words was created, where the vowel /ɑ/ was embedded in various consonant contexts. All participants were asked to read the word list three times, each time presented in a different order. The trans women read the word list before and after intervention. Acoustic analyses of fundamental frequency and the first, second, and third formant frequencies were conducted. For the perceptual analysis, 60 voice samples were selected from the entire material. Fifteen listeners were asked whether they perceived the voice samples as feminine, masculine, or uncertain. The listeners were also asked for gender judgments based on sentences read by the trans women after intervention.

RESULTS: The acoustic analyses revealed an increase in fundamental frequencies and first, second, and third formants after intervention for all five trans women, approaching the values of the female controls. The perceptual judgments showed that the majority of the trans women voice samples were perceived as feminine after intervention.

CONCLUSIONS: Based on the acoustic analyses and the perceptual evaluations, the conclusion seems to show that the combination of the techniques twang and medialization enable the trans women to obtain feminine attribution. Nevertheless, the study is too small for generalizations. However, a take-home message is that it is appropriate to focus primarily on resonance, in addition to speaking fundamental frequency, to gain feminine-sounding speech.}, } @article {pmid39531311, year = {2024}, author = {Ponsonnet, M and Coupé, C and Pellegrino, F and Garcia Arasco, A and Pisanski, K}, title = {Vowel signatures in emotional interjections and nonlinguistic vocalizations expressing pain, disgust, and joy across languagesa).}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {5}, pages = {3118-3139}, doi = {10.1121/10.0032454}, pmid = {39531311}, issn = {1520-8524}, mesh = {Humans ; *Emotions ; Phonetics ; Language ; Speech Acoustics ; Pain/psychology ; Voice Quality ; Happiness ; }, abstract = {In this comparative cross-linguistic study we test whether expressive interjections (words like ouch or yay) share similar vowel signatures across the world's languages, and whether these can be traced back to nonlinguistic vocalizations (like screams and cries) expressing the same emotions of pain, disgust, and joy. We analyze vowels in interjections from dictionaries of 131 languages (over 600 tokens) and compare these with nearly 500 vowels based on formant frequency measures from voice recordings of volitional nonlinguistic vocalizations. We show that across the globe, pain interjections feature a-like vowels and wide falling diphthongs ("ai" as in Ayyy! "aw" as in Ouch!), whereas disgust and joy interjections do not show robust vowel regularities that extend geographically. In nonlinguistic vocalizations, all emotions yield distinct vowel signatures: pain prompts open vowels such as [a], disgust schwa-like central vowels, and joy front vowels such as [i]. Our results show that pain is the only affective experience tested with a clear, robust vowel signature that is preserved between nonlinguistic vocalizations and interjections across languages. These results offer empirical evidence for iconicity in some expressive interjections. We consider potential mechanisms and origins, from evolutionary pressures and sound symbolism to colexification, proposing testable hypotheses for future research.}, } @article {pmid39516258, year = {2024}, author = {Carranante, G and Cany, C and Farri, P and Giavazzi, M and Varnet, L}, title = {Mapping the spectrotemporal regions influencing perception of French stop consonants in noise.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {27183}, pmid = {39516258}, issn = {2045-2322}, support = {ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; ANR-17-EURE-0017//Agence Nationale de la Recherche/ ; ANR-20-CE28-0004//Agence Nationale de la Recherche/ ; }, mesh = {Humans ; *Speech Perception/physiology ; Female ; Male ; *Noise ; Adult ; *Phonetics ; Young Adult ; Language ; Cues ; Speech Acoustics ; France ; Acoustic Stimulation ; }, abstract = {Understanding how speech sounds are decoded into linguistic units has been a central research challenge over the last century. This study follows a reverse-correlation approach to reveal the acoustic cues listeners use to categorize French stop consonants in noise. Compared to previous methods, this approach ensures an unprecedented level of detail with only minimal theoretical assumptions. Thirty-two participants performed a speech-in-noise discrimination task based on natural /aCa/ utterances, with C = /b/, /d/, /g/, /p/, /t/, or /k/. The trial-by-trial analysis of their confusions enabled us to map the spectrotemporal information they relied on for their decisions. In place-of-articulation contrasts, the results confirmed the critical role of formant consonant-vowel transitions, used by all participants, and, to a lesser extent, vowel-consonant transitions and high-frequency release bursts. Similarly, for voicing contrasts, we validated the prominent role of the voicing bar cue, with some participants also using formant transitions and burst cues. This approach revealed that most listeners use a combination of several cues for each task, with significant variability within the participant group. These insights shed new light on decades-old debates regarding the relative importance of cues for phoneme perception and suggest that research on acoustic cues should not overlook individual variability in speech perception.}, } @article {pmid39515817, year = {2024}, author = {Lin, YC and Yan, HT and Lin, CH and Chang, HH}, title = {Identifying and Estimating Frailty Phenotypes by Vocal Biomarkers: Cross-Sectional Study.}, journal = {Journal of medical Internet research}, volume = {26}, number = {}, pages = {e58466}, pmid = {39515817}, issn = {1438-8871}, mesh = {Humans ; Aged ; Cross-Sectional Studies ; *Frailty/physiopathology ; Male ; Female ; *Phenotype ; *Biomarkers ; Middle Aged ; Voice/physiology ; Aged, 80 and over ; Taiwan ; Frail Elderly/statistics & numerical data ; Sarcopenia/physiopathology/diagnosis ; }, abstract = {BACKGROUND: Researchers have developed a variety of indices to assess frailty. Recent research indicates that the human voice reflects frailty status. Frailty phenotypes are seldom discussed in the literature on the aging voice.

OBJECTIVE: This study aims to examine potential phenotypes of frail older adults and determine their correlation with vocal biomarkers.

METHODS: Participants aged ≥60 years who visited the geriatric outpatient clinic of a teaching hospital in central Taiwan between 2020 and 2021 were recruited. We identified 4 frailty phenotypes: energy-based frailty, sarcopenia-based frailty, hybrid-based frailty-energy, and hybrid-based frailty-sarcopenia. Participants were asked to pronounce a sustained vowel "/a/" for approximately 1 second. The speech signals were digitized and analyzed. Four voice parameters-the average number of zero crossings (A1), variations in local peaks and valleys (A2), variations in first and second formant frequencies (A3), and spectral energy ratio (A4)-were used for analyzing changes in voice. Logistic regression was used to elucidate the prediction model.

RESULTS: Among 277 older adults, an increase in A1 values was associated with a lower likelihood of energy-based frailty (odds ratio [OR] 0.81, 95% CI 0.68-0.96), whereas an increase in A2 values resulted in a higher likelihood of sarcopenia-based frailty (OR 1.34, 95% CI 1.18-1.52). Respondents with larger A3 and A4 values had a higher likelihood of hybrid-based frailty-sarcopenia (OR 1.03, 95% CI 1.002-1.06) and hybrid-based frailty-energy (OR 1.43, 95% CI 1.02-2.01), respectively.

CONCLUSIONS: Vocal biomarkers might be potentially useful in estimating frailty phenotypes. Clinicians can use 2 crucial acoustic parameters, namely A1 and A2, to diagnose a frailty phenotype that is associated with insufficient energy or reduced muscle function. The assessment of A3 and A4 involves a complex frailty phenotype.}, } @article {pmid39487102, year = {2025}, author = {Hullebus, M and Gafos, A and Boll-Avetisyan, N and Langus, A and Fritzsche, T and Höhle, B}, title = {Infant preference for specific phonetic cue relations in the contrast between voiced and voiceless stops.}, journal = {Infancy : the official journal of the International Society on Infant Studies}, volume = {30}, number = {1}, pages = {e12630}, pmid = {39487102}, issn = {1532-7078}, support = {317633480 - SFB 1287//Deutsche Forschungsgemeinschaft/ ; }, mesh = {Humans ; *Cues ; *Speech Perception ; *Phonetics ; Male ; Female ; Infant ; Speech Acoustics ; Adult ; Acoustic Stimulation ; Language Development ; }, abstract = {Acoustic variability in the speech input has been shown, in certain contexts, to be beneficial during infants' acquisition of sound contrasts. One approach attributes this result to the potential of variability to make the stability of individual cues visible. Another approach suggests that, instead of highlighting individual cues, variability uncovers stable relations between cues that signal a sound contrast. Here, we investigate the relation between Voice Onset Time and the onset of F1 formant frequency, two cues that subserve the voicing contrast in German. First, we verified that German-speaking adults' use of VOT to categorize voiced and voiceless stops is dependent on the value of the F1 onset frequency, in the specific form of a so-called trading relation. Next, we tested whether 6-month-old German learning infants exhibit differential sensitivity to stimulus continua in which the cues varied to an equal extent, but either adhered to the trading relation established in the adult experiment or adhered to a reversed relation. Our results present evidence that infants prefer listening to speech in which phonetic cues conform to certain cue trading relations over cue relations that are reversed.}, } @article {pmid39473806, year = {2024}, author = {Ayadi, H and Elbéji, A and Despotovic, V and Fagherazzi, G}, title = {Digital Vocal Biomarker of Smoking Status Using Ecological Audio Recordings: Results from the Colive Voice Study.}, journal = {Digital biomarkers}, volume = {8}, number = {1}, pages = {159-170}, pmid = {39473806}, issn = {2504-110X}, abstract = {INTRODUCTION: The complex health, social, and economic consequences of tobacco smoking underscore the importance of incorporating reliable and scalable data collection on smoking status and habits into research across various disciplines. Given that smoking impacts voice production, we aimed to develop a gender and language-specific vocal biomarker of smoking status.

METHODS: Leveraging data from the Colive Voice study, we used statistical analysis methods to quantify the effects of smoking on voice characteristics. Various voice feature extraction methods combined with machine learning algorithms were then used to produce a gender and language-specific (English and French) digital vocal biomarker to differentiate smokers from never-smokers.

RESULTS: A total of 1,332‬ participants were included after propensity score matching (mean age = 43.6 [13.65], 64.41% are female, 56.68% are English speakers, 50% are smokers and 50% are never-smokers). We observed differences in voice features distribution: for women, the fundamental frequency F0, the formants F1, F2, and F3 frequencies and the harmonics-to-noise ratio were lower in smokers compared to never-smokers (p < 0.05) while for men no significant disparities were noted between the two groups. The accuracy and AUC of smoking status prediction reached 0.71 and 0.76, respectively, for the female participants, and 0.65 and 0.68, respectively, for the male participants.

CONCLUSION: We have shown that voice features are impacted by smoking. We have developed a novel digital vocal biomarker that can be used in clinical and epidemiological research to assess smoking status in a rapid, scalable, and accurate manner using ecological audio recordings.}, } @article {pmid39461704, year = {2024}, author = {Li, JJ and Daliri, A and Kim, KS and Max, L}, title = {Does pre-speech auditory modulation reflect processes related to feedback monitoring or speech movement planning?.}, journal = {Neuroscience letters}, volume = {843}, number = {}, pages = {138025}, pmid = {39461704}, issn = {1872-7972}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R01 DC020707/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Female ; *Speech/physiology ; Adult ; Young Adult ; *Electroencephalography/methods ; *Speech Perception/physiology ; Auditory Cortex/physiology ; Acoustic Stimulation/methods ; Movement/physiology ; Auditory Perception/physiology ; }, abstract = {Previous studies have revealed that auditory processing is modulated during the planning phase immediately prior to speech onset. To date, the functional relevance of this pre-speech auditory modulation (PSAM) remains unknown. Here, we investigated whether PSAM reflects neuronal processes that are associated with preparing auditory cortex for optimized feedback monitoring as reflected in online speech corrections. Combining electroencephalographic PSAM data from a previous data set with new acoustic measures of the same participants' speech, we asked whether individual speakers' extent of PSAM is correlated with the implementation of within-vowel articulatory adjustments during /b/-vowel-/d/ word productions. Online articulatory adjustments were quantified as the extent of change in inter-trial formant variability from vowel onset to vowel midpoint (a phenomenon known as centering). This approach allowed us to also consider inter-trial variability in formant production, and its possible relation to PSAM, at vowel onset and midpoint separately. Results showed that inter-trial formant variability was significantly smaller at vowel midpoint than at vowel onset. PSAM was not significantly correlated with this amount of change in variability as an index of within-vowel adjustments. Surprisingly, PSAM was negatively correlated with inter-trial formant variability not only in the middle but also at the very onset of the vowels. Thus, speakers with more PSAM produced formants that were already less variable at vowel onset. Findings suggest that PSAM may reflect processes that influence speech acoustics as early as vowel onset and, thus, that are directly involved in motor command preparation (feedforward control) rather than output monitoring (feedback control).}, } @article {pmid39448279, year = {2024}, author = {Pekdemir, A and Kemaloğlu, YK and Gölaç, H and İriz, A and Köktürk, O and Mengü, G}, title = {The Self-Assessment, Perturbation, and Resonance Values of Voice and Speech in Individuals with Snoring and Obstructive Sleep Apnea.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.018}, pmid = {39448279}, issn = {1873-4588}, abstract = {PURPOSE: The static and dynamic soft tissue changes resulting in hypopnea and/or apnea in the subjects with obstructive sleep apnea (OSA) occur in the upper airway, which also serves as the voice or speech tract. In this study, we looked for the Voice Handicap Index-10 (VHI-10) and Voice-Related Quality of Life (V-RQOL) scores in addition to perturbation and formant values of the vowels in those with snoring and OSA.

METHODS: Epworth Sleepiness Scale (ESS), STOP-Bang scores, Body-Mass Index (BMI), neck circumference (NC), modified Mallampati Index, tonsil size, Apnea-Hypopnea Index, VHI-10 and V-RQOL scores, perturbation and formant values, and fundamental frequency of the voice samples were taken to evaluate.

RESULTS: The data revealed that not the perturbation and formant values but scores of VHI-10 and V-RQOL were significantly different between the control and OSA subjects and that both were significantly correlated with ESS and NC. Further, a few significant correlations of BMI and tonsil size with the formant and perturbation values were also found.

CONCLUSIONS: Our data reveal that (i) VHI-10 and V-RQOL were good identifiers for those with OSA, and (ii) perturbation and formant values were related to particularly tonsil size, and further BMI. Hence, we could say that in an attempt to use a voice parameter to screen OSA, VHI-10, and V-RQOL appeared to be better than the objective voice measures, which could be variable due to the tonsil size and BMI of the subjects.}, } @article {pmid39445770, year = {2024}, author = {Feng, S and Jiang, X}, title = {Acoustic encoding of vocally expressed confidence and doubt in Chinese bidialectics.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2860-2876}, doi = {10.1121/10.0032400}, pmid = {39445770}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Male ; Intention ; *Language ; Multilingualism ; Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Language communicators use acoustic-phonetic cues to convey a variety of social information in the spoken language, and the learning of a second language affects speech production in a social setting. It remains unclear how speaking different dialects could affect the acoustic metrics underlying the intended communicative meanings. Nine Chinese Bayannur-Mandarin bidialectics produced single-digit numbers in statements of both Standard Mandarin and the Bayannur dialect with different levels of intended confidence. Fifteen listeners judged the intention presence and confidence level. Prosodically unmarked and marked stimuli exhibited significant differences in perceived intention. A higher intended level was perceived as more confident. The acoustic analysis revealed the segmental (third and fourth formants, center of gravity), suprasegmental (mean fundamental frequency, fundamental frequency range, duration), and source features (harmonic to noise ratio, cepstral peak prominence) can distinguish between confident and doubtful expressions. Most features also distinguished between dialect and Mandarin productions. Interactions on fourth formant and mean fundamental frequency suggested that speakers made greater use of acoustic parameters to encode confidence and doubt in the Bayannur dialect than in Mandarin. In machine learning experiments, the above-chance-level overall classification rates for confidence and doubt and the in-group advantage supported the dialect theory.}, } @article {pmid39443329, year = {2024}, author = {Persson, A}, title = {The acoustic characteristics of Swedish vowels.}, journal = {Phonetica}, volume = {81}, number = {6}, pages = {599-643}, pmid = {39443329}, issn = {1423-0321}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; Sweden ; *Language ; Speech Perception ; Sound Spectrography ; Female ; Male ; Cues ; Adult ; }, abstract = {The Swedish vowel space is relatively densely populated with 21 categories that differ in quality and quantity. Existing descriptions of the entire space rest on recordings made in the late 1990s or earlier, while recent work in general has focused on subsets of the space. The present paper reports on static and dynamic acoustic analyses of the entire vowel space using a recently released database of h-VOWEL-d words (SwehVd). The results highlight the importance of static and dynamic spectral and temporal cues for Swedish vowel category distinction. The first two formants and vowel duration are the primary acoustic cues to vowel identity, however, the third formant contributes to increased category separability for neighboring contrasts presumed to differ in lip-rounding. In addition, even though all long-short vowel pairs differ systematically in duration, they also display considerable spectral differences, suggesting that quantity distinctions are not separate from quality distinctions in Swedish. The dynamic analysis further suggests formant movements in both long and short vowels, with [e:] and [o:] displaying clearer patterns of diphthongization.}, } @article {pmid39438167, year = {2024}, author = {Martínez-Olalla, R and Hidalgo-De la Guía, I and Gayarzábal-Heinze, E and Fernández-Ruiz, R and Núñez-Vidal, E and Álvarez-Marquina, A and Palacios-Alonso, D}, title = {Analysis of Voice Quality in Children With Smith-Magenis Syndrome.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.026}, pmid = {39438167}, issn = {1873-4588}, abstract = {UNLABELLED: The production of phonation involves very complex processes, linked to the physical, clinical, and emotional state of the speaker. Thus, in populations with neurological diseases, it is possible to find the imprint in the voice signal left by the deterioration of certain cortical areas or part of the neurocognitive mechanisms that are involved in speech. In previous works, the authors determined the relationship between the pathological characteristics of the voice of the speakers with Smith-Magenis syndrome (SMS) and a lower value in the cepstral peak prominence (CPP) with respect to normative speakers. They also described the presence of subharmonics in their voices.

OBJECTIVES: The present study aims to verify whether both characteristics can be used simultaneously to differentiate SMS voices from neurotypical voices. It will also be analyzed if there is variation in the trajectory of the formants coinciding with the subharmonics.

METHODS: To do this, the effect of subharmonics in the voices of 12 SMS individuals was isolated to see if they were responsible for the lower CPP values. An evaluation of the CPP was also carried out in the areas of subharmonic presence, from the peak that reflected the value of f0, rather than using the most prominent peak. This offered us a baseline for the CPP value in the presence of subharmonics. It was checked if changes in the formants occurred synchronously to the appearance of those subharmonics. If so, the muscles that control the position of the jaw and tongue would be affected at the same time as the larynx. The latter was difficult to observe since the samples were very short. A comparison of phonatory performance of a sustained /a/ between a normotypical group and non-normotypical group of children was carried out. These groups were balanced and matched in age and gender. The Spanish Association of Smith-Magenis Syndrome (ASME) provides almost 20% of the population in Spain.

RESULTS: The CPP allows differentiating between normative speakers and those with SMS, even when isolating the effect of subharmonics.

CONCLUSIONS: The CPP is a robust index for determining the degree of dysphonia. It makes it possible to differentiate pathological voices from healthy voices even when subharmonics are present. The presence of subharmonics is a characteristic of voices of SMS individuals and is not present in healthy ones. Both indexes can be used simultaneously to differentiate SMS voices from neurotypical voices.}, } @article {pmid39418590, year = {2024}, author = {Krakauer, J and Naber, C and Niziolek, CA and Parrell, B}, title = {Divided Attention Has Limited Effects on Speech Sensorimotor Control.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {11}, pages = {4358-4368}, pmid = {39418590}, issn = {1558-9102}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Attention/physiology ; Male ; Female ; Young Adult ; *Speech/physiology ; Adult ; Feedback, Sensory/physiology ; Adaptation, Physiological/physiology ; Speech Perception/physiology ; Visual Perception/physiology ; Adolescent ; }, abstract = {PURPOSE: When vowel formants are externally perturbed, speakers change their production to oppose that perturbation both during the ongoing production (compensation) and in future productions (adaptation). To date, attempts to explain the large variability across individuals in these responses have focused on trait-based characteristics such as auditory acuity, but evidence from other motor domains suggests that attention may modulate the motor response to sensory perturbations. Here, we test the extent to which divided attention impacts sensorimotor control for supralaryngeal articulation.

METHOD: Neurobiologically healthy speakers were exposed to random (Experiment 1) or consistent (Experiment 2) real-time auditory perturbation of vowel formants to measure online compensation and trial-to-trial adaptation, respectively. In both experiments, participants completed two conditions: one with a simultaneous visual distractor task to divide attention and one without this secondary task.

RESULTS: Divided visual attention slightly reduced online compensation, but only starting > 300 ms after vowel onset, well beyond the typical duration of vowels in speech. Divided attention had no effect on adaptation.

CONCLUSIONS: The results from both experiments suggest that the use of sensory feedback in typical speech motor control is a largely automatic process unaffected by divided visual attention, suggesting that the source of cross-speaker variability in response to formant perturbations likely lies within the speech production system rather than in higher-level cognitive processes. Methodologically, these results suggest that compensation for formant perturbations should be measured prior to 300 ms after vowel onset to avoid any potential impact of attention or other higher-order cognitive factors.}, } @article {pmid39414424, year = {2024}, author = {He, Y and Wang, X and Huang, T and Zhao, W and Fu, Z and Zheng, Q and Jin, L and Kim, H and Liu, H}, title = {The Study of Speech Acoustic Characteristics of Elderly Individuals with Presbyphagia in Ningbo, China.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.041}, pmid = {39414424}, issn = {1873-4588}, abstract = {The feasibility of using acoustic parameters to predict presbyphagia has been preliminarily confirmed. Considering that age and gender can influence the results of acoustic parameters, this study aimed to further explore the specific effects of age and gender on acoustic parameter analysis of the elderly population over 60 years old with presbyphagia. A total of 45 participants were enrolled and divided into three groups (60-69 years old, 70-79 years old, and 80-89 years old). Acoustic parameters, including maximum phonation time, first to third formant frequencies (F1-F3) of /a/, /i/, and /u/, oral diadochokinesis, the acoustic vowel space, and laryngeal diadochokinesis (LDDK), were extracted and calculated. Two-way analysis of variance was used to analyze the correlations between acoustic parameters and age and gender. The result indicates that /hʌ/ LDDK rate had significant differences in age groups, presenting the 80-89 age group being significantly slower than the 60-69 age group. F1/a/, F2/a/, F2/i/, F3/i/, and F2i/F2u differed systematically between genders, with males being lower and smaller than females. Changes that were consistent with /hʌ/ LDDK regularity, confirmed by greater regularity in females. No significant differences were observed for other acoustic parameters. No significant interactions were revealed. According to the preliminary data, we hypothesized that respiratory capacity and control during vocal fold abduction weaken with aging. This highlights the importance of continuously monitoring the respiratory impact on swallowing function in elderly individuals. Additionally, gender influenced several acoustic parameters, indicating the necessity to differentiate between genders when assessing presbyphagia using acoustic parameters, especially focusing on swallowing function in elderly males in Ningbo.}, } @article {pmid39414423, year = {2024}, author = {Wang, Y and Zhao, Y}, title = {Acoustic Characteristics of Modern Chinese Folk Singing at Different Vocal Efforts.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.09.022}, pmid = {39414423}, issn = {1873-4588}, abstract = {OBJECTIVES: Modern Chinese folk singing is developed by fusing regionally specific traditional Chinese singing with Western scientific training techniques. The purpose of this research is to contribute to the exploration of the acoustic characteristics of Chinese folk songs and the efficient resonance space for the performance.

METHOD: Seven tenors and seven sopranos were invited to sing three songs and read the lyrics in an anechoic chamber. The vocal outputs were meticulously recorded and subjected to a comprehensive acoustic analysis. Overall equivalent sound level, long-term average spectrum (LTAS), gain factors, and other acoustic parameters were analyzed for different vocal efforts (soft, normal, and loud), genders, and vocal modes (singing and speaking).

RESULTS: Male singers have singer's formant at 3 kHz in LTAS, a characteristic not found in other country singers or Chinese opera singers, but slightly higher than the frequency of Western Classical singers. Female singers do not have singer's formant and their LTAS curves are much flatter. The α, spectral balance, and singing power ratio all increased with increasing vocal effort, and they are higher for singing than for speaking. Finally, there is a significant gain factor at 3 kHz, with a maximum value of 1.85 for men and 1.68 for women.

CONCLUSIONS: Male singers in Chinese folk singing have a singer's formant, a phenomenon not consistently observed in their female singers. The intricate acoustic characteristics of this singing style have been extensively examined and can contribute to the existing literature on the spectral properties of diverse vocal genres. Furthermore, this analysis offers foundational data essential for the optimization of room acoustics tailored to vocal performance.}, } @article {pmid39400271, year = {2024}, author = {Clopper, CG}, title = {Dynamic acoustic vowel distances within and across dialects.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2497-2507}, doi = {10.1121/10.0032385}, pmid = {39400271}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; *Speech Production Measurement/methods ; Voice Quality ; Acoustics ; Female ; Male ; Time Factors ; Language ; Sound Spectrography ; Adult ; }, abstract = {Vowels vary in their acoustic similarity across regional dialects of American English, such that some vowels are more similar to one another in some dialects than others. Acoustic vowel distance measures typically evaluate vowel similarity at a discrete time point, resulting in distance estimates that may not fully capture vowel similarity in formant trajectory dynamics. In the current study, language and accent distance measures, which evaluate acoustic distances between talkers over time, were applied to the evaluation of vowel category similarity within talkers. These vowel category distances were then compared across dialects, and their utility in capturing predicted patterns of regional dialect variation in American English was examined. Dynamic time warping of mel-frequency cepstral coefficients was used to assess acoustic distance across the frequency spectrum and captured predicted Southern American English vowel similarity. Root-mean-square distance and generalized additive mixed models were used to assess acoustic distance for selected formant trajectories and captured predicted Southern, New England, and Northern American English vowel similarity. Generalized additive mixed models captured the most predicted variation, but, unlike the other measures, do not return a single acoustic distance value. All three measures are potentially useful for understanding variation in vowel category similarity across dialects.}, } @article {pmid39396508, year = {2024}, author = {Ozkan Atak, HB and Aslan, F and Sennaroglu, G and Sennaroglu, L}, title = {Children with Auditory Brainstem Implants: Language Proficiency and Reading Comprehension Process.}, journal = {Audiology & neuro-otology}, volume = {}, number = {}, pages = {1-12}, doi = {10.1159/000541716}, pmid = {39396508}, issn = {1421-9700}, abstract = {INTRODUCTION: Auditory performance and language proficiency in young children who utilize auditory brainstem implants (ABIs) throughout the first 3 years of life are difficult to predict. ABI users have challenges as a result of delays in language proficiency and the acquisition of reading comprehension, even if ABI technology offers auditory experiences that enhance spoken language development. The aim of this study was to evaluate about the impact of language proficiency on reading comprehension skills in children with ABI.

METHOD: In this study, 20 children with ABI were evaluated for their reading comprehension abilities and language proficiency using an Informal Reading Inventory, Test of Early Language Development-Third Edition (TELD-3), Categories of Auditory Performance-II (CAP-II), and Speech Intelligibility Rating (SIR). Three distinct aspects of reading comprehension were assessed and analyzed to provide a composite score for reading comprehension abilities. TELD-3, which measures receptive and expressive language proficiency, was presented through spoken language.

RESULTS: Studies have shown that there was a relationship between language proficiency and reading comprehension in children with ABI. In the present study, it was determined that the total scores of reading comprehension skills of the children who had poor language proficiency and enrolled in the school for the deaf were also low. The children use short, basic sentences, often repeat words and phrases, and have a restricted vocabulary. In addition, the children had difficulty reading characters and detailed paragraphs and could not remember events in a logical order.

CONCLUSION: Children with ABI may potentially have complicated reading comprehension abilities due to lack of access to all the speech formants needed to develop spoken language. In addition, variables affecting the reading levels of children with ABI include factors such as age at implantation, duration of implant use, presence of additional disability, communication model, and access to auditory rehabilitation. The reading comprehension skills of ABI users were evaluated in this study for the first time in the literature and may constitute a starting point for the examination of variables affecting reading comprehension in this area.}, } @article {pmid39392353, year = {2024}, author = {Yegnanarayana, B and Pannala, V}, title = {Processing group delay spectrograms for study of formant and harmonic contours in speech signals.}, journal = {The Journal of the Acoustical Society of America}, volume = {156}, number = {4}, pages = {2422-2433}, doi = {10.1121/10.0032364}, pmid = {39392353}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; Sound Spectrography ; Signal Processing, Computer-Assisted ; Speech Production Measurement/methods ; Voice Quality ; Time Factors ; Phonetics ; }, abstract = {This paper deals with study of formant and harmonic contours by processing the group delay (GD) spectrograms of speech signals. The GD spectrum is the negative derivative of the phase spectrum with respect to frequency. Recent study shows that the GD spectrogram can be obtained without phase wrapping. Formant frequency contours can be observed in the display of the peaks of the instantaneous wideband equivalent GD spectrogram, derived using the modified single frequency filtering (SFF) analysis of speech signals. Harmonic frequency contours can be observed in the display of the peaks of the instantaneous narrowband equivalent GD spectrogram, derived using the modified SFF analysis of speech signals. For synthetic speech signals, the observed formant contours match the ground truth formant contours from which the signal is derived. For natural speech signals, the observed formant contours match approximately with the given ground truth formant contours mostly in the voiced regions. The results are illustrated for several randomly selected utterances from the TIMIT database. While this study helps to observe the contours of formants in the display, automatic extraction of the formant frequencies needs further processing, requiring logic for eliminating the spurious points, without forcing the number of formants.}, } @article {pmid39356074, year = {2024}, author = {Parrell, B and Niziolek, CA and Chen, T}, title = {Sensorimotor adaptation to a nonuniform formant perturbation generalizes to untrained vowels.}, journal = {Journal of neurophysiology}, volume = {132}, number = {5}, pages = {1437-1444}, pmid = {39356074}, issn = {1522-1598}, support = {P50 HD105353/HD/NICHD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; BCS 2120506//National Science Foundation (NSF)/ ; }, mesh = {Humans ; Male ; Female ; Adult ; *Adaptation, Physiological/physiology ; Young Adult ; *Speech/physiology ; Learning/physiology ; Speech Perception/physiology ; Generalization, Psychological/physiology ; Phonetics ; Feedback, Sensory/physiology ; }, abstract = {When speakers learn to change the way they produce a speech sound, how much does that learning generalize to other speech sounds? Past studies of speech sensorimotor learning have typically tested the generalization of a single transformation learned in a single context. Here, we investigate the ability of the speech motor system to generalize learning when multiple opposing sensorimotor transformations are learned in separate regions of the vowel space. We find that speakers adapt to a nonuniform "centralization" perturbation, learning to produce vowels with greater acoustic contrast, and that this adaptation generalizes to untrained vowels, which pattern like neighboring trained vowels and show increased contrast of a similar magnitude.NEW & NOTEWORTHY We show that sensorimotor adaptation of vowels at the edges of the articulatory working space generalizes to intermediate vowels through local transfer of learning from adjacent vowels. These results extend findings on the locality of sensorimotor learning from upper limb control to speech, a complex task with an opaque and nonlinear transformation between motor actions and sensory consequences. Our results also suggest that our paradigm has potential to drive behaviorally relevant changes that improve communication effectiveness.}, } @article {pmid39322510, year = {2024}, author = {Huang, T and Wang, X and Xu, T and Zhao, W and Cao, Y and Kim, H and Yi, B}, title = {Acoustic Analysis of Mandarin-Speaking Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.08.037}, pmid = {39322510}, issn = {1873-4588}, abstract = {OBJECTIVES: This study aims to investigate the speech characteristics and assess the potential risk of voice fatigue and voice disorders in Chinese transgender women (TW).

METHODS: A case-control study was conducted involving TW recruited in Shanghai, China. The participants included 15 TW, 20 cisgender men (CISM), and 20 cisgender women (CISW). Acoustic parameters including formants (F1, F2, F3, F4), cepstral peak prominence (CPP), jitter, shimmer, harmonic-to-noise ratio (HNR), noise-to-harmonics (NHR), fundamental frequency (f0), and intensity, across vowels, passages, and free talking. Additionally, the Voice Handicap Index-10 (VHI-10) and the Voice Fatigue Index were administered to evaluate voice-related concerns.

RESULTS: (1) The F1 of TW was significantly higher than that of CISW for the vowels /i/ and /u/, and significantly higher than that of CISM for the vowels /a/, /i/, and /u/. The F2 of TW was significantly lower than CISW for the vowels /i/, significantly higher than CISW for the vowels /u/, and significantly higher than CISM for the vowels /a/ and /u/. F3 was significantly lower in TW than in CISW for the vowels /a/ and /i/. The F4 formant was significantly lower in TW than in CISW for the vowels /a/ and /i/, but significantly higher than in CISM for the vowel /u/. (2) The f0 of TW was significantly lower than that of CISW for the vowels /a/, /i/, /u/, during passage reading, and in free speech, but was significantly higher than CISM during passage reading and free talking. Additionally, TW exhibited significantly higher intensity compared with CISW for the vowel /a/ and during passage reading. (3) Jitter in TW was significantly higher than in CISW for the vowels /i/ and /u/, and significantly lower than in CISM during passage reading and free talking. Shimmer was significantly higher in TW compared with both CISW and CISM across the vowels /a/, /i/, during passage reading, and in free talking. The HNR in TW was significantly lower than in both CISW and CISM across all vowels, during passage reading, and in free talking. The NHR was significantly higher in TW than in CISW across all vowels, during passage reading, and in free talking, and significantly higher than in CISM for the vowels /a/, /i/, during passage reading, and in free talking. The CPP in TW was significantly lower than in CISW during passage reading and free talking, and significantly lower than in CISM across all vowels, during passage reading, and in free speech. (4) The VHI-10 scores were significantly higher in TW compared with both CISM and CISW.

CONCLUSIONS: TW exhibit certain acoustic parameters, such as f0 and some of the formants, that fall between those of CISW and CISM without undergoing phonosurgery or voice training. The findings suggest a potential risk for voice fatigue and the development of voice disorders as TW try to modify their vocal characteristics to align with their gender identity.}, } @article {pmid39287502, year = {2024}, author = {Kim, H and Ratkute, V and Epp, B}, title = {Monaural and binaural masking release with speech-like stimuli.}, journal = {JASA express letters}, volume = {4}, number = {9}, pages = {}, doi = {10.1121/10.0028736}, pmid = {39287502}, issn = {2691-1191}, mesh = {Humans ; *Perceptual Masking/physiology ; *Speech Perception/physiology ; Adult ; Acoustic Stimulation ; Male ; Female ; Young Adult ; }, abstract = {The relevance of comodulation and interaural phase difference for speech perception is still unclear. We used speech-like stimuli to link spectro-temporal properties of formants with masking release. The stimuli comprised a tone and three masker bands centered at formant frequencies F1, F2, and F3 derived from a consonant-vowel. The target was a diotic or dichotic frequency-modulated tone following F2 trajectories. Results showed a small comodulation masking release, while the binaural masking level difference was comparable to previous findings. The data suggest that factors other than comodulation may play a dominant role in grouping frequency components in speech.}, } @article {pmid39279469, year = {2024}, author = {Chen, S and Whalen, DH and Mok, PPK}, title = {What R Mandarin Chinese /ɹ/s? - acoustic and articulatory features of Mandarin Chinese rhotics.}, journal = {Phonetica}, volume = {81}, number = {5}, pages = {509-552}, pmid = {39279469}, issn = {1423-0321}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Phonetics ; *Speech Acoustics ; *Tongue/physiology ; Female ; Male ; China ; *Language ; Adult ; Young Adult ; Speech Production Measurement ; Ultrasonography ; East Asian People ; }, abstract = {Rhotic sounds are well known for their considerable phonetic variation within and across languages and their complexity in speech production. Although rhotics in many languages have been examined and documented, the phonetic features of Mandarin rhotics remain unclear, and debates about the prevocalic rhotic (the syllable-onset rhotic) persist. This paper extends the investigation of rhotic sounds by examining the articulatory and acoustic features of Mandarin Chinese rhotics in prevocalic, syllabic (the rhotacized vowel [ɚ]), and postvocalic (r-suffix) positions. Eighteen speakers from Northern China were recorded using ultrasound imaging. Results showed that Mandarin syllabic and postvocalic rhotics can be articulated with various tongue shapes, including tongue-tip-up retroflex and tongue-tip-down bunched shapes. Different tongue shapes have no significant acoustic differences in the first three formants, demonstrating a many-to-one articulation-acoustics relationship. The prevocalic rhotics in our data were found to be articulated only with bunched tongue shapes, and were sometimes produced with frication noise at the start. In general, rhotics in all syllable positions are characterized by a close F2 and F3, though the prevocalic rhotic has a higher F2 and F3 than the syllabic and postvocalic rhotics. The effects of syllable position and vowel context are also discussed.}, } @article {pmid39259883, year = {2024}, author = {Thompson, A and Kim, Y}, title = {Acoustic and Kinematic Predictors of Intelligibility and Articulatory Precision in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {10}, pages = {3595-3611}, pmid = {39259883}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; R03 DC012405/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Parkinson Disease/physiopathology/complications ; *Speech Intelligibility/physiology ; Female ; Male ; Biomechanical Phenomena ; Aged ; *Dysarthria/etiology/physiopathology ; *Speech Acoustics ; Middle Aged ; Speech Production Measurement/methods ; Case-Control Studies ; Phonetics ; }, abstract = {PURPOSE: This study investigated relationships within and between perceptual, acoustic, and kinematic measures in speakers with and without dysarthria due to Parkinson's disease (PD) across different clarity conditions. Additionally, the study assessed the predictive capabilities of selected acoustic and kinematic measures for intelligibility and articulatory precision ratings.

METHOD: Forty participants, comprising 22 with PD and 18 controls, read three phrases aloud using conversational, less clear, and more clear speaking conditions. Acoustic measures and their theoretical kinematic parallel measures (i.e., acoustic and kinematic distance and vowel space area [VSA]; second formant frequency [F2] slope and kinematic speed) were obtained from the diphthong /aɪ/ and selected vowels in the sentences. A total of 368 listeners from crowdsourcing provided ratings for intelligibility and articulatory precision. The research questions were examined using correlations and linear mixed-effects models.

RESULTS: Intelligibility and articulatory precision ratings were highly correlated across all speakers. Acoustic and kinematic distance, as well as F2 slope and kinematic speed, showed moderately positive correlations. In contrast, acoustic and kinematic VSA exhibited no correlation. Among all measures, acoustic VSA and kinematic distance were robust predictors of both intelligibility and articulatory precision ratings, but they were stronger predictors of articulatory precision.

CONCLUSIONS: The findings highlight the importance of measurement selection when examining cross-domain relationships. Additionally, they support the use of behavioral modifications aimed at eliciting larger articulatory gestures to improve intelligibility in individuals with dysarthria due to PD.

OPEN SCIENCE FORM: https://doi.org/10.23641/asha.27011281.}, } @article {pmid39234407, year = {2024}, author = {Subrahmanya, A and Ranasinghe, KG and Kothare, H and Raharjo, I and Kim, KS and Houde, JF and Nagarajan, SS}, title = {Pitch corrections occur in natural speech and are abnormal in patients with Alzheimer's disease.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1424920}, pmid = {39234407}, issn = {1662-5161}, abstract = {Past studies have explored formant centering, a corrective behavior of convergence over the duration of an utterance toward the formants of a putative target vowel. In this study, we establish the existence of a similar centering phenomenon for pitch in healthy elderly controls and examine how such corrective behavior is altered in Alzheimer's Disease (AD). We found the pitch centering response in healthy elderly was similar when correcting pitch errors below and above the target (median) pitch. In contrast, patients with AD showed an asymmetry with a larger correction for the pitch errors below the target phonation than above the target phonation. These findings indicate that pitch centering is a robust compensation behavior in human speech. Our findings also explore the potential impacts on pitch centering from neurodegenerative processes impacting speech in AD.}, } @article {pmid39218756, year = {2024}, author = {Vampola, T and Horáček, J and Laukkanen, AM}, title = {Three-Dimensional Finite Element Modeling of the Singer's Formant Cluster Optimization by Epilaryngeal Narrowing With and Without Velopharyngeal Opening.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.07.035}, pmid = {39218756}, issn = {1873-4588}, abstract = {This study aimed to find the optimal geometrical configuration of the vocal tract (VT) to increase the total acoustic energy output of human voice in the frequency interval 2-3.5 kHz "singer's formant cluster," (SFC) for vowels [a:] and [i:] considering epilaryngeal changes and the velopharyngeal opening (VPO). The study applied 3D volume models of the vocal and nasal tract based on computer tomography images of a female speaker. The epilaryngeal narrowing (EN) increased the total sound pressure level (SPL) and SPL of the SFC by diminishing the frequency difference between acoustic resonances F3 and F4 for [a:] and between F2 and F3 for [i:]. The effect reached its maximum at the low pharynx/epilarynx cross-sectional area ratio 11.4:1 for [a:] and 25:1 for [i:]. The acoustic results obtained with the model optimization are in good agreement with the results of an internationally recognized operatic alto singer. With the EN and the VPO, the VT input reactance was positive over the entire fo singing range (ca 75-1500 Hz). The VPO increased the strength of the SFC and diminished the SPL of F1 for both vowels, but with EN, the SPL decrease was compensated. The effect of EN is not linear and depends on the vowel. Both the EN and the VPO alone and together can support (singing) voice production.}, } @article {pmid39217086, year = {2024}, author = {Figueroa, C and Guillén, V and Huenupán, F and Vallejos, C and Henríquez, E and Urrutia, F and Sanhueza, F and Alarcón, E}, title = {Comparison of Acoustic Parameters of Voice and Speech According to Vowel Type and Suicidal Risk in Adolescents.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.08.006}, pmid = {39217086}, issn = {1873-4588}, abstract = {UNLABELLED: Globally, suicide prevention and understanding suicidal behavior represent significant health challenges. The predictive potential of voice, speech, and language appears as a promising solution to the difficulty in assessment.

OBJECTIVE: To analyze variations in acoustic parameters in voice and speech based on vowel types according to different levels of suicidal risk among adolescents in a text reading task.

METHODOLOGY: Cross-sectional analytical design using nonprobabilistic sampling. Our sample comprised 98 adolescents aged 14 to 19, undergoing voice acoustic assessment, along with suicidal ideation determination through the Okasha Suicidality Scale and Beck Depression Inventory. Acoustic analysis of recordings was conducted using Praat for phonetic research, Python program, Focusrite interface, and microphone to register voice and speech acoustic parameters such as Fundamental Frequency, Jitter, and Formants. Subsequently, data from adolescents with and without suicidal risk were compared.

RESULTS: Significant differences were observed between suicidal and nonsuicidal adolescents in several acoustic aspects, especially in females in fundamental frequency (F0), signal-to-noise ratio (HNRdB), and temporal variability measured by jitter and standard deviation. In men, differences were found in F0 and HNRdB (P < 0.05).

CONCLUSION: This study demonstrated statistically significant variations in various voice acoustic parameters among adolescents with and without suicidal risk. These findings underscore the potential relevance of voice and speech as markers for suicidal risk.}, } @article {pmid39212078, year = {2024}, author = {Zaltz, Y}, title = {The Impact of Trained Conditions on the Generalization of Learning Gains Following Voice Discrimination Training.}, journal = {Trends in hearing}, volume = {28}, number = {}, pages = {23312165241275895}, pmid = {39212078}, issn = {2331-2165}, mesh = {Humans ; Male ; Female ; Young Adult ; *Speech Perception/physiology ; *Generalization, Psychological ; *Cues ; *Noise/adverse effects ; *Acoustic Stimulation ; Adult ; Recognition, Psychology ; Perceptual Masking ; Adolescent ; Speech Acoustics ; Voice Quality ; Discrimination Learning/physiology ; Voice/physiology ; }, abstract = {Auditory training can lead to notable enhancements in specific tasks, but whether these improvements generalize to untrained tasks like speech-in-noise (SIN) recognition remains uncertain. This study examined how training conditions affect generalization. Fifty-five young adults were divided into "Trained-in-Quiet" (n = 15), "Trained-in-Noise" (n = 20), and "Control" (n = 20) groups. Participants completed two sessions. The first session involved an assessment of SIN recognition and voice discrimination (VD) with word or sentence stimuli, employing combined fundamental frequency (F0) + formant frequencies voice cues. Subsequently, only the trained groups proceeded to an interleaved training phase, encompassing six VD blocks with sentence stimuli, utilizing either F0-only or formant-only cues. The second session replicated the interleaved training for the trained groups, followed by a second assessment conducted by all three groups, identical to the first session. Results showed significant improvements in the trained task regardless of training conditions. However, VD training with a single cue did not enhance VD with both cues beyond control group improvements, suggesting limited generalization. Notably, the Trained-in-Noise group exhibited the most significant SIN recognition improvements posttraining, implying generalization across tasks that share similar acoustic conditions. Overall, findings suggest training conditions impact generalization by influencing processing levels associated with the trained task. Training in noisy conditions may prompt higher auditory and/or cognitive processing than training in quiet, potentially extending skills to tasks involving challenging listening conditions, such as SIN recognition. These insights hold significant theoretical and clinical implications, potentially advancing the development of effective auditory training protocols.}, } @article {pmid39185222, year = {2024}, author = {Parrell, B and Naber, C and Kim, OA and Nizolek, CA and McDougle, SD}, title = {Audiomotor prediction errors drive speech adaptation even in the absence of overt movement.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {39185222}, issn = {2692-8205}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC019134/DC/NIDCD NIH HHS/United States ; R01 NS132926/NS/NINDS NIH HHS/United States ; }, abstract = {Observed outcomes of our movements sometimes differ from our expectations. These sensory prediction errors recalibrate the brain's internal models for motor control, reflected in alterations to subsequent movements that counteract these errors (motor adaptation). While leading theories suggest that all forms of motor adaptation are driven by learning from sensory prediction errors, dominant models of speech adaptation argue that adaptation results from integrating time-advanced copies of corrective feedback commands into feedforward motor programs. Here, we tested these competing theories of speech adaptation by inducing planned, but not executed, speech. Human speakers (male and female) were prompted to speak a word and, on a subset of trials, were rapidly cued to withhold the prompted speech. On standard trials, speakers were exposed to real-time playback of their own speech with an auditory perturbation of the first formant to induce single-trial speech adaptation. Speakers experienced a similar sensory error on movement cancelation trials, hearing a perturbation applied to a recording of their speech from a previous trial at the time they would have spoken. Speakers adapted to auditory prediction errors in both contexts, altering the spectral content of spoken vowels to counteract formant perturbations even when no actual movement coincided with the perturbed feedback. These results build upon recent findings in reaching, and suggest that prediction errors, rather than corrective motor commands, drive adaptation in speech.}, } @article {pmid39182457, year = {2024}, author = {Chan, RKW and Wang, BX}, title = {Do long-term acoustic-phonetic features and mel-frequency cepstral coefficients provide complementary speaker-specific information for forensic voice comparison?.}, journal = {Forensic science international}, volume = {363}, number = {}, pages = {112199}, doi = {10.1016/j.forsciint.2024.112199}, pmid = {39182457}, issn = {1872-6283}, mesh = {Humans ; Male ; *Phonetics ; *Speech Acoustics ; Sound Spectrography ; *Voice Quality ; Adult ; Forensic Sciences/methods ; Middle Aged ; Young Adult ; Signal Processing, Computer-Assisted ; }, abstract = {A growing number of studies in forensic voice comparison have explored how elements of phonetic analysis and automatic speaker recognition systems may be integrated for optimal speaker discrimination performance. However, few studies have investigated the evidential value of long-term speech features using forensically-relevant speech data. This paper reports an empirical validation study that assesses the evidential strength of the following long-term features: fundamental frequency (F0), formant distributions, laryngeal voice quality, mel-frequency cepstral coefficients (MFCCs), and combinations thereof. Non-contemporaneous recordings with speech style mismatch from 75 male Australian English speakers were analyzed. Results show that 1) MFCCs outperform long-term acoustic phonetic features; 2) source and filter features do not provide considerably complementary speaker-specific information; and 3) the addition of long-term phonetic features to an MFCCs-based system does not lead to meaningful improvement in system performance. Implications for the complementarity of phonetic analysis and automatic speaker recognition systems are discussed.}, } @article {pmid39175901, year = {2024}, author = {Huang, L and Yang, H and Che, Y and Yang, J}, title = {Automatic speech analysis for detecting cognitive decline of older adults.}, journal = {Frontiers in public health}, volume = {12}, number = {}, pages = {1417966}, pmid = {39175901}, issn = {2296-2565}, mesh = {Humans ; Aged ; Female ; Male ; *Cognitive Dysfunction/diagnosis ; China ; Alzheimer Disease/diagnosis ; Aged, 80 and over ; Speech ; Middle Aged ; Bayes Theorem ; Support Vector Machine ; Algorithms ; }, abstract = {BACKGROUND: Speech analysis has been expected to help as a screening tool for early detection of Alzheimer's disease (AD) and mild-cognitively impairment (MCI). Acoustic features and linguistic features are usually used in speech analysis. However, no studies have yet determined which type of features provides better screening effectiveness, especially in the large aging population of China.

OBJECTIVE: Firstly, to compare the screening effectiveness of acoustic features, linguistic features, and their combination using the same dataset. Secondly, to develop Chinese automated diagnosis model using self-collected natural discourse data obtained from native Chinese speakers.

METHODS: A total of 92 participants from communities in Shanghai, completed MoCA-B and a picture description task based on the Cookie Theft under the guidance of trained operators, and were divided into three groups including AD, MCI, and heathy control (HC) based on their MoCA-B score. Acoustic features (Pitches, Jitter, Shimmer, MFCCs, Formants) and linguistic features (part-of-speech, type-token ratio, information words, information units) are extracted. The machine algorithms used in this study included logistic regression, random forest (RF), support vector machines (SVM), Gaussian Naive Bayesian (GNB), and k-Nearest neighbor (kNN). The validation accuracies of the same ML model using acoustic features, linguistic features, and their combination were compared.

RESULTS: The accuracy with linguistic features is generally higher than acoustic features in training. The highest accuracy to differentiate HC and AD is 80.77% achieved by SVM, based on all the features extracted from the speech data, while the highest accuracy to differentiate HC and AD or MCI is 80.43% achieved by RF, based only on linguistic features.

CONCLUSION: Our results suggest the utility and validity of linguistic features in the automated diagnosis of cognitive impairment, and validated the applicability of automated diagnosis for Chinese language data.}, } @article {pmid39171236, year = {2024}, author = {Holmes, L and Rieger, G and Paulmann, S}, title = {The effect of sexual orientation on voice acoustic properties.}, journal = {Frontiers in psychology}, volume = {15}, number = {}, pages = {1412372}, pmid = {39171236}, issn = {1664-1078}, abstract = {INTRODUCTION: Previous research has investigated sexual orientation differences in the acoustic properties of individuals' voices, often theorizing that homosexuals of both sexes would have voice properties mirroring those of heterosexuals of the opposite sex. Findings were mixed, but many of these studies have methodological limitations including small sample sizes, use of recited passages instead of natural speech, or grouping bisexual and homosexual participants together for analyses.

METHODS: To address these shortcomings, the present study examined a wide range of acoustic properties in the natural voices of 142 men and 175 women of varying sexual orientations, with sexual orientation treated as a continuous variable throughout.

RESULTS: Homosexual men had less breathy voices (as indicated by a lower harmonics-to-noise ratio) and, contrary to our prediction, a lower voice pitch and narrower pitch range than heterosexual men. Homosexual women had lower F4 formant frequency (vocal tract resonance or so-called overtone) in overall vowel production, and rougher voices (measured via jitter and spectral tilt) than heterosexual women. For those sexual orientation differences that were statistically significant, bisexuals were in-between heterosexuals and homosexuals. No sexual orientation differences were found in formants F1-F3, cepstral peak prominence, shimmer, or speech rate in either sex.

DISCUSSION: Recommendations for future "natural voice" investigations are outlined.}, } @article {pmid39091036, year = {2024}, author = {Goncharova, M and Jadoul, Y and Reichmuth, C and Fitch, WT and Ravignani, A}, title = {Vocal tract dynamics shape the formant structure of conditioned vocalizations in a harbor seal.}, journal = {Annals of the New York Academy of Sciences}, volume = {1538}, number = {1}, pages = {107-116}, doi = {10.1111/nyas.15189}, pmid = {39091036}, issn = {1749-6632}, support = {(#W1262-B29)//Austrian Science Foundation Grant/ ; DNRF117//Danmarks Grundforskningsfond/ ; N00014-04-1-0284//Office of Naval Research/ ; Independent Max Planck Research Group Leader funding//Max-Planck-Gesellschaft/ ; Advanced Grant SOMACCA/ERC_/European Research Council/International ; }, mesh = {Animals ; *Vocalization, Animal/physiology ; Male ; Tongue/physiology ; Jaw/physiology/anatomy & histology ; Phocoena/physiology ; Humans ; }, abstract = {Formants, or resonance frequencies of the upper vocal tract, are an essential part of acoustic communication. Articulatory gestures-such as jaw, tongue, lip, and soft palate movements-shape formant structure in human vocalizations, but little is known about how nonhuman mammals use those gestures to modify formant frequencies. Here, we report a case study with an adult male harbor seal trained to produce an arbitrary vocalization composed of multiple repetitions of the sound wa. We analyzed jaw movements frame-by-frame and matched them to the tracked formant modulation in the corresponding vocalizations. We found that the jaw opening angle was strongly correlated with the first (F1) and, to a lesser degree, with the second formant (F2). F2 variation was better explained by the jaw angle opening when the seal was lying on his back rather than on the belly, which might derive from soft tissue displacement due to gravity. These results show that harbor seals share some common articulatory traits with humans, where the F1 depends more on the jaw position than F2. We propose further in vivo investigations of seals to further test the role of the tongue on formant modulation in mammalian sound production.}, } @article {pmid39086377, year = {2024}, author = {Dorman, MF and Natale, SC and Stohl, JS and Felder, J}, title = {Close approximations to the sound of a cochlear implant.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1434786}, pmid = {39086377}, issn = {1662-5161}, abstract = {Cochlear implant (CI) systems differ in terms of electrode design and signal processing. It is likely that patients fit with different implant systems will experience different percepts when presented speech via their implant. The sound quality of speech can be evaluated by asking single-sided-deaf (SSD) listeners fit with a cochlear implant (CI) to modify clean signals presented to their typically hearing ear to match the sound quality of signals presented to their CI ear. In this paper, we describe very close matches to CI sound quality, i.e., similarity ratings of 9.5 to 10 on a 10-point scale, by ten patients fit with a 28 mm electrode array and MED EL signal processing. The modifications required to make close approximations to CI sound quality fell into two groups: One consisted of a restricted frequency bandwidth and spectral smearing while a second was characterized by a wide bandwidth and no spectral smearing. Both sets of modifications were different from those found for patients with shorter electrode arrays who chose upshifts in voice pitch and formant frequencies to match CI sound quality. The data from matching-based metrics of CI sound quality document that speech sound-quality differs for patients fit with different CIs and among patients fit with the same CI.}, } @article {pmid39056002, year = {2024}, author = {Bonacina, S and Krizman, J and Farley, J and Nicol, T and LaBella, CR and Kraus, N}, title = {Persistent post-concussion symptoms include neural auditory processing in young children.}, journal = {Concussion (London, England)}, volume = {9}, number = {1}, pages = {CNC114}, pmid = {39056002}, issn = {2056-3299}, abstract = {AIM: Difficulty understanding speech following concussion is likely caused by auditory processing impairments. We hypothesized that concussion disrupts pitch and phonetic processing of a sound, cues in understanding a talker.

We obtained frequency following responses to a syllable from 120 concussed and 120 control. Encoding of the fundamental frequency (F0), a pitch cue and the first formant (F1), a phonetic cue, was poorer in concussed children. The F0 reduction was greater in the children assessed within 2 weeks of their injuries.

CONCLUSION: Concussions affect auditory processing. Results strengthen evidence of reduced F0 encoding in children with concussion and call for longitudinal study aimed at monitoring the recovery course with respect to the auditory system.}, } @article {pmid39026879, year = {2024}, author = {Li, JJ and Daliri, A and Kim, KS and Max, L}, title = {Does pre-speech auditory modulation reflect processes related to feedback monitoring or speech movement planning?.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {39026879}, issn = {2692-8205}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020707/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; }, abstract = {Previous studies have revealed that auditory processing is modulated during the planning phase immediately prior to speech onset. To date, the functional relevance of this pre-speech auditory modulation (PSAM) remains unknown. Here, we investigated whether PSAM reflects neuronal processes that are associated with preparing auditory cortex for optimized feedback monitoring as reflected in online speech corrections. Combining electroencephalographic PSAM data from a previous data set with new acoustic measures of the same participants' speech, we asked whether individual speakers' extent of PSAM is correlated with the implementation of within-vowel articulatory adjustments during /b/-vowel-/d/ word productions. Online articulatory adjustments were quantified as the extent of change in inter-trial formant variability from vowel onset to vowel midpoint (a phenomenon known as centering). This approach allowed us to also consider inter-trial variability in formant production and its possible relation to PSAM at vowel onset and midpoint separately. Results showed that inter-trial formant variability was significantly smaller at vowel midpoint than at vowel onset. PSAM was not significantly correlated with this amount of change in variability as an index of within-vowel adjustments. Surprisingly, PSAM was negatively correlated with inter-trial formant variability not only in the middle but also at the very onset of the vowels. Thus, speakers with more PSAM produced formants that were already less variable at vowel onset. Findings suggest that PSAM may reflect processes that influence speech acoustics as early as vowel onset and, thus, that are directly involved in motor command preparation (feedforward control) rather than output monitoring (feedback control).}, } @article {pmid39019670, year = {2024}, author = {Doyle, KA and Harel, D and Feeny, GT and Novak, VD and McAllister, T}, title = {Word and Gender Identification in the Speech of Transgender Individuals.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.06.007}, pmid = {39019670}, issn = {1873-4588}, support = {R21 DC021537/DC/NIDCD NIH HHS/United States ; }, abstract = {Listeners use speech to identify both linguistic information, such as the word being produced, and indexical attributes, such as the gender of the speaker. Previous research has shown that these two aspects of speech perception are interrelated. It is important to understand this relationship in the context of gender-affirming voice training (GAVT), where changes in speech production as part of a speaker's gender-affirming care could potentially influence listeners' recognition of the intended utterance. This study conducted a secondary analysis of data from an experiment in which trans women matched shifted targets for the second formant frequency using visual-acoustic biofeedback. Utterances were synthetically altered to feature a gender-ambiguous fundamental frequency and were presented to blinded listeners for rating on a visual analog scale representing the gender spectrum, as well as word identification in a forced-choice task. We found a statistically significant association between the accuracy of word identification and the gender rating of utterances. However, there was no statistically significant difference in word identification accuracy for the formant-shifted conditions relative to an unshifted condition. Overall, these results support previous research in finding that word identification and speaker gender identification are interrelated processes; however, the findings also suggest that a small magnitude of shift in formant frequencies (of the type that might be pursued in a GAVT context) does not have a significant negative impact on the perceptual recoverability of isolated words.}, } @article {pmid38985077, year = {2024}, author = {Lorenzoni, DC and Henriques, JFC and Silva, LKD and Rosa, RR and Berretin-Felix, G and Freitas, KMS and Janson, G}, title = {Comparison of speech changes caused by four different orthodontic retainers: a crossover randomized clinical trial.}, journal = {Dental press journal of orthodontics}, volume = {29}, number = {3}, pages = {e2423277}, pmid = {38985077}, issn = {2177-6709}, mesh = {Humans ; *Orthodontic Retainers ; Female ; Male ; Adult ; *Cross-Over Studies ; Orthodontic Appliance Design ; Young Adult ; Speech/physiology ; }, abstract = {OBJECTIVE: This study aimed to compare the influence of four different maxillary removable orthodontic retainers on speech.

MATERIAL AND METHODS: Eligibility criteria for sample selection were: 20-40-year subjects with acceptable occlusion, native speakers of Portuguese. The volunteers (n=21) were divided in four groups randomized with a 1:1:1:1 allocation ratio. The four groups used, in random order, the four types of retainers full-time for 21 days each, with a washout period of 7-days. The removable maxillary retainers were: conventional wraparound, wraparound with an anterior hole, U-shaped wraparound, and thermoplastic retainer. Three volunteers were excluded. The final sample comprised 18 subjects (11 male; 7 female) with mean age of 27.08 years (SD=4.65). The speech evaluation was performed in vocal excerpts recordings made before, immediately after, and 21 days after the installation of each retainer, with auditory-perceptual and acoustic analysis of formant frequencies F1 and F2 of the vowels. Repeated measures ANOVA and Friedman with Tukey tests were used for statistical comparison.

RESULTS: Speech changes increased immediately after conventional wraparound and thermoplastic retainer installation, and reduced after 21 days, but not to normal levels. However, this increase was statistically significant only for the wraparound with anterior hole and the thermoplastic retainer. Formant frequencies of vowels were altered at initial time, and the changes remained in conventional, U-shaped and thermoplastic appliances after three weeks.

CONCLUSIONS: The thermoplastic retainer was more harmful to the speech than wraparound appliances. The conventional and U-shaped retainers interfered less in speech. The three-week period was not sufficient for speech adaptation.}, } @article {pmid38981448, year = {2024}, author = {Liu, B and Lei, J and Wischhoff, OP and Smereka, KA and Jiang, JJ}, title = {Acoustic Character Governing Variation in Normal, Benign, and Malignant Voices.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {}, number = {}, pages = {1-10}, doi = {10.1159/000540255}, pmid = {38981448}, issn = {1421-9972}, abstract = {INTRODUCTION: Benign and malignant vocal fold lesions (VFLs) are growths that occur on the vocal folds. However, the treatments for these two types of lesions differ significantly. Therefore, it is imperative to use a multidisciplinary approach to properly recognize suspicious lesions. This study aimed to determine the important acoustic characteristics specific to benign and malignant VFLs.

METHODS: The acoustic model of voice quality was utilized to measure various acoustic parameters in 157 participants, including individuals with normal, benign, and malignant conditions. The study comprised 62 female and 95 male participants (43 ± 10 years). Voice samples were collected at the Shanghai Eye, Ear, Nose, and Throat Hospital of Fudan University between May 2020 and July 2021. The acoustic variables of the participants were analyzed using Principal Component Analysis (PCA) to present important acoustic characteristics that are specific to normal vocal folds, benign VFLs, and malignant VFLs. The similarities and differences in acoustic factors were also studied for benign conditions including Reinke's edema, polyps, cysts, and leukoplakia.

RESULTS: Using the PCA method, the components that accounted for the variation in the data were identified, highlighting acoustic characteristics in the normal, benign, and malignant groups. The analysis indicated that coefficients of variation in root mean square energy were observed solely within the normal group. Coefficients of variation in pitch (F0) were found to be significant only in benign voices, while higher formant frequencies and their variability were identified as contributors to the acoustic variance within the malignant group. The presence of formant dispersion (FD) as a weighted factor in PCA was exclusively noted in individuals with Reinke's edema. The amplitude ratio between subharmonics and harmonics (SHR) and its coefficients of variation were evident exclusively in the polyps group. In the case of voices with cysts, both pitch (F0) and coefficients of variation for FD were observed to contribute to variations. Additionally, higher formant frequencies and their coefficients of variation played a role in the acoustic variance among voices of patients with leukoplakia.

CONCLUSION: Experimental evidence demonstrates the utility of the PCA method in the identification of vibrational alterations in the acoustic characteristics of voice affected by lesions. Furthermore, the PCA analysis has highlighted underlying acoustic differences between various conditions such as Reinke's edema, polyps, cysts, and leukoplakia. These findings can be used in the future to develop an automated malignant voice analysis algorithm, which will facilitate timely intervention and management of vocal fold conditions.}, } @article {pmid38951556, year = {2024}, author = {Fletcher, MD and Akis, E and Verschuur, CA and Perry, SW}, title = {Improved tactile speech perception and noise robustness using audio-to-tactile sensory substitution with amplitude envelope expansion.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {15029}, pmid = {38951556}, issn = {2045-2322}, support = {EP/W032422/1//Engineering and Physical Sciences Research Council/ ; EP/T517859/1//Engineering and Physical Sciences Research Council/ ; }, mesh = {Humans ; *Speech Perception/physiology ; Male ; Female ; Adult ; *Noise ; *Hearing Aids ; Young Adult ; Touch/physiology ; Acoustic Stimulation/methods ; Touch Perception/physiology ; Hearing Loss/physiopathology ; }, abstract = {Recent advances in haptic technology could allow haptic hearing aids, which convert audio to tactile stimulation, to become viable for supporting people with hearing loss. A tactile vocoder strategy for audio-to-tactile conversion, which exploits these advances, has recently shown significant promise. In this strategy, the amplitude envelope is extracted from several audio frequency bands and used to modulate the amplitude of a set of vibro-tactile tones. The vocoder strategy allows good consonant discrimination, but vowel discrimination is poor and the strategy is susceptible to background noise. In the current study, we assessed whether multi-band amplitude envelope expansion can effectively enhance critical vowel features, such as formants, and improve speech extraction from noise. In 32 participants with normal touch perception, tactile-only phoneme discrimination with and without envelope expansion was assessed both in quiet and in background noise. Envelope expansion improved performance in quiet by 10.3% for vowels and by 5.9% for consonants. In noise, envelope expansion improved overall phoneme discrimination by 9.6%, with no difference in benefit between consonants and vowels. The tactile vocoder with envelope expansion can be deployed in real-time on a compact device and could substantially improve clinical outcomes for a new generation of haptic hearing aids.}, } @article {pmid38916010, year = {2024}, author = {Sahoo, AK and Sahoo, PK and Gupta, V and Behera, G and Sidam, S and Mishra, UP and Chavan, A and Binu, R and Gour, S and Velayutham, DK and Pooja, and Chatterjee, T and Pal, D}, title = {Assessment of Changes in the Quality of Voice in Post-thyroidectomy Patients With Intact Recurrent and Superior Laryngeal Nerve Function.}, journal = {Cureus}, volume = {16}, number = {5}, pages = {e60873}, pmid = {38916010}, issn = {2168-8184}, abstract = {Background Thyroidectomy is a routinely performed surgical procedure used to treat benign, malignant, and some hormonal disorders of the thyroid that are not responsive to medical therapy. Voice alterations following thyroid surgery are well-documented and often attributed to recurrent laryngeal nerve dysfunction. However, subtle changes in voice quality can persist despite anatomically intact laryngeal nerves. This study aimed to quantify post-thyroidectomy voice changes in patients with intact laryngeal nerves, focusing on fundamental frequency, first formant frequency, shimmer intensity, and maximum phonation duration. Methodology This cross-sectional study was conducted at a tertiary referral center in central India and focused on post-thyroidectomy patients with normal vocal cord function. Preoperative assessments included laryngeal endoscopy and voice recording using a computer program, with evaluations repeated at one and three months post-surgery. Patients with normal laryngeal endoscopic findings underwent voice analysis and provided feedback on subjective voice changes. The PRAAT version 6.2 software was utilized for voice analysis. Results The study included 41 patients with normal laryngoscopic findings after thyroid surgery, with the majority being female (85.4%) and the average age being 42.4 years. Hemithyroidectomy was performed in 41.4% of patients and total thyroidectomy in 58.6%, with eight patients undergoing central compartment neck dissection. Except for one patient, the majority reported no subjective change in voice following surgery. Objective voice analysis showed statistically significant changes in the one-month postoperative period compared to preoperative values, including a 5.87% decrease in fundamental frequency, a 1.37% decrease in shimmer intensity, and a 6.24% decrease in first formant frequency, along with a 4.35% decrease in maximum phonatory duration. These trends persisted at the three-month postoperative period, although values approached close to preoperative levels. Results revealed statistically significant alterations in voice parameters, particularly fundamental frequency and first formant frequency, with greater values observed in total thyroidectomy patients. Shimmer intensity also exhibited slight changes. Comparison between hemithyroidectomy and total thyroidectomy groups revealed no significant differences in fundamental frequency, first formant frequency, and shimmer. However, maximum phonation duration showed a significantly greater change in the hemithyroidectomy group at both one-month and three-month postoperative intervals. Conclusions This study on post-thyroidectomy patients with normal vocal cord movement revealed significant changes in voice parameters postoperatively, with most patients reporting no subjective voice changes. The findings highlight the importance of objective voice analysis in assessing post-thyroidectomy voice outcomes.}, } @article {pmid38890016, year = {2024}, author = {Xiu, N and Li, W and Liu, L and Liu, Z and Cai, Z and Li, L and Vaxelaire, B and Sock, R and Ling, Z and Chen, J and Wang, Y}, title = {A Study on Voice Measures in Patients with Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.05.018}, pmid = {38890016}, issn = {1873-4588}, abstract = {PURPOSE: This research aims to identify acoustic features which can distinguish patients with Parkinson's disease (PD patients) and healthy speakers.

METHODS: Thirty PD patients and 30 healthy speakers were recruited in the experiment, and their speech was collected, including three vowels (/i/, /a/, and /u/) and nine consonants (/p/, /pʰ/, /t/, /tʰ/, /k/, /kʰ/, /l/, /m/, and /n/). Acoustic features like fundamental frequency (F0), Jitter, Shimmer, harmonics-to-noise ratio (HNR), first formant (F1), second formant (F2), third formant (F3), first bandwidth (B1), second bandwidth (B2), third bandwidth (B3), voice onset, voice onset time were analyzed in our experiment. Two-sample independent t test and the nonparametric Mann-Whitney U (MWU) test were carried out alternatively to compare the acoustic measures between the PD patients and healthy speakers. In addition, after figuring out the effective acoustic features for distinguishing PD patients and healthy speakers, we adopted two methods to detect PD patients: (1) Built classifiers based on the effective acoustic features and (2) Trained support vector machine classifiers via the effective acoustic features.

RESULTS: Significant differences were found between the male PD group and the male health control in vowel /i/ (Jitter and Shimmer) and /a/ (Shimmer and HNR). Among female subjects, significant differences were observed in F0 standard deviation (F0 SD) of /u/ between the two groups. Additionally, significant differences between PD group and health control were also found in the F3 of /i/ and /n/, whereas other acoustic features showed no significant differences between the two groups. The HNR of vowel /a/ performed the best classification accuracy compared with the other six acoustic features above found to distinguish PD patients and healthy speakers.

CONCLUSIONS: PD can cause changes in the articulation and phonation of PD patients, wherein increases or decreases occur in some acoustic features. Therefore, the use of acoustic features to detect PD is expected to be a low-cost and large-scale diagnostic method.}, } @article {pmid38880296, year = {2024}, author = {Weirich, M and Simpson, AP and Knutti, N}, title = {Effects of testosterone on speech production and perception: Linking hormone levels in males to vocal cues and female voice attractiveness ratings.}, journal = {Physiology & behavior}, volume = {283}, number = {}, pages = {114615}, doi = {10.1016/j.physbeh.2024.114615}, pmid = {38880296}, issn = {1873-507X}, mesh = {Humans ; *Testosterone/metabolism/pharmacology ; Male ; Adult ; Young Adult ; *Saliva/metabolism/chemistry ; *Hydrocortisone/metabolism ; *Speech Perception/physiology/drug effects ; *Speech/physiology/drug effects ; *Voice/drug effects ; *Cues ; Female ; Beauty ; Acoustic Stimulation ; }, abstract = {This study sets out to investigate the potential effect of males' testosterone level on speech production and speech perception. Regarding speech production, we investigate intra- and inter-individual variation in mean fundamental frequency (fo) and formant frequencies and highlight the potential interacting effect of another hormone, i.e. cortisol. In addition, we investigate the influence of different speech materials on the relationship between testosterone and speech production. Regarding speech perception, we investigate the potential effect of individual differences in males' testosterone level on ratings of attractiveness of female voices. In the production study, data is gathered from 30 healthy adult males ranging from 19 to 27 years (mean age: 22.4, SD: 2.2) who recorded their voices and provided saliva samples at 9 am, 12 noon and 3 pm on a single day. Speech material consists of sustained vowels, counting, read speech and a free description of pictures. Biological measures comprise speakers' height, grip strength, and hormone levels (testosterone and cortisol). In the perception study, participants were asked to rate the attractiveness of female voice stimuli (sentence stimulus, same-speaker pairs) that were manipulated in three steps regarding mean fo and formant frequencies. Regarding speech production, our results show that testosterone affected mean fo (but not formants) both within and between speakers. This relationship was weakened in speakers with high cortisol levels and depended on the speech material. Regarding speech perception, we found female stimuli with higher mean fo and formants to be rated as sounding more attractive than stimuli with lower mean fo and formants. Moreover, listeners with low testosterone showed an increased sensitivity to vocal cues of female attractiveness. While our results of the production study support earlier findings of a relationship between testosterone and mean fo in males (which is mediated by cortisol), they also highlight the relevance of the speech material: The effect of testosterone was strongest in sustained vowels, potentially due to a strengthened effect of hormones on physiologically strongly influenced tasks such as sustained vowels in contrast to more free speech tasks such as a picture description. The perception study is the first to show an effect of males' testosterone level on female attractiveness ratings using voice stimuli.}, } @article {pmid38852197, year = {2024}, author = {Krupić, F and Moravcova, M and Dervišević, E and Čustović, S and Grbić, K and Lindström, P}, title = {When time does not heal all wounds: three decades' experience of immigrants living in Sweden.}, journal = {Medicinski glasnik : official publication of the Medical Association of Zenica-Doboj Canton, Bosnia and Herzegovina}, volume = {21}, number = {2}, pages = {}, doi = {10.17392/1696-21-02}, pmid = {38852197}, issn = {1840-2445}, abstract = {AIM: To investigate how immigrants from the Balkan region experienced their current life situation after living in Sweden for 30 years or more.

MATERIALS: The study was designed as a qualitative study using data from interviews with informants from five Balkan countries. The inclusion criteria were informants who were immigrants to Sweden and had lived in Sweden for more than 30 years. Five groups comprising sixteen informants were invited to participate in the study, and they all agreed.

RESULTS: The analysis of the interviews resulted in three main categories: "from someone to no one", "labour market", and "discrimination". All the informants reported that having an education and life experience was worth-less, having a life but having to start over, re-educating, applying for many jobs but often not being answered, and finally getting a job for which every in-formant was educated but being humiliated every day and treated separately as well as being discriminated against.

CONCLUSION: Coming to Sweden with all their problems, having an education and work experience that was equal to zero in Sweden, studying Swedish and re-reading/repeating all their education, looking for a job and not receiving answers to applications, and finally getting a job but being treated differently and discriminated against on a daily basis was experienced by all the in-formants as terrible. Even though there are enough similar studies in Sweden, it is always good to write more to help prospective immigrants and prospective employers in Sweden.}, } @article {pmid38847582, year = {2024}, author = {Mittapalle, KR and Alku, P}, title = {Classification of phonation types in singing voice using wavelet scattering network-based features.}, journal = {JASA express letters}, volume = {4}, number = {6}, pages = {}, doi = {10.1121/10.0026241}, pmid = {38847582}, issn = {2691-1191}, abstract = {The automatic classification of phonation types in singing voice is essential for tasks such as identification of singing style. In this study, it is proposed to use wavelet scattering network (WSN)-based features for classification of phonation types in singing voice. WSN, which has a close similarity with auditory physiological models, generates acoustic features that greatly characterize the information related to pitch, formants, and timbre. Hence, the WSN-based features can effectively capture the discriminative information across phonation types in singing voice. The experimental results show that the proposed WSN-based features improved phonation classification accuracy by at least 9% compared to state-of-the-art features.}, } @article {pmid38841122, year = {2024}, author = {Gorina-Careta, N and Arenillas-Alcón, S and Puertollano, M and Mondéjar-Segovia, A and Ijjou-Kadiri, S and Costa-Faidella, J and Gómez-Roig, MD and Escera, C}, title = {Exposure to bilingual or monolingual maternal speech during pregnancy affects the neurophysiological encoding of speech sounds in neonates differently.}, journal = {Frontiers in human neuroscience}, volume = {18}, number = {}, pages = {1379660}, pmid = {38841122}, issn = {1662-5161}, abstract = {INTRODUCTION: Exposure to maternal speech during the prenatal period shapes speech perception and linguistic preferences, allowing neonates to recognize stories heard frequently in utero and demonstrating an enhanced preference for their mother's voice and native language. Yet, with a high prevalence of bilingualism worldwide, it remains an open question whether monolingual or bilingual maternal speech during pregnancy influence differently the fetus' neural mechanisms underlying speech sound encoding.

METHODS: In the present study, the frequency-following response (FFR), an auditory evoked potential that reflects the complex spectrotemporal dynamics of speech sounds, was recorded to a two-vowel /oa/ stimulus in a sample of 129 healthy term neonates within 1 to 3 days after birth. Newborns were divided into two groups according to maternal language usage during the last trimester of gestation (monolingual; bilingual). Spectral amplitudes and spectral signal-to-noise ratios (SNR) at the stimulus fundamental (F0) and first formant (F1) frequencies of each vowel were, respectively, taken as measures of pitch and formant structure neural encoding.

RESULTS: Our results reveal that while spectral amplitudes at F0 did not differ between groups, neonates from bilingual mothers exhibited a lower spectral SNR. Additionally, monolingually exposed neonates exhibited a higher spectral amplitude and SNR at F1 frequencies.

DISCUSSION: We interpret our results under the consideration that bilingual maternal speech, as compared to monolingual, is characterized by a greater complexity in the speech sound signal, rendering newborns from bilingual mothers more sensitive to a wider range of speech frequencies without generating a particularly strong response at any of them. Our results contribute to an expanding body of research indicating the influence of prenatal experiences on language acquisition and underscore the necessity of including prenatal language exposure in developmental studies on language acquisition, a variable often overlooked yet capable of influencing research outcomes.}, } @article {pmid38820240, year = {2024}, author = {Wu, HY}, title = {Uncovering Gender-Specific and Cross-Gender Features in Mandarin Deception: An Acoustic and Electroglottographic Approach.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {7}, pages = {2021-2037}, doi = {10.1044/2024_JSLHR-23-00288}, pmid = {38820240}, issn = {1558-9102}, mesh = {Humans ; Female ; Male ; *Speech Acoustics ; Young Adult ; Adult ; *Deception ; *Language ; Glottis/physiology ; Sex Factors ; China ; Electrodiagnosis ; }, abstract = {PURPOSE: This study aimed to investigate the acoustic and electroglottographic (EGG) profiles of Mandarin deception, including global characteristics and the influence of gender.

METHOD: Thirty-six Mandarin speakers participated in an interactive interview game in which they provided both deceptive and truthful answers to 14 biographical questions. Acoustic and EGG signals of the participants' responses were simultaneously recorded; 20 acoustic and 14 EGG features were analyzed using binary logistic regression models.

RESULTS: Increases in fundamental frequency (F0) mean, intensity mean, first formant (F1), fifth formant (F5), contact quotient (CQ), decontacting-time quotient (DTQ), and contact index (CI) as well as decreases in jitter, shimmer, harmonics-to-noise ratio (HNR), and fourth formant (F4) were significantly correlated with global deception. Cross-gender features included increases in intensity mean and F5 and decreases in jitter, HNR, and F4, whereas gender-specific features encompassed increases in F0 mean, shimmer, F1, third formant, and DTQ, as well as decreases in F0 maximum and CQ for female deception, and increases in CQ and CI and decreases in shimmer for male deception.

CONCLUSIONS: The results suggest that Mandarin deception could be tied to underlying pragmatic functions, emotional arousal, decreased glottal contact skewness, and more pressed phonation. Disparities in gender-specific features lend support to differences in the use of pragmatics, levels of deception-induced emotional arousal, skewness of glottal contact patterns, and phonation types.}, } @article {pmid38789366, year = {2024}, author = {Neuhaus, TJ and Scherer, RC and Whitfield, JA}, title = {Gender Perception of Speech: Dependence on Fundamental Frequency, Implied Vocal Tract Length, and Source Spectral Tilt.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.01.014}, pmid = {38789366}, issn = {1873-4588}, abstract = {OBJECTIVE: To investigate how listeners use fundamental frequency, implied vocal tract length, and source spectral tilt to infer speaker gender.

METHODS: Sound files each containing the vowels /i, æ, ɑ, u/ interspersed by brief silences were synthesized. Each of the 210 stimuli was a combination of 10 values for fundamental frequency and 7 values for implied vocal tract length (and the associated formant frequencies) ranging from male-typical to female-typical, and 3 values for source spectral tilt approximating the voice qualities of breathy, normal, and pressed. Twenty-three listeners judged each synthesized "speaker" as "female" or "male." Generalized linear mixed model analysis was used to determine the extent to which fundamental frequency, implied vocal track length, and spectral tilt influenced listener judgment.

RESULTS: Increasing fundamental frequency and decreasing implied vocal tract length resulted in increased probability of female judgment. Two interactions were identified: An increase in fundamental frequency and also a decrease in source spectral tilt (more negative) resulted in a greater increase in the probability of female judgment when the vocal tract length was relatively short.

CONCLUSIONS: The relationships among fundamental frequency, implied vocal tract length, source spectral tilt, and probability of female judgment changed across the range of normal values, suggesting that the relative contributions of fundamental frequency and implied vocal tract length to gender perception varied over the ranges studied. There was no threshold of fundamental frequency or implied vocal tract length that dramatically shifted the perception between male and female.}, } @article {pmid38782960, year = {2024}, author = {Balolia, KL and Fitzgerald, PL}, title = {Male proboscis monkey cranionasal size and shape is associated with visual and acoustic signalling.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {10715}, pmid = {38782960}, issn = {2045-2322}, mesh = {Animals ; Male ; *Sex Characteristics ; Nasal Cavity/anatomy & histology/physiology ; Nose/anatomy & histology ; Animal Communication ; Acoustics ; Skull/anatomy & histology ; Vocalization, Animal/physiology ; Female ; }, abstract = {The large nose adorned by adult male proboscis monkeys is hypothesised to serve as an audiovisual signal of sexual selection. It serves as a visual signal of male quality and social status, and as an acoustic signal, through the expression of loud, low-formant nasalised calls in dense rainforests, where visibility is poor. However, it is unclear how the male proboscis monkey nasal complex, including the internal structure of the nose, plays a role in visual or acoustic signalling. Here, we use cranionasal data to assess whether large noses found in male proboscis monkeys serve visual and/or acoustic signalling functions. Our findings support a visual signalling function for male nasal enlargement through a relatively high degree of nasal aperture sexual size dimorphism, the craniofacial region to which nasal soft tissue attaches. We additionally find nasal aperture size increases beyond dental maturity among male proboscis monkeys, consistent with the visual signalling hypothesis. We show that the cranionasal region has an acoustic signalling role through pronounced nasal cavity sexual shape dimorphism, wherein male nasal cavity shape allows the expression of loud, low-formant nasalised calls. Our findings provide robust support for the male proboscis monkey nasal complex serving both visual and acoustic functions.}, } @article {pmid38778635, year = {2024}, author = {Beach, SD and Niziolek, CA}, title = {Inhibitory modulation of speech trajectories: Evidence from a vowel-modified Stroop task.}, journal = {Cognitive neuropsychology}, volume = {41}, number = {1-2}, pages = {51-69}, pmid = {38778635}, issn = {1464-0627}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Stroop Test ; *Inhibition, Psychological ; Male ; *Speech/physiology ; Female ; *Reaction Time/physiology ; Adult ; Young Adult ; Reading ; Phonetics ; Attention/physiology ; }, abstract = {How does cognitive inhibition influence speaking? The Stroop effect is a classic demonstration of the interference between reading and color naming. We used a novel variant of the Stroop task to measure whether this interference impacts not only the response speed, but also the acoustic properties of speech. Speakers named the color of words in three categories: congruent (e.g., red written in red), color-incongruent (e.g., green written in red), and vowel-incongruent - those with partial phonological overlap with their color (e.g., rid written in red, grain in green, and blow in blue). Our primary aim was to identify any effect of the distractor vowel on the acoustics of the target vowel. Participants were no slower to respond on vowel-incongruent trials, but formant trajectories tended to show a bias away from the distractor vowel, consistent with a phenomenon of acoustic inhibition that increases contrast between confusable alternatives.}, } @article {pmid38755075, year = {2024}, author = {Aaen, M and Sadolin, C}, title = {Towards Improved Auditory-Perceptual Assessment of Timbres: Comparing Accuracy and Reliability of Four Deconstructed Timbre Assessment Models.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.03.039}, pmid = {38755075}, issn = {1873-4588}, abstract = {UNLABELLED: Timbre is a central quality of singing, yet remains a complex notion poorly understood in psychoacoustic studies. Previous studies note how no single acoustic variable or combinations of variables consistently predict timbre dimensions. Timbre varies on a continuum from darkest to lightest. These extremes are associated with laryngeal and vocal tract adjustments related to smaller and larger vocal tract area and variations in vocal fold vibratory characteristics. Perceptually, timbre assessment is influenced by spectral characteristics and formant frequency adjustments, though these dimensions are not independently perceived. Perceptual studies repeatedly demonstrate difficulties in correlating variations in timbre stimuli to specific measures. A recent study demonstrated how acoustic predictive salience of voice category and voice weight across pitches contribute to timbre assessments and concludes that timbre may be related to as-of-yet unknown factor(s). The purpose of this study was to test four different models for assessing timbre; one model focused on specific anatomy, one on listener intuition, one utilizing auditory anchors, and one using expert raters in a deconstructed timbre model with five specific dimensions.

METHODS: Four independent panels were conducted with separate cohorts of professional singing teachers. Forty-one assessors took part in the anatomically focused panel, 54 in the intuition-based panel, 30 in the anchored panel, and 12 in the expert listener panel. Stimuli taken from live performances of well-known singers were used for all panels, representing all genders, genres, and styles across a large pitch range. All stimuli are available as Supplementary Materials. Fleiss' kappa values, descriptive statistics, and significance tests are reported for all panel assessments.

RESULTS: Panels 1 through 4 varied in overall accuracy and agreement. The intuition-based model showed overall 45% average accuracy (SD ± 4%), k = 0.289 (<0.001) compared to overall 71% average accuracy (SD ± 3%), k = 0.368 (<0.001) of the anatomical focused panel. The auditory-anchored model showed overall 75% average accuracy (SD ± 8%), k = 0.54 (<0.001) compared with overall 83% average accuracy and agreement of k = 0.63 (<0.001) for panel 4. Results revealed that the highest accuracy and reliability were achieved in a deconstructed timbre model and that providing anchoring improved reliability but with no further increase in accuracy.

CONCLUSION: Deconstructing timbre into specific parameters improved auditory perceptual accuracy and overall agreement. Assessing timbre along with other perceptual dimensions improves accuracy and reliability. Panel assessors' expert level of listening skills remain an important factor in obtaining reliable and accurate assessments of auditory stimuli for timbre dimensions. Anchoring improved reliability but with no further increase in accuracy. The study suggests that timbre assessment can be improved by approaching the percept through a prism of five specific dimensions each related to specific physiology and auditory-perceptual subcategories. Further tests are needed with framework-naïve listeners, nonmusically educated listeners, artificial intelligence comparisons, and synthetic stimuli to further test the reliability.}, } @article {pmid38754028, year = {2024}, author = {Ning, LH and Hui, TC}, title = {The Accompanying Effect in Responses to Auditory Perturbations: Unconscious Vocal Adjustments to Unperturbed Parameters.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {6}, pages = {1731-1751}, doi = {10.1044/2024_JSLHR-23-00543}, pmid = {38754028}, issn = {1558-9102}, mesh = {Humans ; Male ; Female ; Young Adult ; *Pitch Perception/physiology ; Adult ; *Bayes Theorem ; Speech Perception/physiology ; Loudness Perception/physiology ; Feedback, Sensory/physiology ; Voice/physiology ; Acoustic Stimulation/methods ; Speech Acoustics ; }, abstract = {PURPOSE: The present study examined whether participants respond to unperturbed parameters while experiencing specific perturbations in auditory feedback. For instance, we aim to determine if speakers adjust voice loudness when only pitch is artificially altered in auditory feedback. This phenomenon is referred to as the "accompanying effect" in the present study.

METHOD: Thirty native Mandarin speakers were asked to sustain the vowel /ɛ/ for 3 s while their auditory feedback underwent single shifts in one of the three distinct ways: pitch shift (±100 cents; coded as PT), loudness shift (±6 dB; coded as LD), or first formant (F1) shift (±100 Hz; coded as FM). Participants were instructed to ignore the perturbations in their auditory feedback. Response types were categorized based on pitch, loudness, and F1 for each individual trial, such as Popp_Lopp_Fopp indicating opposing responses in all three domains.

RESULTS: The accompanying effect appeared 93% of the time. Bayesian Poisson regression models indicate that opposing responses in all three domains (Popp_Lopp_Fopp) were the most prevalent response type across the conditions (PT, LD, and FM). The more frequently used response types exhibited opposing responses and significantly larger response curves than the less frequently used response types. Following responses became more prevalent only when the perturbed stimuli were perceived as voices from someone else (external references), particularly in the FM condition. In terms of isotropy, loudness and F1 tended to change in the same direction rather than loudness and pitch.

CONCLUSION: The presence of the accompanying effect suggests that the motor systems responsible for regulating pitch, loudness, and formants are not entirely independent but rather interconnected to some degree.}, } @article {pmid38741274, year = {2024}, author = {Ekström, AG}, title = {Correcting the record: Phonetic potential of primate vocal tracts and the legacy of Philip Lieberman (1934-2022).}, journal = {American journal of primatology}, volume = {86}, number = {8}, pages = {e23637}, doi = {10.1002/ajp.23637}, pmid = {38741274}, issn = {1098-2345}, mesh = {Animals ; *Vocalization, Animal ; *Phonetics ; *Primates/physiology/anatomy & histology ; Humans ; History, 20th Century ; Speech/physiology ; Biological Evolution ; }, abstract = {The phonetic potential of nonhuman primate vocal tracts has been the subject of considerable contention in recent literature. Here, the work of Philip Lieberman (1934-2022) is considered at length, and two research papers-both purported challenges to Lieberman's theoretical work-and a review of Lieberman's scientific legacy are critically examined. I argue that various aspects of Lieberman's research have been consistently misinterpreted in the literature. A paper by Fitch et al. overestimates the would-be "speech-ready" capacities of a rhesus macaque, and the data presented nonetheless supports Lieberman's principal position-that nonhuman primates cannot articulate the full extent of human speech sounds. The suggestion that no vocal anatomical evolution was necessary for the evolution of human speech (as spoken by all normally developing humans) is not supported by phonetic or anatomical data. The second challenge, by Boë et al., attributes vowel-like qualities of baboon calls to articulatory capacities based on audio data; I argue that such "protovocalic" properties likely result from disparate articulatory maneuvers compared to human speakers. A review of Lieberman's scientific legacy by Boë et al. ascribes a view of speech evolution (which the authors term "laryngeal descent theory") to Lieberman, which contradicts his writings. The present article documents a pattern of incorrect interpretations of Lieberman's theoretical work in recent literature. Finally, the apparent trend of vowel-like formant dispersions in great ape vocalization literature is discussed with regard to Lieberman's theoretical work. The review concludes that the "Lieberman account" of primate vocal tract phonetic capacities remains supported by research: the ready articulation of fully human speech reflects species-unique anatomy.}, } @article {pmid38738242, year = {2024}, author = {Cao, S and Rosenzweig, I and Bilotta, F and Jiang, H and Xia, M}, title = {Automatic detection of obstructive sleep apnea based on speech or snoring sounds: a narrative review.}, journal = {Journal of thoracic disease}, volume = {16}, number = {4}, pages = {2654-2667}, pmid = {38738242}, issn = {2072-1439}, abstract = {BACKGROUND AND OBJECTIVE: Obstructive sleep apnea (OSA) is a common chronic disorder characterized by repeated breathing pauses during sleep caused by upper airway narrowing or collapse. The gold standard for OSA diagnosis is the polysomnography test, which is time consuming, expensive, and invasive. In recent years, more cost-effective approaches for OSA detection based in predictive value of speech and snoring has emerged. In this paper, we offer a comprehensive summary of current research progress on the applications of speech or snoring sounds for the automatic detection of OSA and discuss the key challenges that need to be overcome for future research into this novel approach.

METHODS: PubMed, IEEE Xplore, and Web of Science databases were searched with related keywords. Literature published between 1989 and 2022 examining the potential of using speech or snoring sounds for automated OSA detection was reviewed.

KEY CONTENT AND FINDINGS: Speech and snoring sounds contain a large amount of information about OSA, and they have been extensively studied in the automatic screening of OSA. By importing features extracted from speech and snoring sounds into artificial intelligence models, clinicians can automatically screen for OSA. Features such as formant, linear prediction cepstral coefficients, mel-frequency cepstral coefficients, and artificial intelligence algorithms including support vector machines, Gaussian mixture model, and hidden Markov models have been extensively studied for the detection of OSA.

CONCLUSIONS: Due to the significant advantages of noninvasive, low-cost, and contactless data collection, an automatic approach based on speech or snoring sounds seems to be a promising tool for the detection of OSA.}, } @article {pmid38717213, year = {2024}, author = {Feng, H and Wang, L}, title = {Acoustic analysis of English tense and lax vowels: Comparing the production between Mandarin Chinese learners and native English speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {5}, pages = {3071-3089}, doi = {10.1121/10.0025931}, pmid = {38717213}, issn = {1520-8524}, mesh = {Humans ; *Speech Acoustics ; *Phonetics ; Male ; Female ; *Multilingualism ; Young Adult ; Speech Production Measurement ; Adult ; Language ; Acoustics ; Learning ; Voice Quality ; Sound Spectrography ; East Asian People ; }, abstract = {This study investigated how 40 Chinese learners of English as a foreign language (EFL learners) differed from 40 native English speakers in the production of four English tense-lax contrasts, /i-ɪ/, /u-ʊ/, /ɑ-ʌ/, and /æ-ε/, by examining the acoustic measurements of duration, the first three formant frequencies, and the slope of the first formant movement (F1 slope). The dynamic formant trajectory was modeled using discrete cosine transform coefficients to demonstrate the time-varying properties of formant trajectories. A discriminant analysis was employed to illustrate the extent to which Chinese EFL learners relied on different acoustic parameters. This study found that: (1) Chinese EFL learners overemphasized durational differences and weakened spectral differences for the /i-ɪ/, /u-ʊ/, and /ɑ-ʌ/ pairs, although they maintained sufficient spectral differences for /æ-ε/. In contrast, native English speakers predominantly used spectral differences across all four pairs; (2) in non-low tense-lax contrasts, unlike native English speakers, Chinese EFL learners failed to exhibit different F1 slope values, indicating a non-nativelike tongue-root placement during the articulatory process. The findings underscore the contribution of dynamic spectral patterns to the differentiation between English tense and lax vowels, and reveal the influence of precise articulatory gestures on the realization of the tense-lax contrast.}, } @article {pmid38714709, year = {2024}, author = {Ostrega, J and Shiramizu, V and Lee, AJ and Jones, BC and Feinberg, DR}, title = {No evidence that averaging voices influences attractiveness.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {10488}, pmid = {38714709}, issn = {2045-2322}, support = {EP/T023783/1//Engineering and Physical Sciences Research Council/ ; RGPIN-2023-05146//Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Humans ; Male ; Female ; *Voice/physiology ; Adult ; Young Adult ; *Beauty ; Judgment/physiology ; Adolescent ; }, abstract = {Vocal attractiveness influences important social outcomes. While most research on the acoustic parameters that influence vocal attractiveness has focused on the possible roles of sexually dimorphic characteristics of voices, such as fundamental frequency (i.e., pitch) and formant frequencies (i.e., a correlate of body size), other work has reported that increasing vocal averageness increases attractiveness. Here we investigated the roles these three characteristics play in judgments of the attractiveness of male and female voices. In Study 1, we found that increasing vocal averageness significantly decreased distinctiveness ratings, demonstrating that participants could detect manipulations of vocal averageness in this stimulus set and using this testing paradigm. However, in Study 2, we found no evidence that increasing averageness significantly increased attractiveness ratings of voices. In Study 3, we found that fundamental frequency was negatively correlated with male vocal attractiveness and positively correlated with female vocal attractiveness. By contrast with these results for fundamental frequency, vocal attractiveness and formant frequencies were not significantly correlated. Collectively, our results suggest that averageness may not necessarily significantly increase attractiveness judgments of voices and are consistent with previous work reporting significant associations between attractiveness and voice pitch.}, } @article {pmid38704279, year = {2024}, author = {Leyns, C and Adriaansen, A and Daelman, J and Bostyn, L and Meerschman, I and T'Sjoen, G and D'haeseleer, E}, title = {Long-term Acoustic Effects of Gender-Affirming Voice Training in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.04.007}, pmid = {38704279}, issn = {1873-4588}, abstract = {OBJECTIVES: One role of a speech-language pathologist (SLP) is to help transgender clients in developing a healthy, gender-congruent communication. Transgender women frequently approach SLPs to train their voices to sound more feminine, however, long-term acoustic effects of the training needs to be rigorously examined in effectiveness studies. The aim of this study was to investigate the long-term effects (follow-up 1: 3months and follow-up 2: 1year after last session) of gender-affirming voice training for transgender women, in terms of acoustic parameters.

STUDY DESIGN: This study was a randomized sham-controlled trial with a cross-over design.

METHODS: Twenty-six transgender women were included for follow-up 1 and 18 for follow-up 2. All participants received 14weeks of gender-affirming voice training (4weeks sham training, 10weeks of voice feminization training: 5weeks pitch elevation training and 5weeks articulation-resonance training), but in a different order. Speech samples were recorded with Praat at four different time points (pre, post, follow-up 1, follow-up 2). Acoustic analysis included fo of sustained vowel /a:/, reading and spontaneous speech. Formant frequencies (F1-F2-F3) of vowels /a/, /i/, and /u/ were determined and vowel space was calculated. A linear mixed model was used to compare the acoustic voice measurements between measurements (pre - post, pre - follow-up 1, pre - follow-up 2, post - follow-up 1, post - follow-up 2, follow-up 1 - follow-up 2).

RESULTS: Most of the fo measurements and formant frequencies that increased immediately after the intervention, were stable at both follow-up measurements. The median fo during the sustained vowel, reading and spontaneous speech stayed increased at both follow-ups compared to the pre-measurement. However, a decrease of 16 Hz/1.7 ST (reading) and 12 Hz/1.5 ST (spontaneous speech) was detected between the post-measurement (169 Hz for reading, 144 Hz for spontaneous speech) and 1year after the last session (153 Hz and 132 Hz, respectively). The lower limit of fo did not change during reading and spontaneous speech, both directly after the intervention and during both follow-ups. F1-2 of vowel /a/ and the vowel space increased after the intervention and both follow-ups. Individual analyses showed that more aspects should be controlled after the intervention, such as exercises that were performed at home, or the duration of extra gender-affirming voice training sessions.

CONCLUSIONS: After 10 sessions of voice feminization training and follow-up measurements after 3months and 1year, stable increases were found for some formant frequencies and fo measurements, but not all of them. More time should be spent on increasing the fifth percentile of fo, as the lower limit of fo also contributes to the perception of more feminine voice.}, } @article {pmid38693788, year = {2024}, author = {Kocjančič, T and Bořil, T and Hofmann, S}, title = {Acoustic and Articulatory Visual Feedback in Classroom L2 Vowel Remediation.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309231223736}, doi = {10.1177/00238309231223736}, pmid = {38693788}, issn = {1756-6053}, abstract = {This paper presents L2 vowel remediation in a classroom setting via two real-time visual feedback methods: articulatory ultrasound tongue imaging, which shows tongue shape and position, and a newly developed acoustic formant analyzer, which visualizes a point correlating with the combined effect of tongue position and lip rounding in a vowel quadrilateral. Ten Czech students of the Swedish language participated in the study. Swedish vowel production is difficult for Czech speakers since the languages differ significantly in their vowel systems. The students selected the vowel targets on their own and practiced in two classroom groups, with six students receiving two ultrasound training lessons, followed by one acoustic, and four students receiving two acoustic lessons, followed by one ultrasound. Audio data were collected pre-training, after the two sessions employing the first visual feedback method, and at post-training, allowing measuring Euclidean distance among selected groups of vowels and observing the direction of change within the vowel quadrilateral as a result of practice. Perception tests were performed before and after training, revealing that most learners perceived selected vowels correctly already before the practice. The study showed that both feedback methods can be successfully applied to L2 classroom learning, and both lead to the improvement in the pronunciation of the selected vowels, as well as the Swedish vowel set as a whole. However, ultrasound tongue imaging seems to have an advantage as it resulted in a greater number of improved targets.}, } @article {pmid38656176, year = {2024}, author = {Saldías O'Hrens, M and Castro, C and Espinoza, VM and Stoney, J and Quezada, C and Laukkanen, AM}, title = {Spectral features related to the auditory perception of twang-like voices.}, journal = {Logopedics, phoniatrics, vocology}, volume = {}, number = {}, pages = {1-18}, doi = {10.1080/14015439.2024.2345373}, pmid = {38656176}, issn = {1651-2022}, abstract = {BACKGROUND: To the best of our knowledge, studies on the relationship between spectral energy distribution and the degree of perceived twang-like voices are still sparse. Through an auditory-perceptual test we aimed to explore the spectral features that may relate with the auditory-perception of twang-like voices.

METHODS: Ten judges who were blind to the test's tasks and stimuli rated the amount of twang perceived on seventy-six audio samples. The stimuli consisted of twenty voices recorded from eight CCM singers who sustained the vowel [a:] in different pitches, with and without a twang-like voice. Also, forty filtered and sixteen synthesized-manipulated stimuli were included.

RESULTS AND CONCLUSIONS: Based on the intra-rater reliability scores, four judges were identified as suitable to be included in the analyses. Results showed that the frequency of F1 and F2 correlated strongly with the auditory-perception of twang-like voices (0.90 and 0.74, respectively), whereas F3 showed a moderate negative correlation (-0.52). The frequency difference between F1 and F3 showed a strong negative correlation (-0.82). The mean energy between 1-2 kHz and 2-3 kHz correlated moderately (0.51 and 0.42, respectively). The frequency of F4 and F5, and the energy above 3 kHz showed weak correlations. Since the spectral changes under 2 kHz have been associated with the jaw, lips, and tongue adjustments (i.e. vowel articulation) and a higher vertical laryngeal position might affect the frequency of all formants (including F1 and F2), our results suggest that vowel articulation and the laryngeal height may be relevant when performing twang-like voices.}, } @article {pmid38644071, year = {2024}, author = {Cruz, TLB and Frič, M and Andrade, PA}, title = {A Comparison of Countertenor Singing at Various Professional Levels Using Acoustic, Electroglottographic, and Videofluoroscopic Methods.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.03.033}, pmid = {38644071}, issn = {1873-4588}, abstract = {INTRODUCTION: The vocal characteristics of countertenors (CTTs) are poorly understood due to a lack of studies in this field. This study aims to explore differences among CTTs at various professional levels, examining both disparities and congruences in singing styles to better understand the CTT voice.

MATERIALS AND METHODS: Four CTTs (one student, one amateur, and two professionals) sang "La giustizia ha già sull'arco" from Handel's Giulio Cesare, with concurrent videofluoroscopic, electroglottography (EGG), and acoustic data collection. Auditory-perceptual analysis was employed to rate professional level. Acoustic analysis included LH1-LH2, formant cluster prominence, and vibrato analysis. EGG data was analyzed using FonaDyn software, while anatomical modifications were quantified using videofluoroscopic images.

RESULTS: CTTs exhibited EGG contact quotient values surpassing typical levels for inexperienced falsettos. Their vibrato characteristics aligned with expectations for classical singing, whereas the presence of the singer's formant was not observed. Variations in supraglottic adjustments among CTTs underscored the diversity of techniques employed by CTT singers.

CONCLUSIONS: CTTs exhibited vocal techniques that highlighted the influence of individual preferences, professional experience, and stylistic choices in shaping their singing characteristics. The data revealed discernible differences between professional and amateur CTTs, providing insights into the impact of varying levels of experience on vocal expression.}, } @article {pmid38629882, year = {2024}, author = {Torres, C and Li, W and Escudero, P}, title = {Acoustic, phonetic, and phonological features of Drehu vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {4}, pages = {2612-2626}, doi = {10.1121/10.0025538}, pmid = {38629882}, issn = {1520-8524}, mesh = {*Phonetics ; *Speech Acoustics ; Acoustics ; }, abstract = {This study presents an acoustic investigation of the vowel inventory of Drehu (Southern Oceanic Linkage), spoken in New Caledonia. Reportedly, Drehu has a 14 vowel system distinguishing seven vowel qualities and an additional length distinction. Previous phonological descriptions were based on impressionistic accounts showing divergent proposals for two out of seven reported vowel qualities. This study presents the first phonetic investigation of Drehu vowels based on acoustic data from eight speakers. To examine the phonetic correlates of the proposed phonological vowel inventory, multi-point acoustic analyses were used, and vowel inherent spectral change (VISC) was investigated (F1, F2, and F3). Additionally, vowel duration was measured. Contrary to reports from other studies on VISC in monophthongs, we find that monophthongs in Drehu are mostly steady state. We propose a revised vowel inventory and focus on the acoustic description of open-mid /ɛ/ and the central vowel /ə/, whose status was previously unclear. Additionally, we find that vowel quality stands orthogonal to vowel quantity by demonstrating that the phonological vowel length distinction is primarily based on a duration cue rather than formant structure. Finally, we report the acoustic properties of the seven vowel qualities that were identified.}, } @article {pmid38564597, year = {2024}, author = {Wang, H and Ali, Y and Max, L}, title = {Perceptual formant discrimination during speech movement planning.}, journal = {PloS one}, volume = {19}, number = {4}, pages = {e0301514}, pmid = {38564597}, issn = {1932-6203}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech/physiology ; *Speech Perception/physiology ; Acoustics ; *Auditory Cortex ; Movement ; Phonetics ; Speech Acoustics ; }, abstract = {Evoked potential studies have shown that speech planning modulates auditory cortical responses. The phenomenon's functional relevance is unknown. We tested whether, during this time window of cortical auditory modulation, there is an effect on speakers' perceptual sensitivity for vowel formant discrimination. Participants made same/different judgments for pairs of stimuli consisting of a pre-recorded, self-produced vowel and a formant-shifted version of the same production. Stimuli were presented prior to a "go" signal for speaking, prior to passive listening, and during silent reading. The formant discrimination stimulus /uh/ was tested with a congruent productions list (words with /uh/) and an incongruent productions list (words without /uh/). Logistic curves were fitted to participants' responses, and the just-noticeable difference (JND) served as a measure of discrimination sensitivity. We found a statistically significant effect of condition (worst discrimination before speaking) without congruency effect. Post-hoc pairwise comparisons revealed that JND was significantly greater before speaking than during silent reading. Thus, formant discrimination sensitivity was reduced during speech planning regardless of the congruence between discrimination stimulus and predicted acoustic consequences of the planned speech movements. This finding may inform ongoing efforts to determine the functional relevance of the previously reported modulation of auditory processing during speech planning.}, } @article {pmid38557735, year = {2024}, author = {Havenhill, J}, title = {Articulatory and acoustic dynamics of fronted back vowels in American English.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {4}, pages = {2285-2301}, doi = {10.1121/10.0025461}, pmid = {38557735}, issn = {1520-8524}, mesh = {United States ; *Speech Acoustics ; *Phonetics ; Acoustics ; Language ; South Carolina ; }, abstract = {Fronting of the vowels /u, ʊ, o/ is observed throughout most North American English varieties, but has been analyzed mainly in terms of acoustics rather than articulation. Because an increase in F2, the acoustic correlate of vowel fronting, can be the result of any gesture that shortens the front cavity of the vocal tract, acoustic data alone do not reveal the combination of tongue fronting and/or lip unrounding that speakers use to produce fronted vowels. It is furthermore unresolved to what extent the articulation of fronted back vowels varies according to consonantal context and how the tongue and lips contribute to the F2 trajectory throughout the vowel. This paper presents articulatory and acoustic data on fronted back vowels from two varieties of American English: coastal Southern California and South Carolina. Through analysis of dynamic acoustic, ultrasound, and lip video data, it is shown that speakers of both varieties produce fronted /u, ʊ, o/ with rounded lips, and that high F2 observed for these vowels is associated with a front-central tongue position rather than unrounded lips. Examination of time-varying formant trajectories and articulatory configurations shows that the degree of vowel-internal F2 change is predominantly determined by coarticulatory influence of the coda.}, } @article {pmid38530014, year = {2024}, author = {Singh, VP and Sahidullah, M and Kinnunen, T}, title = {ChildAugment: Data augmentation methods for zero-resource children's speaker verification.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {2221-2232}, doi = {10.1121/10.0025178}, pmid = {38530014}, issn = {1520-8524}, mesh = {Male ; Child ; Adult ; Female ; Humans ; Reproducibility of Results ; *Speech ; *Acoustics ; Neural Networks, Computer ; Motivation ; }, abstract = {The accuracy of modern automatic speaker verification (ASV) systems, when trained exclusively on adult data, drops substantially when applied to children's speech. The scarcity of children's speech corpora hinders fine-tuning ASV systems for children's speech. Hence, there is a timely need to explore more effective ways of reusing adults' speech data. One promising approach is to align vocal-tract parameters between adults and children through children-specific data augmentation, referred here to as ChildAugment. Specifically, we modify the formant frequencies and formant bandwidths of adult speech to emulate children's speech. The modified spectra are used to train emphasized channel attention, propagation, and aggregation in time-delay neural network recognizer for children. We compare ChildAugment against various state-of-the-art data augmentation techniques for children's ASV. We also extensively compare different scoring methods, including cosine scoring, probabilistic linear discriminant analysis (PLDA), and neural PLDA. We also propose a low-complexity weighted cosine score for extremely low-resource children ASV. Our findings on the CSLU kids corpus indicate that ChildAugment holds promise as a simple, acoustics-motivated approach, for improving state-of-the-art deep learning based ASV for children. We achieve up to 12.45% (boys) and 11.96% (girls) relative improvement over the baseline. For reproducibility, we provide the evaluation protocols and codes here.}, } @article {pmid38503674, year = {2024}, author = {Södersten, M and Oates, J and Sand, A and Granqvist, S and Quinn, S and Dacakis, G and Nygren, U}, title = {Gender-Affirming Voice Training for Trans Women: Acoustic Outcomes and Their Associations With Listener Perceptions Related to Gender.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.02.003}, pmid = {38503674}, issn = {1873-4588}, abstract = {OBJECTIVES: To investigate acoustic outcomes of gender-affirming voice training for trans women wanting to develop a female sounding voice and to describe what happens acoustically when male sounding voices become more female sounding.

STUDY DESIGN: Prospective treatment study with repeated measures.

METHODS: N = 74 trans women completed a voice training program of 8-12 sessions and had their voices audio recorded twice before and twice after training. Reference data were obtained from N = 40 cisgender speakers. Fundamental frequency (fo), formant frequencies (F1-F4), sound pressure level (Leq), and level difference between first and second harmonic (L1-L2) were extracted from a reading passage and spontaneous speech. N = 79 naive listeners provided gender-related ratings of participants' audio recordings. A linear mixed-effects model was used to estimate average training effects. Individual level analyses determined how changes in acoustic data were related to listeners' ratings.

RESULTS: Group data showed substantial training effects on fo (average, minimum, and maximum) and formant frequencies. Individual data demonstrated that many participants also increased Leq and some increased L1-L2. Measures that most strongly predicted listener ratings of a female sounding voice were: fo, average formant frequency, and Leq.

CONCLUSIONS: This is the largest prospective study reporting on acoustic outcomes of gender-affirming voice training for trans women. We confirm findings from previous smaller scale studies by demonstrating that listener perceptions of male and female sounding voices are related to acoustic voice features, and that voice training for trans women wanting to sound female is associated with desirable acoustic changes, indicating training effectiveness. Although acoustic measures can be a valuable indicator of training effectiveness, particularly from the perspective of clinicians and researchers, we contend that a combination of outcome measures, including client perspectives, are needed to provide comprehensive evaluation of gender-affirming voice training that is relevant for all stakeholders.}, } @article {pmid38501906, year = {2024}, author = {Dolquist, DV and Munson, B}, title = {Clinical Focus: The Development and Description of a Palette of Transmasculine Voices.}, journal = {American journal of speech-language pathology}, volume = {33}, number = {3}, pages = {1113-1126}, doi = {10.1044/2024_AJSLP-23-00398}, pmid = {38501906}, issn = {1558-9110}, mesh = {Humans ; Male ; *Transgender Persons/psychology ; *Speech Acoustics ; *Voice Quality ; Adult ; *Speech Production Measurement ; Young Adult ; Speech-Language Pathology/methods ; Female ; Middle Aged ; Phonetics ; }, abstract = {PURPOSE: The study of gender and speech has historically excluded studies of transmasculine individuals. Consequently, generalizations about speech and gender are based on cisgender individuals. This lack of representation hinders clinical training and clinical service delivery, particularly by speech-language pathologists providing gender-affirming communication services. This letter describes a new corpus of the speech of American English-speaking transmasculine men, transmasculine nonbinary people, and cisgender men that is open and available to clinicians and researchers.

METHOD: Twenty masculine-presenting native English speakers from the Upper Midwestern United States (including cisgender men, transmasculine men, and transmasculine nonbinary people) were recorded, producing three sets of speech materials: Consensus Auditory-Perceptual Evaluation of Voice sentences, the Rainbow Passage, and a novel set of sentences developed for this project. Acoustic measures vowels (overall formant frequency scaling, vowel-space dispersion, fundamental frequency, breathiness), consonants (voice onset time of word-initial voiceless stops, spectral moments of word-initial /s/), and the entire sentence (rate of speech) that were made.

RESULTS: The acoustic measures reveal a wide range for all dependent measures and low correlations among the measures. Results show that many of the voices depart considerably from the norms for men's speech in published studies.

CONCLUSION: This new corpus can be used to illustrate different ways of sounding masculine by speech-language pathologists performing gender-affirming communication services and by higher education teachers as examples of diverse ways of sounding masculine.}, } @article {pmid38498664, year = {2024}, author = {Kim, Y and Thompson, A and Nip, ISB}, title = {Effects of Deep-Brain Stimulation on Speech: Perceptual and Acoustic Data.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {4}, pages = {1090-1106}, pmid = {38498664}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; R01 DC020468/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Deep Brain Stimulation ; Acoustics ; Speech Intelligibility/physiology ; Voice Quality ; *Parkinson Disease/complications/therapy ; Brain ; Speech Acoustics ; }, abstract = {PURPOSE: This study examined speech changes induced by deep-brain stimulation (DBS) in speakers with Parkinson's disease (PD) using a set of auditory-perceptual and acoustic measures.

METHOD: Speech recordings from nine speakers with PD and DBS were compared between DBS-On and DBS-Off conditions using auditory-perceptual and acoustic analyses. Auditory-perceptual ratings included voice quality, articulation precision, prosody, speech intelligibility, and listening effort obtained from 44 listeners. Acoustic measures were made for voicing proportion, second formant frequency slope, vowel dispersion, articulation rate, and range of fundamental frequency and intensity.

RESULTS: No significant changes were found between DBS-On and DBS-Off for the five perceptual ratings. Four of six acoustic measures revealed significant differences between the two conditions. While articulation rate and acoustic vowel dispersion increased, voicing proportion and intensity range decreased from the DBS-Off to DBS-On condition. However, a visual examination of the data indicated that the statistical significance was mostly driven by a small number of participants, while the majority did not show a consistent pattern of such changes.

CONCLUSIONS: Our data, in general, indicate no-to-minimal changes in speech production ensued from DBS stimulation. The findings are discussed with a focus on large interspeaker variability in PD in terms of their speech characteristics and the potential effects of DBS on speech.}, } @article {pmid38498508, year = {2024}, author = {Sabev, M and Andreeva, B}, title = {The acoustics of Contemporary Standard Bulgarian vowels: A corpus study.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {2128-2138}, doi = {10.1121/10.0025293}, pmid = {38498508}, issn = {1520-8524}, mesh = {*Speech Acoustics ; Bulgaria ; *Phonetics ; Acoustics ; Multivariate Analysis ; }, abstract = {A comprehensive examination of the acoustics of Contemporary Standard Bulgarian vowels is lacking to date, and this article aims to fill that gap. Six acoustic variables-the first three formant frequencies, duration, mean f0, and mean intensity-of 11 615 vowel tokens from 140 speakers were analysed using linear mixed models, multivariate analysis of variance, and linear discriminant analysis. The vowel system, which comprises six phonemes in stressed position, [ε a ɔ i ɤ u], was examined from four angles. First, vowels in pretonic syllables were compared to other unstressed vowels, and no spectral or durational differences were found, contrary to an oft-repeated claim that pretonic vowels reduce less. Second, comparisons of stressed and unstressed vowels revealed significant differences in all six variables for the non-high vowels [ε a ɔ]. No spectral or durational differences were found in [i ɤ u], which disproves another received view that high vowels are lowered when unstressed. Third, non-high vowels were compared with their high counterparts; the height contrast was completely neutralized in unstressed [a-ɤ] and [ɔ-u] while [ε-i] remained distinct. Last, the acoustic correlates of vowel contrasts were examined, and it was demonstrated that only F1, F2 frequencies and duration were systematically employed in differentiating vowel phonemes.}, } @article {pmid38497731, year = {2024}, author = {Ashokumar, M and Schwartz, JL and Ito, T}, title = {Changes in Speech Production Following Perceptual Training With Orofacial Somatosensory Inputs.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {10S}, pages = {3962-3973}, doi = {10.1044/2023_JSLHR-23-00249}, pmid = {38497731}, issn = {1558-9102}, mesh = {Humans ; *Speech Perception/physiology ; Female ; Male ; Young Adult ; *Speech/physiology ; Adult ; *Phonetics ; Face/physiology ; Learning/physiology ; }, abstract = {PURPOSE: Orofacial somatosensory inputs play an important role in speech motor control and speech learning. Since receiving specific auditory-somatosensory inputs during speech perceptual training alters speech perception, similar perceptual training could also alter speech production. We examined whether the production performance was changed by perceptual training with orofacial somatosensory inputs.

METHOD: We focused on the French vowels /e/ and /ø/, contrasted in their articulation by horizontal gestures. Perceptual training consisted of a vowel identification task contrasting /e/ and /ø/. Along with training, for the first group of participants, somatosensory stimulation was applied as facial skin stretch in backward direction. We recorded the target vowels uttered by the participants before and after the perceptual training and compared their F1, F2, and F3 formants. We also tested a control group with no somatosensory stimulation and another somatosensory group with a different vowel continuum (/e/-/i/) for perceptual training.

RESULTS: Perceptual training with somatosensory stimulation induced changes in F2 and F3 in the produced vowel sounds. F2 decreased consistently in the two somatosensory groups. F3 increased following the /e/-/ø/ training and decreased following the /e/-/i/ training. F2 change was significantly correlated with the perceptual shift between the first and second half of the training phase in the somatosensory group with the /e/-/ø/ training, but not with the /e/-/i/ training. The control group displayed no effect on F2 and F3, and just a tendency of F1 increase.

CONCLUSION: The results suggest that somatosensory inputs associated to speech sound inputs can play a role in speech training and learning in both production and perception.}, } @article {pmid38480766, year = {2024}, author = {Saha, S and Rattansingh, A and Martino, R and Viswanathan, K and Saha, A and Montazeri Ghahjaverestan, N and Yadollahi, A}, title = {A pilot observation using ultrasonography and vowel articulation to investigate the influence of suspected obstructive sleep apnea on upper airway.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {6144}, pmid = {38480766}, issn = {2045-2322}, mesh = {Humans ; Pilot Projects ; *Sleep Apnea, Obstructive/complications ; *Sleep Apnea Syndromes/complications ; Trachea ; Ultrasonography ; }, abstract = {Failure to employ suitable measures before administering full anesthesia to patients with obstructive sleep apnea (OSA) who are undergoing surgery may lead to developing complications after surgery. Therefore, it is very important to screen OSA before performing a surgery, which is currently done by subjective questionnaires such as STOP-Bang, Berlin scores. These questionnaires have 10-36% specificity in detecting sleep apnea, along with no information given on anatomy of upper airway, which is important for intubation. To address these challenges, we performed a pilot study to understand the utility of ultrasonography and vowel articulation in screening OSA. Our objective was to investigate the influence of OSA risk factors in vowel articulation through ultrasonography and acoustic features analysis. To accomplish this, we recruited 18 individuals with no risk of OSA and 13 individuals with high risk of OSA and asked them to utter vowels, such as /a/ (as in "Sah"), /e/ (as in "See"). An expert ultra-sonographer measured the parasagittal anterior-posterior (PAP) and transverse diameter of the upper airway. From the recorded vowel sounds, we extracted 106 features, including power, pitch, formant, and Mel frequency cepstral coefficients (MFCC). We analyzed the variation of the PAP diameters and vowel features from "See: /i/" to "Sah /a/" between control and OSA groups by two-way repeated measures ANOVA. We found that, there was a variation of upper airway diameter from "See" to "Sah" was significantly smaller in OSA group than control group (OSA: ∆12.8 ± 5.3 mm vs. control: ∆22.5 ± 3.9 mm OSA, p < 0.01). Moreover, we found several vowel features showed the exact same or opposite trend as PAP diameter variation, which led us to build a machine learning model to estimate PAP diameter from vowel features. We found a correlation coefficient of 0.75 between the estimated and measured PAP diameter after applying four estimation models and combining their output with a random forest model, which showed the feasibility of using acoustic features of vowel sounds to monitor upper airway diameter. Overall, this study has proven the concept that ultrasonography and vowel sounds analysis may be useful as an easily accessible imaging tool of upper airway.}, } @article {pmid38469160, year = {2024}, author = {Lee, H and Cho, M and Kwon, HY}, title = {Attention-based speech feature transfer between speakers.}, journal = {Frontiers in artificial intelligence}, volume = {7}, number = {}, pages = {1259641}, pmid = {38469160}, issn = {2624-8212}, abstract = {In this study, we propose a simple yet effective method for incorporating the source speaker's characteristics in the target speaker's speech. This allows our model to generate the speech of the target speaker with the style of the source speaker. To achieve this, we focus on the attention model within the speech synthesis model, which learns various speaker features such as spectrogram, pitch, intensity, formant, pulse, and voice breaks. The model is trained separately using datasets specific to the source and target speakers. Subsequently, we replace the attention weights learned from the source speaker's dataset with the attention weights from the target speaker's model. Finally, by providing new input texts to the target model, we generate the speech of the target speaker with the styles of the source speaker. We validate the effectiveness of our model through similarity analysis utilizing five evaluation metrics and showcase real-world examples.}, } @article {pmid38456732, year = {2024}, author = {Borjigin, A and Bakst, S and Anderson, K and Litovsky, RY and Niziolek, CA}, title = {Discrimination and sensorimotor adaptation of self-produced vowels in cochlear implant users.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {3}, pages = {1895-1908}, pmid = {38456732}, issn = {1520-8524}, support = {R01 DC003083/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; T32 DC005359/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; F32 DC017653/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Cochlear Implants ; *Speech Perception ; *Cochlear Implantation ; Auditory Perception ; Speech ; }, abstract = {Humans rely on auditory feedback to monitor and adjust their speech for clarity. Cochlear implants (CIs) have helped over a million people restore access to auditory feedback, which significantly improves speech production. However, there is substantial variability in outcomes. This study investigates the extent to which CI users can use their auditory feedback to detect self-produced sensory errors and make adjustments to their speech, given the coarse spectral resolution provided by their implants. First, we used an auditory discrimination task to assess the sensitivity of CI users to small differences in formant frequencies of their self-produced vowels. Then, CI users produced words with altered auditory feedback in order to assess sensorimotor adaptation to auditory error. Almost half of the CI users tested can detect small, within-channel differences in their self-produced vowels, and they can utilize this auditory feedback towards speech adaptation. An acoustic hearing control group showed better sensitivity to the shifts in vowels, even in CI-simulated speech, and elicited more robust speech adaptation behavior than the CI users. Nevertheless, this study confirms that CI users can compensate for sensory errors in their speech and supports the idea that sensitivity to these errors may relate to variability in production.}, } @article {pmid38443265, year = {2024}, author = {Stone, TC and Erickson, ML}, title = {Experienced and Inexperienced Listeners' Perception of Vocal Strain.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2024.02.002}, pmid = {38443265}, issn = {1873-4588}, abstract = {OBJECTIVE: The ability to perceive strain or tension in a voice is critical for both speech-language pathologists and singing teachers. Research on voice quality has focused primarily on the perception of breathiness or roughness. The perception of vocal strain has not been extensively researched and is poorly understood.

METHODS/DESIGN: This study employs a group and a within-subject design. Synthetic female sung stimuli were created that varied in source slope and vocal tract transfer function. Two groups of listeners, inexperienced listeners and experienced vocal pedagogues, listened to the stimuli and rated the perceived strain using a visual analog scale Synthetic female stimuli were constructed on the vowel /ɑ/ at 2 pitches, A3 and F5, using glottal source slopes that drop in amplitude at constant rates varying from - 6 dB/octave to - 18 dB/octave. All stimuli were filtered using three vocal tract transfer functions, one derived from a lyric/coloratura soprano, one derived from a mezzo-soprano, and a third that has resonance frequencies mid-way between the two. Listeners heard the stimuli over headphones and rated them on a scale from "no strain" to "very strained" using a visual-analog scale.

RESULTS: Spectral source slope was strongly related to the perception of strain in both groups of listeners. Experienced listeners' perception of strain was also related to formant pattern, while inexperienced listeners' perception of strain was also related to pitch.

CONCLUSION: This study has shown that spectral source slope can be a powerful cue to the perception of strain. However, inexperienced and experienced listeners also differ from each other in how strain is perceived across speaking and singing pitches. These differences may be based on both experience and the goals of the listener.}, } @article {pmid38440592, year = {2024}, author = {Umashankar, A and Ramamoorthy, S and Selvaraj, JL and Dhandayutham, S}, title = {Comparative Study on the Acoustic Analysis of Voice in Auditory Brainstem Implantees, Cochlear Implantees, and Normal Hearing Children.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {76}, number = {1}, pages = {645-652}, pmid = {38440592}, issn = {2231-3796}, abstract = {The aim of the study was to compare the acoustic characteristics of voice between Auditory Brainstem Implantees, Cochlear Implantees and normal hearing children. Voice parameters such as fundamental frequency, formant frequencies, perturbation measures, and harmonic to noise ratio were measured in a total of 30 children out of which 10 were Auditory Brainstem Implantees, 10 were Cochlear Implantees and 10 were normal hearing children. Parametric and nonparametric statistics were done to establish the nature of significance between the three groups. Overall deviancies were seen in the implanted group for all acoustic parameters. However abnormal deviations were seen in individuals with Auditory Brainstem Implants indicating the deficit in the feedback loop impacting the voice characteristics. The deviancy in feedback could attribute to the poor performance in ABI and CI. The CI performed comparatively better when compared to the ABI group indicating a slight feedback loop due to the type of Implant. However, there needs to be additional evidence supporting this and there is a need to carry out the same study using a larger sample size and a longitudinal design.}, } @article {pmid38435340, year = {2023}, author = {Cuadros, J and Z-Rivera, L and Castro, C and Whitaker, G and Otero, M and Weinstein, A and Martínez-Montes, E and Prado, P and Zañartu, M}, title = {DIVA Meets EEG: Model Validation Using Formant-Shift Reflex.}, journal = {Applied sciences (Basel, Switzerland)}, volume = {13}, number = {13}, pages = {}, pmid = {38435340}, issn = {2076-3417}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; }, abstract = {The neurocomputational model 'Directions into Velocities of Articulators' (DIVA) was developed to account for various aspects of normal and disordered speech production and acquisition. The neural substrates of DIVA were established through functional magnetic resonance imaging (fMRI), providing physiological validation of the model. This study introduces DIVA_EEG an extension of DIVA that utilizes electroencephalography (EEG) to leverage the high temporal resolution and broad availability of EEG over fMRI. For the development of DIVA_EEG, EEG-like signals were derived from original equations describing the activity of the different DIVA maps. Synthetic EEG associated with the utterance of syllables was generated when both unperturbed and perturbed auditory feedback (first formant perturbations) were simulated. The cortical activation maps derived from synthetic EEG closely resembled those of the original DIVA model. To validate DIVA_EEG, the EEG of individuals with typical voices (N = 30) was acquired during an altered auditory feedback paradigm. The resulting empirical brain activity maps significantly overlapped with those predicted by DIVA_EEG. In conjunction with other recent model extensions, DIVA_EEG lays the foundations for constructing a complete neurocomputational framework to tackle vocal and speech disorders, which can guide model-driven personalized interventions.}, } @article {pmid38418558, year = {2024}, author = {Fletcher, MD and Akis, E and Verschuur, CA and Perry, SW}, title = {Improved tactile speech perception using audio-to-tactile sensory substitution with formant frequency focusing.}, journal = {Scientific reports}, volume = {14}, number = {1}, pages = {4889}, pmid = {38418558}, issn = {2045-2322}, support = {EP/W032422/1//Engineering and Physical Sciences Research Council/ ; EP/T517859/1//Engineering and Physical Sciences Research Council/ ; }, mesh = {Humans ; *Speech Perception ; Speech ; Touch ; *Touch Perception ; *Cochlear Implants ; }, abstract = {Haptic hearing aids, which provide speech information through tactile stimulation, could substantially improve outcomes for both cochlear implant users and for those unable to access cochlear implants. Recent advances in wide-band haptic actuator technology have made new audio-to-tactile conversion strategies viable for wearable devices. One such strategy filters the audio into eight frequency bands, which are evenly distributed across the speech frequency range. The amplitude envelopes from the eight bands modulate the amplitudes of eight low-frequency tones, which are delivered through vibration to a single site on the wrist. This tactile vocoder strategy effectively transfers some phonemic information, but vowels and obstruent consonants are poorly portrayed. In 20 participants with normal touch perception, we tested (1) whether focusing the audio filters of the tactile vocoder more densely around the first and second formant frequencies improved tactile vowel discrimination, and (2) whether focusing filters at mid-to-high frequencies improved obstruent consonant discrimination. The obstruent-focused approach was found to be ineffective. However, the formant-focused approach improved vowel discrimination by 8%, without changing overall consonant discrimination. The formant-focused tactile vocoder strategy, which can readily be implemented in real time on a compact device, could substantially improve speech perception for haptic hearing aid users.}, } @article {pmid38381271, year = {2024}, author = {Maya Lastra, N and Rangel Negrín, A and Coyohua Fuentes, A and Dias, PAD}, title = {Mantled howler monkey males assess their rivals through formant spacing of long-distance calls.}, journal = {Primates; journal of primatology}, volume = {65}, number = {3}, pages = {183-190}, pmid = {38381271}, issn = {1610-7365}, support = {726265//Consejo Nacional de Ciencia y Tecnología/ ; 15 1529//Consejo Veracruzano de Ciencia y Tecnología/ ; }, mesh = {Male ; Animals ; *Glucocorticoids/metabolism ; Vocalization, Animal/physiology ; *Alouatta/physiology ; Testosterone ; }, abstract = {Formant frequency spacing of long-distance vocalizations is allometrically related to body size and could represent an honest signal of fighting potential. There is, however, only limited evidence that primates use formant spacing to assess the competitive potential of rivals during interactions with extragroup males, a risky context. We hypothesized that if formant spacing of long-distance calls is inversely related to the fighting potential of male mantled howler monkeys (Alouatta palliata), then males should: (1) be more likely and (2) faster to display vocal responses to calling rivals; (3) be more likely and (4) faster to approach calling rivals; and have higher fecal (5) glucocorticoid and (6) testosterone metabolite concentrations in response to rivals calling at intermediate and high formant spacing than to those with low formant spacing. We studied the behavioral responses of 11 adult males to playback experiments of long-distance calls from unknown individuals with low (i.e., emulating large individuals), intermediate, and high (i.e., small individuals) formant spacing (n = 36 experiments). We assayed fecal glucocorticoid and testosterone metabolite concentrations (n = 174). Playbacks always elicited vocal responses, but males responded quicker to intermediate than to low formant spacing playbacks. Low formant spacing calls were less likely to elicit approaches whereas high formant spacing calls resulted in quicker approaches. Males showed stronger hormonal responses to low than to both intermediate and high formant spacing calls. It is possible that males do not escalate conflicts with rivals with low formant spacing calls if these are perceived as large, and against whom winning probabilities should decrease and confrontation costs increase; but are willing to escalate conflicts with rivals of high formant spacing. Formant spacing may therefore be an important signal for rival assessment in this species.}, } @article {pmid38364044, year = {2024}, author = {Merritt, B and Bent, T and Kilgore, R and Eads, C}, title = {Auditory free classification of gender diverse speakersa).}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {2}, pages = {1422-1436}, doi = {10.1121/10.0024521}, pmid = {38364044}, issn = {1520-8524}, mesh = {Humans ; Male ; Female ; *Speech Perception ; Voice Quality ; Speech Acoustics ; Masculinity ; *Sexual and Gender Minorities ; }, abstract = {Auditory attribution of speaker gender has historically been assumed to operate within a binary framework. The prevalence of gender diversity and its associated sociophonetic variability motivates an examination of how listeners perceptually represent these diverse voices. Utterances from 30 transgender (1 agender individual, 15 non-binary individuals, 7 transgender men, and 7 transgender women) and 30 cisgender (15 men and 15 women) speakers were used in an auditory free classification paradigm, in which cisgender listeners classified the speakers on perceived general similarity and gender identity. Multidimensional scaling of listeners' classifications revealed two-dimensional solutions as the best fit for general similarity classifications. The first dimension was interpreted as masculinity/femininity, where listeners organized speakers from high to low fundamental frequency and first formant frequency. The second was interpreted as gender prototypicality, where listeners separated speakers with fundamental frequency and first formant frequency at upper and lower extreme values from more intermediate values. Listeners' classifications for gender identity collapsed into a one-dimensional space interpreted as masculinity/femininity. Results suggest that listeners engage in fine-grained analysis of speaker gender that cannot be adequately captured by a gender dichotomy. Further, varying terminology used in instructions may bias listeners' gender judgements.}, } @article {pmid38358292, year = {2024}, author = {Almurashi, W and Al-Tamimi, J and Khattab, G}, title = {Dynamic specification of vowels in Hijazi Arabic.}, journal = {Phonetica}, volume = {81}, number = {2}, pages = {185-220}, pmid = {38358292}, issn = {1423-0321}, mesh = {Male ; Female ; Humans ; *Phonetics ; *Speech Acoustics ; Language ; Acoustics ; Cues ; }, abstract = {Research on various languages shows that dynamic approaches to vowel acoustics - in particular Vowel-Inherent Spectral Change (VISC) - can play a vital role in characterising and classifying monophthongal vowels compared with a static model. This study's aim was to investigate whether dynamic cues also allow for better description and classification of the Hijazi Arabic (HA) vowel system, a phonological system based on both temporal and spectral distinctions. Along with static and dynamic F1 and F2 patterns, we evaluated the extent to which vowel duration, F0, and F3 contribute to increased/decreased discriminability among vowels. Data were collected from 20 native HA speakers (10 females and 10 males) producing eight HA monophthongal vowels in a word list with varied consonantal contexts. Results showed that dynamic cues provide further insights regarding HA vowels that are not normally gleaned from static measures alone. Using discriminant analysis, the dynamic cues (particularly the seven-point model) had relatively higher classification rates, and vowel duration was found to play a significant role as an additional cue. Our results are in line with dynamic approaches and highlight the importance of looking beyond static cues and beyond the first two formants for further insights into the description and classification of vowel systems.}, } @article {pmid38348589, year = {2024}, author = {Simeone, PJ and Green, JR and Tager-Flusberg, H and Chenausky, KV}, title = {Vowel distinctiveness as a concurrent predictor of expressive language function in autistic children.}, journal = {Autism research : official journal of the International Society for Autism Research}, volume = {17}, number = {2}, pages = {419-431}, doi = {10.1002/aur.3102}, pmid = {38348589}, issn = {1939-3806}, support = {R00 DC017490/DC/NIDCD NIH HHS/United States ; /NH/NIH HHS/United States ; P50 DC013027/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; P50 DC018006/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Autistic Disorder/complications ; *Autism Spectrum Disorder/complications ; Language ; Speech ; *Language Disorders ; Phonetics ; }, abstract = {Speech ability may limit spoken language development in some minimally verbal autistic children. In this study, we aimed to determine whether an acoustic measure of speech production, vowel distinctiveness, is concurrently related to expressive language (EL) for autistic children. Syllables containing the vowels [i] and [a] were recorded remotely from 27 autistic children (4;1-7;11) with a range of spoken language abilities. Vowel distinctiveness was calculated using automatic formant tracking software. Robust hierarchical regressions were conducted with receptive language (RL) and vowel distinctiveness as predictors of EL. Hierarchical regressions were also conducted within a High EL and a Low EL subgroup. Vowel distinctiveness accounted for 29% of the variance in EL for the entire group, RL for 38%. For the Low EL group, only vowel distinctiveness was significant, accounting for 38% of variance in EL. Conversely, in the High EL group, only RL was significant and accounted for 26% of variance in EL. Replicating previous results, speech production and RL significantly predicted concurrent EL in autistic children, with speech production being the sole significant predictor for the Low EL group and RL the sole significant predictor for the High EL group. Further work is needed to determine whether vowel distinctiveness longitudinally, as well as concurrently, predicts EL. Findings have important implications for the early identification of language impairment and in developing language interventions for autistic children.}, } @article {pmid38341748, year = {2024}, author = {Shadle, CH and Fulop, SA and Chen, WR and Whalen, DH}, title = {Assessing accuracy of resonances obtained with reassigned spectrograms from the "ground truth" of physical vocal tract models.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {2}, pages = {1253-1263}, pmid = {38341748}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Voice ; Acoustics ; Speech Acoustics ; Vibration ; Sound Spectrography ; }, abstract = {The reassigned spectrogram (RS) has emerged as the most accurate way to infer vocal tract resonances from the acoustic signal [Shadle, Nam, and Whalen (2016). "Comparing measurement errors for formants in synthetic and natural vowels," J. Acoust. Soc. Am. 139(2), 713-727]. To date, validating its accuracy has depended on formant synthesis for ground truth values of these resonances. Synthesis is easily controlled, but it has many intrinsic assumptions that do not necessarily accurately realize the acoustics in the way that physical resonances would. Here, we show that physical models of the vocal tract with derivable resonance values allow a separate approach to the ground truth, with a different range of limitations. Our three-dimensional printed vocal tract models were excited by white noise, allowing an accurate determination of the resonance frequencies. Then, sources with a range of fundamental frequencies were implemented, allowing a direct assessment of whether RS avoided the systematic bias towards the nearest strong harmonic to which other analysis techniques are prone. RS was indeed accurate at fundamental frequencies up to 300 Hz; above that, accuracy was somewhat reduced. Future directions include testing mechanical models with the dimensions of children's vocal tracts and making RS more broadly useful by automating the detection of resonances.}, } @article {pmid38319369, year = {2024}, author = {Saghiri, MA and Vakhnovetsky, J and Amanabi, M and Karamifar, K and Farhadi, M and Amini, SB and Conte, M}, title = {Exploring the impact of type II diabetes mellitus on voice quality.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {281}, number = {5}, pages = {2707-2716}, pmid = {38319369}, issn = {1434-4726}, mesh = {Humans ; Voice Quality ; Speech Acoustics ; *Diabetes Mellitus, Type 2/complications ; Cross-Sectional Studies ; Speech Production Measurement ; *Voice ; Acoustics ; }, abstract = {PURPOSE: This cross-sectional study aimed to investigate the potential of voice analysis as a prescreening tool for type II diabetes mellitus (T2DM) by examining the differences in voice recordings between non-diabetic and T2DM participants.

METHODS: 60 participants diagnosed as non-diabetic (n = 30) or T2DM (n = 30) were recruited on the basis of specific inclusion and exclusion criteria in Iran between February 2020 and September 2023. Participants were matched according to their year of birth and then placed into six age categories. Using the WhatsApp application, participants recorded the translated versions of speech elicitation tasks. Seven acoustic features [fundamental frequency, jitter, shimmer, harmonic-to-noise ratio (HNR), cepstral peak prominence (CPP), voice onset time (VOT), and formant (F1-F2)] were extracted from each recording and analyzed using Praat software. Data was analyzed with Kolmogorov-Smirnov, two-way ANOVA, post hoc Tukey, binary logistic regression, and student t tests.

RESULTS: The comparison between groups showed significant differences in fundamental frequency, jitter, shimmer, CPP, and HNR (p < 0.05), while there were no significant differences in formant and VOT (p > 0.05). Binary logistic regression showed that shimmer was the most significant predictor of the disease group. There was also a significant difference between diabetes status and age, in the case of CPP.

CONCLUSIONS: Participants with type II diabetes exhibited significant vocal variations compared to non-diabetic controls.}, } @article {pmid38299984, year = {2024}, author = {Benway, NR and Preston, JL and Salekin, A and Hitchcock, E and McAllister, T}, title = {Evaluating acoustic representations and normalization for rhoticity classification in children with speech sound disorders.}, journal = {JASA express letters}, volume = {4}, number = {2}, pages = {}, pmid = {38299984}, issn = {2691-1191}, support = {R01 DC017476/DC/NIDCD NIH HHS/United States ; R01 DC020959/DC/NIDCD NIH HHS/United States ; T32 DC000046/DC/NIDCD NIH HHS/United States ; }, mesh = {Child ; Humans ; *Speech Sound Disorder/diagnosis ; Acoustics ; Engineering ; Models, Statistical ; Neural Networks, Computer ; }, abstract = {The effects of different acoustic representations and normalizations were compared for classifiers predicting perception of children's rhotic versus derhotic /ɹ/. Formant and Mel frequency cepstral coefficient (MFCC) representations for 350 speakers were z-standardized, either relative to values in the same utterance or age-and-sex data for typical /ɹ/. Statistical modeling indicated age-and-sex normalization significantly increased classifier performances. Clinically interpretable formants performed similarly to MFCCs and were endorsed for deep neural network engineering, achieving mean test-participant-specific F1-score = 0.81 after personalization and replication (σx = 0.10, med = 0.83, n = 48). Shapley additive explanations analysis indicated the third formant most influenced fully rhotic predictions.}, } @article {pmid38257406, year = {2024}, author = {Hou, Y and Li, Q and Wang, Z and Liu, T and He, Y and Li, H and Ren, Z and Guo, X and Yang, G and Liu, Y and Yu, L}, title = {Study on a Pig Vocalization Classification Method Based on Multi-Feature Fusion.}, journal = {Sensors (Basel, Switzerland)}, volume = {24}, number = {2}, pages = {}, pmid = {38257406}, issn = {1424-8220}, support = {2021ZD0113803//Scientific and Technological Innovation 2030 Program of China Ministry of Science and Technology/ ; 20YFZCSN00220//Tianjin Science and Technology Planning Project/ ; JKZX202214//Beijing Academy of Agriculture and Forestry Sciences Outstanding Scientist Training Program/ ; }, mesh = {Swine ; Animals ; *Recognition, Psychology ; *Cough ; Neural Networks, Computer ; Principal Component Analysis ; }, abstract = {To improve the classification of pig vocalization using vocal signals and improve recognition accuracy, a pig vocalization classification method based on multi-feature fusion is proposed in this study. With the typical vocalization of pigs in large-scale breeding houses as the research object, short-time energy, frequency centroid, formant frequency and first-order difference, and Mel frequency cepstral coefficient and first-order difference were extracted as the fusion features. These fusion features were improved using principal component analysis. A pig vocalization classification model with a BP neural network optimized based on the genetic algorithm was constructed. The results showed that using the improved features to recognize pig grunting, squealing, and coughing, the average recognition accuracy was 93.2%; the recognition precisions were 87.9%, 98.1%, and 92.7%, respectively, with an average of 92.9%; and the recognition recalls were 92.0%, 99.1%, and 87.4%, respectively, with an average of 92.8%, which indicated that the proposed pig vocalization classification method had good recognition precision and recall, and could provide a reference for pig vocalization information feedback and automatic recognition.}, } @article {pmid38252795, year = {2024}, author = {Nagamine, T}, title = {Formant dynamics in second language speech: Japanese speakers' production of English liquids.}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {1}, pages = {479-495}, doi = {10.1121/10.0024351}, pmid = {38252795}, issn = {1520-8524}, mesh = {Humans ; *Acoustics ; *Language ; *Speech ; }, abstract = {This article reports an acoustic study analysing the time-varying spectral properties of word-initial English liquids produced by 31 first-language (L1) Japanese and 14 L1 English speakers. While it is widely accepted that L1 Japanese speakers have difficulty in producing English /l/ and /ɹ/, the temporal characteristics of L2 English liquids are not well-understood, even in light of previous findings that English liquids show dynamic properties. In this study, the distance between the first and second formants (F2-F1) and the third formant (F3) are analysed dynamically over liquid-vowel intervals in three vowel contexts using generalised additive mixed models (GAMMs). The results demonstrate that L1 Japanese speakers produce word-initial English liquids with stronger vocalic coarticulation than L1 English speakers. L1 Japanese speakers may have difficulty in dissociating F2-F1 between the liquid and the vowel to a varying degree, depending on the vowel context, which could be related to perceptual factors. This article shows that dynamic information uncovers specific challenges that L1 Japanese speakers have in producing L2 English liquids accurately.}, } @article {pmid38226202, year = {2023}, author = {Ghaemi, H and Grillo, R and Alizadeh, O and Shirzadeh, A and Ejtehadi, B and Torkzadeh, M and Samieirad, S}, title = {What Is the Effect of Maxillary Impaction Orthognathic Surgery on Voice Characteristics? A Quasi-Experimental Study.}, journal = {World journal of plastic surgery}, volume = {12}, number = {3}, pages = {44-56}, pmid = {38226202}, issn = {2228-7914}, abstract = {BACKGROUND: Regarding the impact of orthognathic surgery on the airway and voice, this study was carried out to investigate the effects of maxillary impaction surgery on patients' voices through acoustic analysis and articulation assessment.

METHODS: This quasi-experimental, before-and-after, double-blind study aimed at examining the effects of maxillary impaction surgery on the voice of orthognathic surgery patients. Before the surgery, a speech therapist conducted acoustic analysis, which included fundament frequency (F0), Jitter, Shimmer, and the harmonic-to-noise ratio (HNR), as well as first, second, and third formants (F1, F2, and F3). The patient's age, sex, degree of maxillary deformity, and impaction were documented in a checklist. Voice analysis was repeated during follow-up appointments at one and six months after the surgery in a blinded manner. The data were statistically analyzed using SPSS 23, and the significance level was set at 0.05.

RESULTS: Twenty two patients (18 females, 4 males) were examined, with ages ranging from 18 to 40 years and an average age of 25.54 years. F2, F3, HNR, and Shimmer demonstrated a significant increase over the investigation period compared to the initial phase of the study (P <0.001 for each). Conversely, the Jitter variable exhibited a significant decrease during the follow-up assessments in comparison to the initial phase of the study (P< 0.001).

CONCLUSION: Following maxillary impaction surgery, improvements in voice quality were observed compared to the preoperative condition. However, further studies with larger samples are needed to confirm the relevancy.}, } @article {pmid38214609, year = {2024}, author = {Hedrick, M and Thornton, K}, title = {Reaction time for correct identification of vowels in consonant-vowel syllables and of vowel segments.}, journal = {JASA express letters}, volume = {4}, number = {1}, pages = {}, doi = {10.1121/10.0024334}, pmid = {38214609}, issn = {2691-1191}, mesh = {Adult ; Humans ; Young Adult ; Reaction Time ; *Phonetics ; }, abstract = {Reaction times for correct vowel identification were measured to determine the effects of intertrial intervals, vowel, and cue type. Thirteen adults with normal hearing, aged 20-38 years old, participated. Stimuli included three naturally produced syllables (/ba/ /bi/ /bu/) presented whole or segmented to isolate the formant transition or static formant center. Participants identified the vowel presented via loudspeaker by mouse click. Results showed a significant effect of intertrial intervals, no significant effect of cue type, and a significant vowel effect-suggesting that feedback occurs, vowel identification may depend on cue duration, and vowel bias may stem from focal structure.}, } @article {pmid38174963, year = {2024}, author = {Sathe, NC and Kain, A and Reiss, LAJ}, title = {Fusion of dichotic consonants in normal-hearing and hearing-impaired listenersa).}, journal = {The Journal of the Acoustical Society of America}, volume = {155}, number = {1}, pages = {68-77}, pmid = {38174963}, issn = {1520-8524}, support = {R01 DC013307/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Hearing Loss, Sensorineural ; *Speech Perception/physiology ; *Hearing Loss ; Psychoacoustics ; Phonetics ; Hearing ; }, abstract = {Hearing-impaired (HI) listeners have been shown to exhibit increased fusion of dichotic vowels, even with different fundamental frequency (F0), leading to binaural spectral averaging and interference. To determine if similar fusion and averaging occurs for consonants, four natural and synthesized stop consonants (/pa/, /ba/, /ka/, /ga/) at three F0s of 74, 106, and 185 Hz were presented dichotically-with ΔF0 varied-to normal-hearing (NH) and HI listeners. Listeners identified the one or two consonants perceived, and response options included /ta/ and /da/ as fused percepts. As ΔF0 increased, both groups showed decreases in fusion and increases in percent correct identification of both consonants, with HI listeners displaying similar fusion but poorer identification. Both groups exhibited spectral averaging (psychoacoustic fusion) of place of articulation but phonetic feature fusion for differences in voicing. With synthetic consonants, NH subjects showed increased fusion and decreased identification. Most HI listeners were unable to discriminate the synthetic consonants. The findings suggest smaller differences between groups in consonant fusion than vowel fusion, possibly due to the presence of more cues for segregation in natural speech or reduced reliance on spectral cues for consonant perception. The inability of HI listeners to discriminate synthetic consonants suggests a reliance on cues other than formant transitions for consonant discrimination.}, } @article {pmid38165498, year = {2024}, author = {Wang, L and Liu, R and Wang, Y and Xu, X and Zhang, R and Wei, Y and Zhu, R and Zhang, X and Wang, F}, title = {Effectiveness of a Biofeedback Intervention Targeting Mental and Physical Health Among College Students Through Speech and Physiology as Biomarkers Using Machine Learning: A Randomized Controlled Trial.}, journal = {Applied psychophysiology and biofeedback}, volume = {49}, number = {1}, pages = {71-83}, pmid = {38165498}, issn = {1573-3270}, support = {ZD2021026//Key Project supported by Medical Science and Technology Development Foundation, Jiangsu Commission of Health/ ; 62176129//National Natural Science Foundation of China/ ; 81725005//National Science Fund for Distinguished Young Scholars/ ; U20A6005//National Natural Science Foundation Regional Innovation and Development Joint Fund/ ; BE2021617//Jiangsu Provincial Key Research and Development Program/ ; }, mesh = {Humans ; *Speech ; *Sleep Initiation and Maintenance Disorders ; Biofeedback, Psychology/methods ; Students/psychology ; Biomarkers ; Machine Learning ; }, abstract = {Biofeedback therapy is mainly based on the analysis of physiological features to improve an individual's affective state. There are insufficient objective indicators to assess symptom improvement after biofeedback. In addition to psychological and physiological features, speech features can precisely convey information about emotions. The use of speech features can improve the objectivity of psychiatric assessments. Therefore, biofeedback based on subjective symptom scales, objective speech, and physiological features to evaluate efficacy provides a new approach for early screening and treatment of emotional problems in college students. A 4-week, randomized, controlled, parallel biofeedback therapy study was conducted with college students with symptoms of anxiety or depression. Speech samples, physiological samples, and clinical symptoms were collected at baseline and at the end of treatment, and the extracted speech features and physiological features were used for between-group comparisons and correlation analyses between the biofeedback and wait-list groups. Based on the speech features with differences between the biofeedback intervention and wait-list groups, an artificial neural network was used to predict the therapeutic effect and response after biofeedback therapy. Through biofeedback therapy, improvements in depression (p = 0.001), anxiety (p = 0.001), insomnia (p = 0.013), and stress (p = 0.004) severity were observed in college-going students (n = 52). The speech and physiological features in the biofeedback group also changed significantly compared to the waitlist group (n = 52) and were related to the change in symptoms. The energy parameters and Mel-Frequency Cepstral Coefficients (MFCC) of speech features can predict whether biofeedback intervention effectively improves anxiety and insomnia symptoms and treatment response. The accuracy of the classification model built using the artificial neural network (ANN) for treatment response and non-response was approximately 60%. The results of this study provide valuable information about biofeedback in improving the mental health of college-going students. The study identified speech features, such as the energy parameters, and MFCC as more accurate and objective indicators for tracking biofeedback therapy response and predicting efficacy. Trial Registration ClinicalTrials.gov ChiCTR2100045542.}, } @article {pmid38158551, year = {2024}, author = {Anikin, A and Barreda, S and Reby, D}, title = {A practical guide to calculating vocal tract length and scale-invariant formant patterns.}, journal = {Behavior research methods}, volume = {56}, number = {6}, pages = {5588-5604}, pmid = {38158551}, issn = {1554-3528}, mesh = {Humans ; *Software ; Phonetics ; Speech/physiology ; Speech Acoustics ; Vocal Cords/physiology ; Acoustics ; }, abstract = {Formants (vocal tract resonances) are increasingly analyzed not only by phoneticians in speech but also by behavioral scientists studying diverse phenomena such as acoustic size exaggeration and articulatory abilities of non-human animals. This often involves estimating vocal tract length acoustically and producing scale-invariant representations of formant patterns. We present a theoretical framework and practical tools for carrying out this work, including open-source software solutions included in R packages soundgen and phonTools. Automatic formant measurement with linear predictive coding is error-prone, but formant_app provides an integrated environment for formant annotation and correction with visual and auditory feedback. Once measured, formants can be normalized using a single recording (intrinsic methods) or multiple recordings from the same individual (extrinsic methods). Intrinsic speaker normalization can be as simple as taking formant ratios and calculating the geometric mean as a measure of overall scale. The regression method implemented in the function estimateVTL calculates the apparent vocal tract length assuming a single-tube model, while its residuals provide a scale-invariant vowel space based on how far each formant deviates from equal spacing (the schwa function). Extrinsic speaker normalization provides more accurate estimates of speaker- and vowel-specific scale factors by pooling information across recordings with simple averaging or mixed models, which we illustrate with example datasets and R code. The take-home messages are to record several calls or vowels per individual, measure at least three or four formants, check formant measurements manually, treat uncertain values as missing, and use the statistical tools best suited to each modeling context.}, } @article {pmid38135960, year = {2023}, author = {Kraxberger, F and Näger, C and Laudato, M and Sundström, E and Becker, S and Mihaescu, M and Kniesburges, S and Schoder, S}, title = {On the Alignment of Acoustic and Coupled Mechanic-Acoustic Eigenmodes in Phonation by Supraglottal Duct Variations.}, journal = {Bioengineering (Basel, Switzerland)}, volume = {10}, number = {12}, pages = {}, pmid = {38135960}, issn = {2306-5354}, support = {39480417//Austrian Research Promotion Agency/ ; 446965891//Deutsche Forschungsgemeinschaft/ ; n/a//TU Graz Open Access Publishing Fund/ ; }, abstract = {Sound generation in human phonation and the underlying fluid-structure-acoustic interaction that describes the sound production mechanism are not fully understood. A previous experimental study, with a silicone made vocal fold model connected to a straight vocal tract pipe of fixed length, showed that vibroacoustic coupling can cause a deviation in the vocal fold vibration frequency. This occurred when the fundamental frequency of the vocal fold motion was close to the lowest acoustic resonance frequency of the pipe. What is not fully understood is how the vibroacoustic coupling is influenced by a varying vocal tract length. Presuming that this effect is a pure coupling of the acoustical effects, a numerical simulation model is established based on the computation of the mechanical-acoustic eigenvalue. With varying pipe lengths, the lowest acoustic resonance frequency was adjusted in the experiments and so in the simulation setup. In doing so, the evolution of the vocal folds' coupled eigenvalues and eigenmodes is investigated, which confirms the experimental findings. Finally, it was shown that for normal phonation conditions, the mechanical mode is the most efficient vibration pattern whenever the acoustic resonance of the pipe (lowest formant) is far away from the vocal folds' vibration frequency. Whenever the lowest formant is slightly lower than the mechanical vocal fold eigenfrequency, the coupled vocal fold motion pattern at the formant frequency dominates.}, } @article {pmid38082914, year = {2023}, author = {Pah, ND and Motin, MA and Oliveira, GC and Kumar, DK}, title = {The Change of Vocal Tract Length in People with Parkinson's Disease.}, journal = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference}, volume = {2023}, number = {}, pages = {1-4}, doi = {10.1109/EMBC40787.2023.10340263}, pmid = {38082914}, issn = {2694-0604}, mesh = {Humans ; Male ; *Parkinson Disease/complications/diagnosis ; *Voice ; Dysarthria/diagnosis/etiology ; Speech ; }, abstract = {Hypokinetic dysarthria is one of the early symptoms of Parkinson's disease (PD) and has been proposed for early detection and also for monitoring of the progression of the disease. PD reduces the control of vocal tract muscles such as the tongue and lips and, therefore the length of the active vocal tract is altered. However, the change in the vocal tract length due to the disease has not been investigated. The aim of this study was to determine the difference in the apparent vocal tract length (AVTL) between people with PD and age-matched control healthy people. The phoneme, /a/ from the UCI Parkinson's Disease Classification Dataset and the Italian Parkinson's Voice and Speech Dataset were used and AVTL was calculated based on the first four formants of the sustained phoneme (F1-F4). The results show a correlation between Parkinson's disease and an increase in vocal tract length. The most sensitive feature was the AVTL calculated using the first formants of sustained phonemes (F1). The other significant finding reported in this article is that the difference is significant and only appeared in the male participants. However, the size of the database is not sufficiently large to identify the possible confounding factors such as the severity and duration of the disease, medication, age, and comorbidity factors.Clinical relevance-The outcomes of this research have the potential to improve the identification of early Parkinsonian dysarthria and monitor PD progression.}, } @article {pmid38061210, year = {2024}, author = {Orekhova, EV and Fadeev, KA and Goiaeva, DE and Obukhova, TS and Ovsiannikova, TM and Prokofyev, AO and Stroganova, TA}, title = {Different hemispheric lateralization for periodicity and formant structure of vowels in the auditory cortex and its changes between childhood and adulthood.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {171}, number = {}, pages = {287-307}, doi = {10.1016/j.cortex.2023.10.020}, pmid = {38061210}, issn = {1973-8102}, mesh = {Adult ; Humans ; Child ; *Auditory Cortex/physiology ; Acoustic Stimulation ; Auditory Perception/physiology ; Magnetoencephalography ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {The spectral formant structure and periodicity pitch are the major features that determine the identity of vowels and the characteristics of the speaker. However, very little is known about how the processing of these features in the auditory cortex changes during development. To address this question, we independently manipulated the periodicity and formant structure of vowels while measuring auditory cortex responses using magnetoencephalography (MEG) in children aged 7-12 years and adults. We analyzed the sustained negative shift of source current associated with these vowel properties, which was present in the auditory cortex in both age groups despite differences in the transient components of the auditory response. In adults, the sustained activation associated with formant structure was lateralized to the left hemisphere early in the auditory processing stream requiring neither attention nor semantic mapping. This lateralization was not yet established in children, in whom the right hemisphere contribution to formant processing was strong and decreased during or after puberty. In contrast to the formant structure, periodicity was associated with a greater response in the right hemisphere in both children and adults. These findings suggest that left-lateralization for the automatic processing of vowel formant structure emerges relatively late in ontogenesis and pose a serious challenge to current theories of hemispheric specialization for speech processing.}, } @article {pmid38058304, year = {2023}, author = {Alain, C and Göke, K and Shen, D and Bidelman, GM and Bernstein, LJ and Snyder, JS}, title = {Neural alpha oscillations index context-driven perception of ambiguous vowel sequences.}, journal = {iScience}, volume = {26}, number = {12}, pages = {108457}, pmid = {38058304}, issn = {2589-0042}, abstract = {Perception of bistable stimuli is influenced by prior context. In some cases, the interpretation matches with how the preceding stimulus was perceived; in others, it tends to be the opposite of the previous stimulus percept. We measured high-density electroencephalography (EEG) while participants were presented with a sequence of vowels that varied in formant transition, promoting the perception of one or two auditory streams followed by an ambiguous bistable sequence. For the bistable sequence, participants were more likely to report hearing the opposite percept of the one heard immediately before. This auditory contrast effect coincided with changes in alpha power localized in the left angular gyrus and left sensorimotor and right sensorimotor/supramarginal areas. The latter correlated with participants' perception. These results suggest that the contrast effect for a bistable sequence of vowels may be related to neural adaptation in posterior auditory areas, which influences participants' perceptual construal level of ambiguous stimuli.}, } @article {pmid38050971, year = {2024}, author = {Shellikeri, S and Cho, S and Ash, S and Gonzalez-Recober, C and Mcmillan, CT and Elman, L and Quinn, C and Amado, DA and Baer, M and Irwin, DJ and Massimo, L and Olm, CA and Liberman, MY and Grossman, M and Nevler, N}, title = {Digital markers of motor speech impairments in spontaneous speech of patients with ALS-FTD spectrum disorders.}, journal = {Amyotrophic lateral sclerosis & frontotemporal degeneration}, volume = {25}, number = {3-4}, pages = {317-325}, pmid = {38050971}, issn = {2167-9223}, support = {K99 AG073510/AG/NIA NIH HHS/United States ; P01 AG066597/AG/NIA NIH HHS/United States ; R01 NS109260/NS/NINDS NIH HHS/United States ; K08 NS114106/NS/NINDS NIH HHS/United States ; P30 AG072979/AG/NIA NIH HHS/United States ; R01 AG054519/AG/NIA NIH HHS/United States ; }, mesh = {Humans ; *Frontotemporal Dementia/diagnosis/diagnostic imaging ; *Amyotrophic Lateral Sclerosis/complications/diagnosis ; Speech ; Magnetic Resonance Imaging ; *Dystonic Disorders ; }, abstract = {OBJECTIVE: To evaluate automated digital speech measures, derived from spontaneous speech (picture descriptions), in assessing bulbar motor impairments in patients with ALS-FTD spectrum disorders (ALS-FTSD).

METHODS: Automated vowel algorithms were employed to extract two vowel acoustic measures: vowel space area (VSA), and mean second formant slope (F2 slope). Vowel measures were compared between ALS with and without clinical bulbar symptoms (ALS + bulbar (n = 49, ALSFRS-r bulbar subscore: x¯ = 9.8 (SD = 1.7)) vs. ALS-nonbulbar (n = 23), behavioral variant frontotemporal dementia (bvFTD, n = 25) without a motor syndrome, and healthy controls (HC, n = 32). Correlations with bulbar motor clinical scales, perceived listener effort, and MRI cortical thickness of the orobuccal primary motor cortex (oral PMC) were examined. We compared vowel measures to speaking rate, a conventional metric for assessing bulbar dysfunction.

RESULTS: ALS + bulbar had significantly reduced VSA and F2 slope than ALS-nonbulbar (|d|=0.94 and |d|=1.04, respectively), bvFTD (|d|=0.89 and |d|=1.47), and HC (|d|=0.73 and |d|=0.99). These reductions correlated with worse bulbar clinical scores (VSA: R = 0.33, p = 0.043; F2 slope: R = 0.38, p = 0.011), greater listener effort (VSA: R=-0.43, p = 0.041; F2 slope: p > 0.05), and cortical thinning in oral PMC (F2 slope: β = 0.0026, p = 0.017). Vowel measures demonstrated greater sensitivity and specificity for bulbar impairment than speaking rate, while showing independence from cognitive and respiratory impairments.

CONCLUSION: Automatic vowel measures are easily derived from a brief spontaneous speech sample, are sensitive to mild-moderate stage of bulbar disease in ALS-FTSD, and may present better sensitivity to bulbar impairment compared to traditional assessments such as speaking rate.}, } @article {pmid38033551, year = {2023}, author = {Heeringa, AN and Jüchter, C and Beutelmann, R and Klump, GM and Köppl, C}, title = {Altered neural encoding of vowels in noise does not affect behavioral vowel discrimination in gerbils with age-related hearing loss.}, journal = {Frontiers in neuroscience}, volume = {17}, number = {}, pages = {1238941}, pmid = {38033551}, issn = {1662-4548}, abstract = {INTRODUCTION: Understanding speech in a noisy environment, as opposed to speech in quiet, becomes increasingly more difficult with increasing age. Using the quiet-aged gerbil, we studied the effects of aging on speech-in-noise processing. Specifically, behavioral vowel discrimination and the encoding of these vowels by single auditory-nerve fibers were compared, to elucidate some of the underlying mechanisms of age-related speech-in-noise perception deficits.

METHODS: Young-adult and quiet-aged Mongolian gerbils, of either sex, were trained to discriminate a deviant naturally-spoken vowel in a sequence of vowel standards against a speech-like background noise. In addition, we recorded responses from single auditory-nerve fibers of young-adult and quiet-aged gerbils while presenting the same speech stimuli.

RESULTS: Behavioral vowel discrimination was not significantly affected by aging. For both young-adult and quiet-aged gerbils, the behavioral discrimination between /eː/ and /iː/ was more difficult to make than /eː/ vs. /aː/ or /iː/ vs. /aː/, as evidenced by longer response times and lower d' values. In young-adults, spike timing-based vowel discrimination agreed with the behavioral vowel discrimination, while in quiet-aged gerbils it did not. Paradoxically, discrimination between vowels based on temporal responses was enhanced in aged gerbils for all vowel comparisons. Representation schemes, based on the spectrum of the inter-spike interval histogram, revealed stronger encoding of both the fundamental and the lower formant frequencies in fibers of quiet-aged gerbils, but no qualitative changes in vowel encoding. Elevated thresholds in combination with a fixed stimulus level, i.e., lower sensation levels of the stimuli for old individuals, can explain the enhanced temporal coding of the vowels in noise.

DISCUSSION: These results suggest that the altered auditory-nerve discrimination metrics in old gerbils may mask age-related deterioration in the central (auditory) system to the extent that behavioral vowel discrimination matches that of the young adults.}, } @article {pmid38029503, year = {2024}, author = {Mohn, JL and Baese-Berk, MM and Jaramillo, S}, title = {Selectivity to acoustic features of human speech in the auditory cortex of the mouse.}, journal = {Hearing research}, volume = {441}, number = {}, pages = {108920}, pmid = {38029503}, issn = {1878-5891}, support = {R56 DC015531/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Mice ; Animals ; *Auditory Cortex/physiology ; Speech ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; Acoustics ; Auditory Perception/physiology ; }, abstract = {A better understanding of the neural mechanisms of speech processing can have a major impact in the development of strategies for language learning and in addressing disorders that affect speech comprehension. Technical limitations in research with human subjects hinder a comprehensive exploration of these processes, making animal models essential for advancing the characterization of how neural circuits make speech perception possible. Here, we investigated the mouse as a model organism for studying speech processing and explored whether distinct regions of the mouse auditory cortex are sensitive to specific acoustic features of speech. We found that mice can learn to categorize frequency-shifted human speech sounds based on differences in formant transitions (FT) and voice onset time (VOT). Moreover, neurons across various auditory cortical regions were selective to these speech features, with a higher proportion of speech-selective neurons in the dorso-posterior region. Last, many of these neurons displayed mixed-selectivity for both features, an attribute that was most common in dorsal regions of the auditory cortex. Our results demonstrate that the mouse serves as a valuable model for studying the detailed mechanisms of speech feature encoding and neural plasticity during speech-sound learning.}, } @article {pmid38010781, year = {2024}, author = {Anikin, A and Valente, D and Pisanski, K and Cornec, C and Bryant, GA and Reby, D}, title = {The role of loudness in vocal intimidation.}, journal = {Journal of experimental psychology. General}, volume = {153}, number = {2}, pages = {511-530}, doi = {10.1037/xge0001508}, pmid = {38010781}, issn = {1939-2222}, support = {//Vetenskapsrådet/ ; //French National Research Agency (ANR)/ ; }, mesh = {Humans ; *Voice ; Voice Quality ; Aggression ; Communication ; Sound ; }, abstract = {Across many species, a major function of vocal communication is to convey formidability, with low voice frequencies traditionally considered the main vehicle for projecting large size and aggression. Vocal loudness is often ignored, yet it might explain some puzzling exceptions to this frequency code. Here we demonstrate, through acoustic analyses of over 3,000 human vocalizations and four perceptual experiments, that vocalizers produce low frequencies when attempting to sound large, but loudness is prioritized for displays of strength and aggression. Our results show that, although being loud is effective for signaling strength and aggression, it poses a physiological trade-off with low frequencies because a loud voice is achieved by elevating pitch and opening the mouth wide into a-like vowels. This may explain why aggressive vocalizations are often high-pitched and why open vowels are considered "large" in sound symbolism despite their high first formant. Callers often compensate by adding vocal harshness (nonlinear vocal phenomena) to undesirably high-pitched loud vocalizations, but a combination of low and loud remains an honest predictor of both perceived and actual physical formidability. The proposed notion of a loudness-frequency trade-off thus adds a new dimension to the widely accepted frequency code and requires a fundamental rethinking of the evolutionary forces shaping the form of acoustic signals. (PsycInfo Database Record (c) 2024 APA, all rights reserved).}, } @article {pmid38000960, year = {2023}, author = {Barrientos, E and Cataldo, E}, title = {Estimating Formant Frequencies of Vowels Sung by Sopranos Using Weighted Linear Prediction.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.018}, pmid = {38000960}, issn = {1873-4588}, abstract = {This study introduces the weighted linear prediction adapted to high-pitched singing voices (WLP-HPSV) method for accurately estimating formant frequencies of vowels sung by lyric sopranos. The WLP-HPSV method employs a variant of the WLP analysis combined with the zero-frequency filtering (ZFF) technique to address specific challenges in formant estimation from singing signals. Evaluation of the WLP-HPSV method compared to the LPC method demonstrated its superior performance in accurately capturing the spectral characteristics of synthetic /u/ vowels and the /a/ and /u/ natural singing vowels. The QCP parameters used in the WLP-HPSV method varied with pitch, revealing insights into the interplay between the vocal tract and glottal characteristics during vowel production. The comparison between the LPC and WLP-HPSV methods highlighted the robustness of the WLP-HPSV method in accurately estimating formant frequencies across different pitches.}, } @article {pmid37992456, year = {2024}, author = {Punamäki, RL and Diab, SY and Drosos, K and Qouta, SR and Vänskä, M}, title = {The role of acoustic features of maternal infant-directed singing in enhancing infant sensorimotor, language and socioemotional development.}, journal = {Infant behavior & development}, volume = {74}, number = {}, pages = {101908}, doi = {10.1016/j.infbeh.2023.101908}, pmid = {37992456}, issn = {1934-8800}, mesh = {Female ; Infant ; Child ; Infant, Newborn ; Humans ; *Singing ; Prospective Studies ; Speech ; Language ; Acoustics ; Language Development ; }, abstract = {The quality of infant-directed speech (IDS) and infant-directed singing (IDSi) are considered vital to children, but empirical studies on protomusical qualities of the IDSi influencing infant development are rare. The current prospective study examines the role of IDSi acoustic features, such as pitch variability, shape and movement, and vocal amplitude vibration, timbre, and resonance, in associating with infant sensorimotor, language, and socioemotional development at six and 18 months. The sample consists of 236 Palestinian mothers from Gaza Strip singing to their six-month-olds a song by their own choice. Maternal IDSi was recorded and analyzed by the OpenSMILE- tool to depict main acoustic features of pitch frequencies, variations, and contours, vocal intensity, resonance formants, and power. The results are based on completed 219 maternal IDSi. Mothers reported about their infants' sensorimotor, language-vocalization, and socioemotional skills at six months, and psychologists tested these skills by Bayley Scales for Infant Development at 18 months. Results show that maternal IDSi characterized by wide pitch variability and rich and high vocal amplitude and vibration were associated with infants' optimal sensorimotor, language vocalization, and socioemotional skills at six months, and rich and high vocal amplitude and vibration predicted these optimal developmental skills also at 18 months. High resonance and rhythmicity formants were associated with optimal language and vocalization skills at six months. To conclude, the IDSi is considered important in enhancing newborn and risk infants' wellbeing, and the current findings argue that favorable acoustic singing qualities are crucial for optimal multidomain development across infancy.}, } @article {pmid37992412, year = {2023}, author = {Levin, M and Zaltz, Y}, title = {Voice Discrimination in Quiet and in Background Noise by Simulated and Real Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {12}, pages = {5169-5186}, doi = {10.1044/2023_JSLHR-23-00019}, pmid = {37992412}, issn = {1558-9102}, mesh = {Humans ; *Cochlear Implants ; *Speech Perception ; *Cochlear Implantation/methods ; *Hearing Loss/rehabilitation ; Noise ; }, abstract = {PURPOSE: Cochlear implant (CI) users demonstrate poor voice discrimination (VD) in quiet conditions based on the speaker's fundamental frequency (fo) and formant frequencies (i.e., vocal-tract length [VTL]). Our purpose was to examine the effect of background noise at levels that allow good speech recognition thresholds (SRTs) on VD via acoustic CI simulations and CI hearing.

METHOD: Forty-eight normal-hearing (NH) listeners who listened via noise-excited (n = 20) or sinewave (n = 28) vocoders and 10 prelingually deaf CI users (i.e., whose hearing loss began before language acquisition) participated in the study. First, the signal-to-noise ratio (SNR) that yields 70.7% correct SRT was assessed using an adaptive sentence-in-noise test. Next, the CI simulation listeners performed 12 adaptive VDs: six in quiet conditions, two with each cue (fo, VTL, fo + VTL), and six amid speech-shaped noise. The CI participants performed six VDs: one with each cue, in quiet and amid noise. SNR at VD testing was 5 dB higher than the individual's SRT in noise (SRTn +5 dB).

RESULTS: Results showed the following: (a) Better VD was achieved via the noise-excited than the sinewave vocoder, with the noise-excited vocoder better mimicking CI VD; (b) background noise had a limited negative effect on VD, only for the CI simulation listeners; and (c) there was a significant association between SNR at testing and VTL VD only for the CI simulation listeners.

CONCLUSIONS: For NH listeners who listen to CI simulations, noise that allows good SRT can nevertheless impede VD, probably because VD depends more on bottom-up sensory processing. Conversely, for prelingually deaf CI users, noise that allows good SRT hardly affects VD, suggesting that they rely strongly on bottom-up processing for both VD and speech recognition.}, } @article {pmid37992404, year = {2024}, author = {Kapsner-Smith, MR and Abur, D and Eadie, TL and Stepp, CE}, title = {Test-Retest Reliability of Behavioral Assays of Feedforward and Feedback Auditory-Motor Control of Voice and Articulation.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {67}, number = {1}, pages = {34-48}, pmid = {37992404}, issn = {1558-9102}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; F31 DC020359/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; }, mesh = {Male ; Humans ; Female ; Feedback ; Reproducibility of Results ; *Voice/physiology ; Speech ; Hearing ; }, abstract = {PURPOSE: Behavioral assays of feedforward and feedback auditory-motor control of voice and articulation frequently are used to make inferences about underlying neural mechanisms and to study speech development and disorders. However, no studies have examined the test-retest reliability of such measures, which is critical for rigorous study of auditory-motor control. Thus, the purpose of the present study was to assess the reliability of assays of feedforward and feedback control in voice versus articulation domains.

METHOD: Twenty-eight participants (14 cisgender women, 12 cisgender men, one transgender man, one transmasculine/nonbinary) who denied any history of speech, hearing, or neurological impairment were measured for responses to predictable versus unexpected auditory feedback perturbations of vocal (fundamental frequency, fo) and articulatory (first formant, F1) acoustic parameters twice, with 3-6 weeks between sessions. Reliability was measured with intraclass correlations.

RESULTS: Opposite patterns of reliability were observed for fo and F1; fo reflexive responses showed good reliability and fo adaptive responses showed poor reliability, whereas F1 reflexive responses showed poor reliability and F1 adaptive responses showed moderate reliability. However, a criterion-referenced categorical measurement of fo adaptive responses as typical versus atypical showed substantial test-retest agreement.

CONCLUSIONS: Individual responses to some behavioral assays of auditory-motor control of speech should be interpreted with caution, which has implications for several fields of research. Additional research is needed to establish reliable criterion-referenced measures of F1 adaptive responses as well as fo and F1 reflexive responses. Furthermore, the opposite patterns of test-retest reliability observed for voice versus articulation add to growing evidence for differences in underlying neural control mechanisms.}, } @article {pmid37988375, year = {2023}, author = {Zhang, W and Clayards, M}, title = {Contribution of acoustic cues to prominence ratings for four Mandarin vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {5}, pages = {3364-3373}, doi = {10.1121/10.0022410}, pmid = {37988375}, issn = {1520-8524}, mesh = {*Cues ; Bayes Theorem ; Acoustics ; Speech Acoustics ; *Speech Perception ; Phonetics ; }, abstract = {The acoustic cues for prosodic prominence have been explored extensively, but one open question is to what extent they differ by context. This study investigates the extent to which vowel type affects how acoustic cues are related to prominence ratings provided in a corpus of spoken Mandarin. In the corpus, each syllable was rated as either prominent or non-prominent. We predicted prominence ratings using Bayesian mixed-effect regression models for each of four Mandarin vowels (/a, i, ɤ, u/), using fundamental frequency (F0), intensity, duration, the first and second formants, and tone type as predictors. We compared the role of each cue within and across the four models. We found that overall duration was the best predictor of prominence ratings and that formants were the weakest, but the role of each cue differed by vowel. We did not find credible evidence that F0 was relevant for /a/, or that intensity was relevant for /i/. We also found evidence that duration was more important for /ɤ/ than for /i/. The results suggest that vowel type credibly affects prominence ratings, which may reflect differences in the coordination of acoustic cues in prominence marking.}, } @article {pmid37974753, year = {2023}, author = {Jasim, M and Nayana, VG and Nayaka, H and Nayak, PS}, title = {Effect of Adenotonsillectomy on Spectral and Acoustic Characteristics.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {75}, number = {4}, pages = {3467-3475}, pmid = {37974753}, issn = {2231-3796}, abstract = {Acoustic analysis and perceptual analysis has been extensively used to assess the speech and voice among individual with voice disorders. These methods provide objective, quantitative and precise information on the speech and voice characteristics in any given disorder and help in monitoring any recovery, deterioration, or improvement in an individual's speech and also differentiate between normal and abnormal speech and voice characteristics. The present study was carried out to investigate the spectral characteristics (formant frequency parameters and formant centralization ratios) and voice characteristics (Acoustic parameters of voice) changes in individuals following adenotonsillectomy. A total of 34 participants participated in the study with a history of adenotonsillar hypertrophy. Spectral and acoustic voice parameters were analyzed across the three-time domains, before surgery (T0), 30 days (T1), and 90 days (T2) after surgery. Data was analyzed statistically using the SPSS software version-28.0.0.0. Descriptive statistics were used to find the mean and standard deviation. Repeated measures of ANOVA were used to compare the pre and post-experimental measures for spectral and acoustic, voice parameters. The derived parameter of acoustic vowel space (formant centralization ratio 3) was compared across three conditions timelines. The results revealed that acoustic vowel space measure and formant frequency measures were significantly increased in pre and post-operative conditions across the three timelines. A significant difference was obtained across the acoustic parameters across the time domains. Adenotonsillectomy has been proved to be an efficient surgical procedure in treating children with chronic adenotonsillitis. The results obtained have indicated an overall improvement in the spectral and acoustic voice parameters thereby highlighting the need for adenotonsillectomy at the right time and at the right age.}, } @article {pmid37972580, year = {2024}, author = {Noffs, G and Cobler-Lichter, M and Perera, T and Kolbe, SC and Butzkueven, H and Boonstra, FMC and van der Walt, A and Vogel, AP}, title = {Plug-and-Play Microphones for Recording Speech and Voice with Smart Devices.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {76}, number = {4}, pages = {372-385}, pmid = {37972580}, issn = {1421-9972}, mesh = {Humans ; *Speech Acoustics ; Male ; Speech Production Measurement/instrumentation ; Female ; Adult ; Voice Quality ; Equipment Design ; Multiple Sclerosis ; }, abstract = {INTRODUCTION: Smart devices are widely available and capable of quickly recording and uploading speech segments for health-related analysis. The switch from laboratory recordings with professional-grade microphone setups to remote, smart device-based recordings offers immense potential for the scalability of voice assessment. Yet, a growing body of literature points to a wide heterogeneity among acoustic metrics for their robustness to variation in recording devices. The addition of consumer-grade plug-and-play microphones has been proposed as a possible solution. The aim of our study was to assess if the addition of consumer-grade plug-and-play microphones increases the acoustic measurement agreement between ultra-portable devices and a reference microphone.

METHODS: Speech was simultaneously recorded by a reference high-quality microphone commonly used in research and by two configurations with plug-and-play microphones. Twelve speech-acoustic features were calculated using recordings from each microphone to determine the agreement intervals in measurements between microphones. Agreement intervals were then compared to expected deviations in speech in various neurological conditions. Each microphone's response to speech and to silence was characterized through acoustic analysis to explore possible reasons for differences in acoustic measurements between microphones. The statistical differentiation of two groups, neurotypical and people with multiple sclerosis, using metrics from each tested microphone was compared to that of the reference microphone.

RESULTS: The two consumer-grade plug-and-play microphones favored high frequencies (mean center of gravity difference ≥ +175.3 Hz) and recorded more noise (mean difference in signal to noise ≤ -4.2 dB) when compared to the reference microphone. Between consumer-grade microphones, differences in relative noise were closely related to distance between the microphone and the speaker's mouth. Agreement intervals between the reference and consumer-grade microphones remained under disease-expected deviations only for fundamental frequency (f0, agreement interval ≤0.06 Hz), f0 instability (f0 CoV, agreement interval ≤0.05%), and tracking of second formant movement (agreement interval ≤1.4 Hz/ms). Agreement between microphones was poor for other metrics, particularly for fine timing metrics (mean pause length and pause length variability for various tasks). The statistical difference between the two groups of speakers was smaller with the plug-and-play than with the reference microphone.

CONCLUSION: Measurement of f0 and F2 slope was robust to variation in recording equipment, while other acoustic metrics were not. Thus, the tested plug-and-play microphones should not be used interchangeably with professional-grade microphones for speech analysis. Plug-and-play microphones may assist in equipment standardization within speech studies, including remote or self-recording, possibly with small loss in accuracy and statistical power as observed in the current study.}, } @article {pmid37944057, year = {2023}, author = {Ribas-Prats, T and Cordero, G and Lip-Sosa, DL and Arenillas-Alcón, S and Costa-Faidella, J and Gómez-Roig, MD and Escera, C}, title = {Developmental Trajectory of the Frequency-Following Response During the First 6 Months of Life.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {12}, pages = {4785-4800}, doi = {10.1044/2023_JSLHR-23-00104}, pmid = {37944057}, issn = {1558-9102}, mesh = {Infant, Newborn ; Infant ; Humans ; *Speech Perception/physiology ; Language Development ; Phonetics ; Electroencephalography ; }, abstract = {PURPOSE: The aim of the present study is to characterize the maturational changes during the first 6 months of life in the neural encoding of two speech sound features relevant for early language acquisition: the stimulus fundamental frequency (fo), related to stimulus pitch, and the vowel formant composition, particularly F1. The frequency-following response (FFR) was used as a snapshot into the neural encoding of these two stimulus attributes.

METHOD: FFRs to a consonant-vowel stimulus /da/ were retrieved from electroencephalographic recordings in a sample of 80 healthy infants (45 at birth and 35 at the age of 1 month). Thirty-two infants (16 recorded at birth and 16 recorded at 1 month) returned for a second recording at 6 months of age.

RESULTS: Stimulus fo and F1 encoding showed improvements from birth to 6 months of age. Most remarkably, a significant improvement in the F1 neural encoding was observed during the first month of life.

CONCLUSION: Our results highlight the rapid and sustained maturation of the basic neural machinery necessary for the phoneme discrimination ability during the first 6 months of age.}, } @article {pmid37943390, year = {2024}, author = {Mračková, M and Mareček, R and Mekyska, J and Košťálová, M and Rektorová, I}, title = {Levodopa may modulate specific speech impairment in Parkinson's disease: an fMRI study.}, journal = {Journal of neural transmission (Vienna, Austria : 1996)}, volume = {131}, number = {2}, pages = {181-187}, pmid = {37943390}, issn = {1435-1463}, support = {LX22NPO5107 (MEYS): Financed by EU-Next Generation EU//Ministerstvo Školství, Mládeže a Tělovýchovy/ ; }, mesh = {Humans ; *Levodopa/adverse effects ; *Parkinson Disease/complications/diagnostic imaging/drug therapy ; Speech/physiology ; Magnetic Resonance Imaging/methods ; Quality of Life ; Speech Disorders/diagnostic imaging/etiology ; Dysarthria/etiology/complications ; Antiparkinson Agents/adverse effects ; }, abstract = {Hypokinetic dysarthria (HD) is a difficult-to-treat symptom affecting quality of life in patients with Parkinson's disease (PD). Levodopa may partially alleviate some symptoms of HD in PD, but the neural correlates of these effects are not fully understood. The aim of our study was to identify neural mechanisms by which levodopa affects articulation and prosody in patients with PD. Altogether 20 PD patients participated in a task fMRI study (overt sentence reading). Using a single dose of levodopa after an overnight withdrawal of dopaminergic medication, levodopa-induced BOLD signal changes within the articulatory pathway (in regions of interest; ROIs) were studied. We also correlated levodopa-induced BOLD signal changes with the changes in acoustic parameters of speech. We observed no significant changes in acoustic parameters due to acute levodopa administration. After levodopa administration as compared to the OFF dopaminergic condition, patients showed task-induced BOLD signal decreases in the left ventral thalamus (p = 0.0033). The changes in thalamic activation were associated with changes in pitch variation (R = 0.67, p = 0.006), while the changes in caudate nucleus activation were related to changes in the second formant variability which evaluates precise articulation (R = 0.70, p = 0.003). The results are in line with the notion that levodopa does not have a major impact on HD in PD, but it may induce neural changes within the basal ganglia circuitries that are related to changes in speech prosody and articulation.}, } @article {pmid37940420, year = {2023}, author = {Liu, W and Wang, Y and Liang, C}, title = {Formant and Voice Source Characteristics of Vowels in Chinese National Singing and Bel Canto. A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.016}, pmid = {37940420}, issn = {1873-4588}, abstract = {BACKGROUND: There have been numerous reports on the acoustic characteristics of singers' vowel articulation and phonation, and these studies cover many phonetic dimensions, such as fundamental frequency (F0), intensity, formant frequency, and voice quality.

METHOD: Taking the three representative vowels (/a/, /i/, /u/) in Chinese National Singing and Bel Canto as the research object, the present study investigates the differences and associations in vowel articulation and phonation between Chinese National Singing and Bel Canto using acoustic measures, for example, F0, formant frequency, long-term average spectrum (LTAS).

RESULTS: The relationship between F0 and formant indicates that F1 is proportional to F0, in which the female has a significant variation in vowel /a/. Compared with the male, the formant structure of the female singing voice differs significantly from that of the speech voice. Regarding the relationship between intensity and formant, LTAS shows that the Chinese National Singing tenor and Bel Canto baritone have the singer's formant cluster when singing vowels, while the two sopranos do not.

CONCLUSIONS: The systematic changes of formant frequencies with voice source are observed. (i) F1 of the female vowel /a/ has undergone a significant tuning change in the register transition, reflecting the characteristics of singing genres. (ii) Female singers utilize the intrinsic pitch of vowels when adopting the register transition strategy. This finding can be assumed to facilitate understanding the theory of intrinsic vowel pitch and revise Sundberg's hypothesis that F1 rises with F0. A non-linear relationship exists between F1 and F0, which adds to the non-linear interaction of the formant and vocal source. (iii) The singer's formant is affected by voice classification, gender, and singing genres.}, } @article {pmid37935372, year = {2023}, author = {Keller, PE and Lee, J and König, R and Novembre, G}, title = {Sex-related communicative functions of voice spectral energy in human chorusing.}, journal = {Biology letters}, volume = {19}, number = {11}, pages = {20230326}, pmid = {37935372}, issn = {1744-957X}, mesh = {Humans ; Male ; Female ; *Voice ; *Music ; Acoustics ; Social Behavior ; }, abstract = {Music is a human communicative art whose evolutionary origins may lie in capacities that support cooperation and/or competition. A mixed account favouring simultaneous cooperation and competition draws on analogous interactive displays produced by collectively signalling non-human animals (e.g. crickets and frogs). In these displays, rhythmically coordinated calls serve as a beacon whereby groups of males 'cooperatively' attract potential female mates, while the likelihood of each male competitively attracting an actual mate depends on the precedence of his signal. Human behaviour consistent with the mixed account was previously observed in a renowned boys choir, where the basses-the oldest boys with the deepest voices-boosted their acoustic prominence by increasing energy in a high-frequency band of the vocal spectrum when girls were in an otherwise male audience. The current study tested female and male sensitivity and preferences for this subtle vocal modulation in online listening tasks. Results indicate that while female and male listeners are similarly sensitive to enhanced high-spectral energy elicited by the presence of girls in the audience, only female listeners exhibit a reliable preference for it. Findings suggest that human chorusing is a flexible form of social communicative behaviour that allows simultaneous group cohesion and sexually motivated competition.}, } @article {pmid37925330, year = {2023}, author = {Baker, CP and Brockmann-Bauser, M and Purdy, SC and Rakena, TO}, title = {High and Wide: An In Silico Investigation of Frequency, Intensity, and Vibrato Effects on Widely Applied Acoustic Voice Perturbation and Noise Measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.10.007}, pmid = {37925330}, issn = {1873-4588}, abstract = {OBJECTIVES: This in silico study explored the effects of a wide range of fundamental frequency (fo), source-spectrum tilt (SST), and vibrato extent (VE) on commonly used frequency and amplitude perturbation and noise measures.

METHOD: Using 53 synthesized tones produced in Madde, the effects of stepwise increases in fo, intensity (modeled by decreasing SST), and VE on the PRAAT parameters jitter % (local), relative average perturbation (RAP) %, shimmer % (local), amplitude perturbation quotient 3 (APQ3) %, and harmonics-to-noise ratio (HNR) dB were investigated. A secondary experiment was conducted to determine whether any fo effects on jitter, RAP, shimmer, APQ3, and HNR were stable. A total of 10 sinewaves were synthesized in Sopran from 100 to 1000 Hz using formant frequencies for /a/, /i/, and /u/-like vowels, respectively. All effects were statistically assessed with Kendall's tau-b and partial correlation.

RESULTS: Increasing fo resulted in an overall increase in jitter, RAP, shimmer, and APQ3 values, respectively (P < 0.01). Oscillations of the data across the explored fo range were observed in all measurement outputs. In the Sopran tests, the oscillatory pattern seen in the Madde fo condition remained and showed differences between vowel conditions. Increasing intensity (decreasing SST) led to reduced pitch and amplitude perturbation and HNR (P < 0.05). Increasing VE led to lower HNR and an almost linear increase of all other measures (P < 0.05).

CONCLUSION: These novel data offer a controlled demonstration for the behavior of jitter (local) %, RAP %, shimmer (local) %, APQ3 %, and HNR (dB) when varying fo, SST, and VE in synthesized tones. Since humans will vary in all of these aspects in spoken language and vowel phonation, researchers should take potential resonance-harmonics type effects into account when comparing intersubject or preintervention and postintervention data using these measures.}, } @article {pmid37906609, year = {2023}, author = {Song, J and Kim, M and Park, J}, title = {Acoustic correlates of perceived personality from Korean utterances in a formal communicative setting.}, journal = {PloS one}, volume = {18}, number = {10}, pages = {e0293222}, pmid = {37906609}, issn = {1932-6203}, mesh = {Humans ; Female ; *Speech ; *Voice ; Acoustics ; Personality ; Language ; }, abstract = {The aim of the present study was to find acoustic correlates of perceived personality from the speech produced in a formal communicative setting-that of Korean customer service employees in particular. This work extended previous research on voice personality impressions to a different sociocultural and linguistic context in which speakers are expected to speak politely in a formal register. To use naturally produced speech rather than read speech, we devised a new method that successfully elicited spontaneous speech from speakers who were role-playing as customer service employees, while controlling for the words and sentence structures they used. We then examined a wide range of acoustic properties in the utterances, including voice quality and global acoustic and segmental properties using Principal Component Analysis. Subjects of the personality rating task listened to the utterances and rated perceived personality in terms of the Big-Five personality traits. While replicating some previous findings, we discovered several acoustic variables that exclusively accounted for the personality judgments of female speakers; a more modal voice quality increased perceived conscientiousness and neuroticism, and less dispersed formants reflecting a larger body size increased the perceived levels of extraversion and openness. These biases in personality perception likely reflect gender and occupation-related stereotypes that exist in South Korea. Our findings can also serve as a basis for developing and evaluating synthetic speech for Voice Assistant applications in future studies.}, } @article {pmid37905994, year = {2024}, author = {Ealer, C and Niemczak, CE and Nicol, T and Magohe, A and Bonacina, S and Zhang, Z and Rieke, C and Leigh, S and Kobrina, A and Lichtenstein, J and Massawe, ER and Kraus, N and Buckey, JC}, title = {Auditory neural processing in children living with HIV uncovers underlying central nervous system dysfunction.}, journal = {AIDS (London, England)}, volume = {38}, number = {3}, pages = {289-298}, pmid = {37905994}, issn = {1473-5571}, support = {R01 HD095277/HD/NICHD NIH HHS/United States ; }, mesh = {Child ; Humans ; Cohort Studies ; Cross-Sectional Studies ; *HIV Infections/complications ; Acoustic Stimulation ; Tanzania ; Central Nervous System ; }, abstract = {OBJECTIVE: Central nervous system (CNS) damage from HIV infection or treatment can lead to developmental delays and poor educational outcomes in children living with HIV (CLWH). Early markers of central nervous system dysfunction are needed to target interventions and prevent life-long disability. The frequency following response (FFR) is an auditory electrophysiology test that can reflect the health of the central nervous system. In this study, we explore whether the FFR reveals auditory central nervous system dysfunction in CLWH.

STUDY DESIGN: Cross-sectional analysis of an ongoing cohort study. Data were from the child's first visit in the study.

SETTING: The infectious disease center in Dar es Salaam, Tanzania.

METHODS: We collected the FFR from 151 CLWH and 151 HIV-negative children. To evoke the FFR, three speech syllabi (/da/, /ba/, /ga/) were played monaurally to the child's right ear. Response measures included neural timing (peak latencies), strength of frequency encoding (fundamental frequency and first formant amplitude), encoding consistency (inter-response consistency), and encoding precision (stimulus-to-response correlation).

RESULTS: CLWH showed smaller first formant amplitudes (P  < 0.0001), weaker inter-response consistencies (P  < 0.0001) and smaller stimulus to response correlations (P  < 0.0001) than FFRs from HIV-negative children. These findings generalized across the three speech stimuli with moderately strong effect sizes (partial η2 ranged from 0.061 to 0.094).

CONCLUSION: The FFR shows auditory central nervous system dysfunction in CLWH. Neural encoding of auditory stimuli was less robust, more variable, and less accurate. As the FFR is a passive and objective test, it may offer an effective way to assess and detect central nervous system function in CLWH.}, } @article {pmid37900335, year = {2023}, author = {Mutlu, A and Celik, S and Kilic, MA}, title = {Effects of Personal Protective Equipment on Speech Acoustics.}, journal = {Sisli Etfal Hastanesi tip bulteni}, volume = {57}, number = {3}, pages = {434-439}, pmid = {37900335}, issn = {1302-7123}, abstract = {OBJECTIVES: The transmission of severe acute respiratory syndrome coronavirus-2 occurs primarily through droplets, which highlights the importance of protecting the oral, nasal, and conjunctival mucosas using personal protective equipment (PPE). The use of PPE can lead to communication difficulties between healthcare workers and patients. This study aimed to investigate changes in the acoustic parameters of speech sounds when different types of PPE are used.

METHODS: A cross-sectional study was conducted, enrolling 18 healthy male and female participants. They were instructed to produce a sustained [ɑː] vowel for at least 3 s to estimate voice quality. In addition, all Turkish vowels were produced for a minimum of 200 ms. Finally, three Turkish fricative consonants ([f], [s], and [ʃ]) were produced in a consonant/vowel/consonant format with different vowel contexts within a carrier sentence. Recordings were repeated under the following conditions: no PPE, surgical mask, N99 mask, face shield, surgical mask + face shield, and N99 mask + face shield. All recordings were subjected to analysis.

RESULTS: Frequency perturbation parameters did not show significant differences. However, in males, all vowels except [u] in the first formant (F1), except [ɔ] and [u] in the second formant (F2), except [ɛ] and [ɔ] in the third formant (F3), and only [i] in the fourth formant (F4) were significant. In females, all vowels except [i] in F1, except [u] in F2, all vowels in F3, and except [u] and [ɯ] in F4 were significant. Spectral moment values exhibited significance in both groups.

CONCLUSION: The use of different types of PPE resulted in variations in speech acoustic features. These findings may be attributed to the filtering effects of PPE on specific frequencies and the potential chamber effect in front of the face. Understanding the impact of PPE on speech acoustics contributes to addressing communication challenges in healthcare settings.}, } @article {pmid37877773, year = {2023}, author = {Steffman, J and Zhang, W}, title = {Vowel perception under prominence: Examining the roles of F0, duration, and distributional information.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {4}, pages = {2594-2608}, doi = {10.1121/10.0021300}, pmid = {37877773}, issn = {1520-8524}, mesh = {*Cues ; *Language ; Speech ; Perception ; }, abstract = {This study investigates how prosodic prominence mediates the perception of American English vowels, testing the effects of F0 and duration. In Experiment 1, the perception of four vowel continua varying in duration and formants (high: /i-ɪ/, /u-ʊ/, non-high: /ɛ-ae/, /ʌ-ɑ/), was examined under changes in F0-based prominence. Experiment 2 tested if cue usage varies as the distributional informativity of duration as a cue to prominence is manipulated. Both experiments show that duration is a consistent vowel-intrinsic cue. F0-based prominence affected perception of vowels via compensation for peripheralization of prominent vowels in the vowel space. Longer duration and F0-based prominence further enhanced the perception of formant cues. The distributional manipulation in Experiment 2 exerted a minimal impact. Findings suggest that vowel perception is mediated by prominence in a height-dependent manner which reflects patterns in the speech production literature. Further, duration simultaneously serves as an intrinsic cue and serves a prominence-related function in enhancing perception of formant cues.}, } @article {pmid37873157, year = {2023}, author = {Wang, H and Ali, Y and Max, L}, title = {Perceptual formant discrimination during speech movement planning.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37873157}, issn = {2692-8205}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; }, abstract = {Evoked potential studies have shown that speech planning modulates auditory cortical responses. The phenomenon's functional relevance is unknown. We tested whether, during this time window of cortical auditory modulation, there is an effect on speakers' perceptual sensitivity for vowel formant discrimination. Participants made same/different judgments for pairs of stimuli consisting of a pre-recorded, self-produced vowel and a formant-shifted version of the same production. Stimuli were presented prior to a "go" signal for speaking, prior to passive listening, and during silent reading. The formant discrimination stimulus /uh/ was tested with a congruent productions list (words with /uh/) and an incongruent productions list (words without /uh/). Logistic curves were fitted to participants' responses, and the just-noticeable difference (JND) served as a measure of discrimination sensitivity. We found a statistically significant effect of condition (worst discrimination before speaking) without congruency effect. Post-hoc pairwise comparisons revealed that JND was significantly greater before speaking than during silent reading. Thus, formant discrimination sensitivity was reduced during speech planning regardless of the congruence between discrimination stimulus and predicted acoustic consequences of the planned speech movements. This finding may inform ongoing efforts to determine the functional relevance of the previously reported modulation of auditory processing during speech planning.}, } @article {pmid37850867, year = {2023}, author = {Miller, HE and Kearney, E and Nieto-Castañón, A and Falsini, R and Abur, D and Acosta, A and Chao, SC and Dahl, KL and Franken, M and Heller Murray, ES and Mollaei, F and Niziolek, CA and Parrell, B and Perrachione, T and Smith, DJ and Stepp, CE and Tomassi, N and Guenther, FH}, title = {Do Not Cut Off Your Tail: A Mega-Analysis of Responses to Auditory Perturbation Experiments.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {11}, pages = {4315-4331}, pmid = {37850867}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; P50 DC015446/DC/NIDCD NIH HHS/United States ; R01 DC007683/DC/NIDCD NIH HHS/United States ; R01 DC011277/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; F31 DC019032/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T90 DA032484/DA/NIDA NIH HHS/United States ; F31 DC016197/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; F31 DC020352/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Aged ; *Speech/physiology ; *Parkinson Disease ; Feedback, Sensory/physiology ; }, abstract = {PURPOSE: The practice of removing "following" responses from speech perturbation analyses is increasingly common, despite no clear evidence as to whether these responses represent a unique response type. This study aimed to determine if the distribution of responses to auditory perturbation paradigms represents a bimodal distribution, consisting of two distinct response types, or a unimodal distribution.

METHOD: This mega-analysis pooled data from 22 previous studies to examine the distribution and magnitude of responses to auditory perturbations across four tasks: adaptive pitch, adaptive formant, reflexive pitch, and reflexive formant. Data included at least 150 unique participants for each task, with studies comprising younger adult, older adult, and Parkinson's disease populations. A Silverman's unimodality test followed by a smoothed bootstrap resampling technique was performed for each task to evaluate the number of modes in each distribution. Wilcoxon signed-ranks tests were also performed for each distribution to confirm significant compensation in response to the perturbation.

RESULTS: Modality analyses were not significant (p > .05) for any group or task, indicating unimodal distributions. Our analyses also confirmed compensatory reflexive responses to pitch and formant perturbations across all groups, as well as adaptive responses to sustained formant perturbations. However, analyses of sustained pitch perturbations only revealed evidence of adaptation in studies with younger adults.

CONCLUSION: The demonstration of a clear unimodal distribution across all tasks suggests that following responses do not represent a distinct response pattern, but rather the tail of a unimodal distribution.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.24282676.}, } @article {pmid37845148, year = {2023}, author = {Chu, M and Wang, J and Fan, Z and Yang, M and Xu, C and Ma, Y and Tao, Z and Wu, D}, title = {A Multidomain Generative Adversarial Network for Hoarse-to-Normal Voice Conversion.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.027}, pmid = {37845148}, issn = {1873-4588}, abstract = {Hoarse voice affects the efficiency of communication between people. However, surgical treatment may result in patients with poorer voice quality, and voice repair techniques can only repair vowels. In this paper, we propose a novel multidomain generative adversarial voice conversion method to achieve hoarse-to-normal voice conversion and personalize voices for patients with hoarseness. The proposed method aims to improve the speech quality of hoarse voices through a multidomain generative adversarial network. The proposed method is evaluated on subjective and objective evaluation metrics. According to the findings of the spectrum analysis, the suggested method converts hoarse voice formants more effectively than variational auto-encoder (VAE), Auto-VC (voice conversion), StarGAN-VC (Generative Adversarial Network- Voice Conversion), and CycleVAE. For the word error rate, the suggested method obtains absolute gains of 35.62, 37.97, 45.42, and 50.05 compared to CycleVAE, StarGAN-VC, Auto-VC, and VAE, respectively. The suggested method achieves CycleVAE, VAE, StarGAN-VC, and Auto-VC, respectively, in terms of naturalness by 42.49%, 51.60%, 69.37%, and 77.54%. The suggested method outperforms VAE, CycleVAE, StarGAN-VC, and Auto-VC, respectively, in terms of intelligibility, with absolute gains of 0.87, 0.93, 1.08, and 1.13. In terms of content similarity, the proposed method obtains 43.48%, 75.52%, 76.21%, and 108.62% improvements compared to CycleVAE, StarGAN-VC, Auto-VC, and VAE, respectively. ABX results show that the suggested method can personalize the voice for patients with hoarseness. This study demonstrates the feasibility of voice conversion methods in improving the speech quality of hoarse voices.}, } @article {pmid37838586, year = {2023}, author = {Santos, SS and Christmann, MK and Cielo, CA}, title = {Spectrographic Vocal Characteristics in Female Teachers: Finger Kazoo Intensive Short-term Vocal Therapy.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.023}, pmid = {37838586}, issn = {1873-4588}, abstract = {OBJECTIVE: Verify the results from intensive short-term vocal therapy using the Finger Kazoo technique about the spectrographic vocal measurements of teachers.

METHODS: Controlled and randomized trial. Spectrographic vocal assessment was performed by judges before and after intensive short-term vocal therapy with Finger Kazoo. Sample was composed of 41 female teachers. There were two study groups (with vocal nodules and without structural affection of the vocal folds) and the respective control groups. For the statistical analysis of the data, nonparametric tests were used (Mann-Whitney test and Wilcoxon test).

RESULTS: After intensive short-term vocal therapy with Finger Kazoo, improvement in voice spectral parameters, such as improvement in tracing (color intensity and regularity), greater definition of formants and harmonics, increased replacement of harmonics by noise, and a greater number of harmonics, mainly in the group without structural affection of the vocal folds.

CONCLUSION: There was an improvement in the spectrographic vocal parameters, showing greater stability, quality, and projection of the emission, especially in female teachers without structural affection of the vocal folds.}, } @article {pmid37831677, year = {2023}, author = {Kim, JA and Jang, H and Choi, Y and Min, YG and Hong, YH and Sung, JJ and Choi, SJ}, title = {Subclinical articulatory changes of vowel parameters in Korean amyotrophic lateral sclerosis patients with perceptually normal voices.}, journal = {PloS one}, volume = {18}, number = {10}, pages = {e0292460}, pmid = {37831677}, issn = {1932-6203}, mesh = {Humans ; *Dysarthria/diagnosis/etiology ; *Amyotrophic Lateral Sclerosis ; Speech Intelligibility ; Phonetics ; Republic of Korea ; Speech Acoustics ; }, abstract = {The available quantitative methods for evaluating bulbar dysfunction in patients with amyotrophic lateral sclerosis (ALS) are limited. We aimed to characterize vowel properties in Korean ALS patients, investigate associations between vowel parameters and clinical features of ALS, and analyze subclinical articulatory changes of vowel parameters in those with perceptually normal voices. Forty-three patients with ALS (27 with dysarthria and 16 without dysarthria) and 20 healthy controls were prospectively collected in the study. Dysarthria was assessed using the ALS Functional Rating Scale-Revised (ALSFRS-R) speech subscores, with any loss of 4 points indicating the presence of dysarthria. The structured speech samples were recorded and analyzed using Praat software. For three corner vowels (/a/, /i/, and /u/), data on the vowel duration, fundamental frequency, frequencies of the first two formants (F1 and F2), harmonics-to-noise ratio, vowel space area (VSA), and vowel articulation index (VAI) were extracted from the speech samples. Corner vowel durations were significantly longer in ALS patients with dysarthria than in healthy controls. The F1 frequency of /a/, F2 frequencies of /i/ and /u/, the VSA, and the VAI showed significant differences between ALS patients with dysarthria and healthy controls. The area under the curve (AUC) was 0.912. The F1 frequency of /a/ and the VSA were the major determinants for differentiating ALS patients who had not yet developed apparent dysarthria from healthy controls (AUC 0.887). In linear regression analyses, as the ALSFRS-R speech subscore decreased, both the VSA and VAI were reduced. In contrast, vowel durations were found to be rather prolonged. The analyses of vowel parameters provided a useful metric correlated with disease severity for detecting subclinical bulbar dysfunction in ALS patients.}, } @article {pmid37830332, year = {2024}, author = {Cai, X and Ouyang, M and Yin, Y and Zhang, Q}, title = {Sensorimotor Adaptation to Formant-Shifted Auditory Feedback Is Predicted by Language-Specific Factors in L1 and L2 Speech Production.}, journal = {Language and speech}, volume = {67}, number = {3}, pages = {846-869}, doi = {10.1177/00238309231202503}, pmid = {37830332}, issn = {1756-6053}, mesh = {Humans ; Male ; Female ; *Feedback, Sensory/physiology ; Young Adult ; *Multilingualism ; Adult ; *Adaptation, Physiological ; *Speech/physiology ; Speech Perception/physiology ; Memory, Short-Term/physiology ; Executive Function/physiology ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Auditory feedback plays an important role in the long-term updating and maintenance of speech motor control; thus, the current study explored the unresolved question of how sensorimotor adaptation is predicted by language-specific and domain-general factors in first-language (L1) and second-language (L2) production. Eighteen English-L1 speakers and 22 English-L2 speakers performed the same sensorimotor adaptation experiments and tasks, which measured language-specific and domain-general abilities. The experiment manipulated the language groups (English-L1 and English-L2) and experimental conditions (baseline, early adaptation, late adaptation, and end). Linear mixed-effects model analyses indicated that auditory acuity was significantly associated with sensorimotor adaptation in L1 and L2 speakers. Analysis of vocal responses showed that L1 speakers exhibited significant sensorimotor adaptation under the early adaptation, late adaptation, and end conditions, whereas L2 speakers exhibited significant sensorimotor adaptation only under the late adaptation condition. Furthermore, the domain-general factors of working memory and executive control were not associated with adaptation/aftereffects in either L1 or L2 production, except for the role of working memory in aftereffects in L2 production. Overall, the study empirically supported the hypothesis that sensorimotor adaptation is predicted by language-specific factors such as auditory acuity and language experience, whereas general cognitive abilities do not play a major role in this process.}, } @article {pmid37827893, year = {2023}, author = {Geng, P and Fan, N and Ling, R and Li, Z and Guo, H and Lu, Q and Chen, X}, title = {Acoustic Characteristics of Mandarin Speech in Male Drug Users.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.08.022}, pmid = {37827893}, issn = {1873-4588}, abstract = {AIM: Drug use/addiction has a profound impact on the physical and mental health of individuals. Previous studies have indicated that drug users may experience speech perception disorders, including speech illusion and challenges in recognizing emotional speech. However, the influence of drugs on speech production, as another crucial aspect of speech communication, has not been thoroughly examined. Therefore, the current study aimed to investigate how drugs affect the acoustic characteristics of speech in Chinese male drug users.

METHOD: Speech recordings were collected from a total of 160 male drug users (including 106 heroin users, 23 ketamine users, and 31 methamphetamine users) and 55 male healthy controls with no history of drug use. Acoustic analysis was conducted on the collected speech data from these groups, and classification analysis was performed using five supervised learning algorithms.

RESULTS: The results demonstrated that drug users exhibited smaller F0 standard deviation, reduced loudness, cepstral peak prominence, and formant relative energies, as well as higher H1-A3, longer unvoiced segments, and fewer voiced segments per second compared to the control group. The classification analyses yielded good performance in classifying drug users and non-drug users, with an accuracy above 86%. Moreover, the identification of the three groups of drug users achieved an accuracy of approximately 70%. Additionally, the study revealed different effects on speech production among the three types of drugs.

CONCLUSION: The above findings indicate the presence of speech disorders, such as vocal hoarseness, in drug users, thus confirming the assumption that the acoustic characteristics of speech in drug users deviates from the norm. This study not only fills the knowledge gap regarding the effects of drugs on the speech production of Chinese male drug users but also provides a more comprehensive understanding of how drugs impact human behaviors. Furthermore, this research provides theoretical foundations of detoxification and speech rehabilitation for drug users.}, } @article {pmid37817600, year = {2023}, author = {Favaro, L and Zanoli, A and Ludynia, K and Snyman, A and Carugati, F and Friard, O and Scaglione, FE and Manassero, L and Valazza, A and Mathevon, N and Gamba, M and Reby, D}, title = {Vocal tract shape variation contributes to individual vocal identity in African penguins.}, journal = {Proceedings. Biological sciences}, volume = {290}, number = {2008}, pages = {20231029}, pmid = {37817600}, issn = {1471-2954}, mesh = {Animals ; *Spheniscidae/physiology ; Vocalization, Animal/physiology ; Body Size ; Acoustics ; Communication ; }, abstract = {Variation in formant frequencies has been shown to affect social interactions and sexual competition in a range of avian species. Yet, the anatomical bases of this variation are poorly understood. Here, we investigated the morphological correlates of formants production in the vocal apparatus of African penguins. We modelled the geometry of the supra-syringeal vocal tract of 20 specimens to generate a population of virtual vocal tracts with varying dimensions. We then estimated the acoustic response of these virtual vocal tracts and extracted the centre frequency of the first four predicted formants. We demonstrate that: (i) variation in length and cross-sectional area of vocal tracts strongly affects the formant pattern, (ii) the tracheal region determines most of this variation, and (iii) the skeletal size of penguins does not correlate with the trachea length and consequently has relatively little effect on formants. We conclude that in African penguins, while the variation in vocal tract geometry generates variation in resonant frequencies supporting the discrimination of conspecifics, such variation does not provide information on the emitter's body size. Overall, our findings advance our understanding of the role of formant frequencies in bird vocal communication.}, } @article {pmid37811992, year = {2023}, author = {de Boer, MM and Heeren, WFL}, title = {The language dependency of /m/ in native Dutch and non-native English.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {4}, pages = {2168-2176}, doi = {10.1121/10.0021288}, pmid = {37811992}, issn = {1520-8524}, mesh = {*Multilingualism ; *Speech Perception ; Language ; Speech ; Phonetics ; }, abstract = {In forensic speaker comparisons, the current practice is to try to avoid comparisons between speech fragments in different languages. However, globalization requires an exploration of individual speech features that may show phonetic consistency across a speaker's languages. We predicted that the bilabial nasal /m/ may be minimally affected by the language spoken due to the involvement of the rigid nasal cavity in combination with a lack of fixed oral articulatory targets. The results show that indeed, L1 Dutch speakers (N = 53) had similar nasal formants and formant bandwidths when speaking in their L2 English as in their native language, suggesting language-independency of /m/ within speakers. In fact, acoustics seemed to rely more on the phonetic context than on the language spoken. Nevertheless, caution should still be exercised when sampling across languages when the languages' phoneme inventories and phonotactics show substantial differences.}, } @article {pmid37809163, year = {2023}, author = {Meng, Z and Liu, H and Ma, AC}, title = {Optimizing Voice Recognition Informatic Robots for Effective Communication in Outpatient Settings.}, journal = {Cureus}, volume = {15}, number = {9}, pages = {e44848}, pmid = {37809163}, issn = {2168-8184}, abstract = {Aim/Objective Within the dynamic healthcare technology landscape, this research aims to explore patient inquiries within outpatient clinics, elucidating the interplay between technology and healthcare intricacies. Building upon the initial intelligent guidance robot implementation shortcomings, this investigation seeks to enhance informatic robots with voice recognition technology. The objective is to analyze users' vocal patterns, discern age-associated vocal attributes, and facilitate age differentiation through subtle vocal nuances to enhance the efficacy of human-robot communication within outpatient clinical settings. Methods This investigation employs a multi-faceted approach. It leverages voice recognition technology to analyze users' vocal patterns. A diverse dataset of voice samples from various age groups was collected. Acoustic features encompassing pitch, formant frequencies, spectral characteristics, and vocal tract length are extracted from the audio samples. The Mel Filterbank and Mel-Frequency Cepstral Coefficients (MFCCs) are employed for speech and audio processing tasks alongside machine learning algorithms to assess and match vocal patterns to age-related traits. Results The research reveals compelling outcomes. The incorporation of voice recognition technology contributes to a significant improvement in human-robot communication within outpatient clinical settings. Through accurate analysis of vocal patterns and age-related traits, informatic robots can differentiate age through nuanced verbal cues. This augmentation leads to enhanced contextual understanding and tailored responses, significantly advancing the efficiency of patient interactions with the robots. Conclusion Integrating voice recognition technology into informatic robots presents a noteworthy advancement in outpatient clinic settings. By enabling age differentiation through vocal nuances, this augmentation enhances the precision and relevance of responses. The study contributes to the ongoing discourse on the dynamic evolution of healthcare technology, underscoring the complex synergy between technological progression and the intricate realities within healthcare infrastructure. As healthcare continues to metamorphose, the seamless integration of voice recognition technology marks a pivotal stride in optimizing human-robot communication and elevating patient care within outpatient settings.}, } @article {pmid37790479, year = {2023}, author = {Mohn, JL and Baese-Berk, MM and Jaramillo, S}, title = {Selectivity to acoustic features of human speech in the auditory cortex of the mouse.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37790479}, issn = {2692-8205}, support = {R56 DC015531/DC/NIDCD NIH HHS/United States ; }, abstract = {A better understanding of the neural mechanisms of speech processing can have a major impact in the development of strategies for language learning and in addressing disorders that affect speech comprehension. Technical limitations in research with human subjects hinder a comprehensive exploration of these processes, making animal models essential for advancing the characterization of how neural circuits make speech perception possible. Here, we investigated the mouse as a model organism for studying speech processing and explored whether distinct regions of the mouse auditory cortex are sensitive to specific acoustic features of speech. We found that mice can learn to categorize frequency-shifted human speech sounds based on differences in formant transitions (FT) and voice onset time (VOT). Moreover, neurons across various auditory cortical regions were selective to these speech features, with a higher proportion of speech-selective neurons in the dorso-posterior region. Last, many of these neurons displayed mixed-selectivity for both features, an attribute that was most common in dorsal regions of the auditory cortex. Our results demonstrate that the mouse serves as a valuable model for studying the detailed mechanisms of speech feature encoding and neural plasticity during speech-sound learning.}, } @article {pmid37786950, year = {2024}, author = {Sant'Anna, LIDA and Miranda E Paulo, D and Baião, FCS and Lima, IFP and Vieira, WA and César, CPHAR and Pithon, MM and Maia, LC and Paranhos, LR}, title = {Can rapid maxillary expansion affect speech sound production in growing patients? A systematic review.}, journal = {Orthodontics & craniofacial research}, volume = {27}, number = {2}, pages = {185-192}, doi = {10.1111/ocr.12716}, pmid = {37786950}, issn = {1601-6343}, support = {//Conselho Nacional de Desenvolvimento Científico e Tecnológico/ ; //Coordenação de Aperfeiçoamento de Pessoal de Nível Superior/ ; //Fundação de Amparo à Pesquisa do Estado de Minas Gerais/ ; }, mesh = {Humans ; *Phonetics ; *Palatal Expansion Technique/adverse effects ; Speech ; Maxilla ; Nasal Cavity ; }, abstract = {Rapid maxillary expansion (RME) may change speech sound parameters due to the enlargement of oral and nasal cavities. This study aimed to systematically review the current evidence on speech changes as a side effect of RME. An electronic search was conducted in nine databases, and two of them accessed the 'grey literature'. The eligibility criteria included clinical studies assessing orthodontic patients with maxillary transverse deficiency and the relationship with speech alterations without restricting publication year or language. Only interventional studies were included. The JBI Critical Appraisal Tool assessed the risk of bias. The initial search provided 4853 studies. Seven articles (n = 200 patients) met the inclusion criteria and were analysed. The primary source of bias was the absence of a control group in four studies. RME altered speech production by changing vowel fundamental frequency and fricative phoneme formant frequency. Shimmer and jitter rates changed in one and two studies, respectively. Two studies presented deterioration during orthodontic treatment, but speech improved after appliance removal. Despite the limited evidence, RME affects speech during and after treatment.}, } @article {pmid37778391, year = {2023}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Correction: 'Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage' (2021), by Grawunder et al.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1890}, pages = {20230319}, pmid = {37778391}, issn = {1471-2970}, } @article {pmid37769645, year = {2024}, author = {van Brenk, F and Lowit, A and Tjaden, K}, title = {Effects of Speaking Rate on Variability of Second Formant Frequency Transitions in Dysarthria.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {76}, number = {3}, pages = {295-308}, pmid = {37769645}, issn = {1421-9972}, support = {R01 DC004689/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Dysarthria/physiopathology/etiology ; Male ; Female ; *Parkinson Disease/complications/physiopathology ; Middle Aged ; *Speech Intelligibility ; Aged ; Speech Acoustics ; Adult ; Speech Production Measurement ; Sound Spectrography ; Ataxia/physiopathology ; Phonetics ; }, abstract = {INTRODUCTION: This study examined the utility of multiple second formant (F2) slope metrics to capture differences in speech production for individuals with dysarthria and healthy controls as a function of speaking rate. In addition, the utility of F2 slope metrics for predicting severity of intelligibility impairment in dysarthria was examined.

METHODS: Twenty three speakers with Parkinson's disease and mild to moderate hypokinetic dysarthria (HD), 9 speakers with various neurological diseases and mild to severe ataxic or ataxic-spastic dysarthria (AD), and 26 age-matched healthy control speakers (CON) participated in a sentence repetition task. Sentences were produced at habitual, fast, and slow speaking rate. A variety of metrics were derived from the rising F2 transition portion of the diphthong /ai/. To obtain measures of intelligibility for the two clinical speaker groups, 15 undergraduate SLP students participated in a transcription experiment.

RESULTS: Significantly shallower slopes were found for the speakers with HD compared to control speakers. Steeper F2 slopes were associated with increased speaking rate for all groups. Higher variability in F2 slope metrics was found for the speakers with AD compared to the two other speaker groups. For both clinical speaker groups, there was a negative association between intelligibility and F2 slope variability metrics, indicating lower variability in speech production was associated with higher intelligibility.

DISCUSSION: F2 slope metrics were sensitive to dysarthria presence, dysarthria type, and speaking rate. The current study provided evidence that the use of F2 slope variability measures has additional value to F2 slope averaged measures for predicting severity of intelligibility impairment in dysarthria.}, } @article {pmid37756574, year = {2023}, author = {Liu, W and Wang, T and Huang, X}, title = {The influences of forward context on stop-consonant perception: The combined effects of contrast and acoustic cue activation?.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {3}, pages = {1903-1920}, doi = {10.1121/10.0021077}, pmid = {37756574}, issn = {1520-8524}, mesh = {*Cues ; *Speech Perception/physiology ; Phonetics ; Acoustics ; Speech Acoustics ; Acoustic Stimulation ; }, abstract = {The perception of the /da/-/ga/ series, distinguished primarily by the third formant (F3) transition, is affected by many nonspeech and speech sounds. Previous studies mainly investigated the influences of context stimuli with frequency bands located in the F3 region and proposed the account of spectral contrast effects. This study examined the effects of context stimuli with bands not in the F3 region. The results revealed that these non-F3-region stimuli (whether with bands higher or lower than the F3 region) mainly facilitated the identification of /ga/; for example, the stimuli (including frequency-modulated glides, sine-wave tones, filtered sentences, and natural vowels) in the low-frequency band (500-1500 Hz) led to more /ga/ responses than those in the low-F3 region (1500-2500 Hz). It is suggested that in the F3 region, context stimuli may act through spectral contrast effects, while in non-F3 regions, context stimuli might activate the acoustic cues of /g/ and further facilitate the identification of /ga/. The combination of contrast and acoustic cue effects can explain more results concerning the forward context influences on the perception of the /da/-/ga/ series, including the effects of non-F3-region stimuli and the imbalanced influences of context stimuli on /da/ and /ga/ perception.}, } @article {pmid37748969, year = {2023}, author = {Toppo, R and Sinha, S}, title = {The Acoustics of Gender in Indian English: Toward Forensic Profiling in a Multilingual Context.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.07.030}, pmid = {37748969}, issn = {1873-4588}, abstract = {The present study is an acoustic analysis of Indian English, specifically examining the speech patterns and characteristics of three different groups with different native languages. This study investigates fundamental frequency (fo), fo range, fo variance, formant frequencies, and vowel space size in 42 native male and female speakers of Odia, Bangla, and Hindi. Furthermore, it investigated the potential correlation between fundamental frequency and vowel space, examining whether variations in vowel space size could be influenced by gender-specific perceptual factors. The paper emphasizes that in a multilingual context, gender identification can be efficiently correlated with both fo and formant frequencies. To measure a range of acoustic characteristics, speech samples were collected from the recording task. Analysis was done on PRAAT. The study revealed significant differences between genders for the examined acoustic characteristics. Results indicate differences in the size of gender-specific variations among the language groups, with females exhibiting more significant differences in fo, formant frequencies, and vowel space than males. The findings show no significant correlation between fo and vowel space area, indicating that other features are responsible for large vowel space for females. These findings display significant potential toward creating a robust empirical framework for gender profiling that can be utilized in a wide range of forensic linguistics investigations.}, } @article {pmid37736531, year = {2023}, author = {Osiecka, AN and Briefer, EF and Kidawa, D and Wojczulanis-Jakubas, K}, title = {Social calls of the little auk (Alle alle) reflect body size and possibly partnership, but not sex.}, journal = {Royal Society open science}, volume = {10}, number = {9}, pages = {230845}, pmid = {37736531}, issn = {2054-5703}, abstract = {Source-filter theory posits that an individual's size and vocal tract length are reflected in the parameters of their calls. In species that mate assortatively, this could result in vocal similarity. In the context of mate selection, this would mean that animals could listen in to find a partner that sounds-and therefore is-similar to them. We investigated the social calls of the little auk (Alle alle), a highly vocal seabird mating assortatively, using vocalizations produced inside 15 nests by known individuals. Source- and filter-related acoustic parameters were used in linear mixed models testing the possible impact of body size. A principal component analysis followed by a permuted discriminant function analysis tested the effect of sex. Additionally, randomization procedures tested whether partners are more vocally similar than random birds. There was a significant effect of size on the mean fundamental frequency of a simple call, but not on parameters of a multisyllable call with apparent formants. Neither sex nor partnership influenced the calls-there was, however, a tendency to match certain parameters between partners. This indicates that vocal cues are at best weak indicators of size, and other factors likely play a role in mate selection.}, } @article {pmid37730823, year = {2023}, author = {Georgiou, GP}, title = {Comparison of the prediction accuracy of machine learning algorithms in crosslinguistic vowel classification.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {15594}, pmid = {37730823}, issn = {2045-2322}, mesh = {Adult ; Humans ; *Algorithms ; *Neural Networks, Computer ; Acoustics ; Discriminant Analysis ; Machine Learning ; }, abstract = {Machine learning algorithms can be used for the prediction of nonnative sound classification based on crosslinguistic acoustic similarity. To date, very few linguistic studies have compared the classification accuracy of different algorithms. This study aims to assess how well machines align with human speech perception by assessing the ability of three machine learning algorithms, namely, linear discriminant analysis (LDA), decision tree (C5.0), and neural network (NNET), to predict the classification of second language (L2) sounds in terms of first language (L1) categories. The models were trained using the first three formants and duration of L1 vowels and fed with the same acoustic features of L2 vowels. To validate their accuracy, adult L2 speakers completed a perceptual classification task. The results indicated that NNET predicted with success the classification of all L2 vowels with the highest proportion in terms of L1 categories, while LDA and C5.0 missed only one vowel each. Furthermore, NNET exhibited superior accuracy in predicting the full range of above chance responses, followed closely by LDA. C5.0 did not meet the anticipated performance levels. The findings can hold significant implications for advancing both the theoretical and practical frameworks of speech acquisition.}, } @article {pmid37717981, year = {2023}, author = {Zhang, T and Liu, X and Liu, G and Shao, Y}, title = {PVR-AFM: A Pathological Voice Repair System based on Non-linear Structure.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {5}, pages = {648-662}, doi = {10.1016/j.jvoice.2021.05.010}, pmid = {37717981}, issn = {1873-4588}, mesh = {Humans ; Aged ; *Voice ; Speech ; *Voice Disorders/diagnosis ; Algorithms ; Cognition ; }, abstract = {OBJECTIVE: Speech signal processing has become an important technique to ensure that the voice interaction system communicates accurately with the user by improving the clarity or intelligibility of speech signals. However, most existing works only focus on whether to process the voice of average human but ignore the communication needs of individuals suffering from voice disorder, including voice-related professionals, older people, and smokers. To solve this demand, it is essential to design a non-invasive repair system that processes pathological voices.

METHODS: In this paper, we propose a repair system for multiple polyp vowels, such as /a/, /i/ and /u/. We utilize a non-linear model based on amplitude-modulation (AM) and a frequency-modulation (FM) structure to extract the pitch and formant of pathological voice. To solve the fracture and instability of pitch, we provide a pitch extraction algorithm, which ensures that pitch's stability and avoids the errors of double pitch caused by the instability of low-frequency signal. Furthermore, we design a formant reconstruction mechanism, which can effectively determine the frequency and bandwidth to accomplish formant repair.

RESULTS: Finally, spectrum observation and objective indicators show that the system has better performance in improving the intelligibility of pathological speech.}, } @article {pmid37701868, year = {2023}, author = {Roland, V and Huet, K and Harmegnies, B and Piccaluga, M and Verhaegen, C and Delvaux, V}, title = {Vowel production: a potential speech biomarker for early detection of dysarthria in Parkinson's disease.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1129830}, pmid = {37701868}, issn = {1664-1078}, abstract = {OBJECTIVES: Our aim is to detect early, subclinical speech biomarkers of dysarthria in Parkinson's disease (PD), i.e., systematic atypicalities in speech that remain subtle, are not easily detectible by the clinician, so that the patient is labeled "non-dysarthric." Based on promising exploratory work, we examine here whether vowel articulation, as assessed by three acoustic metrics, can be used as early indicator of speech difficulties associated with Parkinson's disease.

STUDY DESIGN: This is a prospective case-control study.

METHODS: Sixty-three individuals with PD and 35 without PD (healthy controls-HC) participated in this study. Out of 63 PD patients, 43 had been diagnosed with dysarthria (DPD) and 20 had not (NDPD). Sustained vowels were recorded for each speaker and formant frequencies were measured. The analyses focus on three acoustic metrics: individual vowel triangle areas (tVSA), vowel articulation index (VAI) and the Phi index.

RESULTS: tVSA were found to be significantly smaller for DPD speakers than for HC. The VAI showed significant differences between these two groups, indicating greater centralization and lower vowel contrasts in the DPD speakers with dysarhtria. In addition, DPD and NDPD speakers had lower Phi values, indicating a lower organization of their vowel system compared to the HC. Results also showed that the VAI index was the most efficient to distinguish between DPD and NDPD whereas the Phi index was the best acoustic metric to discriminate NDPD and HC.

CONCLUSION: This acoustic study identified potential subclinical vowel-related speech biomarkers of dysarthria in speakers with Parkinson's disease who have not been diagnosed with dysarthria.}, } @article {pmid37695295, year = {2023}, author = {Perrine, BL and Scherer, RC}, title = {Using a vertical three-mass computational model of the vocal folds to match human phonation of three adult males.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {3}, pages = {1505-1525}, pmid = {37695295}, issn = {1520-8524}, support = {R01 DC007640/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Adult ; Male ; *Vocal Cords ; Phonation ; Glottis ; *Larynx ; Computer Simulation ; }, abstract = {Computer models of phonation are used to study various parameters that are difficult to control, measure, and observe in human subjects. Imitating human phonation by varying the prephonatory conditions of computer models offers insight into the variations that occur across human phonatory production. In the present study, a vertical three-mass computer model of phonation [Perrine, Scherer, Fulcher, and Zhai (2020). J. Acoust. Soc. Am. 147, 1727-1737], driven by empirical pressures from a physical model of the vocal folds (model M5), with a vocal tract following the design of Ishizaka and Flanagan [(1972). Bell Sys. Tech. J. 51, 1233-1268] was used to match prolonged vowels produced by three male subjects using various pitch and loudness levels. The prephonatory conditions of tissue mass and tension, subglottal pressure, glottal diameter and angle, posterior glottal gap, false vocal fold gap, and vocal tract cross-sectional areas were varied in the model to match the model output with the fundamental frequency, alternating current airflow, direct current airflow, skewing quotient, open quotient, maximum flow negative derivative, and the first three formant frequencies from the human production. Parameters were matched between the model and human subjects with an average overall percent mismatch of 4.40% (standard deviation = 6.75%), suggesting a reasonable ability of the simple low dimensional model to mimic these variables.}, } @article {pmid37650429, year = {2023}, author = {Steffman, J}, title = {Vowel-internal cues to vowel quality and prominence in speech perception.}, journal = {Phonetica}, volume = {80}, number = {5}, pages = {329-356}, pmid = {37650429}, issn = {1423-0321}, mesh = {Humans ; *Cues ; *Speech Perception ; Language ; Speech ; Phonetics ; Speech Acoustics ; }, abstract = {This study examines how variation in F0 and intensity impacts the perception of American English vowels. Both properties vary intrinsically as a function of vowel features in the speech production literature, raising the question of the perceptual impact of each. In addition to considering listeners' interpretation of either cue as an intrinsic property of the vowel, the possible prominence-marking function of each is considered. Two patterns of prominence strengthening in vowels, sonority expansion and hyperarticulation, are tested in light of recent findings that contextual prominence impacts vowel perception in line with these effects (i.e. a prominent vowel is expected by listeners to be realized as if it had undergone prominence strengthening). Across four vowel contrasts with different height and frontness features, listeners categorized phonetic continua with variation in formants, F0 and intensity. Results show that variation in level F0 height is interpreted as an intrinsic cue by listeners. Higher F0 cues a higher vowel, following intrinsic F0 effects in the production literature. In comparison, intensity is interpreted as a prominence-lending cue, for which effect directionality is dependent on vowel height. Higher intensity high vowels undergo perceptual re-calibration in line with (acoustic) hyperarticulation, whereas higher intensity non-high vowels undergo perceptual re-calibration in line with sonority expansion.}, } @article {pmid37630210, year = {2023}, author = {Yang, J and Yue, Y and Lv, H and Ren, B and Zhang, Y}, title = {Effect of Adding Intermediate Layers on the Interface Bonding Performance of WC-Co Diamond-Coated Cemented Carbide Tool Materials.}, journal = {Molecules (Basel, Switzerland)}, volume = {28}, number = {16}, pages = {}, pmid = {37630210}, issn = {1420-3049}, support = {ZR2022ME129//Natural Science Foundation of Shandong Province of China/ ; 2021-2//Science and Technology Research-Revealing-list System- special project of QingdaoWest Coast New Area of Shandong province of China/ ; }, abstract = {The interface models of diamond-coated WC-Co cemented carbide (DCCC) were constructed without intermediate layers and with different interface terminals, such as intermediate layers of TiC, TiN, CrN, and SiC. The adhesion work of the interface model was calculated based on the first principle. The results show that the adhesion work of the interface was increased after adding four intermediate layers. Their effect on improving the interface adhesion performance of cemented carbide coated with diamond was ranked in descending order as follows: SiC > CrN > TiC > TiN. The charge density difference and the density of states were further analyzed. After adding the intermediate layer, the charge distribution at the interface junction was changed, and the electron cloud at the interface junction overlapped to form a more stable chemical bond. Additionally, after adding the intermediate layer, the density of states of the atoms at the interface increased in the energy overlapping area. The formant formed between the electronic orbitals enhances the bond strength. Thus, the interface bonding performance of DCCC was enhanced. Among them, the most obvious was the interatomic electron cloud overlapping at the diamond/SiCC-Si/WC-Co interface, its bond length was the shortest (1.62 Å), the energy region forming the resonance peak was the largest (-5-20 eV), and the bonding was the strongest. The interatomic bond length at the diamond/TiNTi/WC-Co interface was the longest (4.11 Å), the energy region forming the resonance peak was the smallest (-5-16 eV), and the bonding was the weakest. Comprehensively considering four kinds of intermediate layers, the best intermediate layer for improving the interface bonding performance of DCCC was SiC, and the worst was TiN.}, } @article {pmid37616075, year = {2023}, author = {Bradshaw, AR and Lametti, DR and Shiller, DM and Jasmin, K and Huang, R and McGettigan, C}, title = {Speech motor adaptation during synchronous and metronome-timed speech.}, journal = {Journal of experimental psychology. General}, volume = {152}, number = {12}, pages = {3476-3489}, doi = {10.1037/xge0001459}, pmid = {37616075}, issn = {1939-2222}, support = {//Leverhulme Trust/ ; }, mesh = {Humans ; Speech/physiology ; *Speech Perception/physiology ; *Voice/physiology ; Phonetics ; Learning ; }, abstract = {Sensorimotor integration during speech has been investigated by altering the sound of a speaker's voice in real time; in response, the speaker learns to change their production of speech sounds in order to compensate (adaptation). This line of research has however been predominantly limited to very simple speaking contexts, typically involving (a) repetitive production of single words and (b) production of speech while alone, without the usual exposure to other voices. This study investigated adaptation to a real-time perturbation of the first and second formants during production of sentences either in synchrony with a prerecorded voice (synchronous speech group) or alone (solo speech group). Experiment 1 (n = 30) found no significant difference in the average magnitude of compensatory formant changes between the groups; however, synchronous speech resulted in increased between-individual variability in such formant changes. Participants also showed acoustic-phonetic convergence to the voice they were synchronizing with prior to introduction of the feedback alteration. Furthermore, the extent to which the changes required for convergence agreed with those required for adaptation was positively correlated with the magnitude of subsequent adaptation. Experiment 2 tested an additional group with a metronome-timed speech task (n = 15) and found a similar pattern of increased between-participant variability in formant changes. These findings demonstrate that speech motor adaptation can be measured robustly at the group level during performance of more complex speaking tasks; however, further work is needed to resolve whether self-voice adaptation and other-voice convergence reflect additive or interactive effects during sensorimotor control of speech. (PsycInfo Database Record (c) 2023 APA, all rights reserved).}, } @article {pmid37591234, year = {2023}, author = {Ancel, EE and Smith, ML and Rao, VNV and Munson, B}, title = {Relating Acoustic Measures to Listener Ratings of Children's Productions of Word-Initial /ɹ/ and /w/.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {9}, pages = {3413-3427}, pmid = {37591234}, issn = {1558-9102}, support = {R01 DC002932/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Child, Preschool ; Reproducibility of Results ; *Acoustics ; Educational Status ; *Schools ; }, abstract = {PURPOSE: The /ɹ/ productions of young children acquiring American English are highly variable and often inaccurate, with [w] as the most common substitution error. One acoustic indicator of the goodness of children's /ɹ/ productions is the difference between the frequency of the second formant (F2) and the third formant (F3), with a smaller F3-F2 difference being associated with a perceptually more adultlike /ɹ/. This study analyzed the effectiveness of automatically extracted F3-F2 differences in characterizing young children's productions of /ɹ/-/w/ in comparison with manually coded measurements.

METHOD: Automated F3-F2 differences were extracted from productions of a variety of different /ɹ/- and /w/-initial words spoken by 3- to 4-year-old monolingual preschoolers (N = 117; 2,278 tokens in total). These automated measures were compared to ratings of the phoneme goodness of children's productions as rated by untrained adult listeners (n = 132) on a visual analog scale, as well as to narrow transcriptions of the production into four categories: [ɹ], [w], and two intermediate categories.

RESULTS: Data visualizations show a weak relationship between automated F3-F2 differences with listener ratings and narrow transcriptions. Mixed-effects models suggest the automated F3-F2 difference only modestly predicts listener ratings (R [2] = .37) and narrow transcriptions (R [2] = .32).

CONCLUSION: The weak relationship between automated F3-F2 difference and both listener ratings and narrow transcriptions suggests that these automated acoustic measures are of questionable reliability and utility in assessing preschool children's mastery of the /ɹ/-/w/ contrast.}, } @article {pmid37555773, year = {2023}, author = {Stilp, C and Chodroff, E}, title = {"Please say what this word is": Linguistic experience and acoustic context interact in vowel categorization .}, journal = {JASA express letters}, volume = {3}, number = {8}, pages = {}, pmid = {37555773}, issn = {2691-1191}, support = {R01 DC020303/DC/NIDCD NIH HHS/United States ; }, mesh = {*Phonetics ; *Speech Perception ; Acoustics ; Speech Acoustics ; Language ; }, abstract = {Ladefoged and Broadbent [(1957). J. Acoust. Soc. Am. 29(1), 98-104] is a foundational study in speech perception research, demonstrating that acoustic properties of earlier sounds alter perception of subsequent sounds: a context sentence with a lowered first formant (F1) frequency promotes perception of a raised F1 in a target word, and vice versa. The present study replicated the original with U.K. and U.S. listeners. While the direction of the perceptual shift was consistent with the original study, neither sample replicated the large effect sizes. This invites consideration of how linguistic experience relates to the magnitudes of these context effects.}, } @article {pmid37555771, year = {2023}, author = {Tanner, J}, title = {Prosodic and durational influences on the formant dynamics of Japanese vowels.}, journal = {JASA express letters}, volume = {3}, number = {8}, pages = {}, doi = {10.1121/10.0020547}, pmid = {37555771}, issn = {2691-1191}, mesh = {Humans ; Phonetics ; *Speech Acoustics ; *Language ; }, abstract = {The relationship between prosodic structure and segmental realisation is a central question within phonetics. For vowels, this has been typically examined in terms of duration, leaving largely unanswered how prosodic boundaries influence spectral realisation. This study examines the influence of prosodic boundary strength-as well as duration and pauses-on vowel dynamics in spontaneous Japanese. While boundary strength has a marginal effect on dynamics, increased duration and pauses result in greater vowel peripherality and spectral change. These findings highlight the complex relationship between prosodic and segmental structure, and illustrate the importance of multifactorial analysis in corpus research.}, } @article {pmid37547022, year = {2023}, author = {Hilger, A and Cole, J and Larson, C}, title = {Task-dependent pitch auditory feedback control in cerebellar ataxia.}, journal = {Research square}, volume = {}, number = {}, pages = {}, pmid = {37547022}, issn = {2693-5015}, support = {F31 DC017877/DC/NIDCD NIH HHS/United States ; }, abstract = {PURPOSE: The purpose of this study was to investigate how ataxia affects the task-dependent role of pitch auditory feedback control in speech. In previous research, individuals with ataxia produced over-corrected, hypermetric compensatory responses to unexpected pitch and formant frequency perturbations in auditory feedback in sustained vowels and single words (Houde et al., 2019; Li et al., 2019; Parrell et al., 2017). In this study, we investigated whether ataxia would also affect the task-dependent role of the auditory feedback control system, measuring whether pitch-shift responses would be mediated by speech task or semantic focus pattern as they are in neurologically healthy speakers.

METHODS: Twenty-two adults with ataxia and 29 age- and sex-matched control participants produced sustained vowels and sentences with and without corrective focus while their auditory feedback was briefly and unexpectedly perturbed in pitch by +/-200 cents. The magnitude and latency of the reflexive pitch-shift responses were measured as a reflection of auditory feedback control.

RESULTS: Individuals with ataxia produced larger reflexive pitch-shift responses in both the sustained-vowel and sentence-production tasks than the control participants. Additionally, a differential response magnitude was observed by task and sentence focus pattern for both groups.

CONCLUSION: These findings demonstrate that even though accuracy of auditory feedback control correction is affected by cerebellar damage, as evidenced by the hypermetric responses, the system still retains efficiency in utilizing the task-dependent role of auditory feedback.}, } @article {pmid37541926, year = {2023}, author = {Gao, Y and Feng, Y and Wu, D and Lu, F and He, H and Tian, C}, title = {Effect of Wearing Different Masks on Acoustic, Aerodynamic, and Formant Parameters.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.06.018}, pmid = {37541926}, issn = {1873-4588}, abstract = {OBJECTIVE: This study aimed to investigate the effects of different types of masks on acoustic, aerodynamic, and formant parameters in healthy people.

METHODS: Our study involved 30 healthy participants, 15 of each gender, aged 20-40 years. The tests were conducted under four conditions: without a mask, after wearing a surgical mask, after wearing a head-mounted N95 mask, and after wearing an ear-mounted N95 mask. Voice recording was done with the mask on. The acoustic parameters include mean fundamental frequency (F0), mean intensity, percentage of jitter (local), percentage of shimmer (local), mean noise to harmonic ratio (NHR), aerodynamic parameter, maximum phonation time (MPT), and formant parameters (/a/, /i/, /u/ three vowels F1, F2).

RESULTS: The main effect of mask type was significant in MPT, mean F0, mean HNR, /a/F1, /a/F2, /i/F2. However, the effect sizes and power in /a/F2, /i/F2 were low. MPT, mean F0 and mean HNR significantly increased and /a/F1 significantly decreased after wearing the head-mounted n95 mask. The mean F0 and mean HNR increased significantly after wearing the ear-mounted n95 mask. No significant changes were observed in parameters after wearing the surgical mask in this study. When the statistics are performed separately for males and females, the results obtained are similar to those previously obtained for unspecified males and females.

CONCLUSION: After wearing the surgical mask, this study found insignificant changes in mean F0, jitter (local), shimmer (local), mean NHR, mean intensity, MPT, and the vowels F1 and F2. This may be due to the looser design of the surgical mask and the relatively small attenuation of sound. N95 masks have a greater effect on vocalization than surgical masks and may cause changes in F0 and HNR after wearing an N95 mask. In the present study, no significant changes in jitter and shimmer were observed after wearing the mask. In addition, there was a significant reduction in /a/F1 after wearing the N95 headgear mask may owing to its high restriction of jaw mobility. In future studies, the change in jaw movement amplitude after wearing the mouthpiece can be added to investigate.}, } @article {pmid37522248, year = {2023}, author = {Rizzi, R and Bidelman, GM}, title = {Duplex perception reveals brainstem auditory representations are modulated by listeners' ongoing percept for speech.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {33}, number = {18}, pages = {10076-10086}, pmid = {37522248}, issn = {1460-2199}, support = {R01 DC016267/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech ; *Speech Perception/physiology ; Brain Stem/physiology ; Brain/physiology ; Hearing ; Auditory Perception/physiology ; Acoustic Stimulation ; }, abstract = {So-called duplex speech stimuli with perceptually ambiguous spectral cues to one ear and isolated low- versus high-frequency third formant "chirp" to the opposite ear yield a coherent percept supporting their phonetic categorization. Critically, such dichotic sounds are only perceived categorically upon binaural integration. Here, we used frequency-following responses (FFRs), scalp-recorded potentials reflecting phase-locked subcortical activity, to investigate brainstem responses to fused speech percepts and to determine whether FFRs reflect binaurally integrated category-level representations. We recorded FFRs to diotic and dichotic stop-consonants (/da/, /ga/) that either did or did not require binaural fusion to properly label along with perceptually ambiguous sounds without clear phonetic identity. Behaviorally, listeners showed clear categorization of dichotic speech tokens confirming they were heard with a fused, phonetic percept. Neurally, we found FFRs were stronger for categorically perceived speech relative to category-ambiguous tokens but also differentiated phonetic categories for both diotically and dichotically presented speech sounds. Correlations between neural and behavioral data further showed FFR latency predicted the degree to which listeners labeled tokens as "da" versus "ga." The presence of binaurally integrated, category-level information in FFRs suggests human brainstem processing reflects a surprisingly abstract level of the speech code typically circumscribed to much later cortical processing.}, } @article {pmid37506120, year = {2023}, author = {Kim, KS and Gaines, JL and Parrell, B and Ramanarayanan, V and Nagarajan, SS and Houde, JF}, title = {Mechanisms of sensorimotor adaptation in a hierarchical state feedback control model of speech.}, journal = {PLoS computational biology}, volume = {19}, number = {7}, pages = {e1011244}, pmid = {37506120}, issn = {1553-7358}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; F32 DC019538/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; P50 DC019900/DC/NIDCD NIH HHS/United States ; R01 NS100440/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Feedback ; *Speech ; *Adaptation, Physiological ; Feedback, Sensory ; Movement ; }, abstract = {Upon perceiving sensory errors during movements, the human sensorimotor system updates future movements to compensate for the errors, a phenomenon called sensorimotor adaptation. One component of this adaptation is thought to be driven by sensory prediction errors-discrepancies between predicted and actual sensory feedback. However, the mechanisms by which prediction errors drive adaptation remain unclear. Here, auditory prediction error-based mechanisms involved in speech auditory-motor adaptation were examined via the feedback aware control of tasks in speech (FACTS) model. Consistent with theoretical perspectives in both non-speech and speech motor control, the hierarchical architecture of FACTS relies on both the higher-level task (vocal tract constrictions) as well as lower-level articulatory state representations. Importantly, FACTS also computes sensory prediction errors as a part of its state feedback control mechanism, a well-established framework in the field of motor control. We explored potential adaptation mechanisms and found that adaptive behavior was present only when prediction errors updated the articulatory-to-task state transformation. In contrast, designs in which prediction errors updated forward sensory prediction models alone did not generate adaptation. Thus, FACTS demonstrated that 1) prediction errors can drive adaptation through task-level updates, and 2) adaptation is likely driven by updates to task-level control rather than (only) to forward predictive models. Additionally, simulating adaptation with FACTS generated a number of important hypotheses regarding previously reported phenomena such as identifying the source(s) of incomplete adaptation and driving factor(s) for changes in the second formant frequency during adaptation to the first formant perturbation. The proposed model design paves the way for a hierarchical state feedback control framework to be examined in the context of sensorimotor adaptation in both speech and non-speech effector systems.}, } @article {pmid37499137, year = {2023}, author = {Illner, V and Tykalova, T and Skrabal, D and Klempir, J and Rusz, J}, title = {Automated Vowel Articulation Analysis in Connected Speech Among Progressive Neurological Diseases, Dysarthria Types, and Dysarthria Severities.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {8}, pages = {2600-2621}, doi = {10.1044/2023_JSLHR-22-00526}, pmid = {37499137}, issn = {1558-9102}, mesh = {Humans ; Dysarthria/etiology ; Speech/physiology ; *Cerebellar Ataxia ; *Parkinson Disease/complications ; Articulation Disorders ; Atrophy ; Speech Acoustics ; Speech Intelligibility ; }, abstract = {PURPOSE: Although articulatory impairment represents distinct speech characteristics in most neurological diseases affecting movement, methods allowing automated assessments of articulation deficits from the connected speech are scarce. This study aimed to design a fully automated method for analyzing dysarthria-related vowel articulation impairment and estimate its sensitivity in a broad range of neurological diseases and various types and severities of dysarthria.

METHOD: Unconstrained monologue and reading passages were acquired from 459 speakers, including 306 healthy controls and 153 neurological patients. The algorithm utilized a formant tracker in combination with a phoneme recognizer and subsequent signal processing analysis.

RESULTS: Articulatory undershoot of vowels was presented in a broad spectrum of progressive neurodegenerative diseases, including Parkinson's disease, progressive supranuclear palsy, multiple-system atrophy, Huntington's disease, essential tremor, cerebellar ataxia, multiple sclerosis, and amyotrophic lateral sclerosis, as well as in related dysarthria subtypes including hypokinetic, hyperkinetic, ataxic, spastic, flaccid, and their mixed variants. Formant ratios showed a higher sensitivity to vowel deficits than vowel space area. First formants of corner vowels were significantly lower for multiple-system atrophy than cerebellar ataxia. Second formants of vowels /a/ and /i/ were lower in ataxic compared to spastic dysarthria. Discriminant analysis showed a classification score of up to 41.0% for disease type, 39.3% for dysarthria type, and 49.2% for dysarthria severity. Algorithm accuracy reached an F-score of 0.77.

CONCLUSIONS: Distinctive vowel articulation alterations reflect underlying pathophysiology in neurological diseases. Objective acoustic analysis of vowel articulation has the potential to provide a universal method to screen motor speech disorders.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.23681529.}, } @article {pmid37496795, year = {2023}, author = {Mailhos, A and Egea-Caparrós, DA and Cabana, Á and Martínez-Sánchez, F}, title = {Voice pitch is negatively associated with sociosexual behavior in males but not in females.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1200065}, pmid = {37496795}, issn = {1664-1078}, abstract = {Acoustic cues play a major role in social interactions in many animal species. In addition to the semantic contents of human speech, voice attributes - e.g., voice pitch, formant position, formant dispersion, etc. - have been proposed to provide critical information for the assessment of potential rivals and mates. However, prior studies exploring the association of acoustic attributes with reproductive success, or some of its proxies, have produced mixed results. Here, we investigate whether the mean fundamental frequency (F0), formant position (Pf), and formant dispersion (Df) - dimorphic attributes of the human voice - are related to sociosexuality, as measured by the Revised Sociosexual Orientation Inventory (SOI-R) - a trait also known to exhibit sex differences - in a sample of native Spanish-speaking students (101 males, 147 females). Analyses showed a significant negative correlation between F0 and sociosexual behavior, and between Pf and sociosexual desire in males but not in females. These correlations remained significant after correcting for false discovery rate (FDR) and controlling for age, a potential confounding variable. Our results are consistent with a role of F0 and Pf serving as cues in the mating domain in males but not in females. Alternatively, the association of voice attributes and sociosexual orientation might stem from the parallel effect of male sex hormones both on the male brain and the anatomical structures involved in voice production.}, } @article {pmid37477268, year = {2023}, author = {González-Alvarez, J and Sos-Peña, R}, title = {Body Perception From Connected Speech: Speaker Height Discrimination from Natural Sentences and Sine-Wave Replicas with and without Pitch.}, journal = {Perceptual and motor skills}, volume = {130}, number = {4}, pages = {1353-1365}, doi = {10.1177/00315125231173581}, pmid = {37477268}, issn = {1558-688X}, mesh = {Humans ; Male ; Female ; Speech ; *Speech Perception ; *Voice ; Body Height ; Language ; Pitch Perception ; }, abstract = {In addition to language, the human voice carries information about the physical characteristics of speakers, including their body size (height and weight). The fundamental speaking frequency, perceived as voice pitch, and the formant frequencies, or resonators of the vocal tract, are the acoustic speech parameters that have been most intensely studied for perceiving a speaker's body size. In this study, we created sine-wave (SW) replicas of connected speech (sentences) uttered by 20 male and 20 female speakers, consisting of three time-varying sinusoidal waves matching the frequency pattern of the first three formants of each sentence. These stimuli only provide information about the formant frequencies of a speech signal. We also created a new experimental condition by adding a sinusoidal replica of the voice pitch of each sentence. Results obtained from a binary discrimination task revealed that (a) our SW replicas provided sufficient useful information to accurately judge the speakers' body height at an above chance level; (b) adding the sinusoidal replica about the voice pitch did not significantly increase accuracy; and (c) stimuli from female speakers were more informative for body height detection and allowed higher perceptual accuracy, due to a stronger correlation between formant frequencies and actual body height than stimuli from male speakers.}, } @article {pmid37467104, year = {2023}, author = {Vilanova, ID and Almeida, SB and de Araújo, VS and Santos, RS and Schroder, AGD and Zeigelboim, BS and Corrêa, CC and Taveira, KVM and de Araujo, CM}, title = {Impact of orthognathic surgery on voice and speech: a systematic review and meta-analysis.}, journal = {European journal of orthodontics}, volume = {45}, number = {6}, pages = {747-763}, doi = {10.1093/ejo/cjad025}, pmid = {37467104}, issn = {1460-2210}, mesh = {Adult ; Humans ; *Orthognathic Surgery ; Speech ; *Orthognathic Surgical Procedures ; }, abstract = {BACKGROUND: Orthognathic surgical procedures, whether in one or both jaws, can affect structures regarding the articulation and resonance of voice and speech.

OBJECTIVE: Evaluating the impact of orthognathic surgery on voice and speech performance in individuals with skeletal dentofacial disharmony.

SEARCH METHODS: Word combinations and truncations were adapted for the following electronic databases: EMBASE, PubMed/Medline, Scopus, Web of Science, Cochrane Library, and Latin American and Caribbean Literature in Health Sciences (LILACS), and grey literature.

SELECTION CRITERIA: The research included studies on nonsyndromic adults with skeletal dentofacial disharmony undergoing orthognathic surgery. These studies assessed patients before and after surgery or compared them with individuals with good facial harmony using voice and speech parameters through validated protocols.

DATA COLLECTION AND ANALYSIS: Two independent reviewers performed all stages of the review. The Joanna Briggs Institute tool was used to assess risk of bias in the cohort studies, and ROBINS-I was used for nonrandomized clinical trials. The authors also performed a meta-analysis of random effects.

RESULTS: A total of 1163 articles were retrieved after the last search, of which 23 were read in full. Of these, four were excluded, totalling 19 articles for quantitative synthesis. When comparing the pre- and postoperative periods, both for fundamental frequency, formants, and jitter and shimmer perturbation measures, orthognathic surgery did not affect vowel production. According to the articles, the main articulatory errors associated with skeletal dentofacial disharmonies prior to surgery were distortions of fricative sounds, mainly/s/ and/z/.

CONCLUSIONS: Orthognathic surgery may have little or no impact on vocal characteristics during vowel production. However, due to the confounding factors involved, estimates are inconclusive. The most prevalent articulatory disorders in the preoperative period were distortion of the fricative phonemes/s/ and/z/. However, further studies must be carried out to ensure greater robustness to these findings.

REGISTRATION: PROSPERO (CRD42022291113).}, } @article {pmid37436271, year = {2023}, author = {Stoehr, A and Souganidis, C and Thomas, TB and Jacobsen, J and Martin, CD}, title = {Voice onset time and vowel formant measures in online testing and laboratory-based testing with(out) surgical face masks.}, journal = {The Journal of the Acoustical Society of America}, volume = {154}, number = {1}, pages = {152-166}, doi = {10.1121/10.0020064}, pmid = {37436271}, issn = {1520-8524}, mesh = {Humans ; Masks ; Pandemics ; Speech Acoustics ; *COVID-19 ; *Voice ; Phonetics ; }, abstract = {Since the COVID-19 pandemic started, conducting experiments online is increasingly common, and face masks are often used in everyday life. It remains unclear whether phonetic detail in speech production is captured adequately when speech is recorded in internet-based experiments or in experiments conducted with face masks. We tested 55 Spanish-Basque-English trilinguals in picture naming tasks in three conditions: online, laboratory-based with surgical face masks, and laboratory-based without face masks (control). We measured plosive voice onset time (VOT) in each language, the formants and duration of English vowels /iː/ and /ɪ/, and the Spanish/Basque vowel space. Across conditions, there were differences between English and Spanish/Basque VOT and in formants and duration between English /iː/-/ɪ/; between conditions, small differences emerged. Relative to the control condition, the Spanish/Basque vowel space was larger in online testing and smaller in the face mask condition. We conclude that testing online or with face masks is suitable for investigating phonetic detail in within-participant designs although the precise measurements may differ from those in traditional laboratory-based research.}, } @article {pmid37433805, year = {2023}, author = {Kries, J and De Clercq, P and Lemmens, R and Francart, T and Vandermosten, M}, title = {Acoustic and phonemic processing are impaired in individuals with aphasia.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {11208}, pmid = {37433805}, issn = {2045-2322}, mesh = {Humans ; *Aphasia/etiology ; *Language Disorders ; Acoustics ; Cognition ; Individuality ; }, abstract = {Acoustic and phonemic processing are understudied in aphasia, a language disorder that can affect different levels and modalities of language processing. For successful speech comprehension, processing of the speech envelope is necessary, which relates to amplitude changes over time (e.g., the rise times). Moreover, to identify speech sounds (i.e., phonemes), efficient processing of spectro-temporal changes as reflected in formant transitions is essential. Given the underrepresentation of aphasia studies on these aspects, we tested rise time processing and phoneme identification in 29 individuals with post-stroke aphasia and 23 healthy age-matched controls. We found significantly lower performance in the aphasia group than in the control group on both tasks, even when controlling for individual differences in hearing levels and cognitive functioning. Further, by conducting an individual deviance analysis, we found a low-level acoustic or phonemic processing impairment in 76% of individuals with aphasia. Additionally, we investigated whether this impairment would propagate to higher-level language processing and found that rise time processing predicts phonological processing performance in individuals with aphasia. These findings show that it is important to develop diagnostic and treatment tools that target low-level language processing mechanisms.}, } @article {pmid37424066, year = {2024}, author = {Maes, P and Weyland, M and Kissine, M}, title = {Structure and acoustics of the speech of verbal autistic preschoolers.}, journal = {Journal of child language}, volume = {51}, number = {3}, pages = {509-525}, doi = {10.1017/S0305000923000417}, pmid = {37424066}, issn = {1469-7602}, support = {//Fondation Roger de Spoelberch/ ; //Fondation Francqui - Stichting/ ; //Marguerite-Marie Delacroix foundation/ ; }, mesh = {Humans ; Child, Preschool ; Male ; Female ; *Speech Acoustics ; Phonetics ; Child Language ; Autistic Disorder/psychology ; Speech ; Speech Production Measurement ; }, abstract = {In this study, we report an extensive investigation of the structural language and acoustical specificities of the spontaneous speech of ten three- to five-year-old verbal autistic children. The autistic children were compared to a group of ten typically developing children matched pairwise on chronological age, nonverbal IQ and socioeconomic status, and groupwise on verbal IQ and gender on various measures of structural language (phonetic inventory, lexical diversity and morpho-syntactic complexity) and a series of acoustical measures of speech (mean and range fundamental frequency, a formant dispersion index, syllable duration, jitter and shimmer). Results showed that, overall, the structure and acoustics of the verbal autistic children's speech were highly similar to those of the TD children. Few remaining atypicalities in the speech of autistic children lay in a restricted use of different vocabulary items, a somewhat diminished morpho-syntactic complexity, and a slightly exaggerated syllable duration.}, } @article {pmid37417627, year = {2023}, author = {Park, EJ and Yoo, SD}, title = {Correlation between the parameters of quadrilateral vowel and dysphonia severity in patients with traumatic brain injury.}, journal = {Medicine}, volume = {102}, number = {27}, pages = {e33030}, pmid = {37417627}, issn = {1536-5964}, mesh = {Humans ; *Dysphonia/diagnosis/etiology ; Retrospective Studies ; Dysarthria ; Quality of Life ; Acoustics ; }, abstract = {Dysarthria and dysphonia are common in patients with traumatic brain injury (TBI). Multiple factors may contribute to TBI-induced dysarthria, including poor vocalization, articulation, respiration, and/or resonance. Many patients suffer from dysarthria that persists after the onset of TBI, with negative effects on their quality of life. This study aimed to investigate the relationship between vowel quadrilateral parameters and Dysphoria Severity Index (DSI), which objectively reflects vocal function We retrospectively enrolled TBI patients diagnosed using computer tomography. Participants had dysarthria and dysphonia and underwent acoustic analysis. Praat software was used to measure vowel space area (VSA), formant centralization ratio (FCR), and the second formant (F2) ratio. For the 4 corner vowels (/a/,/u/,/i/, and/ae/), the resonance frequency of the vocal folds was measured and is shown as 2-dimensional coordinates for the formant parameters. Pear-son correlation and multiple linear regression analyses were performed between the variables. VSA showed a significant positive correlation with DSI/a/ (R = 0.221) and DSI/i/ (R = 0.026). FCR showed a significant negative correlation with DSI/u/ and DSI/i/. The F2 ratio showed a significant positive correlation with DSI/u/ and DSI/ae/. In the multiple linear regression analysis, VSA was found to be a significant predictor of DSI/a/ (β = 0.221, P = .030, R 2 = 0.139). F2 ratio (β = 0.275, P = .0.015) and FCR (β = -0.218, P = .029) was a significant predictor of DSI/u/ (R 2 = 0.203). FCR was a significant predictor of DSI/i/ (β = -0.260, P = .010, R 2 = 0.158). F2 ratio was a significant predictor of DSI/ae/ (β = 0.254, P = .013, R 2 = 0.154). Vowel quadrilateral parameters, such as VSA, FCR, and F2 ratio, may be associated with dysphonia severity in TBI patients.}, } @article {pmid37416548, year = {2023}, author = {Persson, A and Jaeger, TF}, title = {Evaluating normalization accounts against the dense vowel space of Central Swedish.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1165742}, pmid = {37416548}, issn = {1664-1078}, abstract = {Talkers vary in the phonetic realization of their vowels. One influential hypothesis holds that listeners overcome this inter-talker variability through pre-linguistic auditory mechanisms that normalize the acoustic or phonetic cues that form the input to speech recognition. Dozens of competing normalization accounts exist-including both accounts specific to vowel perception and general purpose accounts that can be applied to any type of cue. We add to the cross-linguistic literature on this matter by comparing normalization accounts against a new phonetically annotated vowel database of Swedish, a language with a particularly dense vowel inventory of 21 vowels differing in quality and quantity. We evaluate normalization accounts on how they differ in predicted consequences for perception. The results indicate that the best performing accounts either center or standardize formants by talker. The study also suggests that general purpose accounts perform as well as vowel-specific accounts, and that vowel normalization operates in both temporal and spectral domains.}, } @article {pmid37413966, year = {2023}, author = {Steinschneider, M}, title = {Toward an understanding of vowel encoding in the human auditory cortex.}, journal = {Neuron}, volume = {111}, number = {13}, pages = {1995-1997}, doi = {10.1016/j.neuron.2023.06.004}, pmid = {37413966}, issn = {1097-4199}, mesh = {Humans ; *Auditory Cortex/physiology ; Phonetics ; *Speech Perception/physiology ; }, abstract = {In this issue of Neuron, Oganian et al.[1] performed intracranial recordings in the auditory cortex of human subjects to clarify how vowels are encoded by the brain. Formant-based tuning curves demonstrated the organization of vowel encoding. The need for population codes and demonstration of speaker normalization were emphasized.}, } @article {pmid37404579, year = {2023}, author = {Hong, Y and Chen, S and Zhou, F and Chan, A and Tang, T}, title = {Phonetic entrainment in L2 human-robot interaction: an investigation of children with and without autism spectrum disorder.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1128976}, pmid = {37404579}, issn = {1664-1078}, abstract = {Phonetic entrainment is a phenomenon in which people adjust their phonetic features to approach those of their conversation partner. Individuals with Autism Spectrum Disorder (ASD) have been reported to show some deficits in entrainment during their interactions with human interlocutors, though deficits in terms of significant differences from typically developing (TD) controls were not always registered. One reason related to the inconsistencies of whether deficits are detected or not in autistic individuals is that the conversation partner's speech could hardly be controlled, and both the participants and the partners might be adjusting their phonetic features. The variabilities in the speech of conversation partners and various social traits exhibited might make the phonetic entrainment (if any) of the participants less detectable. In this study, we attempted to reduce the variability of the interlocutors by employing a social robot and having it do a goal-directed conversation task with children with and without ASD. Fourteen autistic children and 12 TD children participated the current study in their second language English. Results showed that autistic children showed comparable vowel formants and mean fundamental frequency (f0) entrainment as their TD peers, but they did not entrain their f0 range as the TD group did. These findings suggest that autistic children were capable of exhibiting phonetic entrainment behaviors similar to TD children in vowel formants and f0, particularly in a less complex situation where the speech features and social traits of the interlocutor were controlled. Furthermore, the utilization of a social robot may have increased the interest of these children in phonetic entrainment. On the other hand, entrainment of f0 range was more challenging for these autistic children even in a more controlled situation. This study demonstrates the viability and potential of using human-robot interactions as a novel method to evaluate abilities and deficits in phonetic entrainment in autistic children.}, } @article {pmid37401990, year = {2023}, author = {Terranova, F and Baciadonna, L and Maccarone, C and Isaja, V and Gamba, M and Favaro, L}, title = {Penguins perceive variations of source- and filter-related vocal parameters of species-specific vocalisations.}, journal = {Animal cognition}, volume = {26}, number = {5}, pages = {1613-1622}, pmid = {37401990}, issn = {1435-9456}, mesh = {Animals ; *Spheniscidae ; Vocalization, Animal ; Species Specificity ; Acoustics ; Sound ; }, abstract = {Animal vocalisations encode a wide range of biological information about the age, sex, body size, and social status of the emitter. Moreover, vocalisations play a significant role in signalling the identity of the emitter to conspecifics. Recent studies have shown that, in the African penguin (Spheniscus demersus), acoustic cues to individual identity are encoded in the fundamental frequency (F0) and resonance frequencies (formants) of the vocal tract. However, although penguins are known to produce vocalisations where F0 and formants vary among individuals, it remains to be tested whether the receivers can perceive and use such information in the individual recognition process. In this study, using the Habituation-Dishabituation (HD) paradigm, we tested the hypothesis that penguins perceive and respond to a shift of ± 20% (corresponding to the natural inter-individual variation observed in ex-situ colonies) of F0 and formant dispersion (ΔF) of species-specific calls. We found that penguins were more likely to look rapidly and for longer at the source of the sound when F0 and formants of the calls were manipulated, indicating that they could perceive variations of these parameters in the vocal signals. Our findings provide the first experimental evidence that, in the African penguin, listeners can perceive changes in F0 and formants, which can be used by the receiver as potential cues for the individual discrimination of the emitter.}, } @article {pmid37391267, year = {2024}, author = {Panneton, R and Cristia, A and Taylor, C and Moon, C}, title = {Positive Valence Contributes to Hyperarticulation in Maternal Speech to Infants and Puppies.}, journal = {Journal of child language}, volume = {51}, number = {5}, pages = {1230-1240}, doi = {10.1017/S0305000923000296}, pmid = {37391267}, issn = {1469-7602}, mesh = {Humans ; Female ; Infant ; *Mother-Child Relations ; *Emotions ; Animals ; Dogs ; Speech ; Maternal Behavior/psychology ; Male ; Mothers/psychology ; Adult ; Phonetics ; }, abstract = {Infant-directed speech often has hyperarticulated features, such as point vowels whose formants are further apart than in adult-directed speech. This increased "vowel space" may reflect the caretaker's effort to speak more clearly to infants, thus benefiting language processing. However, hyperarticulation may also result from more positive valence (e.g., speaking with positive vocal emotion) often found in mothers' speech to infants. This study was designed to replicate others who have found hyperarticulation in maternal speech to their 6-month-olds, but also to examine their speech to a non-human infant (i.e., a puppy). We rated both kinds of maternal speech for their emotional valence and recorded mothers' speech to a human adult. We found that mothers produced more positively valenced utterances and some hyperarticulation in both their infant- and puppy-directed speech, compared to their adult-directed speech. This finding promotes looking at maternal speech from a multi-faceted perspective that includes emotional state.}, } @article {pmid37384576, year = {2023}, author = {Vogt, C and Floegel, M and Kasper, J and Gispert-Sánchez, S and Kell, CA}, title = {Oxytocinergic modulation of speech production-a double-blind placebo-controlled fMRI study.}, journal = {Social cognitive and affective neuroscience}, volume = {18}, number = {1}, pages = {}, pmid = {37384576}, issn = {1749-5024}, mesh = {Humans ; Male ; *Speech ; *Oxytocin/pharmacology ; Magnetic Resonance Imaging ; Receptors, Oxytocin/genetics ; Language ; Double-Blind Method ; Administration, Intranasal ; Brain/physiology ; }, abstract = {Many socio-affective behaviors, such as speech, are modulated by oxytocin. While oxytocin modulates speech perception, it is not known whether it also affects speech production. Here, we investigated effects of oxytocin administration and interactions with the functional rs53576 oxytocin receptor (OXTR) polymorphism on produced speech and its underlying brain activity. During functional magnetic resonance imaging, 52 healthy male participants read sentences out loud with either neutral or happy intonation, a covert reading condition served as a common baseline. Participants were studied once under the influence of intranasal oxytocin and in another session under placebo. Oxytocin administration increased the second formant of produced vowels. This acoustic feature has previously been associated with speech valence; however, the acoustic differences were not perceptually distinguishable in our experimental setting. When preparing to speak, oxytocin enhanced brain activity in sensorimotor cortices and regions of both dorsal and right ventral speech processing streams, as well as subcortical and cortical limbic and executive control regions. In some of these regions, the rs53576 OXTR polymorphism modulated oxytocin administration-related brain activity. Oxytocin also gated cortical-basal ganglia circuits involved in the generation of happy prosody. Our findings suggest that several neural processes underlying speech production are modulated by oxytocin, including control of not only affective intonation but also sensorimotor aspects during emotionally neutral speech.}, } @article {pmid37344246, year = {2023}, author = {Vasquez-Serrano, P and Reyes-Moreno, J and Guido, RC and Sepúlveda-Sepúlveda, A}, title = {MFCC Parameters of the Speech Signal: An Alternative to Formant-Based Instantaneous Vocal Tract Length Estimation.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.05.012}, pmid = {37344246}, issn = {1873-4588}, abstract = {On the one hand, the relationship between formant frequencies and vocal tract length (VTL) has been intensively studied over the years. On the other hand, the connection involving mel-frequency cepstral coefficients (MFCCs), which concisely codify the overall shape of a speaker's spectral envelope with just a few cepstral coefficients, and VTL has only been modestly analyzed, being worth of further investigation. Thus, based on different statistical models, this article explores the advantages and disadvantages of the latter approach, which is relatively novel, in contrast to the former which arises from more traditional studies. Additionally, VTL is assumed to be a static and inherent characteristic of speakers, that is, a single length parameter is frequently estimated per speaker. By contrast, in this paper we consider VTL estimation from a dynamic perspective using modern real-time Magnetic Resonance Imaging (rtMRI) to measure VTL in parallel with audio signals. To support the experiments, data obtained from USC-TIMIT magnetic resonance videos were used, allowing for the 2D real-time analysis of articulators in motion. As a result, we observed that the performance of MFCCs in case of speaker-dependent modeling is higher, however, in case of cross-speaker modeling, which uses different speakers' data for training and evaluating, its performance is not significantly different of that obtained with formants. In complement, we note that the estimation based on MFCCs is robust, with an acceptable computational time complexity, coherent with the traditional approach.}, } @article {pmid37307398, year = {2023}, author = {Cox, C and Dideriksen, C and Keren-Portnoy, T and Roepstorff, A and Christiansen, MH and Fusaroli, R}, title = {Infant-directed speech does not always involve exaggerated vowel distinctions: Evidence from Danish.}, journal = {Child development}, volume = {94}, number = {6}, pages = {1672-1696}, doi = {10.1111/cdev.13950}, pmid = {37307398}, issn = {1467-8624}, support = {DFF-7013-00074//Danmarks Frie Forskningsfond/ ; //Interacting Minds Centre/ ; }, mesh = {Adult ; Infant ; Humans ; Female ; Child, Preschool ; Male ; Child ; *Speech ; Language ; Language Development ; Child Language ; *Speech Perception ; Denmark ; Phonetics ; Speech Acoustics ; }, abstract = {This study compared the acoustic properties of 26 (100% female, 100% monolingual) Danish caregivers' spontaneous speech addressed to their 11- to 24-month-old infants (infant-directed speech, IDS) and an adult experimenter (adult-directed speech, ADS). The data were collected between 2016 and 2018 in Aarhus, Denmark. Prosodic properties of Danish IDS conformed to cross-linguistic patterns, with a higher pitch, greater pitch variability, and slower articulation rate than ADS. However, an acoustic analysis of vocalic properties revealed that Danish IDS had a reduced or similar vowel space, higher within-vowel variability, raised formants, and lower degree of vowel discriminability compared to ADS. None of the measures, except articulation rate, showed age-related differences. These results push for future research to conduct theory-driven comparisons across languages with distinct phonological systems.}, } @article {pmid37305920, year = {2023}, author = {, }, title = {Editor's note: Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1882}, pages = {20230201}, pmid = {37305920}, issn = {1471-2970}, } @article {pmid37303890, year = {2023}, author = {Baron, A and Harwood, V and Kleinman, D and Campanelli, L and Molski, J and Landi, N and Irwin, J}, title = {Where on the face do we look during phonemic restoration: An eye-tracking study.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1005186}, pmid = {37303890}, issn = {1664-1078}, abstract = {Face to face communication typically involves audio and visual components to the speech signal. To examine the effect of task demands on gaze patterns in response to a speaking face, adults participated in two eye-tracking experiments with an audiovisual (articulatory information from the mouth was visible) and a pixelated condition (articulatory information was not visible). Further, task demands were manipulated by having listeners respond in a passive (no response) or an active (button press response) context. The active experiment required participants to discriminate between speech stimuli and was designed to mimic environmental situations which require one to use visual information to disambiguate the speaker's message, simulating different listening conditions in real-world settings. Stimuli included a clear exemplar of the syllable /ba/ and a second exemplar in which the formant initial consonant was reduced creating an /a/-like consonant. Consistent with our hypothesis, results revealed that the greatest fixations to the mouth were present in the audiovisual active experiment and visual articulatory information led to a phonemic restoration effect for the /a/ speech token. In the pixelated condition, participants fixated on the eyes, and discrimination of the deviant token within the active experiment was significantly greater than the audiovisual condition. These results suggest that when required to disambiguate changes in speech, adults may look to the mouth for additional cues to support processing when it is available.}, } @article {pmid37302909, year = {2023}, author = {Ikuma, T and McWhorter, AJ and Oral, E and Kunduk, M}, title = {Formant-Aware Spectral Analysis of Sustained Vowels of Pathological Breathy Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.05.002}, pmid = {37302909}, issn = {1873-4588}, abstract = {OBJECTIVES: This paper reports the effectiveness of formant-aware spectral parameters to predict the perceptual breathiness rating. A breathy voice has a steeper spectral slope and higher turbulent noise than a normal voice. Measuring spectral parameters of acoustic signals over lower formant regions is a known approach to capture the properties related to breathiness. This study examines this approach by testing the contemporary spectral parameters and algorithms within the framework, alternate frequency band designs, and vowel effects.

METHODS: Sustained vowel recordings (/a/, /i/, and /u/) of speakers with voice disorders in the German Saarbrueken Voice Database were considered (n: 367). Recordings with signal irregularities, such as subharmonics or with roughness perception, were excluded from the study. Four speech language pathologists perceptually rated the recordings for breathiness on a 100-point scale, and their averages were used in the analysis. The acoustic spectra were segmented into four frequency bands according to the vowel formant structures. Five spectral parameters (intraband harmonics-to-noise ratio, HNR; interband harmonics ratio, HHR; interband noise ratio, NNR; and interband glottal-to-noise energy, GNE, ratio) were evaluated in each band to predict the perceptual breathiness rating. Four HNR algorithms were tested.

RESULTS: Multiple linear regression models of spectral parameters, led by the HNRs, were shown to explain up to 85% of the variance in perceptual breathiness ratings. This performance exceeded that of the acoustic breathiness index (82%). Individually, the HNR over the first two formants best explained the variances in the breathiness (78%), exceeding the smoothed cepstrum peak prominence (74%). The performance of HNR was highly algorithm dependent (10% spread). Some vowel effects were observed in the perceptual rating (higher for /u/), predictability (5% lower for /u/), and model parameter selections.

CONCLUSIONS: Strong per-vowel breathiness acoustic models were found by segmenting the spectrum to isolate the portion most affected by breathiness.}, } @article {pmid37260602, year = {2023}, author = {Ashokumar, M and Guichet, C and Schwartz, JL and Ito, T}, title = {Correlation between the effect of orofacial somatosensory inputs in speech perception and speech production performance.}, journal = {Auditory perception & cognition}, volume = {6}, number = {1-2}, pages = {97-107}, pmid = {37260602}, issn = {2574-2450}, support = {R01 DC017439/DC/NIDCD NIH HHS/United States ; }, abstract = {INTRODUCTION: Orofacial somatosensory inputs modify the perception of speech sounds. Such auditory-somatosensory integration likely develops alongside speech production acquisition. We examined whether the somatosensory effect in speech perception varies depending on individual characteristics of speech production.

METHODS: The somatosensory effect in speech perception was assessed by changes in category boundary between /e/ and /ø/ in a vowel identification test resulting from somatosensory stimulation providing facial skin deformation in the rearward direction corresponding to articulatory movement for /e/ applied together with the auditory input. Speech production performance was quantified by the acoustic distances between the average first, second and third formants of /e/ and /ø/ utterances recorded in a separate test.

RESULTS: The category boundary between /e/ and /ø/ was significantly shifted towards /ø/ due to the somatosensory stimulation which is consistent with previous research. The amplitude of the category boundary shift was significantly correlated with the acoustic distance between the mean second - and marginally third - formants of /e/ and /ø/ productions, with no correlation with the first formant distance.

DISCUSSION: Greater acoustic distances can be related to larger contrasts between the articulatory targets of vowels in speech production. These results suggest that the somatosensory effect in speech perception can be linked to speech production performance.}, } @article {pmid37227411, year = {2023}, author = {Saba, JN and Ali, H and Hansen, JHL}, title = {The effects of estimation accuracy, estimation approach, and number of selected channels using formant-priority channel selection for an "n-of-m" sound processing strategy for cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {3100}, pmid = {37227411}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implants ; *Speech Perception ; *Cochlear Implantation ; Sound ; Noise ; }, abstract = {Previously, selection of l channels was prioritized according to formant frequency locations in an l-of-n-of-m-based signal processing strategy to provide important voicing information independent of listening environments for cochlear implant (CI) users. In this study, ideal, or ground truth, formants were incorporated into the selection stage to determine the effect of accuracy on (1) subjective speech intelligibility, (2) objective channel selection patterns, and (3) objective stimulation patterns (current). An average +11% improvement (p < 0.05) was observed across six CI users in quiet, but not for noise or reverberation conditions. Analogous increases in channel selection and current for the upper range of F1 and a decrease across mid-frequencies with higher corresponding current, were both observed at the expense of noise-dominant channels. Objective channel selection patterns were analyzed a second time to determine the effects of estimation approach and number of selected channels (n). A significant effect of estimation approach was only observed in the noise and reverberation condition with minor differences in channel selection and significantly decreased stimulated current. Results suggest that estimation method, accuracy, and number of channels in the proposed strategy using ideal formants may improve intelligibility when corresponding stimulated current of formant channels are not masked by noise-dominant channels.}, } @article {pmid37224720, year = {2023}, author = {Carney, LH and Cameron, DA and Kinast, KB and Feld, CE and Schwarz, DM and Leong, UC and McDonough, JM}, title = {Effects of sensorineural hearing loss on formant-frequency discrimination: Measurements and models.}, journal = {Hearing research}, volume = {435}, number = {}, pages = {108788}, pmid = {37224720}, issn = {1878-5891}, support = {R01 DC001641/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech Perception/physiology ; *Hearing Loss, Sensorineural/diagnosis ; Mesencephalon ; *Inferior Colliculi/physiology ; Cochlear Nerve ; Phonetics ; }, abstract = {This study concerns the effect of hearing loss on discrimination of formant frequencies in vowels. In the response of the healthy ear to a harmonic sound, auditory-nerve (AN) rate functions fluctuate at the fundamental frequency, F0. Responses of inner-hair-cells (IHCs) tuned near spectral peaks are captured (or dominated) by a single harmonic, resulting in lower fluctuation depths than responses of IHCs tuned between spectral peaks. Therefore, the depth of neural fluctuations (NFs) varies along the tonotopic axis and encodes spectral peaks, including formant frequencies of vowels. This NF code is robust across a wide range of sound levels and in background noise. The NF profile is converted into a rate-place representation in the auditory midbrain, wherein neurons are sensitive to low-frequency fluctuations. The NF code is vulnerable to sensorineural hearing loss (SNHL) because capture depends upon saturation of IHCs, and thus the interaction of cochlear gain with IHC transduction. In this study, formant-frequency discrimination limens (DLFFs) were estimated for listeners with normal hearing or mild to moderate SNHL. The F0 was fixed at 100 Hz, and formant peaks were either aligned with harmonic frequencies or placed between harmonics. Formant peak frequencies were 600 and 2000 Hz, in the range of first and second formants of several vowels. The difficulty of the task was varied by changing formant bandwidth to modulate the contrast in the NF profile. Results were compared to predictions from model auditory-nerve and inferior colliculus (IC) neurons, with listeners' audiograms used to individualize the AN model. Correlations between DLFFs, audiometric thresholds near the formant frequencies, age, and scores on the Quick speech-in-noise test are reported. SNHL had a strong effect on DLFF for the second formant frequency (F2), but relatively small effect on DLFF for the first formant (F1). The IC model appropriately predicted substantial threshold elevations for changes in F2 as a function of SNHL and little effect of SNHL on thresholds for changes in F1.}, } @article {pmid37214801, year = {2023}, author = {Rizzi, R and Bidelman, GM}, title = {Duplex perception reveals brainstem auditory representations are modulated by listeners' ongoing percept for speech.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, pmid = {37214801}, issn = {2692-8205}, support = {R01 DC016267/DC/NIDCD NIH HHS/United States ; }, abstract = {So-called duplex speech stimuli with perceptually ambiguous spectral cues to one ear and isolated low- vs. high-frequency third formant "chirp" to the opposite ear yield a coherent percept supporting their phonetic categorization. Critically, such dichotic sounds are only perceived categorically upon binaural integration. Here, we used frequency-following responses (FFRs), scalp-recorded potentials reflecting phase-locked subcortical activity, to investigate brainstem responses to fused speech percepts and to determine whether FFRs reflect binaurally integrated category-level representations. We recorded FFRs to diotic and dichotic stop-consonants (/da/, /ga/) that either did or did not require binaural fusion to properly label along with perceptually ambiguous sounds without clear phonetic identity. Behaviorally, listeners showed clear categorization of dichotic speech tokens confirming they were heard with a fused, phonetic percept. Neurally, we found FFRs were stronger for categorically perceived speech relative to category-ambiguous tokens but also differentiated phonetic categories for both diotically and dichotically presented speech sounds. Correlations between neural and behavioral data further showed FFR latency predicted the degree to which listeners labeled tokens as "da" vs. "ga". The presence of binaurally integrated, category-level information in FFRs suggests human brainstem processing reflects a surprisingly abstract level of the speech code typically circumscribed to much later cortical processing.}, } @article {pmid37212513, year = {2023}, author = {Cox, SR and Huang, T and Chen, WR and Ng, ML}, title = {An acoustic study of Cantonese alaryngeal speech in different speaking conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2973}, pmid = {37212513}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech, Alaryngeal/methods ; Speech, Esophageal ; Speech ; *Larynx, Artificial ; Acoustics ; Speech Intelligibility ; Speech Acoustics ; }, abstract = {Esophageal (ES) speech, tracheoesophageal (TE) speech, and the electrolarynx (EL) are common methods of communication following the removal of the larynx. Our recent study demonstrated that intelligibility may increase for Cantonese alaryngeal speakers using clear speech (CS) compared to their everyday "habitual speech" (HS), but the reasoning is still unclear [Hui, Cox, Huang, Chen, and Ng (2022). Folia Phoniatr. Logop. 74, 103-111]. The purpose of this study was to assess the acoustic characteristics of vowels and tones produced by Cantonese alaryngeal speakers using HS and CS. Thirty-one alaryngeal speakers (9 EL, 10 ES, and 12 TE speakers) read The North Wind and the Sun passage in HS and CS. Vowel formants, vowel space area (VSA), speaking rate, pitch, and intensity were examined, and their relationship to intelligibility were evaluated. Statistical models suggest that larger VSAs significantly improved intelligibility, but slower speaking rate did not. Vowel and tonal contrasts did not differ between HS and CS for all three groups, but the amount of information encoded in fundamental frequency and intensity differences between high and low tones positively correlated with intelligibility for TE and ES groups, respectively. Continued research is needed to understand the effects of different speaking conditions toward improving acoustic and perceptual characteristics of Cantonese alaryngeal speech.}, } @article {pmid37210244, year = {2023}, author = {Valls-Ontañón, A and Ferreiro, M and Moragues-Aguiló, B and Molins-Ballabriga, G and Julián-González, S and Sauca-Balart, A and Hernández-Alfaro, F}, title = {Impact of 3-dimensional anatomical changes secondary to orthognathic surgery on voice resonance and articulatory function: a prospective study.}, journal = {The British journal of oral & maxillofacial surgery}, volume = {61}, number = {5}, pages = {373-379}, doi = {10.1016/j.bjoms.2023.04.007}, pmid = {37210244}, issn = {1532-1940}, mesh = {Humans ; *Orthognathic Surgery ; Prospective Studies ; Facial Bones ; Speech ; Tongue ; Speech Acoustics ; }, abstract = {An evaluation was made of the impact of orthognathic surgery (OS) on speech, addressing in particular the effects of skeletal and airway changes on voice resonance characteristics and articulatory function. A prospective study was carried out involving 29 consecutive patientssubjected to OS. Preoperative, and short and long-term postoperative evaluations were made of anatomical changes (skeletal and airway measurements), speech evolution (assessed objectively by acoustic analysis: fundamental frequency, local jitter, local shimmer of each vowel, and formants F1 and F2 of vowel /a/), and articulatory function (use of compensatory musculature, point of articulation, and speech intelligibility). These were also assessed subjectively by means of a visual analogue scale. Articulatory function after OS showed immediate improvement and had further progressed at one year of follow up. This improvement significantly correlated with the anatomical changes, and was also notably perceived by the patient. On the other hand, although a slight modification in vocal resonance was reported and seen to correlate with anatomical changes of the tongue, hyoid bone, and airway, it was not subjectively perceived by the patients. In conclusion, the results demonstrated that OS had beneficial effects on articulatory function and imperceptible subjective changes in a patient's voice. Patients subjected to OS, apart from benefitting from improved articulatory function, should not be afraid that they will not recognise their voice after treatment.}, } @article {pmid37205390, year = {2023}, author = {Shellikeri, S and Cho, S and Ash, S and Gonzalez-Recober, C and McMillan, CT and Elman, L and Quinn, C and Amado, DA and Baer, M and Irwin, DJ and Massimo, L and Olm, C and Liberman, M and Grossman, M and Nevler, N}, title = {Digital markers of motor speech impairments in natural speech of patients with ALS-FTD spectrum disorders.}, journal = {medRxiv : the preprint server for health sciences}, volume = {}, number = {}, pages = {}, doi = {10.1101/2023.04.29.23289308}, pmid = {37205390}, support = {K08 NS114106/NS/NINDS NIH HHS/United States ; }, abstract = {BACKGROUND AND OBJECTIVES: Patients with ALS-FTD spectrum disorders (ALS-FTSD) have mixed motor and cognitive impairments and require valid and quantitative assessment tools to support diagnosis and tracking of bulbar motor disease. This study aimed to validate a novel automated digital speech tool that analyzes vowel acoustics from natural, connected speech as a marker for impaired articulation due to bulbar motor disease in ALS-FTSD.

METHODS: We used an automatic algorithm called Forced Alignment Vowel Extraction (FAVE) to detect spoken vowels and extract vowel acoustics from 1 minute audio-recorded picture descriptions. Using automated acoustic analysis scripts, we derived two articulatory-acoustic measures: vowel space area (VSA, in Bark [2]) which represents tongue range-of-motion (size), and average second formant slope of vowel trajectories (F2 slope) which represents tongue movement speed. We compared vowel measures between ALS with and without clinically-evident bulbar motor disease (ALS+bulbar vs. ALS-bulbar), behavioral variant frontotemporal dementia (bvFTD) without a motor syndrome, and healthy controls (HC). We correlated impaired vowel measures with bulbar disease severity, estimated by clinical bulbar scores and perceived listener effort, and with MRI cortical thickness of the orobuccal part of the primary motor cortex innervating the tongue (oralPMC). We also tested correlations with respiratory capacity and cognitive impairment.

RESULTS: Participants were 45 ALS+bulbar (30 males, mean age=61±11), 22 ALS-nonbulbar (11 males, age=62±10), 22 bvFTD (13 males, age=63±7), and 34 HC (14 males, age=69±8). ALS+bulbar had smaller VSA and shallower average F2 slopes than ALS-bulbar (VSA: | d |=0.86, p =0.0088; F2 slope: | d |=0.98, p =0.0054), bvFTD (VSA: | d |=0.67, p =0.043; F2 slope: | d |=1.4, p <0.001), and HC (VSA: | d |=0.73, p =0.024; F2 slope: | d |=1.0, p <0.001). Vowel measures declined with worsening bulbar clinical scores (VSA: R=0.33, p =0.033; F2 slope: R=0.25, p =0.048), and smaller VSA was associated with greater listener effort (R=-0.43, p =0.041). Shallower F2 slopes were related to cortical thinning in oralPMC (R=0.50, p =0.03). Neither vowel measure was associated with respiratory nor cognitive test scores.

CONCLUSIONS: Vowel measures extracted with automatic processing from natural speech are sensitive to bulbar motor disease in ALS-FTD and are robust to cognitive impairment.}, } @article {pmid37203275, year = {2023}, author = {Easwar, V and Peng, ZE and Mak, V and Mikiel-Hunter, J}, title = {Differences between children and adults in the neural encoding of voice fundamental frequency in the presence of noise and reverberation.}, journal = {The European journal of neuroscience}, volume = {58}, number = {2}, pages = {2547-2562}, doi = {10.1111/ejn.16049}, pmid = {37203275}, issn = {1460-9568}, mesh = {Humans ; Adult ; Male ; Child ; Aged ; Adolescent ; *Speech Perception/physiology ; Noise ; Speech ; }, abstract = {Environmental noise and reverberation challenge speech understanding more significantly in children than in adults. However, the neural/sensory basis for the difference is poorly understood. We evaluated the impact of noise and reverberation on the neural processing of the fundamental frequency of voice (f0)-an important cue to tag or recognize a speaker. In a group of 39 6- to 15-year-old children and 26 adults with normal hearing, envelope following responses (EFRs) were elicited by a male-spoken /i/ in quiet, noise, reverberation, and both noise and reverberation. Due to increased resolvability of harmonics at lower than higher vowel formants that may affect susceptibility to noise and/or reverberation, the /i/ was modified to elicit two EFRs: one initiated by the low frequency first formant (F1) and the other initiated by mid to high frequency second and higher formants (F2+) with predominantly resolved and unresolved harmonics, respectively. F1 EFRs were more susceptible to noise whereas F2+ EFRs were more susceptible to reverberation. Reverberation resulted in greater attenuation of F1 EFRs in adults than children, and greater attenuation of F2+ EFRs in older than younger children. Reduced modulation depth caused by reverberation and noise explained changes in F2+ EFRs but was not the primary determinant for F1 EFRs. Experimental data paralleled modelled EFRs, especially for F1. Together, data suggest that noise or reverberation influences the robustness of f0 encoding depending on the resolvability of vowel harmonics and that maturation of processing temporal/envelope information of voice is delayed in reverberation, particularly for low frequency stimuli.}, } @article {pmid37173176, year = {2024}, author = {Wang, Y and Hattori, M and Masaki, K and Sumita, YI}, title = {Detailed speech evaluation including formant 3 analysis and voice visualization in maxillofacial rehabilitation: A clinical report.}, journal = {The Journal of prosthetic dentistry}, volume = {132}, number = {6}, pages = {1331.e1-1331.e7}, doi = {10.1016/j.prosdent.2023.02.022}, pmid = {37173176}, issn = {1097-6841}, mesh = {Humans ; Male ; Aged ; *Palatal Obturators ; Maxillary Sinus/diagnostic imaging ; Speech Disorders/rehabilitation/etiology ; Speech Production Measurement ; Speech Acoustics ; Voice Quality ; Maxilla ; }, abstract = {Objective speech evaluation such as analysis of formants 1 and 2 and nasality measurement have been used in maxillofacial rehabilitation for outcome assessment. However, in some patients, those evaluations are insufficient to assess a specific or unique problem. This report describes the use of a new speech evaluation including formant 3 analysis and voice visualization in a patient with a maxillofacial defect. The patient was a 67-year-old man who had a maxillary defect that opened to the maxillary sinus and who had an unnatural voice even when wearing an obturator. Nasality was low and the frequency of formants 1 and 2 were normal even without the obturator. However, a low frequency of formant 3 and a shifted center of voice were observed. These results indicated that the unnatural voice was related to increased resonant volume in the pharynx rather than hypernasality. This patient demonstrates that advanced speech analysis can be useful for detecting the cause of speech disorder and planning maxillofacial rehabilitation.}, } @article {pmid37138997, year = {2023}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {On the speaker discriminatory power asymmetry regarding acoustic-phonetic parameters and the impact of speaking style.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1101187}, pmid = {37138997}, issn = {1664-1078}, abstract = {This study aimed to assess what we refer to as the speaker discriminatory power asymmetry and its forensic implications in comparisons performed in different speaking styles: spontaneous dialogues vs. interviews. We also addressed the impact of data sampling on the speaker's discriminatory performance concerning different acoustic-phonetic estimates. The participants were 20 male speakers, Brazilian Portuguese speakers from the same dialectal area. The speech material consisted of spontaneous telephone conversations between familiar individuals, and interviews conducted between each individual participant and the researcher. Nine acoustic-phonetic parameters were chosen for the comparisons, spanning from temporal and melodic to spectral acoustic-phonetic estimates. Ultimately, an analysis based on the combination of different parameters was also conducted. Two speaker discriminatory metrics were examined: Cost Log-likelihood-ratio (Cllr) and Equal Error Rate (EER) values. A general speaker discriminatory trend was suggested when assessing the parameters individually. Parameters pertaining to the temporal acoustic-phonetic class depicted the weakest performance in terms of speaker contrasting power as evidenced by the relatively higher Cllr and EER values. Moreover, from the set of acoustic parameters assessed, spectral parameters, mainly high formant frequencies, i.e., F3 and F4, were the best performing in terms of speaker discrimination, depicting the lowest EER and Cllr scores. The results appear to suggest a speaker discriminatory power asymmetry concerning parameters from different acoustic-phonetic classes, in which temporal parameters tended to present a lower discriminatory power. The speaking style mismatch also seemed to considerably impact the speaker comparison task, by undermining the overall discriminatory performance. A statistical model based on the combination of different acoustic-phonetic estimates was found to perform best in this case. Finally, data sampling has proven to be of crucial relevance for the reliability of discriminatory power assessment.}, } @article {pmid37129674, year = {2023}, author = {Zaltz, Y}, title = {The effect of stimulus type and testing method on talker discrimination of school-age children.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2611}, doi = {10.1121/10.0017999}, pmid = {37129674}, issn = {1520-8524}, mesh = {Adult ; Humans ; Child ; Adolescent ; Young Adult ; *Speech Perception ; Child Development ; Speech ; Linguistics ; Acoustics ; }, abstract = {Efficient talker discrimination (TD) improves speech understanding under multi-talker conditions. So far, TD of children has been assessed using various testing parameters, making it difficult to draw comparative conclusions. This study explored the effects of the stimulus type and variability on children's TD. Thirty-two children (7-10 years old) underwent eight TD assessments with fundamental frequency + formant changes using an adaptive procedure. Stimuli included consonant-vowel-consonant words or three-word sentences and were either fixed by run or by trial (changing throughout the run). Cognitive skills were also assessed. Thirty-one adults (18-35 years old) served as controls. The results showed (1) poorer TD for the fixed-by-trial than the fixed-by-run method, with both stimulus types for the adults but only with the words for the children; (2) poorer TD for the words than the sentences with the fixed-by-trial method only for the children; and (3) significant correlations between the children's age and TD. These results support a developmental trajectory in the use of perceptual anchoring for TD and in its reliance on comprehensive acoustic and linguistic information. The finding that the testing parameters may influence the top-down and bottom-up processing for TD should be considered when comparing data across studies or when planning new TD experiments.}, } @article {pmid37128454, year = {2022}, author = {Ghosh, S and Feng, Z and Bian, J and Butler, K and Prosperi, M}, title = {DR-VIDAL - Doubly Robust Variational Information-theoretic Deep Adversarial Learning for Counterfactual Prediction and Treatment Effect Estimation on Real World Data.}, journal = {AMIA ... Annual Symposium proceedings. AMIA Symposium}, volume = {2022}, number = {}, pages = {485-494}, pmid = {37128454}, issn = {1942-597X}, support = {R01 AI141810/AI/NIAID NIH HHS/United States ; R01 AI170187/AI/NIAID NIH HHS/United States ; R01 AG076234/AG/NIA NIH HHS/United States ; R21 CA245858/CA/NCI NIH HHS/United States ; R01 AI145552/AI/NIAID NIH HHS/United States ; R56 AG069880/AG/NIA NIH HHS/United States ; }, mesh = {Humans ; Prognosis ; *Electronic Health Records ; Causality ; }, abstract = {Determining causal effects of interventions onto outcomes from real-world, observational (non-randomized) data, e.g., treatment repurposing using electronic health records, is challenging due to underlying bias. Causal deep learning has improved over traditional techniques for estimating individualized treatment effects (ITE). We present the Doubly Robust Variational Information-theoretic Deep Adversarial Learning (DR-VIDAL), a novel generative framework that combines two joint models of treatment and outcome, ensuring an unbiased ITE estimation even when one of the two is misspecified. DR-VIDAL integrates: (i) a variational autoencoder (VAE) to factorize confounders into latent variables according to causal assumptions; (ii) an information-theoretic generative adversarial network (Info-GAN) to generate counterfactuals; (iii) a doubly robust block incorporating treatment propensities for outcome predictions. On synthetic and real-world datasets (Infant Health and Development Program, Twin Birth Registry, and National Supported Work Program), DR-VIDAL achieves better performance than other non-generative and generative methods. In conclusion, DR-VIDAL uniquely fuses causal assumptions, VAE, Info-GAN, and doubly robustness into a comprehensive, per- formant framework. Code is available at: https://github.com/Shantanu48114860/DR-VIDAL-AMIA-22 under MIT license.}, } @article {pmid37116009, year = {2024}, author = {Li, M and Erickson, IM and Cross, EV and Lee, JD}, title = {It's Not Only What You Say, But Also How You Say It: Machine Learning Approach to Estimate Trust from Conversation.}, journal = {Human factors}, volume = {66}, number = {6}, pages = {1724-1741}, pmid = {37116009}, issn = {1547-8181}, mesh = {Humans ; *Trust ; *Machine Learning ; Adult ; Communication ; Male ; Female ; }, abstract = {OBJECTIVE: The objective of this study was to estimate trust from conversations using both lexical and acoustic data.

BACKGROUND: As NASA moves to long-duration space exploration operations, the increasing need for cooperation between humans and virtual agents requires real-time trust estimation by virtual agents. Measuring trust through conversation is a novel and unintrusive approach.

METHOD: A 2 (reliability) × 2 (cycles) × 3 (events) within-subject study with habitat system maintenance was designed to elicit various levels of trust in a conversational agent. Participants had trust-related conversations with the conversational agent at the end of each decision-making task. To estimate trust, subjective trust ratings were predicted using machine learning models trained on three types of conversational features (i.e., lexical, acoustic, and combined). After training, model explanation was performed using variable importance and partial dependence plots.

RESULTS: Results showed that a random forest algorithm, trained using the combined lexical and acoustic features, predicted trust in the conversational agent most accurately (Radj2=0.71). The most important predictors were a combination of lexical and acoustic cues: average sentiment considering valence shifters, the mean of formants, and Mel-frequency cepstral coefficients (MFCC). These conversational features were identified as partial mediators predicting people's trust.

CONCLUSION: Precise trust estimation from conversation requires lexical cues and acoustic cues.

APPLICATION: These results showed the possibility of using conversational data to measure trust, and potentially other dynamic mental states, unobtrusively and dynamically.}, } @article {pmid37106680, year = {2023}, author = {Teixeira, FL and Costa, MRE and Abreu, JP and Cabral, M and Soares, SP and Teixeira, JP}, title = {A Narrative Review of Speech and EEG Features for Schizophrenia Detection: Progress and Challenges.}, journal = {Bioengineering (Basel, Switzerland)}, volume = {10}, number = {4}, pages = {}, pmid = {37106680}, issn = {2306-5354}, support = {UIDB/05757/2020//Fundação para a Ciência e Tecnologia/ ; UIDP/05757/2020//Fundação para a Ciência e Tecnologia/ ; LA/P/0007/2021//Fundação para a Ciência e Tecnologia/ ; }, abstract = {Schizophrenia is a mental illness that affects an estimated 21 million people worldwide. The literature establishes that electroencephalography (EEG) is a well-implemented means of studying and diagnosing mental disorders. However, it is known that speech and language provide unique and essential information about human thought. Semantic and emotional content, semantic coherence, syntactic structure, and complexity can thus be combined in a machine learning process to detect schizophrenia. Several studies show that early identification is crucial to prevent the onset of illness or mitigate possible complications. Therefore, it is necessary to identify disease-specific biomarkers for an early diagnosis support system. This work contributes to improving our knowledge about schizophrenia and the features that can identify this mental illness via speech and EEG. The emotional state is a specific characteristic of schizophrenia that can be identified with speech emotion analysis. The most used features of speech found in the literature review are fundamental frequency (F0), intensity/loudness (I), frequency formants (F1, F2, and F3), Mel-frequency cepstral coefficients (MFCC's), the duration of pauses and sentences (SD), and the duration of silence between words. Combining at least two feature categories achieved high accuracy in the schizophrenia classification. Prosodic and spectral or temporal features achieved the highest accuracy. The work with higher accuracy used the prosodic and spectral features QEVA, SDVV, and SSDL, which were derived from the F0 and spectrogram. The emotional state can be identified with most of the features previously mentioned (F0, I, F1, F2, F3, MFCCs, and SD), linear prediction cepstral coefficients (LPCC), linear spectral features (LSF), and the pause rate. Using the event-related potentials (ERP), the most promissory features found in the literature are mismatch negativity (MMN), P2, P3, P50, N1, and N2. The EEG features with higher accuracy in schizophrenia classification subjects are the nonlinear features, such as Cx, HFD, and Lya.}, } @article {pmid37105171, year = {2023}, author = {Oganian, Y and Bhaya-Grossman, I and Johnson, K and Chang, EF}, title = {Vowel and formant representation in the human auditory speech cortex.}, journal = {Neuron}, volume = {111}, number = {13}, pages = {2105-2118.e4}, pmid = {37105171}, issn = {1097-4199}, support = {R01 DC012379/DC/NIDCD NIH HHS/United States ; U01 NS117765/NS/NINDS NIH HHS/United States ; }, mesh = {Humans ; Speech ; Phonetics ; Auditory Perception ; *Speech Perception ; *Auditory Cortex ; }, abstract = {Vowels, a fundamental component of human speech across all languages, are cued acoustically by formants, resonance frequencies of the vocal tract shape during speaking. An outstanding question in neurolinguistics is how formants are processed neurally during speech perception. To address this, we collected high-density intracranial recordings from the human speech cortex on the superior temporal gyrus (STG) while participants listened to continuous speech. We found that two-dimensional receptive fields based on the first two formants provided the best characterization of vowel sound representation. Neural activity at single sites was highly selective for zones in this formant space. Furthermore, formant tuning is adjusted dynamically for speaker-specific spectral context. However, the entire population of formant-encoding sites was required to accurately decode single vowels. Overall, our results reveal that complex acoustic tuning in the two-dimensional formant space underlies local vowel representations in STG. As a population code, this gives rise to phonological vowel perception.}, } @article {pmid37080890, year = {2023}, author = {Herbst, CT and Story, BH and Meyer, D}, title = {Acoustical Theory of Vowel Modification Strategies in Belting.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.01.004}, pmid = {37080890}, issn = {1873-4588}, abstract = {Various authors have argued that belting is to be produced by "speech-like" sounds, with the first and second supraglottic vocal tract resonances (fR1 and fR2) at frequencies of the vowels determined by the lyrics to be sung. Acoustically, the hallmark of belting has been identified as a dominant second harmonic, possibly enhanced by first resonance tuning (fR1≈2fo). It is not clear how both these concepts - (a) phonating with "speech-like," unmodified vowels; and (b) producing a belting sound with a dominant second harmonic, typically enhanced by fR1 - can be upheld when singing across a singer's entire musical pitch range. For instance, anecdotal reports from pedagogues suggest that vowels with a low fR1, such as [i] or [u], might have to be modified considerably (by raising fR1) in order to phonate at higher pitches. These issues were systematically addressed in silico with respect to treble singing, using a linear source-filter voice production model. The dominant harmonic of the radiated spectrum was assessed in 12987 simulations, covering a parameter space of 37 fundamental frequencies (fo) across the musical pitch range from C3 to C6; 27 voice source spectral slope settings from -4 to -30 dB/octave; computed for 13 different IPA vowels. The results suggest that, for most unmodified vowels, the stereotypical belting sound characteristics with a dominant second harmonic can only be produced over a pitch range of about a musical fifth, centered at fo≈0.5fR1. In the [ɔ] and [ɑ] vowels, that range is extended to an octave, supported by a low second resonance. Data aggregation - considering the relative prevalence of vowels in American English - suggests that, historically, belting with fR1≈2fo was derived from speech, and that songs with an extended musical pitch range likely demand considerable vowel modification. We thus argue that - on acoustical grounds - the pedagogical commandment for belting with unmodified, "speech-like" vowels can not always be fulfilled.}, } @article {pmid37078508, year = {2023}, author = {Dillon, MT and Helpard, L and Brown, KD and Selleck, AM and Richter, ME and Rooth, MA and Thompson, NJ and Dedmon, MM and Ladak, HM and Agrawal, S}, title = {Influence of the Frequency-to-Place Function on Recognition with Place-Based Cochlear Implant Maps.}, journal = {The Laryngoscope}, volume = {133}, number = {12}, pages = {3540-3547}, doi = {10.1002/lary.30710}, pmid = {37078508}, issn = {1531-4995}, support = {//Academic Medical Organization of Southwestern Ontario/ ; //MED-EL Medical Electronics/ ; //Natural Sciences and Engineering Research Council of Canada/ ; }, mesh = {Adult ; Humans ; *Cochlear Implants ; Artificial Intelligence ; *Speech Perception ; *Cochlear Implantation ; Cochlea/anatomy & histology ; Acoustic Stimulation/methods ; }, abstract = {OBJECTIVE: Comparison of acute speech recognition for cochlear implant (CI) alone and electric-acoustic stimulation (EAS) users listening with default maps or place-based maps using either a spiral ganglion (SG) or a new Synchrotron Radiation-Artificial Intelligence (SR-AI) frequency-to-place function.

METHODS: Thirteen adult CI-alone or EAS users completed a task of speech recognition at initial device activation with maps that differed in the electric filter frequency assignments. The three map conditions were: (1) maps with the default filter settings (default map), (2) place-based maps with filters aligned to cochlear SG tonotopicity using the SG function (SG place-based map), and (3) place-based maps with filters aligned to cochlear Organ of Corti (OC) tonotopicity using the SR-AI function (SR-AI place-based map). Speech recognition was evaluated using a vowel recognition task. Performance was scored as the percent correct for formant 1 recognition due to the rationale that the maps would deviate the most in the estimated cochlear place frequency for low frequencies.

RESULTS: On average, participants had better performance with the OC SR-AI place-based map as compared to the SG place-based map and the default map. A larger performance benefit was observed for EAS users than for CI-alone users.

CONCLUSION: These pilot data suggest that EAS and CI-alone users may experience better performance with a patient-centered mapping approach that accounts for the variability in cochlear morphology (OC SR-AI frequency-to-place function) in the individualization of the electric filter frequencies (place-based mapping procedure).

LEVEL OF EVIDENCE: 3 Laryngoscope, 133:3540-3547, 2023.}, } @article {pmid37071803, year = {2023}, author = {Terband, H and van Brenk, F}, title = {Modeling Responses to Auditory Feedback Perturbations in Adults, Children, and Children With Complex Speech Sound Disorders: Evidence for Impaired Auditory Self-Monitoring?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1563-1587}, doi = {10.1044/2023_JSLHR-22-00379}, pmid = {37071803}, issn = {1558-9102}, mesh = {Young Adult ; Humans ; Child ; Child, Preschool ; *Speech Sound Disorder ; Feedback ; *Speech Perception/physiology ; Reproducibility of Results ; Speech/physiology ; Feedback, Sensory/physiology ; *Stuttering ; }, abstract = {PURPOSE: Previous studies have found that typically developing (TD) children were able to compensate and adapt to auditory feedback perturbations to a similar or larger degree compared to young adults, while children with speech sound disorder (SSD) were found to produce predominantly following responses. However, large individual differences lie underneath the group-level results. This study investigates possible mechanisms in responses to formant shifts by modeling parameters of feedback and feedforward control of speech production based on behavioral data.

METHOD: SimpleDIVA was used to model an existing dataset of compensation/adaptation behavior to auditory feedback perturbations collected from three groups of Dutch speakers: 50 young adults, twenty-three 4- to 8-year-old children with TD speech, and seven 4- to 8-year-old children with SSD. Between-groups and individual within-group differences in model outcome measures representing auditory and somatosensory feedback control gain and feedforward learning rate were assessed.

RESULTS: Notable between-groups and within-group variation was found for all outcome measures. Data modeled for individual speakers yielded model fits with varying reliability. Auditory feedback control gain was negative in children with SSD and positive in both other groups. Somatosensory feedback control gain was negative for both groups of children and marginally negative for adults. Feedforward learning rate measures were highest in the children with TD speech followed by children with SSD, compared to adults.

CONCLUSIONS: The SimpleDIVA model was able to account for responses to the perturbation of auditory feedback other than corrective, as negative auditory feedback control gains were associated with following responses to vowel shifts. These preliminary findings are suggestive of impaired auditory self-monitoring in children with complex SSD. Possible mechanisms underlying the nature of following responses are discussed.}, } @article {pmid37059081, year = {2023}, author = {Chao, SC and Daliri, A}, title = {Effects of Gradual and Sudden Introduction of Perturbations on Adaptive Responses to Formant-Shift and Formant-Clamp Perturbations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1588-1599}, pmid = {37059081}, issn = {1558-9102}, support = {R01 DC019905/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Speech/physiology ; *Speech Perception ; Phonetics ; Speech Acoustics ; }, abstract = {PURPOSE: When the speech motor system encounters errors, it generates adaptive responses to compensate for the errors. Unlike errors induced by formant-shift perturbations, errors induced by formant-clamp perturbations do not correspond with the speaker's speech (i.e., degraded motor-to-auditory correspondence). We previously showed that adaptive responses to formant-clamp perturbations are smaller than responses to formant-shift perturbations when perturbations are introduced gradually. This study examined responses to formant-clamp and formant-shift perturbations when perturbations are introduced suddenly.

METHOD: One group of participants (n = 30) experienced gradually introduced formant-clamp and formant-shift perturbations, and another group (n = 30) experienced suddenly introduced formant-clamp and formant-shift perturbations. We designed the perturbations based on participant-specific vowel configurations such that a participant's first and second formants of /ɛ/ were perturbed toward their /æ/. To estimate adaptive responses, we measured formant changes (0-100 ms of the vowel) in response to the formant perturbations.

RESULTS: We found that (a) the difference between responses to formant-clamp and formant-shift perturbations was smaller when the perturbations were introduced suddenly and (b) responses to suddenly introduced (but not gradually introduced) formant-shift perturbations positively correlated with responses to formant-clamp perturbations.

CONCLUSIONS: These results showed that the speech motor system responds to errors induced by formant-shift and formant-clamp perturbations more differently when perturbations are introduced gradually than suddenly. Overall, the quality of errors (formant-shift vs. formant-clamp) and the manner of introducing errors (gradually vs. suddenly) modulate the speech motor system's evaluations of and responses to errors.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.22406422.}, } @article {pmid37040323, year = {2023}, author = {Luo, X and Daliri, A}, title = {The Impact of Bimodal Hearing on Speech Acoustics of Vowel Production in Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {5}, pages = {1511-1524}, pmid = {37040323}, issn = {1558-9102}, support = {R01 DC019905/DC/NIDCD NIH HHS/United States ; R01 DC020162/DC/NIDCD NIH HHS/United States ; R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Cochlear Implants ; Speech Acoustics ; *Cochlear Implantation ; Hearing ; *Hearing Aids ; }, abstract = {PURPOSE: This study aimed to investigate the acoustic changes in vowel production with different forms of auditory feedback via cochlear implant (CI), hearing aid (HA), and bimodal hearing (CI + HA).

METHOD: Ten post-lingually deaf adult bimodal CI users (aged 50-78 years) produced English vowels /i/, /ɛ/, /æ/, /ɑ/, /ʊ/, and /u/ in the context of /hVd/ during short-term use of no device (ND), HA, CI, and CI + HA. Segmental features (first formant frequency [F 1], second formant frequency [F 2], and vowel space area) and suprasegmental features (duration, intensity, and fundamental frequency [f o]) of vowel production were analyzed. Participants also categorized a vowel continuum synthesized from their own productions of /ɛ/ and /æ/ using HA, CI, and CI + HA.

RESULTS: F 1s of all vowels decreased; F 2s of front vowels but not back vowels increased; vowel space areas increased; and vowel durations, intensities, and f os decreased with statistical significance in the HA, CI, and CI + HA conditions relative to the ND condition. Only f os were lower, and vowel space areas were larger with CI and CI + HA than with HA. Average changes in f o, intensity, and F 1 from the ND condition to the HA, CI, and CI + HA conditions were positively correlated. Most participants did not show a typical psychometric function for vowel categorization, and thus, the relationship between vowel categorization and production was not tested.

CONCLUSIONS: The results suggest that acoustic, electric, and bimodal hearing have a measurable impact on vowel acoustics of post-lingually deaf adults when their hearing devices are turned on and off temporarily. Also, changes in f o and F 1 with the use of hearing devices may be largely driven by changes in intensity.}, } @article {pmid37031224, year = {2023}, author = {Hsu, TC and Wu, BX and Lin, RT and Chien, CJ and Yeh, CY and Chang, TH}, title = {Electron-phonon interaction toward engineering carrier mobility of periodic edge structured graphene nanoribbons.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {5781}, pmid = {37031224}, issn = {2045-2322}, support = {NSTC-109-2222-E-002-002-MY3//Ministry of Science and Technology, Taiwan/ ; 110-2622-8-002-014//Taiwan Semiconductor Manufacturing Company/ ; }, abstract = {Graphene nanoribbons have many extraordinary electrical properties and are the candidates for semiconductor industry. In this research, we propose a design of Coved GNRs with periodic structure ranged from 4 to 8 nm or more, of which the size is within practical feature sizes by advanced lithography tools. The carrier transport properties of Coved GNRs with the periodic coved shape are designed to break the localized electronic state and reducing electron-phonon scattering. In this way, the mobility of Coved GNRs can be enhanced by orders compared with the zigzag GNRs in same width. Moreover, in contrast to occasional zero bandgap transition of armchair and zigzag GNRs without precision control in atomic level, the Coved GNRs with periodic edge structures can exclude the zero bandgap conditions, which makes practical the mass production process. The designed Coved-GNRs is fabricated over the Germanium (110) substrate where the graphene can be prepared in the single-crystalline and single-oriented formants and the edge of GNRs is later repaired under "balanced condition growth" and we demonstrate that the propose coved structures are compatible to current fabrication facility.}, } @article {pmid37015000, year = {2023}, author = {Vorperian, HK and Kent, RD and Lee, Y and Buhr, KA}, title = {Vowel Production in Children and Adults With Down Syndrome: Fundamental and Formant Frequencies of the Corner Vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {4}, pages = {1208-1239}, pmid = {37015000}, issn = {1558-9102}, support = {R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Male ; Female ; Humans ; Adult ; Child ; Child, Preschool ; Adolescent ; Young Adult ; Middle Aged ; Aged ; Aged, 80 and over ; *Speech Acoustics ; *Down Syndrome ; Phonetics ; Speech Intelligibility ; Acoustics ; }, abstract = {PURPOSE: Atypical vowel production contributes to reduced speech intelligibility in children and adults with Down syndrome (DS). This study compares the acoustic data of the corner vowels /i/, /u/, /æ/, and /ɑ/ from speakers with DS against typically developing/developed (TD) speakers.

METHOD: Measurements of the fundamental frequency (f o) and first four formant frequencies (F1-F4) were obtained from single word recordings containing the target vowels from 81 participants with DS (ages 3-54 years) and 293 TD speakers (ages 4-92 years), all native speakers of English. The data were used to construct developmental trajectories and to determine interspeaker and intraspeaker variability.

RESULTS: Trajectories for DS differed from TD based on age and sex, but the groups were similar with the striking change in f o and F1-F4 frequencies around age 10 years. Findings confirm higher f o in DS, and vowel-specific differences between DS and TD in F1 and F2 frequencies, but not F3 and F4. The measure of F2 differences of front-versus-back vowels was more sensitive of compression than reduced vowel space area/centralization across age and sex. Low vowels had more pronounced F2 compression as related to reduced speech intelligibility. Intraspeaker variability was significantly greater for DS than TD for nearly all frequency values across age.

DISCUSSION: Vowel production differences between DS and TD are age- and sex-specific, which helps explain contradictory results in previous studies. Increased intraspeaker variability across age in DS confirms the presence of a persisting motor speech disorder. Atypical vowel production in DS is common and related to dysmorphology, delayed development, and disordered motor control.}, } @article {pmid37005127, year = {2023}, author = {Capobianco, S and Nacci, A and Calcinoni, O and Bruschini, L and Berrettini, S and Bottalico, P}, title = {Assessing Acoustic Parameters in Early Music and Romantic Operatic Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.02.009}, pmid = {37005127}, issn = {1873-4588}, abstract = {OBJECTIVE: Since the recent early music (EM) revival, a subset of singers have begun to specialize in a style of singing that is perceptually different from the more "mainstream" romantic operatic (RO) singing style. The aim of this study is to characterize EM with respect to RO singing in terms of its vibrato characteristics and the singer's formant cluster.

STUDY DESIGN: This study presents a within-subject experimental design.

METHODS: Ten professional singers (5 F; 5M) versed in both EM and RO repertoire were enrolled in the study. Each singer recorded the first 10 bars of the famous Aria, "Amarilli Mia Bella" (Giulio Caccini, 1602) a cappella, in RO and EM styles, in random order. Three sustained notes were extracted from the acoustical recordings and were analyzed using the free user-friendly software Biovoice to extract five parameters: vibrato rate, vibrato extent, vibrato jitter (Jvib), vibrato shimmer, and quality ratio (QR), an estimation of the singer's formant power.

RESULTS: Vibrato in EM singing was characterized by a higher rate, a smaller extent, and less regular cycle-cycle period duration (higher Jvib) compared to RO singing. As in previous studies, RO singing presented a more prominent singer's formant, as indicated by a smaller QR.

CONCLUSIONS: Acoustical analysis of some vibrato characteristics and the Singer's Formant significantly differentiated EM from RO singing styles. Given the acoustical distinctions between EM and RO styles, future scientific and musicological studies should consider distinguishing between the two styles rather than using a singular term for and description of Western Classical singing.}, } @article {pmid37003707, year = {2023}, author = {Wood, S}, title = {Dating the open /æ/ sound change in Southern British English.}, journal = {JASA express letters}, volume = {3}, number = {3}, pages = {035205}, doi = {10.1121/10.0015281}, pmid = {37003707}, issn = {2691-1191}, abstract = {The new open /æ/ was not noticed in the non-regional received pronunciation (RP) accent of Southern British English until the 1980s. Dating to the 1950s or 1920s had been suggested, but the earliest known regional example was born in Kent in the 1860s. Formant data from archived recordings of 29 Southeastern speakers, born between the 1850s and 1960s, were studied using two methods: inspection of formant diagrams for closer /æ/, and modelling low vowels for open /æ/. The earliest RP speaker found with new open /æ/ was born in 1857, demonstrating that this type of sound change had started by the 1850s.}, } @article {pmid37002095, year = {2023}, author = {Serrurier, A and Neuschaefer-Rube, C}, title = {Morphological and acoustic modeling of the vocal tract.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {3}, pages = {1867}, doi = {10.1121/10.0017356}, pmid = {37002095}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Speech Acoustics ; Phonetics ; Speech ; *Voice ; Acoustics ; }, abstract = {In speech production, the anatomical morphology forms the substrate on which the speakers build their articulatory strategy to reach specific articulatory-acoustic goals. The aim of this study is to characterize morphological inter-speaker variability by building a shape model of the full vocal tract including hard and soft structures. Static magnetic resonance imaging data from 41 speakers articulating altogether 1947 phonemes were considered, and the midsagittal articulator contours were manually outlined. A phoneme-independent average-articulation representative of morphology was calculated as the speaker mean articulation. A principal component analysis-driven shape model was derived from average-articulations, leading to five morphological components, which explained 87% of the variance. Almost three-quarters of the variance was related to independent variations of the horizontal oral and vertical pharyngeal lengths, the latter capturing male-female differences. The three additional components captured shape variations related to head tilt and palate shape. Plane wave propagation acoustic simulations were run to characterize morphological components. A lengthening of 1 cm of the vocal tract in the vertical or horizontal directions led to a decrease in formant values of 7%-8%. Further analyses are required to analyze three-dimensional variability and to understand the morphological-acoustic relationships per phoneme. Average-articulations and model code are publicly available (https://github.com/tonioser/VTMorphologicalModel).}, } @article {pmid36949035, year = {2023}, author = {Lou, Q and Wang, X and Chen, Y and Wang, G and Jiang, L and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients With Repaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {34}, number = {6}, pages = {e551-e556}, doi = {10.1097/SCS.0000000000009301}, pmid = {36949035}, issn = {1536-3732}, mesh = {Adult ; Humans ; *Cleft Palate/surgery ; Speech ; Pharynx/surgery ; *Velopharyngeal Insufficiency/surgery ; Pharyngeal Muscles ; }, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients with repaired cleft palate through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with complete velopharyngeal closure (VPC) and velopharyngeal insufficiency (VPI) patients.

PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality and consonant missing rate, for objective acoustic analysis, we used speech sample normalization and objective acoustic parameters included normalized vowel formants, voice onset time and the analysis of 3-dimensional spectrogram and spectrum, were carried out on speech samples produced by 3 groups of speakers: (a) speakers with velopharyngeal competence after palatorrhaphy (n=38); (b) speakers with velopharyngeal incompetence after palatorrhaphy (n=70), (c) adult patients with cleft palate (n=65) and (d) typical speakers (n=30).

RESULTS: There was a highly negative correlation between VPC grade and speech intelligibility (ρ=-0.933), and a highly positive correlation between VPC and nasality (ρ=0.813). In subjective evaluation, the speech level of VPI patients was significantly lower than that of VPC patients and normal adults. Although the nasality and consonant loss rate of VPC patients were significantly higher than that of normal adults, the speech intelligibility of VPC patients was not significantly different from that of normal adults. In acoustic analysis, patients with VPI still performed poorly compared with patients with VPC.

CONCLUSIONS: The speech function of adult cleft palate patients is affected by abnormal palatal structure and bad pronunciation habits. In subjective evaluation, there was no significant difference in speech level between VPC patients and normal adults, whereas there was significant difference between VPI patients and normal adults. The acoustic parameters were different between the 2 groups after cleft palate repair. The condition of palatopharyngeal closure after cleft palate can affect the patient's speech.}, } @article {pmid36946195, year = {2023}, author = {Easwar, V and Purcell, D and Wright, T}, title = {Predicting Hearing aid Benefit Using Speech-Evoked Envelope Following Responses in Children With Hearing Loss.}, journal = {Trends in hearing}, volume = {27}, number = {}, pages = {23312165231151468}, pmid = {36946195}, issn = {2331-2165}, mesh = {Adolescent ; Child ; Female ; Humans ; Male ; Evoked Potentials, Auditory ; *Hearing Aids ; *Hearing Loss/physiopathology/therapy ; *Speech Perception/physiology ; Speech/physiology ; }, abstract = {Electroencephalography could serve as an objective tool to evaluate hearing aid benefit in infants who are developmentally unable to participate in hearing tests. We investigated whether speech-evoked envelope following responses (EFRs), a type of electroencephalography-based measure, could predict improved audibility with the use of a hearing aid in children with mild-to-severe permanent, mainly sensorineural, hearing loss. In 18 children, EFRs were elicited by six male-spoken band-limited phonemic stimuli--the first formants of /u/ and /i/, the second and higher formants of /u/ and /i/, and the fricatives /s/ and /∫/--presented together as /su∫i/. EFRs were recorded between the vertex and nape, when /su∫i/ was presented at 55, 65, and 75 dB SPL using insert earphones in unaided conditions and individually fit hearing aids in aided conditions. EFR amplitude and detectability improved with the use of a hearing aid, and the degree of improvement in EFR amplitude was dependent on the extent of change in behavioral thresholds between unaided and aided conditions. EFR detectability was primarily influenced by audibility; higher sensation level stimuli had an increased probability of detection. Overall EFR sensitivity in predicting audibility was significantly higher in aided (82.1%) than unaided conditions (66.5%) and did not vary as a function of stimulus or frequency. EFR specificity in ascertaining inaudibility was 90.8%. Aided improvement in EFR detectability was a significant predictor of hearing aid-facilitated change in speech discrimination accuracy. Results suggest that speech-evoked EFRs could be a useful objective tool in predicting hearing aid benefit in children with hearing loss.}, } @article {pmid36945094, year = {2023}, author = {Duan, H and Xie, Q and Zhang, Z}, title = {Characteristics of Alveolo-palatal Affricates Produced by Mandarin-speaking Children with Repaired Cleft Palate.}, journal = {American journal of health behavior}, volume = {47}, number = {1}, pages = {13-20}, doi = {10.5993/AJHB.47.1.2}, pmid = {36945094}, issn = {1945-7359}, mesh = {Humans ; Child ; Child, Preschool ; *Cleft Palate/surgery ; Phonetics ; Language ; }, abstract = {Objectives: In this study, examined the acoustic properties of affricates /t/ and /t[h]/ in Mandarin Chinese, and analyzed the differences of the acoustic characteristics of these affricates produced by children with repaired cleft palate and normally developing children. We also explored the relationship between the affricates and high-front vowel /i/. Methods: We analyzed 16 monosyllabic words with alveolo-palatal affricates as the initial consonants produced by children with repaired cleft palate (N=13, Mean=5.9 years) and normally developing children (N=6, Mean age=5.3 years). We used several acoustic parameters to investigate the characteristics of these affricates, such as the center of gravity, VOT and the formants of vowels. Results: Compared with normally developing children, children with cleft palate exhibited a lower center of gravity for the 2 affricates /t/ and /t[h]/. Data from the control group showed that the affricate /t[h]/ had a significantly greater center of gravity than that of /t/. The accuracy of /t , t[h]/ produced by speakers of cleft palate was significantly correlated with that of /i/ (r=0.63). High-front vowel /i/ is a significant index in diagnosing speech intelligibility which is more valuable than /a/ and /u/. There was a significant difference in F2 of vowel /i/ between children with cleft palate without speech therapy (CS1) and after speech therapy (CS2). After speech intervention, the accuracy of affricates produced by children with cleft palate was improved, the acoustic properties "stop + noise segments" appeared. Conclusion: Children with cleft palate can be distinguished better from children with normal development by 2 significant acoustic characteristics: center of gravity and VOT. As alveolo-palatal affricates /t , t[h]/ and high-front vowel /i/ have a similar place of articulation, front-tongue-blade, their production accuracy can be improved mutually. The analysis showed that the articulation of Chinese /i/ has a higher frontal lingual position and less variability, which is more conducive to articulation training and improves the effect of cleft palate training. These findings provide a potential relationship on affricates /t, t[h]/ and vowel /i/. Children with cleft palate have difficulty pronouncing the /t, t [h]/ and /i/. It is better to start with a vowel /i/, resulting in improvement in overall speech intelligibility.}, } @article {pmid36938342, year = {2023}, author = {Alghowinem, S and Gedeon, T and Goecke, R and Cohn, JF and Parker, G}, title = {Interpretation of Depression Detection Models via Feature Selection Methods.}, journal = {IEEE transactions on affective computing}, volume = {14}, number = {1}, pages = {133-152}, pmid = {36938342}, issn = {1949-3045}, support = {R01 MH051435/MH/NIMH NIH HHS/United States ; R01 MH065376/MH/NIMH NIH HHS/United States ; R01 MH096951/MH/NIMH NIH HHS/United States ; }, abstract = {Given the prevalence of depression worldwide and its major impact on society, several studies employed artificial intelligence modelling to automatically detect and assess depression. However, interpretation of these models and cues are rarely discussed in detail in the AI community, but have received increased attention lately. In this study, we aim to analyse the commonly selected features using a proposed framework of several feature selection methods and their effect on the classification results, which will provide an interpretation of the depression detection model. The developed framework aggregates and selects the most promising features for modelling depression detection from 38 feature selection algorithms of different categories. Using three real-world depression datasets, 902 behavioural cues were extracted from speech behaviour, speech prosody, eye movement and head pose. To verify the generalisability of the proposed framework, we applied the entire process to depression datasets individually and when combined. The results from the proposed framework showed that speech behaviour features (e.g. pauses) are the most distinctive features of the depression detection model. From the speech prosody modality, the strongest feature groups were F0, HNR, formants, and MFCC, while for the eye activity modality they were left-right eye movement and gaze direction, and for the head modality it was yaw head movement. Modelling depression detection using the selected features (even though there are only 9 features) outperformed using all features in all the individual and combined datasets. Our feature selection framework did not only provide an interpretation of the model, but was also able to produce a higher accuracy of depression detection with a small number of features in varied datasets. This could help to reduce the processing time needed to extract features and creating the model.}, } @article {pmid36882955, year = {2023}, author = {Hauser, I}, title = {Differential Cue Weighting in Mandarin Sibilant Production.}, journal = {Language and speech}, volume = {66}, number = {4}, pages = {1056-1090}, pmid = {36882955}, issn = {1756-6053}, mesh = {Humans ; *Cues ; Phonetics ; *Speech Perception ; Speech ; Speech Acoustics ; }, abstract = {Individual talkers vary in their relative use of different cues to signal phonological contrast. Previous work provides limited and conflicting data on whether such variation is modulated by cue trading or individual differences in speech style. This paper examines differential cue weighting patterns in Mandarin sibilants as a test case for these hypotheses. Standardized Mandarin exhibits a three-way place contrast between retroflex, alveopalatal, and alveolar sibilants with individual differences in relative weighting of spectral center of gravity (COG) and the second formant of the following vowel (F2). In results from a speech production task, cue weights of COG and F2 are inversely correlated across speakers, demonstrating a trade-off relationship in cue use. These findings are consistent with a cue trading account of individual differences in contrast signaling.}, } @article {pmid36880531, year = {2023}, author = {Yang, X and Guo, C and Zhang, M and Li, Y and Ren, M and Mao, S and Dhakal, R and Kim, NY and Dong, Z and Sun, B and Yao, Z}, title = {Ultrahigh-sensitivity multi-parameter tacrolimus solution detection based on an anchor planar millifluidic microwave biosensor.}, journal = {Analytical methods : advancing methods and applications}, volume = {15}, number = {14}, pages = {1765-1774}, doi = {10.1039/d3ay00100h}, pmid = {36880531}, issn = {1759-9679}, mesh = {*Tacrolimus ; Microwaves ; Radio Waves ; Limit of Detection ; *Biosensing Techniques ; }, abstract = {To detect drug concentration in tacrolimus solution, an anchor planar millifluidic microwave (APMM) biosensor is proposed. The millifluidic system integrated with the sensor enables accurate and efficient detection while eliminating interference caused by the fluidity of the tacrolimus sample. Different concentrations (10-500 ng mL[-1]) of the tacrolimus analyte were introduced into the millifluidic channel, where it completely interacts with the radio frequency patch electromagnetic field, thereby effectively and sensitively modifying the resonant frequency and amplitude of the transmission coefficient. Experimental results indicate that the sensor has an extremely low limit of detection (LoD) of 0.12 pg mL[-1] and a frequency detection resolution (FDR) of 1.59 (MHz (ng mL[-1])). The greater the FDR and the lower the LoD, the more the feasibility of a label-free biosensing method. Regression analysis revealed a strong linear correlation (R[2] = 0.992) between the concentration of tacrolimus and the frequency difference of the two resonant peaks of APMM. In addition, the difference in the reflection coefficient between the two formants was measured and calculated, and a strong linear correlation (R[2] = 0.998) was found between the difference and tacrolimus concentration. Five measurements were performed on each individual sample of tacrolimus to validate the biosensor's high repeatability. Consequently, the proposed biosensor is a potential candidate for the early detection of tacrolimus drug concentration levels in organ transplant recipients. This study presents a simple method for constructing microwave biosensors with high sensitivity and rapid response.}, } @article {pmid36859160, year = {2023}, author = {Liu, Z and Xu, Y}, title = {Deep learning assessment of syllable affiliation of intervocalic consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {848}, doi = {10.1121/10.0017117}, pmid = {36859160}, issn = {1520-8524}, mesh = {Male ; Humans ; *Deep Learning ; Acoustics ; Emotions ; Judgment ; Language ; }, abstract = {In English, a sentence like "He made out our intentions." could be misperceived as "He may doubt our intentions." because the coda /d/ sounds like it has become the onset of the next syllable. The nature and occurrence condition of this resyllabification phenomenon are unclear, however. Previous empirical studies mainly relied on listener judgment, limited acoustic evidence, such as voice onset time, or average formant values to determine the occurrence of resyllabification. This study tested the hypothesis that resyllabification is a coarticulatory reorganisation that realigns the coda consonant with the vowel of the next syllable. Deep learning in conjunction with dynamic time warping (DTW) was used to assess syllable affiliation of intervocalic consonants. The results suggest that convolutional neural network- and recurrent neural network-based models can detect cases of resyllabification using Mel-frequency spectrograms. DTW analysis shows that neural network inferred resyllabified sequences are acoustically more similar to their onset counterparts than their canonical productions. A binary classifier further suggests that, similar to the genuine onsets, the inferred resyllabified coda consonants are coarticulated with the following vowel. These results are interpreted with an account of resyllabification as a speech-rate-dependent coarticulatory reorganisation mechanism in speech.}, } @article {pmid36859151, year = {2023}, author = {Lasota, M and Šidlof, P and Maurerlehner, P and Kaltenbacher, M and Schoder, S}, title = {Anisotropic minimum dissipation subgrid-scale model in hybrid aeroacoustic simulations of human phonation.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {1052}, doi = {10.1121/10.0017202}, pmid = {36859151}, issn = {1520-8524}, mesh = {Humans ; *Acoustics ; Anisotropy ; Computer Simulation ; *Computer Systems ; Phonation ; }, abstract = {This article deals with large-eddy simulations of three-dimensional incompressible laryngeal flow followed by acoustic simulations of human phonation of five cardinal English vowels, /ɑ, æ, i, o, u/. The flow and aeroacoustic simulations were performed in OpenFOAM and in-house code openCFS, respectively. Given the large variety of scales in the flow and acoustics, the simulation is separated into two steps: (1) computing the flow in the larynx using the finite volume method on a fine moving grid with 2.2 million elements, followed by (2) computing the sound sources separately and wave propagation to the radiation zone around the mouth using the finite element method on a coarse static grid with 33 000 elements. The numerical results showed that the anisotropic minimum dissipation model, which is not well known since it is not available in common CFD software, predicted stronger sound pressure levels at higher harmonics, and especially at first two formants, than the wall-adapting local eddy-viscosity model. The model on turbulent flow in the larynx was employed and a positive impact on the quality of simulated vowels was found.}, } @article {pmid36857868, year = {2023}, author = {Huang, Z and Lobbezoo, F and Vanhommerig, JW and Volgenant, CMC and de Vries, N and Aarab, G and Hilgevoord, AAJ}, title = {Effects of demographic and sleep-related factors on snoring sound parameters.}, journal = {Sleep medicine}, volume = {104}, number = {}, pages = {3-10}, doi = {10.1016/j.sleep.2023.02.012}, pmid = {36857868}, issn = {1878-5506}, mesh = {Adult ; Humans ; Male ; Middle Aged ; *Snoring ; *Sleep Apnea, Obstructive ; Sleep ; Polysomnography ; Demography ; }, abstract = {OBJECTIVE: To investigate the effect of frequently reported between-individual (viz., age, gender, body mass index [BMI], and apnea-hypopnea index [AHI]) and within-individual (viz., sleep stage and sleep position) snoring sound-related factors on snoring sound parameters in temporal, intensity, and frequency domains.

METHODS: This study included 83 adult snorers (mean ± SD age: 42.2 ± 11.3 yrs; male gender: 59%) who underwent an overnight polysomnography (PSG) and simultaneous sound recording, from which a total of 131,745 snoring events were extracted and analyzed. Data on both between-individual and within-individual factors were extracted from the participants' PSG reports.

RESULTS: Gender did not have any significant effect on snoring sound parameters. The fundamental frequency (FF; coefficient = -0.31; P = 0.02) and dominant frequency (DF; coefficient = -12.43; P < 0.01) of snoring sounds decreased with the increase of age, and the second formant increased (coefficient = 22.91; P = 0.02) with the increase of BMI. Severe obstructive sleep apnea (OSA; AHI ≥30 events/hour), non-rapid eye movement sleep stage 3 (N3), and supine position were all associated with more, longer, and louder snoring events (P < 0.05). Supine position was associated with higher FF and DF, and lateral decubitus positions were associated with higher formants.

CONCLUSIONS: Within the limitations of the current patient profile and included factors, AHI was found to have greater effects on snoring sound parameters than the other between-individual factors. The included within-individual factors were found to have greater effects on snoring sound parameters than the between-individual factors under study.}, } @article {pmid36844947, year = {2023}, author = {Wang, L and Jiang, Z}, title = {Tidal Volume Level Estimation Using Respiratory Sounds.}, journal = {Journal of healthcare engineering}, volume = {2023}, number = {}, pages = {4994668}, pmid = {36844947}, issn = {2040-2309}, mesh = {Humans ; *Respiratory Sounds ; Snoring ; Tidal Volume ; *Sleep Apnea, Obstructive ; Algorithms ; }, abstract = {Respiratory sounds have been used as a noninvasive and convenient method to estimate respiratory flow and tidal volume. However, current methods need calibration, making them difficult to use in a home environment. A respiratory sound analysis method is proposed to estimate tidal volume levels during sleep qualitatively. Respiratory sounds are filtered and segmented into one-minute clips, all clips are clustered into three categories: normal breathing/snoring/uncertain with agglomerative hierarchical clustering (AHC). Formant parameters are extracted to classify snoring clips into simple snoring and obstructive snoring with the K-means algorithm. For simple snoring clips, the tidal volume level is calculated based on snoring last time. For obstructive snoring clips, the tidal volume level is calculated by the maximum breathing pause interval. The performance of the proposed method is evaluated on an open dataset, PSG-Audio, in which full-night polysomnography (PSG) and tracheal sound were recorded simultaneously. The calculated tidal volume levels are compared with the corresponding lowest nocturnal oxygen saturation (LoO2) data. Experiments show that the proposed method calculates tidal volume levels with high accuracy and robustness.}, } @article {pmid36816289, year = {2023}, author = {Aldamen, H and Al-Deaibes, M}, title = {Arabic emphatic consonants as produced by English speakers: An acoustic study.}, journal = {Heliyon}, volume = {9}, number = {2}, pages = {e13401}, pmid = {36816289}, issn = {2405-8440}, abstract = {This study examines the production of emphatic consonants as produced by American L2 learners of Arabic. To this end, 19 participants, 5 native speakers and 14 L2 learners, participated in a production experiment in which they produced monosyllabic CVC pairs that were contrasted in terms of whether the initial consonant was plain or emphatic. The acoustic parameters that were investigated are VOT of voiceless stops, COG of fricatives, and the first three formant frequencies of the target vowels. The results of the native speakers showed that VOT is a reliable acoustic correlate of emphasis in MSA. The results also showed that vowels in the emphatic context have higher F1 and F3 and lower F2. The results showed that the L2 learners produced comparable VOT values to those of native Arabic speakers. Further, L2 learners produced a significantly lower F2 of the vowels in the emphatic context than that in the plain context. Proficiency in Arabic played a role on the F2 measure; the intermediate learners tended to be more native-like than the beginning learners. As for F3, the results of the L2 learners unexpectedly showed that the beginning learners produced a higher F3 in the context of fricatives only. This suggests that the relationship between emphasis and proficiency depends on whether the preceding consonant is a stop or fricative.}, } @article {pmid36816122, year = {2023}, author = {Ali, IE and Sumita, Y and Wakabayashi, N}, title = {Comparison of Praat and Computerized Speech Lab for formant analysis of five Japanese vowels in maxillectomy patients.}, journal = {Frontiers in neuroscience}, volume = {17}, number = {}, pages = {1098197}, pmid = {36816122}, issn = {1662-4548}, abstract = {INTRODUCTION: Speech impairment is a common complication after surgical resection of maxillary tumors. Maxillofacial prosthodontists play a critical role in restoring this function so that affected patients can enjoy better lives. For that purpose, several acoustic software packages have been used for speech evaluation, among which Computerized Speech Lab (CSL) and Praat are widely used in clinical and research contexts. Although CSL is a commercial product, Praat is freely available on the internet and can be used by patients and clinicians to practice several therapy goals. Therefore, this study aimed to determine if both software produced comparable results for the first two formant frequencies (F1 and F2) and their respective formant ranges obtained from the same voice samples from Japanese participants with maxillectomy defects.

METHODS: CSL was used as a reference to evaluate the accuracy of Praat with both the default and newly proposed adjusted settings. Thirty-seven participants were enrolled in this study for formant analysis of the five Japanese vowels (a/i/u/e/o) using CSL and Praat. Spearman's rank correlation coefficient was used to judge the correlation between the analysis results of both programs regarding F1 and F2 and their respective formant ranges.

RESULTS: As the findings pointed out, highly positive correlations between both software were found for all acoustic features and all Praat settings.

DISCUSSION: The strong correlations between the results of both CSL and Praat suggest that both programs may have similar decision strategies for atypical speech and for both sexes. This study highlights that the default settings in Praat can be used for formant analysis in maxillectomy patients with predictable accuracy. The proposed adjusted settings in Praat can yield more accurate results for formant analysis of atypical speech in maxillectomy cases when the examiner cannot precisely locate the formant frequencies using the default settings or confirm analysis results obtained using CSL.}, } @article {pmid36748155, year = {2023}, author = {Zhang, C and Hou, Q and Guo, TT and Zhong, JT and Ren, H and Li, GL}, title = {[The effect of Wendler Glottoplasty to elevate vocal pitch in transgender women].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {58}, number = {2}, pages = {139-144}, doi = {10.3760/cma.j.cn115330-20220518-00275}, pmid = {36748155}, issn = {1673-0860}, support = {81900926//National Natural Science Foundation of China/ ; 7204246//Beijing Natural Science Foundation/ ; }, mesh = {Humans ; Male ; Female ; Young Adult ; Adult ; Middle Aged ; *Transgender Persons ; Retrospective Studies ; Speech Acoustics ; Voice Quality ; Phonation ; }, abstract = {Objective: To evaluate the effect of Wendler Glottoplasty to elevate vocal pitch in transgender women. Methods: The voice parameters of pre-and 3-month post-surgery of 29 transgender women who underwent Wendler Glottoplasty in department of otorhinolaryngology head and neck surgery of Beijing Friendship Hospital from January, 2017 to October, 2020 were retrospectively analyzed. The 29 transgender women ranged in age from 19-47 (27.0±6.3) years old. Subjective evaluation was performed using Transsexual Voice Questionnaire for Male to Female (TVQ[MtF]). Objective parameters included fundamental frequency (F0), highest pitch, lowest pitch, habitual volume, Jitter, Shimmer, maximal phonation time (MPT), noise to harmonic ratio (NHR) and formants frequencies(F1, F2, F3, F4). SPSS 25.0 software was used for statistically analysis. Results: Three months after surgery, the score of TVQ[MtF] was significantly decreased [(89.9±14.7) vs. (50.4±13.6), t=11.49, P<0.001]. The F0 was significantly elevated [(152.7±23.3) Hz vs. (207.7±45.9) Hz, t=-6.03, P<0.001]. Frequencies of F1, F2 and F3 were significantly elevated. No statistical difference was observed in the frequencies of F4. The highest pitch was not significantly altered while the lowest pitch was significantly elevated [(96.8±17.7) Hz vs. (120.0±28.9) Hz, t=-3.71, P=0.001]. Habitual speech volume was significantly increased [(60.0±5.2) dB vs. (63.6±9.6) dB, t=-2.12, P=0.043]. Jitter, Shimmer, NHR and MPT were not obviously altered (P>0.05). Conclusions: Wendler Glottoplasty could notably elevate the vocal pitch, formants frequencies and degree of vocal femininity in transgender women without affecting phonation ability and voice quality. It can be an effective treatment modality for voice feminization.}, } @article {pmid36742666, year = {2022}, author = {Gunjawate, DR and Ravi, R and Tauro, JP and Philip, R}, title = {Spectral and Temporal Characteristics of Vowels in Konkani.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4870-4879}, pmid = {36742666}, issn = {2231-3796}, abstract = {The present study was undertaken to study the acoustic characteristics of vowels using spectrographic analysis in Mangalorean Catholic Konkani dialect of Konkani spoken in Mangalore, Karnataka, India. Recordings were done using CVC words in 11 males and 19 females between the age range of 18-55 years. The CVC words consisted of combinations of vowels such as (/i, i:, e, ɵ, ə, u, o, ɐ, ӓ, ɔ/) and consonants such as (/m, k, w, s, ʅ, h, l, r, p, ʤ, g, n, Ɵ, ṭ, ḷ, b, dh/). Recordings were done in a sound-treated room using PRAAT software and spectrographic analysis was done and spectral and temporal characteristics such as fundamental frequency (F0), formants (F1, F2, F3) and vowel duration. The results showed that higher fundamental frequency values were observed for short, high and back vowels. Higher F1 values were noted for open vowels and F2 was higher for front vowels. Long vowels had longer duration compared to short vowels and females had longer vowel duration compared to males. The acoustic information in terms of spectral and temporal cues helps in better understanding the production and perception of languages and dialects.}, } @article {pmid36742539, year = {2022}, author = {Prakash, P and Boominathan, P and Mahalingam, S}, title = {Acoustic Description of Bhramari Pranayama.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4738-4747}, pmid = {36742539}, issn = {2231-3796}, abstract = {UNLABELLED: The study's aim was (1) To describe the acoustic characteristics of Bhramari pranayama, and (2) to compare the acoustic features of nasal consonant /m/ and the sound of Bhramari pranayama produced by yoga trainers. Cross-sectional study design. Thirty-three adult male yoga trainers performed five repeats of nasal consonant /m/ and Bhramari pranayama. These samples were recorded into Computerized Speech Lab, Kay Pentax model 4500b using a microphone (SM48). Formant frequencies (f F1, f F2, f F3, & f F4), formant bandwidths (BF1, BF2, BF3, & BF4), anti-formant, alpha and beta ratio were analyzed. Nasal consonant /m/ had higher f F2 and anti-formant compared to Bhramari pranayama. Statistical significant differences were noted in f F2, BF3, and anti-formants. Bhramari pranayama revealed a low alpha ratio and a higher beta ratio than /m/. However, these differences were not statistically significant. Findings are discussed from acoustic and physiological perspectives. Bhramari pranayama was assumed to be produced with a larger pharyngeal cavity and narrower velar passage when compared to nasal consonant /m/. Verification at the level of the glottis and with aerodynamic parameters may ascertain the above propositions.

SUPPLEMENTARY INFORMATION: The online version contains supplementary material available at 10.1007/s12070-021-03054-1.}, } @article {pmid36732236, year = {2023}, author = {Kondaurova, MV and Zheng, Q and Donaldson, CW and Smith, AF}, title = {Effect of telepractice on pediatric cochlear implant users and provider vowel space: A preliminary report.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {1}, pages = {467}, doi = {10.1121/10.0016866}, pmid = {36732236}, issn = {1520-8524}, mesh = {Child ; Humans ; *Cochlear Implants ; Speech Acoustics ; Speech Production Measurement ; *Cochlear Implantation ; *Deafness/rehabilitation ; Phonetics ; *Speech Perception ; }, abstract = {Clear speaking styles are goal-oriented modifications in which talkers adapt acoustic-phonetic characteristics of speech to compensate for communication challenges. Do children with hearing loss and a clinical provider modify speech characteristics during telepractice to adjust for remote communication? The study examined the effect of telepractice (tele-) on vowel production in seven (mean age 4:11 years, SD 1:2 years) children with cochlear implants (CIs) and a provider. The first (F1) and second (F2) formant frequencies of /i/, /ɑ/, and /u/ vowels were measured in child and provider speech during one in-person and one tele-speech-language intervention, order counterbalanced. Child and provider vowel space areas (VSA) were calculated. The results demonstrated an increase in F2 formant frequency for /i/ vowel in child and provider speech and an increase in F1 formant frequency for /ɑ/ vowel in the provider speech during tele- compared to in-person intervention. An expansion of VSA was found in child and provider speech in tele- compared to in-person intervention. In children, the earlier age of CI activation was associated with larger VSA in both tele- and in-person intervention. The results suggest that the children and the provider adjust vowel articulation in response to remote communication during telepractice.}, } @article {pmid36719795, year = {2022}, author = {Kirby, J and Pittayaporn, P and Brunelle, M}, title = {Transphonologization of onset voicing: revisiting Northern and Eastern Kmhmu'.}, journal = {Phonetica}, volume = {79}, number = {6}, pages = {591-629}, pmid = {36719795}, issn = {1423-0321}, mesh = {Humans ; *Voice ; Phonation ; Language ; Speech Acoustics ; Acoustics ; Phonetics ; }, abstract = {Phonation and vowel quality are often thought to play a vital role at the initial stage of tonogenesis. This paper investigates the production of voicing and tones in a tonal Northern Kmhmu' dialect spoken in Nan Province, Thailand, and a non-tonal Eastern Kmhmu' dialect spoken in Vientiane, Laos, from both acoustic and electroglottographic perspectives. Large and consistent VOT differences between voiced and voiceless stops are preserved in Eastern Kmhmu', but are not found in Northern Kmhmu', consistent with previous reports. With respect to pitch, f0 is clearly a secondary property of the voicing contrast in Eastern Kmhmu', but unquestionably the primary contrastive property in Northern Kmhmu'. Crucially, no evidence is found to suggest that either phonation type or formant differences act as significant cues to voicing in Eastern Kmhmu' or tones in Northern Kmhmu'. These results suggests that voicing contrasts can also be transphonologized directly into f0-based contrasts, skipping a registral stage based primarily on phonation and/or vowel quality.}, } @article {pmid36714887, year = {2023}, author = {Viegas, F and Camargo, Z and Viegas, D and Guimarães, GS and Luiz, RR and Ritto, F and Simões-Zenari, M and Nemr, K}, title = {Acoustic Measurements of Speech and Voice in Men with Angle Class II, Division 1, Malocclusion.}, journal = {International archives of otorhinolaryngology}, volume = {27}, number = {1}, pages = {e10-e15}, pmid = {36714887}, issn = {1809-9777}, abstract = {Introduction The acoustic analysis of speech (measurements of the fundamental frequency and formant frequencies) of different vowels produced by speakers with the Angle class II, division 1, malocclusion can provide information about the relationship between articulatory and phonatory mechanisms in this type of maxillomandibular disproportion. Objectives To investigate acoustic measurements related to the fundamental frequency (F0) and formant frequencies (F1 and F2) of the oral vowels of Brazilian Portuguese (BP) produced by male speakers with Angle class II, division 1, malocclusion (study group) and compare with men with Angle class I malocclusion (control group). Methods In total, 60 men (20 with class II, 40 with class I) aged between 18 and 40 years were included in the study. Measurements of F0, F1 and F2 of the seven oral vowels of BP were estimated from the audio samples containing repetitions of carrier sentences. The statistical analysis was performed using the Student t -test and the effect size was calculated. Results Significant differences (p -values) were detected for F0 values in five vowels ([e], [i], [ᴐ], [o] and [u]), and for F1 in vowels [a] and [ᴐ], with high levels for class II, division 1. Conclusion Statistical differences were found in the F0 measurements with higher values in five of the seven vowels analysed in subjects with Angle class II, division 1. The formant frequencies showed differences only in F1 in two vowels with higher values in the study group. The data suggest that data on voice and speech production must be included in the protocol's assessment of patients with malocclusion.}, } @article {pmid36712820, year = {2023}, author = {Freeman, V}, title = {Production and perception of prevelar merger: Two-dimensional comparisons using Pillai scores and confusion matrices.}, journal = {Journal of phonetics}, volume = {97}, number = {}, pages = {}, pmid = {36712820}, issn = {0095-4470}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, abstract = {Vowel merger production is quantified with gradient acoustic measures, while phonemic perception methods are often coarser, complicating comparisons within mergers in progress. This study implements a perception experiment in two-dimensional formant space (F1 × F2), allowing unified plotting, quantification, and statistics with production data. Production and perception are compared within 20 speakers for a two-part prevelar merger in progress in Pacific Northwest English, where mid-front /ɛ, e/ approximate or merge before voiced velar /ɡ/ (leg-vague merger), and low-front prevelar /æɡ/ raises toward them (bag-raising). Distributions are visualized with kernel density plots and overlap quantified with Pillai scores and confusion matrices from linear discriminant analysis models. Results suggest that leg-vague merger is perceived as more complete than it is produced (in both the sample and community), while bag-raising is highly variable in production but rejected in perception. Relationships between production and perception varied by age, with raising and merger progressing across two generations in production but not perception, followed by younger adults perceiving leg-vague merger but not producing it and varying in (minimal) raising perception while varying in bag-raising in production. Thus, prevelar raising/merger may be progressing among some social groups but reversing in others.}, } @article {pmid36701896, year = {2023}, author = {Holmes, E and Johnsrude, IS}, title = {Intelligibility benefit for familiar voices is not accompanied by better discrimination of fundamental frequency or vocal tract length.}, journal = {Hearing research}, volume = {429}, number = {}, pages = {108704}, doi = {10.1016/j.heares.2023.108704}, pmid = {36701896}, issn = {1878-5891}, support = {MOP 133450//CIHR/Canada ; }, mesh = {Humans ; *Voice ; Speech ; Cognition ; *Speech Perception ; Heart Rate ; }, abstract = {Speech is more intelligible when it is spoken by familiar than unfamiliar people. If this benefit arises because key voice characteristics like perceptual correlates of fundamental frequency or vocal tract length (VTL) are more accurately represented for familiar voices, listeners may be able to discriminate smaller manipulations to such characteristics for familiar than unfamiliar voices. We measured participants' (N = 17) thresholds for discriminating pitch (correlate of fundamental frequency, or glottal pulse rate) and formant spacing (correlate of VTL; 'VTL-timbre') for voices that were familiar (participants' friends) and unfamiliar (other participants' friends). As expected, familiar voices were more intelligible. However, discrimination thresholds were no smaller for the same familiar voices. The size of the intelligibility benefit for a familiar over an unfamiliar voice did not relate to the difference in discrimination thresholds for the same voices. Also, the familiar-voice intelligibility benefit was just as large following perceptible manipulations to pitch and VTL-timbre. These results are more consistent with cognitive accounts of speech perception than traditional accounts that predict better discrimination.}, } @article {pmid36689265, year = {2023}, author = {Ettore, E and Müller, P and Hinze, J and Riemenschneider, M and Benoit, M and Giordana, B and Hurlemann, R and Postin, D and Lecomte, A and Musiol, M and Lindsay, H and Robert, P and König, A}, title = {Digital Phenotyping for Differential Diagnosis of Major Depressive Episode: Narrative Review.}, journal = {JMIR mental health}, volume = {10}, number = {}, pages = {e37225}, pmid = {36689265}, issn = {2368-7959}, abstract = {BACKGROUND: Major depressive episode (MDE) is a common clinical syndrome. It can be found in different pathologies such as major depressive disorder (MDD), bipolar disorder (BD), posttraumatic stress disorder (PTSD), or even occur in the context of psychological trauma. However, only 1 syndrome is described in international classifications (Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition [DSM-5]/International Classification of Diseases 11th Revision [ICD-11]), which do not take into account the underlying pathology at the origin of the MDE. Clinical interviews are currently the best source of information to obtain the etiological diagnosis of MDE. Nevertheless, it does not allow an early diagnosis and there are no objective measures of extracted clinical information. To remedy this, the use of digital tools and their correlation with clinical symptomatology could be useful.

OBJECTIVE: We aimed to review the current application of digital tools for MDE diagnosis while highlighting shortcomings for further research. In addition, our work was focused on digital devices easy to use during clinical interview and mental health issues where depression is common.

METHODS: We conducted a narrative review of the use of digital tools during clinical interviews for MDE by searching papers published in PubMed/MEDLINE, Web of Science, and Google Scholar databases since February 2010. The search was conducted from June to September 2021. Potentially relevant papers were then compared against a checklist for relevance and reviewed independently for inclusion, with focus on 4 allocated topics of (1) automated voice analysis, behavior analysis by (2) video and physiological measures, (3) heart rate variability (HRV), and (4) electrodermal activity (EDA). For this purpose, we were interested in 4 frequently found clinical conditions in which MDE can occur: (1) MDD, (2) BD, (3) PTSD, and (4) psychological trauma.

RESULTS: A total of 74 relevant papers on the subject were qualitatively analyzed and the information was synthesized. Thus, a digital phenotype of MDE seems to emerge consisting of modifications in speech features (namely, temporal, prosodic, spectral, source, and formants) and in speech content, modifications in nonverbal behavior (head, hand, body and eyes movement, facial expressivity, and gaze), and a decrease in physiological measurements (HRV and EDA). We not only found similarities but also differences when MDE occurs in MDD, BD, PTSD, or psychological trauma. However, comparative studies were rare in BD or PTSD conditions, which does not allow us to identify clear and distinct digital phenotypes.

CONCLUSIONS: Our search identified markers from several modalities that hold promise for helping with a more objective diagnosis of MDE. To validate their potential, further longitudinal and prospective studies are needed.}, } @article {pmid36680472, year = {2023}, author = {Aoyama, K and Hong, L and Flege, JE and Akahane-Yamada, R and Yamada, T}, title = {Relationships Between Acoustic Characteristics and Intelligibility Scores: A Reanalysis of Japanese Speakers' Productions of American English Liquids.}, journal = {Language and speech}, volume = {66}, number = {4}, pages = {1030-1045}, doi = {10.1177/00238309221140910}, pmid = {36680472}, issn = {1756-6053}, mesh = {Adult ; Child ; Humans ; United States ; Japan ; *Speech Acoustics ; *Language ; Speech ; Acoustics ; Speech Intelligibility ; Phonetics ; }, abstract = {The primary purpose of this research report was to investigate the relationships between acoustic characteristics and perceived intelligibility for native Japanese speakers' productions of American English liquids. This report was based on a reanalysis of intelligibility scores and acoustic analyses that were reported in two previous studies. We examined which acoustic parameters were associated with higher perceived intelligibility scores for their productions of /l/ and /ɹ/ in American English, and whether Japanese speakers' productions of the two liquids were acoustically differentiated from each other. Results demonstrated that the second formant (F2) was strongly correlated with the perceived intelligibility scores for the Japanese adults' productions. Results also demonstrated that the Japanese adults' and children's productions of /l/ and /ɹ/ were indeed differentiated by some acoustic parameters including the third formant (F3). In addition, some changes occurred in the Japanese children's productions over the course of 1 year. Overall, the present report shows that Japanese speakers of American English may be making a distinction between /l/ and /ɹ/ in production, although the distinctions are made in a different way compared with native English speakers' productions. These findings have implications for setting realistic goals for improving intelligibility of English /l/ and /ɹ/ for Japanese speakers, as well as theoretical advancement of second-language speech learning.}, } @article {pmid36608104, year = {2023}, author = {Sahin, S and Sen Yilmaz, B}, title = {Effects of the Orthognathic Surgery on the Voice Characteristics of Skeletal Class III Patients.}, journal = {The Journal of craniofacial surgery}, volume = {34}, number = {1}, pages = {253-257}, doi = {10.1097/SCS.0000000000008843}, pmid = {36608104}, issn = {1536-3732}, mesh = {Adult ; Humans ; Male ; Female ; Voice Quality ; Speech Acoustics ; *Orthognathic Surgery ; *Voice ; Acoustics ; }, abstract = {OBJECTIVES: To analyze the effects of the bimaxillary orthognathic surgery on the voice characteristics of skeletal Class III cases, and to evaluate correlations between acoustic and skeletal changes.

METHOD: Skeletal Class III adult patients (7 male, 18 female) were asked to pronounce the sounds "[a], [ɛ], [ɯ], [i], [ɔ], [œ], [u], [y]" for 3 seconds. Voice records and lateral cephalometric x-rays were taken before the surgery (T0) and 6 months after (T1). Voice records were taken for the control group with 6 months of interval (n=20). The formant frequencies (F0, F1, F2, and F3), Shimmer, Jitter and Noise to Harmonic Ratio (NHR) parameters were considered with Praat version 6.0.43.

RESULTS: In the surgery group, significant differences were observed in the F1 of [e], F2 and Shimmer of [ɯ] and F1 and F2 of [œ] and F1 of [y] sound, the post-surgery values were lower. F3 of [u] sound was higher. In comparison with the control group, ΔF3 of the [ɔ], ΔF3 of the [u] and ΔF1 of the [y] sound, ΔShimmer of [ɛ], [ɯ], [i], [ɔ], [u] and [y], and the ΔNHR of [ɔ] sound significantly changed. The Pearson correlation analysis proved some correlations; ΔF2 between ΔSNA for [ɯ] and [œ] sounds, ΔF1 between ΔHBV for [y] sound.

CONCLUSION: Bimaxillary orthognathic surgery changed some voice parameters in skeletal Class III patients. Some correlations were found between skeletal and acoustic parameters. We advise clinicians to consider these findings and inform their patients.}, } @article {pmid36593767, year = {2023}, author = {Kim, S and Choi, J and Cho, T}, title = {Data on English coda voicing contrast under different prosodic conditions produced by American English speakers and Korean learners of English.}, journal = {Data in brief}, volume = {46}, number = {}, pages = {108816}, pmid = {36593767}, issn = {2352-3409}, abstract = {This data article provides acoustic data for individual speakers' production of coda voicing contrast between stops in English, which are based on laboratory speech recorded by twelve native speakers of American English and twenty-four Korean learners of English. There were four pairs of English monosyllabic target words with voicing contrast in the coda position (bet-bed, pet-ped, bat-bad, pat-pad). The words were produced in carrier sentences in which they were placed in two different prosodic boundary conditions (Intonational Phrase initial and Intonation Phrase medial), two pitch accent conditions (nuclear-pitch accented and unaccented), and three focus conditions (lexical focus, phonological focus and no focus). The raw acoustic measurement values that are included in a CSV-formated file are F0, F1, F2 and duration of each vowel preceding a coda consonant; and Voice Onset Time of word-initial stops. This article also provides figures that exemplify individual speaker variation of vowel duration, F0, F1 and F2 as a function of focus conditions. The data can thus be potentially reused to observe individual variations in phonetic encoding of coda voicing contrast as a function of the aforementioned prosodically-conditioned factors (i.e., prosodic boundary, pitch accent, focus) in native vs. non-native English. Some theoretical aspects of the data are discussed in the full-length article entitled "Phonetic encoding of coda voicing contrast under different focus conditions in L1 vs. L2 English" [1].}, } @article {pmid36586864, year = {2022}, author = {Herbst, CT and Story, BH}, title = {Computer simulation of vocal tract resonance tuning strategies with respect to fundamental frequency and voice source spectral slope in singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {6}, pages = {3548}, doi = {10.1121/10.0014421}, pmid = {36586864}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Singing ; Computer Simulation ; *Voice ; Sound ; Vibration ; }, abstract = {A well-known concept of singing voice pedagogy is "formant tuning," where the lowest two vocal tract resonances (fR1, fR2) are systematically tuned to harmonics of the laryngeal voice source to maximize the level of radiated sound. A comprehensive evaluation of this resonance tuning concept is still needed. Here, the effect of fR1, fR2 variation was systematically evaluated in silico across the entire fundamental frequency range of classical singing for three voice source characteristics with spectral slopes of -6, -12, and -18 dB/octave. Respective vocal tract transfer functions were generated with a previously introduced low-dimensional computational model, and resultant radiated sound levels were expressed in dB(A). Two distinct strategies for optimized sound output emerged for low vs high voices. At low pitches, spectral slope was the predominant factor for sound level increase, and resonance tuning only had a marginal effect. In contrast, resonance tuning strategies became more prevalent and voice source strength played an increasingly marginal role as fundamental frequency increased to the upper limits of the soprano range. This suggests that different voice classes (e.g., low male vs high female) likely have fundamentally different strategies for optimizing sound output, which has fundamental implications for pedagogical practice.}, } @article {pmid36578688, year = {2022}, author = {Ji, Y and Hu, Y and Jiang, X}, title = {Segmental and suprasegmental encoding of speaker confidence in Wuxi dialect vowels.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {1028106}, pmid = {36578688}, issn = {1664-1078}, abstract = {INTRODUCTION: Wuxi dialect is a variation of Wu dialect spoken in eastern China and is characterized by a rich tonal system. Compared with standard Mandarin speakers, those of Wuxi dialect as their mother tongue can be more efficient in varying vocal cues to encode communicative meanings in speech communication. While literature has demonstrated that speakers encode high vs. low confidence in global prosodic cues at the sentence level, it is unknown how speakers' intended confidence is encoded at a more local, phonetic level. This study aimed to explore the effects of speakers' intended confidence on both prosodic and formant features of vowels in two lexical tones (the flat tone and the contour tone) of Wuxi dialect.

METHODS: Words of a single vowel were spoken in confident, unconfident, or neutral tone of voice by native Wuxi dialect speakers using a standard elicitation procedure. Linear-mixed effects modeling and parametric bootstrapping testing were performed.

RESULTS: The results showed that (1) the speakers raised both F1 and F2 in the confident level (compared with the neutral-intending expression). Additionally, F1 can distinguish between the confident and unconfident expressions; (2) Compared with the neutral-intending expression, the speakers raised mean f0, had a greater variation of f0 and prolonged pronunciation time in the unconfident level while they raised mean intensity, had a greater variation of intensity and prolonged pronunciation time in the confident level. (3) The speakers modulated mean f0 and mean intensity to a larger extent on the flat tone than the contour tone to differentiate between levels of confidence in the voice, while they modulated f0 and intensity range more only on the contour tone.

DISCUSSION: These findings shed new light on the mechanisms of segmental and suprasegmental encoding of speaker confidence and lack of confidence at the vowel level, highlighting the interplay of lexical tone and vocal expression in speech communication.}, } @article {pmid36571115, year = {2023}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Expression of concern: 'Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage' (2022) by Grawunder et al.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1870}, pages = {20220476}, doi = {10.1098/rstb.2022.0476}, pmid = {36571115}, issn = {1471-2970}, } @article {pmid38875684, year = {2022}, author = {Iyer, R and Meyer, D}, title = {Detection of Suicide Risk Using Vocal Characteristics: Systematic Review.}, journal = {JMIR biomedical engineering}, volume = {7}, number = {2}, pages = {e42386}, pmid = {38875684}, issn = {2561-3278}, abstract = {BACKGROUND: In an age when telehealth services are increasingly being used for forward triage, there is a need for accurate suicide risk detection. Vocal characteristics analyzed using artificial intelligence are now proving capable of detecting suicide risk with accuracies superior to traditional survey-based approaches, suggesting an efficient and economical approach to ensuring ongoing patient safety.

OBJECTIVE: This systematic review aimed to identify which vocal characteristics perform best at differentiating between patients with an elevated risk of suicide in comparison with other cohorts and identify the methodological specifications of the systems used to derive each feature and the accuracies of classification that result.

METHODS: A search of MEDLINE via Ovid, Scopus, Computers and Applied Science Complete, CADTH, Web of Science, ProQuest Dissertations and Theses A&I, Australian Policy Online, and Mednar was conducted between 1995 and 2020 and updated in 2021. The inclusion criteria were human participants with no language, age, or setting restrictions applied; randomized controlled studies, observational cohort studies, and theses; studies that used some measure of vocal quality; and individuals assessed as being at high risk of suicide compared with other individuals at lower risk using a validated measure of suicide risk. Risk of bias was assessed using the Risk of Bias in Non-randomized Studies tool. A random-effects model meta-analysis was used wherever mean measures of vocal quality were reported.

RESULTS: The search yielded 1074 unique citations, of which 30 (2.79%) were screened via full text. A total of 21 studies involving 1734 participants met all inclusion criteria. Most studies (15/21, 71%) sourced participants via either the Vanderbilt II database of recordings (8/21, 38%) or the Silverman and Silverman perceptual study recording database (7/21, 33%). Candidate vocal characteristics that performed best at differentiating between high risk of suicide and comparison cohorts included timing patterns of speech (median accuracy 95%), power spectral density sub-bands (median accuracy 90.3%), and mel-frequency cepstral coefficients (median accuracy 80%). A random-effects meta-analysis was used to compare 22 characteristics nested within 14% (3/21) of the studies, which demonstrated significant standardized mean differences for frequencies within the first and second formants (standardized mean difference ranged between -1.07 and -2.56) and jitter values (standardized mean difference=1.47). In 43% (9/21) of the studies, risk of bias was assessed as moderate, whereas in the remaining studies (12/21, 57%), the risk of bias was assessed as high.

CONCLUSIONS: Although several key methodological issues prevailed among the studies reviewed, there is promise in the use of vocal characteristics to detect elevations in suicide risk, particularly in novel settings such as telehealth or conversational agents.

TRIAL REGISTRATION: PROSPERO International Prospective Register of Systematic Reviews CRD420200167413; https://www.crd.york.ac.uk/prospero/display_record.php?ID=CRD42020167413.}, } @article {pmid36508721, year = {2023}, author = {Moya-Galé, G and Wisler, AA and Walsh, SJ and McAuliffe, MJ and Levy, ES}, title = {Acoustic Predictors of Ease of Understanding in Spanish Speakers With Dysarthria Associated With Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {8S}, pages = {2999-3012}, doi = {10.1044/2022_JSLHR-22-00284}, pmid = {36508721}, issn = {1558-9102}, mesh = {Humans ; *Dysarthria/complications ; Speech Intelligibility ; Speech Acoustics ; *Parkinson Disease/complications ; Acoustics ; Speech Production Measurement ; }, abstract = {PURPOSE: The purpose of this study was to examine selected baseline acoustic features of hypokinetic dysarthria in Spanish speakers with Parkinson's disease (PD) and identify potential acoustic predictors of ease of understanding in Spanish.

METHOD: Seventeen Spanish-speaking individuals with mild-to-moderate hypokinetic dysarthria secondary to PD and eight healthy controls were recorded reading a translation of the Rainbow Passage. Acoustic measures of vowel space area, as indicated by the formant centralization ratio (FCR), envelope modulation spectra (EMS), and articulation rate were derived from the speech samples. Additionally, 15 healthy adults rated ease of understanding of the recordings on a visual analogue scale. A multiple linear regression model was implemented to investigate the predictive value of the selected acoustic parameters on ease of understanding.

RESULTS: Listeners' ease of understanding was significantly lower for speakers with dysarthria than for healthy controls. The FCR, EMS from the first 10 s of the reading passage, and the difference in EMS between the end and the beginning sections of the passage differed significantly between the two groups of speakers. Findings indicated that 67.7% of the variability in ease of understanding was explained by the predictive model, suggesting a moderately strong relationship between the acoustic and perceptual domains.

CONCLUSIONS: Measures of envelope modulation spectra were found to be highly significant model predictors of ease of understanding of Spanish-speaking individuals with hypokinetic dysarthria associated with PD. Articulation rate was also found to be important (albeit to a lesser degree) in the predictive model. The formant centralization ratio should be further examined with a larger sample size and more severe dysarthria to determine its efficacy in predicting ease of understanding.}, } @article {pmid36477984, year = {2023}, author = {Peng, H and Li, S and Xing, J and Yang, F and Wu, A}, title = {Surface plasmon resonance of Au/Ag metals for the photoluminescence enhancement of lanthanide ion Ln[3+] doped upconversion nanoparticles in bioimaging.}, journal = {Journal of materials chemistry. B}, volume = {11}, number = {24}, pages = {5238-5250}, doi = {10.1039/d2tb02251f}, pmid = {36477984}, issn = {2050-7518}, mesh = {*Lanthanoid Series Elements/chemistry ; Surface Plasmon Resonance ; *Nanoparticles/chemistry ; *Quantum Dots/chemistry ; }, abstract = {Deep tissue penetration, chemical inertness and biocompatibility give UCNPs a competitive edge over traditional fluorescent materials like organic dyes or quantum dots. However, the low quantum efficiency of UNCPs becomes an obstacle. Among extensive methods and strategies currently used to prominently solve this concerned issue, surface plasmon resonance (SPR) of noble metals is of great use due to the agreement between the SPR peak of metals and absorption band of UCNPs. A key challenge of this match is that the structures and sizes of noble metals have significant influences on the peak of SPR formants, where achieving an explicit elucidation of relationships between the physical properties of noble metals and their SPR formants is of great importance. This review aims to clarify the mechanism of the SPR effect of noble metals on the optical performance of UCNPs. Furthermore, novel research studies in which Au, Ag or Au/Ag composites in various structures and sizes are combined with UCNPs through different synthetic methods are summarized. We provide an overview of improved photoluminescence for bioimaging exhibited by different composite nanoparticles with respect to UCNPs acting as both cores and shells, taking Au@UCNPs, Ag@UCNPs and Au/Ag@UCNPs into account. Finally, there are remaining shortcomings and latent opportunities which deserve further research. This review will provide directions for the bioimaging applications of UCNPs through the introduction of the SPR effect of noble metals.}, } @article {pmid36460491, year = {2024}, author = {Wang, Y and Hattori, M and Liu, R and Sumita, YI}, title = {Digital acoustic analysis of the first three formant frequencies in patients with a prosthesis after maxillectomy.}, journal = {The Journal of prosthetic dentistry}, volume = {132}, number = {5}, pages = {1082-1087}, doi = {10.1016/j.prosdent.2022.10.010}, pmid = {36460491}, issn = {1097-6841}, mesh = {Humans ; Male ; Middle Aged ; Aged ; Adult ; Aged, 80 and over ; *Maxilla/surgery ; *Palatal Obturators ; *Speech Acoustics ; Young Adult ; Phonetics ; Speech Intelligibility ; }, abstract = {STATEMENT OF PROBLEM: Prosthetic rehabilitation with an obturator can help to restore or improve the intelligibility of speech in patients after maxillectomy. The frequency of formants 1 and 2 as well as their ranges were initially reported in patients with maxillary defects in 2002, and the evaluation method that was used is now applied in clinical evaluation. However, the details of formant 3 are not known and warrant investigation because, according to speech science, formant 3 is related to the pharyngeal volume. Clarifying the formant frequency values of formant 3 in patients after maxillectomy would enable prosthodontists to refer to these data when planning treatment and when assessing the outcome of an obturator.

PURPOSE: The purpose of this clinical study was to determine the acoustic characteristics of formant 3, together with those of formants 1 and 2, by using a digital acoustic analysis during maxillofacial prosthetic treatment. The utility of determining formant 3 in the evaluation of speech in patients after maxillectomy was also evaluated.

MATERIAL AND METHODS: Twenty-six male participants after a maxillectomy (mean age, 63 years; range, 20 to 93 years) were included, and the 5 Japanese vowels /a/, /e/, /i/, /o/, and /u/ produced with and without a definitive obturator prosthesis were recorded. The frequencies of the 3 formants were determined, and their ranges were calculated by using a speech analysis system (Computerized Speech Lab CSL 4400). The Wilcoxon signed rank test was used to compare the formants between the 2 use conditions (α=0.05).

RESULTS: Significant differences were found in the frequencies and ranges of all 3 formants between the use conditions. The ranges of all 3 formants produced with the prosthesis were significantly greater than those produced without it.

CONCLUSIONS: Based on the findings, both the first 2 formants and the third formant were changed by wearing an obturator prosthesis. Because formant 3 is related to the volume of the pharynx, evaluation of this formant and its range can reflect the effectiveness of the prosthesis to seal the oronasal communication and help reduce hypernasality, suggesting the utility of formant 3 analysis in prosthodontic rehabilitation.}, } @article {pmid36456282, year = {2022}, author = {Voeten, CC and Heeringa, W and Van de Velde, H}, title = {Normalization of nonlinearly time-dynamic vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {5}, pages = {2692}, doi = {10.1121/10.0015025}, pmid = {36456282}, issn = {1520-8524}, abstract = {This study compares 16 vowel-normalization methods for purposes of sociophonetic research. Most of the previous work in this domain has focused on the performance of normalization methods on steady-state vowels. By contrast, this study explicitly considers dynamic formant trajectories, using generalized additive models to model these nonlinearly. Normalization methods were compared using a hand-corrected dataset from the Flemish-Dutch Teacher Corpus, which contains 160 speakers from 8 geographical regions, who spoke regionally accented versions of Netherlandic/Flemish Standard Dutch. Normalization performance was assessed by comparing the methods' abilities to remove anatomical variation, retain vowel distinctions, and explain variation in the normalized F0-F3. In addition, it was established whether normalization competes with by-speaker random effects or supplements it, by comparing how much between-speaker variance remained to be apportioned to random effects after normalization. The results partly reproduce the good performance of Lobanov, Gerstman, and Nearey 1 found earlier and generally favor log-mean and centroid methods. However, newer methods achieve higher effect sizes (i.e., explain more variance) at only marginally worse performances. Random effects were found to be equally useful before and after normalization, showing that they complement it. The findings are interpreted in light of the way that the different methods handle formant dynamics.}, } @article {pmid36455242, year = {2023}, author = {Leyns, C and Daelman, J and Adriaansen, A and Tomassen, P and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Short-Term Acoustic Effects of Speech Therapy in Transgender Women: A Randomized Controlled Trial.}, journal = {American journal of speech-language pathology}, volume = {32}, number = {1}, pages = {145-168}, doi = {10.1044/2022_AJSLP-22-00135}, pmid = {36455242}, issn = {1558-9110}, mesh = {Humans ; Female ; *Speech Therapy ; Speech Acoustics ; *Transgender Persons ; Acoustics ; Speech ; }, abstract = {PURPOSE: This study measured and compared the acoustic short-term effects of pitch elevation training (PET) and articulation-resonance training (ART) and the combination of both programs, in transgender women.

METHOD: A randomized controlled study with cross-over design was used. Thirty transgender women were included and received 14 weeks of speech training. All participants started with 4 weeks of sham training; after which they were randomly assigned to one of two groups: One group continued with PET (5 weeks), followed by ART (5 weeks); the second group received both trainings in opposite order. Participants were recorded 4 times, in between the training blocks: pre, post 1 (after sham), post 2 (after training 1), and post 3 (after training 2). Speech samples included a sustained vowel, continuous speech during reading, and spontaneous speech and were analyzed using Praat software. Fundamental frequency (f o), intensity, voice range profile, vowel formant frequencies (F 1-2-3-4-5 of /a/-/i/-/u/), formant contrasts, vowel space, and vocal quality (Acoustic Voice Quality Index) were determined.

RESULTS AND CONCLUSIONS: Fundamental frequencies increased after both the PET and ART program, with a higher increase after PET. The combination of both interventions showed a mean increase of the f o of 49 Hz during a sustained vowel, 49 Hz during reading, and 29 Hz during spontaneous speech. However, the lower limit (percentile 5) of the f o during spontaneous speech did not change. Higher values were detected for F 1-2 of /a/, F 3 of /u/, and vowel space after PET and ART separately. F 1-2-3 of /a/, F 1-3-4 of /u/, vowel space, and formant contrasts increased after the combination of PET and ART; hence, the combination induced more increases in formant frequencies. Intensity and voice quality measurements did not change. No order effect was detected; that is, starting with PET or ART did not change the outcome.}, } @article {pmid36425833, year = {2022}, author = {Chen, S and Han, C and Wang, S and Liu, X and Wang, B and Wei, R and Lei, X}, title = {Hearing the physical condition: The relationship between sexually dimorphic vocal traits and underlying physiology.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {983688}, pmid = {36425833}, issn = {1664-1078}, abstract = {A growing amount of research has shown associations between sexually dimorphic vocal traits and physiological conditions related to reproductive advantage. This paper presented a review of the literature on the relationship between sexually dimorphic vocal traits and sex hormones, body size, and physique. Those physiological conditions are important in reproductive success and mate selection. Regarding sex hormones, there are associations between sex-specific hormones and sexually dimorphic vocal traits; about body size, formant frequencies are more reliable predictors of human body size than pitch/fundamental frequency; with regard to the physique, there is a possible but still controversial association between human voice and strength and combat power, while pitch is more often used as a signal of aggressive intent in conflict. Future research should consider demographic, cross-cultural, cognitive interaction, and emotional motivation influences, in order to more accurately assess the relationship between voice and physiology. Moreover, neurological studies were recommended to gain a deeper understanding of the evolutionary origins and adaptive functions of voice modulation.}, } @article {pmid36397662, year = {2022}, author = {Eichner, ACO and Donadon, C and Skarżyński, PH and Sanfins, MD}, title = {A Systematic Review of the Literature Between 2009 and 2019 to Identify and Evaluate Publications on the Effects of Age-Related Hearing Loss on Speech Processing.}, journal = {Medical science monitor : international medical journal of experimental and clinical research}, volume = {28}, number = {}, pages = {e938089}, pmid = {36397662}, issn = {1643-3750}, mesh = {Aged ; Animals ; Humans ; Speech ; *Speech Perception/physiology ; Acoustic Stimulation ; *Hearing Loss, Sensorineural ; *Cochlear Implants ; }, abstract = {Changes in central auditory processing due to aging in normal-hearing elderly patients, as well as age-related hearing loss, are often associated with difficulties in speech processing, especially in unfavorable acoustic environments. Speech processing depends on the perception of temporal and spectral features, and for this reason can be assessed by recordings of phase-locked neural activity when synchronized to transient and periodic sound stimuli frequency-following responses (FFRs). An electronic search of the PubMed and Web of Science databases was carried out in July 2019. Studies that evaluated the effects of age-related hearing loss on components of FFRs were included. Studies that were not in English, studies performed on animals, studies with cochlear implant users, literature reviews, letters to the editor, and case studies were excluded. Our search yielded 6 studies, each of which included 30 to 94 subjects aged between 18 and 80 years. Latency increases and significant amplitude reduction of the onset, offset, and sloop V/A components of FFRs were observed. Latency and amplitude impairment of the fundamental frequency, first formant, and high formants were related to peripheral sensorineural hearing loss in the elderly population. Conclusions: Temporal changes in FFR tracing were related to the aging process. Hearing loss also impacts the envelope fine structure, producing poorer speech comprehension in noisy environments. More research is needed to understand aspects related to hearing loss and cognitive aspects common to the elderly.}, } @article {pmid36376191, year = {2022}, author = {Raveendran, R and Yeshoda, K}, title = {Effects of Resonant Voice Therapy on Perceptual and Acoustic Source and Tract Parameters - A Preliminary Study on Indian Carnatic Classical Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.09.023}, pmid = {36376191}, issn = {1873-4588}, abstract = {PURPOSE: The aim of the study was to examine the effects of resonant voice therapy (RVT) on the vocal resonance of trained Carnatic singers. The specific objectives were to evaluate the effects of resonant voice therapy on the auditory perceptual judgments and acoustic source and tract parameters before and after RVT on phonation and sung voice samples.

METHOD: Six vocally healthy trained Carnatic singers, three males and three females aged 18-25 years (M = 23; S.D = 2.09) participated in the study. All the participants were assigned to a 21-days-long Resonance Voice Therapy (RVT) training program. The participants' pre and post training phonation and sung samples were subjected to auditory perceptual analysis and acoustic analysis.

RESULTS: The results revealed that the post training auditory perceptual ratings of the phonation task showed a statistically significant difference from the pre training scores (Z= 2.35; P = 0.019). While for the singing task, the post training perceptual ratings were not significantly different from the pre training perceptual rating scores (Z= 2.66; P = 0.08). A significant difference was observed between the pre and post training values for all the measured acoustic parameters of the phonation task. In singing task, though the fundamental frequency, third and fourth formant frequencies showed no significant difference in the pre and post training conditions (P > 0.05), the parameter of- difference between the first formant frequency and the fundamental frequency showed a significant decrease (P = 0.028).

CONCLUSION: The effects of resonant voice production led to a high vocal economy, as evidenced from the improved source and filter acoustic parameters. Indication for formant tuning through vocal tract modifications, probably an enlarged pharyngeal area resulting in increased resonant voice quality in both phonation and singing tasks, is inferred from these results.}, } @article {pmid36371478, year = {2022}, author = {Rocchesso, D and Andolina, S and Ilardo, G and Palumbo, SD and Galluzzo, Y and Randazzo, M}, title = {A perceptual sound space for auditory displays based on sung-vowel synthesis.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {19370}, pmid = {36371478}, issn = {2045-2322}, support = {PON AIM (id: AIM1875400-1, CUP: B74I18000210006)//Ministero dell'Istruzione, dell'Università e della Ricerca/ ; }, mesh = {Humans ; Sound Spectrography ; *Singing ; Sound ; *Speech Perception ; }, abstract = {When designing displays for the human senses, perceptual spaces are of great importance to give intuitive access to physical attributes. Similar to how perceptual spaces based on hue, saturation, and lightness were constructed for visual color, research has explored perceptual spaces for sounds of a given timbral family based on timbre, brightness, and pitch. To promote an embodied approach to the design of auditory displays, we introduce the Vowel-Type-Pitch (VTP) space, a cylindrical sound space based on human sung vowels, whose timbres can be synthesized by the composition of acoustic formants and can be categorically labeled. Vowels are arranged along the circular dimension, while voice type and pitch of the vowel correspond to the remaining two axes of the cylindrical VTP space. The decoupling and perceptual effectiveness of the three dimensions of the VTP space are tested through a vowel labeling experiment, whose results are visualized as maps on circular slices of the VTP cylinder. We discuss implications for the design of auditory and multi-sensory displays that account for human perceptual capabilities.}, } @article {pmid36360418, year = {2022}, author = {Yoon, TJ and Ha, S}, title = {Adults' Perception of Children's Vowel Production.}, journal = {Children (Basel, Switzerland)}, volume = {9}, number = {11}, pages = {}, pmid = {36360418}, issn = {2227-9067}, support = {NRF-2021S1A5A2A03064795//Ministry of Education of the Republic of Korean and the National Research Foundation of Korea/ ; }, abstract = {The study examined the link between Korean-speaking children's vowel production and its perception by inexperienced adults and also observed whether ongoing vowel changes in mid-back vowels affect adults' perceptions when the vowels are produced by children. This study analyzed vowels in monosyllabic words produced by 20 children, ranging from 2 to 6 years old, with a focus on gender distinction, and used them as perceptual stimuli for word perception by 20 inexperienced adult listeners. Acoustic analyses indicated that F0 was not a reliable cue for distinguishing gender, but the first two formants served as reliable cues for gender distinction. The results confirmed that the spacing of the two low formants is linguistically and para-linguistically important in identifying vowel types and gender. However, a pair of non-low back vowels caused difficulties in correct vowel identification. Proximal distance between the vowels could be interpreted to result in the highest mismatch between children's production and adults' perception of the two non-low back vowels in the Korean language. We attribute the source of the highest mismatch of the two non-low back vowels to the ongoing sound change observed in high and mid-back vowels in adult speech. The ongoing vowel change is also observed in the children's vowel space, which may well be shaped after the caregivers whose non-low back vowels are close to each other.}, } @article {pmid36359019, year = {2022}, author = {Guo, S and Wu, W and Liu, Y and Kang, X and Li, C}, title = {Effects of Valley Topography on Acoustic Communication in Birds: Why Do Birds Avoid Deep Valleys in Daqinggou Nature Reserve?.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {21}, pages = {}, pmid = {36359019}, issn = {2076-2615}, support = {No. 2022xjkk0802//The Ministry of Science and Technology of China/ ; No. 2019HJ2096001006//The Ministry of Ecology and Environment of China/ ; }, abstract = {To investigate the effects of valley topography on the acoustic transmission of avian vocalisations, we carried out playback experiments in Daqinggou valley, Inner Mongolia, China. During the experiments, we recorded the vocalisations of five avian species, the large-billed crow (Corvus macrorhynchos Wagler, 1827), common cuckoo (Cuculus canorus Linnaeus, 1758), Eurasian magpie (Pica pica Linnaeus, 1758), Eurasian tree sparrow (Passer montanus Linnaeus, 1758), and meadow bunting (Emberiza cioides Brand, 1843), at transmission distances of 30 m and 50 m in the upper and lower parts of the valley and analysed the intensity, the fundamental frequency (F0), and the first three formant frequencies (F1/F2/F3) of the sounds. We also investigated bird species diversity in the upper and lower valley. We found that: (1) at the distance of 30 m, there were significant differences in F0/F1/F2/F3 in Eurasian magpies, significant differences in F1/F2/F3 in the meadow bunting and Eurasian tree sparrow, and partially significant differences in sound frequency between the upper and lower valley in the other two species; (2) at the distance of 50 m, there were significant differences in F0/F1/F2/F3 in two avian species (large-billed crow and common cuckoo) between the upper and lower valley and partially significant differences in sound frequency between the upper and lower valley in the other three species; (2) there were significant differences in the acoustic intensities of crow, cuckoo, magpie, and bunting calls between the upper and lower valley. (3) Species number and richness were significantly higher in the upper valley than in the lower valley. We suggested that the structure of valley habitats may lead to the breakdown of acoustic signals and communication in birds to varying degrees. The effect of valley topography on acoustic communication could be one reason for animal species avoiding deep valleys.}, } @article {pmid36351244, year = {2022}, author = {Kim, Y and Thompson, A}, title = {An Acoustic-Phonetic Approach to Effects of Face Masks on Speech Intelligibility.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {12}, pages = {4679-4689}, pmid = {36351244}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; *Speech Intelligibility ; Phonetics ; Speech Acoustics ; Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: This study aimed to examine the effects of wearing a face mask on speech acoustics and intelligibility, using an acoustic-phonetic analysis of speech. In addition, the effects of speakers' behavioral modification while wearing a mask were examined.

METHOD: Fourteen female adults were asked to read a set of words and sentences under three conditions: (a) conversational, mask-off; (b) conversational, mask-on; and (c) clear, mask-on. Seventy listeners rated speech intelligibility using two methods: orthographic transcription and visual analog scale (VAS). Acoustic measures for vowels included duration, first (F1) and second (F2) formant frequency, and intensity ratio of F1/F2. For consonants, spectral moment coefficients and consonant-vowel (CV) boundary (intensity ratio between consonant and vowel) were measured.

RESULTS: Face masks had a negative impact on speech intelligibility as measured by both intelligibility ratings. However, speech intelligibility was recovered in the clear speech condition for VAS but not for transcription scores. Analysis of orthographic transcription showed that listeners tended to frequently confuse consonants (particularly fricatives, affricates, and stops), rather than vowels in the word-initial position. Acoustic data indicated a significant effect of condition on CV intensity ratio only.

CONCLUSIONS: Our data demonstrate a negative effect of face masks on speech intelligibility, mainly affecting consonants. However, intelligibility can be enhanced by speaking clearly, likely driven by prosodic alterations.}, } @article {pmid36322641, year = {2024}, author = {Baker, CP and Sundberg, J and Purdy, SC and Rakena, TO}, title = {Female adolescent singing voice characteristics: an exploratory study using LTAS and inverse filtering.}, journal = {Logopedics, phoniatrics, vocology}, volume = {49}, number = {2}, pages = {83-95}, doi = {10.1080/14015439.2022.2140455}, pmid = {36322641}, issn = {1651-2022}, mesh = {Humans ; Adolescent ; Female ; *Singing ; *Voice Quality ; Young Adult ; *Acoustics ; *Signal Processing, Computer-Assisted ; Sound Spectrography ; Age Factors ; Time Factors ; Phonation ; Adolescent Development ; Adolescent Behavior ; Sex Factors ; }, abstract = {Background and Aim: To date, little research is available that objectively quantifies female adolescent singing-voice characteristics in light of the physiological and functional developments that occur from puberty to adulthood. This exploratory study sought to augment the pool of data available that offers objective voice analysis of female singers in late adolescence.Methods: Using long-term average spectra (LTAS) and inverse filtering techniques, dynamic range and voice-source characteristics were determined in a cohort of vocally healthy cis-gender female adolescent singers (17 to 19 years) from high-school choirs in Aotearoa New Zealand. Non-parametric statistics were used to determine associations and significant differences.Results: Wide intersubject variation was seen between dynamic range, spectral measures of harmonic organisation (formant cluster prominence, FCP), noise components in the spectrum (high-frequency energy ratio, HFER), and the normalised amplitude quotient (NAQ) suggesting great variability in ability to control phonatory mechanisms such as subglottal pressure (Psub), glottal configuration and adduction, and vocal tract shaping. A strong association between the HFER and NAQ suggest that these non-invasive measures may offer complimentary insights into vocal function, specifically with regard to glottal adduction and turbulent noise in the voice signal.Conclusion: Knowledge of the range of variation within healthy adolescent singers is necessary for the development of effective and inclusive pedagogical practices, and for vocal-health professionals working with singers of this age. LTAS and inverse filtering are useful non-invasive tools for determining such characteristics.}, } @article {pmid36313043, year = {2022}, author = {Easwar, V and Purcell, D and Eeckhoutte, MV and Aiken, SJ}, title = {The Influence of Male- and Female-Spoken Vowel Acoustics on Envelope-Following Responses.}, journal = {Seminars in hearing}, volume = {43}, number = {3}, pages = {223-239}, pmid = {36313043}, issn = {0734-0451}, abstract = {The influence of male and female vowel characteristics on the envelope-following responses (EFRs) is not well understood. This study explored the role of vowel characteristics on the EFR at the fundamental frequency (f0) in response to the vowel /ε/ (as in "head"). Vowel tokens were spoken by five males and five females and EFRs were measured in 25 young adults (21 females). An auditory model was used to estimate changes in auditory processing that might account for talker effects on EFR amplitude. There were several differences between male and female vowels in relation to the EFR. For male talkers, EFR amplitudes were correlated with the bandwidth and harmonic count of the first formant, and the amplitude of the trough below the second formant. For female talkers, EFR amplitudes were correlated with the range of f0 frequencies and the amplitude of the trough above the second formant. The model suggested that the f0 EFR reflects a wide distribution of energy in speech, with primary contributions from high-frequency harmonics mediated from cochlear regions basal to the peaks of the first and second formants, not from low-frequency harmonics with energy near f0. Vowels produced by female talkers tend to produce lower-amplitude EFR, likely because they depend on higher-frequency harmonics where speech sound levels tend to be lower. This work advances auditory electrophysiology by showing how the EFR evoked by speech relates to the acoustics of speech, for both male and female voices.}, } @article {pmid36304844, year = {2022}, author = {Pah, ND and Indrawati, V and Kumar, DK}, title = {Voice Features of Sustained Phoneme as COVID-19 Biomarker.}, journal = {IEEE journal of translational engineering in health and medicine}, volume = {10}, number = {}, pages = {4901309}, pmid = {36304844}, issn = {2168-2372}, mesh = {Humans ; *COVID-19 ; Cross-Sectional Studies ; Longitudinal Studies ; Pandemics ; SARS-CoV-2 ; Biomarkers ; }, abstract = {BACKGROUND: The COVID-19 pandemic has resulted in enormous costs to our society. Besides finding medicines to treat those infected by the virus, it is important to find effective and efficient strategies to prevent the spreading of the disease. One key factor to prevent transmission is to identify COVID-19 biomarkers that can be used to develop an efficient, accurate, noninvasive, and self-administered screening procedure. Several COVID-19 variants cause significant respiratory symptoms, and thus a voice signal may be a potential biomarker for COVID-19 infection.

AIM: This study investigated the effectiveness of different phonemes and a range of voice features in differentiating people infected by COVID-19 with respiratory tract symptoms.

METHOD: This cross-sectional, longitudinal study recorded six phonemes (i.e., /a/, /e/, /i/, /o/, /u/, and /m/) from 40 COVID-19 patients and 48 healthy subjects for 22 days. The signal features were obtained for the recordings, which were statistically analyzed and classified using Support Vector Machine (SVM).

RESULTS: The statistical analysis and SVM classification show that the voice features related to the vocal tract filtering (e.g., MFCC, VTL, and formants) and the stability of the respiratory muscles and lung volume (Intensity-SD) were the most sensitive to voice change due to COVID-19. The result also shows that the features extracted from the vowel /i/ during the first 3 days after admittance to the hospital were the most effective. The SVM classification accuracy with 18 ranked features extracted from /i/ was 93.5% (with F1 score of 94.3%).

CONCLUSION: A measurable difference exists between the voices of people with COVID-19 and healthy people, and the phoneme /i/ shows the most pronounced difference. This supports the potential for using computerized voice analysis to detect the disease and consider it a biomarker.}, } @article {pmid36293884, year = {2022}, author = {Choi, MK and Yoo, SD and Park, EJ}, title = {Destruction of Vowel Space Area in Patients with Dysphagia after Stroke.}, journal = {International journal of environmental research and public health}, volume = {19}, number = {20}, pages = {}, pmid = {36293884}, issn = {1660-4601}, mesh = {Humans ; Dysarthria/complications ; *Deglutition Disorders/etiology ; Speech Acoustics ; Deglutition ; *Stroke/complications ; }, abstract = {Dysphagia is associated with dysarthria in stroke patients. Vowel space decreases in stroke patients with dysarthria; destruction of the vowel space is often observed. We determined the correlation of destruction of acoustic vowel space with dysphagia in stroke patients. Seventy-four individuals with dysphagia and dysarthria who had experienced stroke were enrolled. For /a/, /ae/, /i/, and /u/ vowels, we determined formant parameter (it reflects vocal tract resonance frequency as a two-dimensional coordinate point), formant centralization ratio (FCR), and quadrilateral vowel space area (VSA). Swallowing function was assessed using the videofluoroscopic dysphagia scale (VDS) during videofluoroscopic swallowing studies. Pearson's correlation and linear regression were used to determine the correlation between VSA, FCR, and VDS. Subgroups were created based on VSA; vowel space destruction groups were compared using ANOVA and Scheffe's test. VSA and FCR were negatively and positively correlated with VDS, respectively. Groups were separated based on mean and standard deviation of VSA. One-way ANOVA revealed significant differences in VDS, FCR, and age between the VSA groups and no significant differences in VDS between mild and moderate VSA reduction and vowel space destruction groups. VSA and FCR values correlated with swallowing function. Vowel space destruction has characteristics similar to VSA reduction at a moderate-to-severe degree and has utility as an indicator of dysphagia severity.}, } @article {pmid36289365, year = {2022}, author = {Müller, M and Wang, Z and Caffier, F and Caffier, PP}, title = {New objective timbre parameters for classification of voice type and fach in professional opera singers.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {17921}, pmid = {36289365}, issn = {2045-2322}, mesh = {Humans ; *Singing ; Voice Quality ; *Voice ; Occupations ; Sound ; }, abstract = {Voice timbre is defined as sound color independent of pitch and volume, based on a broad frequency band between 2 and 4 kHz. Since there are no specific timbre parameters, previous studies have come to the very general conclusion that the center frequencies of the singer's formants are somewhat higher in the higher voice types than in the lower ones. For specification, a database was created containing 1723 sound examples of various voice types. The energy distribution in the frequency bands of the singer's formants was extracted for quantitative analysis. When the energy distribution function reached 50%, the corresponding absolute frequency in Hz was defined as Frequency of Half Energy (FHE). This new parameter quantifies the timbre of a singing voice as a concrete measure, independent of fundamental frequency, vowel color and volume. The database allows assigning FHE means ± SD as characteristic or comparative values for sopranos (3092 ± 284 Hz), tenors (2705 ± 221 Hz), baritones (2454 ± 206 Hz) and basses (2384 ± 164 Hz). In addition to vibrato, specific timbre parameters provide another valuable feature in vocal pedagogy for classification of voice type and fach according to the lyric or dramatic character of the voice.}, } @article {pmid36279585, year = {2022}, author = {Hussain, RO and Kumar, P and Singh, NK}, title = {Subcortical and Cortical Electrophysiological Measures in Children With Speech-in-Noise Deficits Associated With Auditory Processing Disorders.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4454-4468}, doi = {10.1044/2022_JSLHR-22-00094}, pmid = {36279585}, issn = {1558-9102}, mesh = {Child ; Humans ; Adolescent ; *Auditory Perceptual Disorders/diagnosis ; Speech ; Noise ; *Speech Perception/physiology ; Evoked Potentials, Auditory ; Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem/physiology ; }, abstract = {PURPOSE: The aim of this study was to analyze the subcortical and cortical auditory evoked potentials for speech stimuli in children with speech-in-noise (SIN) deficits associated with auditory processing disorder (APD) without any reading or language deficits.

METHOD: The study included 20 children in the age range of 9-13 years. Ten children were recruited to the APD group; they had below-normal scores on the speech-perception-in-noise test and were diagnosed as having APD. The remaining 10 were typically developing (TD) children and were recruited to the TD group. Speech-evoked subcortical (brainstem) and cortical (auditory late latency) responses were recorded and compared across both groups.

RESULTS: The results showed a statistically significant reduction in the amplitudes of the subcortical potentials (both for stimulus in quiet and in noise) and the magnitudes of the spectral components (fundamental frequency and the second formant) in children with SIN deficits in the APD group compared to the TD group. In addition, the APD group displayed enhanced amplitudes of the cortical potentials compared to the TD group.

CONCLUSION: Children with SIN deficits associated with APD exhibited impaired coding/processing of the auditory information at the level of the brainstem and the auditory cortex.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21357735.}, } @article {pmid36279201, year = {2022}, author = {Bochner, J and Samar, V and Prud'hommeaux, E and Huenerfauth, M}, title = {Phoneme Categorization in Prelingually Deaf Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4429-4453}, doi = {10.1044/2022_JSLHR-22-00038}, pmid = {36279201}, issn = {1558-9102}, mesh = {Adult ; Humans ; Young Adult ; *Cochlear Implants ; *Deafness/rehabilitation ; *Speech Perception ; *Cochlear Implantation ; Hearing ; }, abstract = {PURPOSE: Phoneme categorization (PC) for voice onset time and second formant transition was studied in adult cochlear implant (CI) users with early-onset deafness and hearing controls.

METHOD: Identification and discrimination tasks were administered to 30 participants implanted before 4 years of age, 21 participants implanted after 7 years of age, and 21 hearing individuals.

RESULTS: Distinctive identification and discrimination functions confirmed PC within all groups. Compared to hearing participants, the CI groups generally displayed longer/higher category boundaries, shallower identification function slopes, reduced identification consistency, and reduced discrimination performance. A principal component analysis revealed that identification consistency, discrimination accuracy, and identification function slope, but not boundary location, loaded on a single factor, reflecting general PC performance. Earlier implantation was associated with better PC performance within the early CI group, but not the late CI group. Within the early CI group, earlier implantation age but not PC performance was associated with better speech recognition. Conversely, within the late CI group, better PC performance but not earlier implantation age was associated with better speech recognition.

CONCLUSIONS: Results suggest that implantation timing within the sensitive period before 4 years of age partly determines the level of PC performance. They also suggest that early implantation may promote development of higher level processes that can compensate for relatively poor PC performance, as can occur in challenging listening conditions.}, } @article {pmid36266347, year = {2022}, author = {Skrabal, D and Rusz, J and Novotny, M and Sonka, K and Ruzicka, E and Dusek, P and Tykalova, T}, title = {Articulatory undershoot of vowels in isolated REM sleep behavior disorder and early Parkinson's disease.}, journal = {NPJ Parkinson's disease}, volume = {8}, number = {1}, pages = {137}, pmid = {36266347}, issn = {2373-8057}, support = {NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; MH CZ-DRO-VFN64165//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; NU20-08-00445//Ministerstvo Zdravotnictví Ceské Republiky (Ministry of Health of the Czech Republic)/ ; }, abstract = {Imprecise vowels represent a common deficit associated with hypokinetic dysarthria resulting from a reduced articulatory range of motion in Parkinson's disease (PD). It is not yet unknown whether the vowel articulation impairment is already evident in the prodromal stages of synucleinopathy. We aimed to assess whether vowel articulation abnormalities are present in isolated rapid eye movement sleep behaviour disorder (iRBD) and early-stage PD. A total of 180 male participants, including 60 iRBD, 60 de-novo PD and 60 age-matched healthy controls performed reading of a standardized passage. The first and second formant frequencies of the corner vowels /a/, /i/, and /u/ extracted from predefined words, were utilized to construct articulatory-acoustic measures of Vowel Space Area (VSA) and Vowel Articulation Index (VAI). Compared to controls, VSA was smaller in both iRBD (p = 0.01) and PD (p = 0.001) while VAI was lower only in PD (p = 0.002). iRBD subgroup with abnormal olfactory function had smaller VSA compared to iRBD subgroup with preserved olfactory function (p = 0.02). In PD patients, the extent of bradykinesia and rigidity correlated with VSA (r = -0.33, p = 0.01), while no correlation between axial gait symptoms or tremor and vowel articulation was detected. Vowel articulation impairment represents an early prodromal symptom in the disease process of synucleinopathy. Acoustic assessment of vowel articulation may provide a surrogate marker of synucleinopathy in scenarios where a single robust feature to monitor the dysarthria progression is needed.}, } @article {pmid36266224, year = {2022}, author = {Zhang, T and He, M and Li, B and Zhang, C and Hu, J}, title = {Acoustic Characteristics of Cantonese Speech Through Protective Facial Coverings.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.08.029}, pmid = {36266224}, issn = {1873-4588}, abstract = {OBJECTIVES: Protective facial coverings (PFCs) such as surgical masks attenuate speech transmission and affect speech intelligibility, which is reported in languages such as English and German. The present study intended to verify the detrimental impacts on production of tonal languages such as Cantonese, by examining realization of speech correlates in Cantonese under PFCs including facial masks and shields.

METHODS: We recorded scripted speech in Hong Kong Cantonese produced by three adult speakers who wore various PFCs, including surgical masks, KF94 masks, and face shields (with and without surgical masks). Spectral and temporal parameters were measured, including mean intensity, speaking rate, long-term amplitude spectrum, formant frequencies of vowels, and duration and fundamental frequency (F0) of tone-bearing parts.

RESULTS: Significant changes were observed in all acoustic correlates of Cantonese speech under PFCs. Sound pressure levels were attenuated more intensely at ranges of higher frequencies in speech through face masks, whereas sound transmission was affected more at ranges of lower frequencies in speech under face shields. Vowel spaces derived from formant frequencies shrank under all PFCs, with the vowel /aa/ demonstrating largest changes in the first two formants. All tone-bearing parts were shortened and showed increments of F0 means in speech through PFCs. The decrease of tone duration was statistically significant in High-level and Low-level tones, while the increment of F0 means was significant in High-level tone only.

CONCLUSIONS: General filtering effect of PFCs is observed in Cantonese speech data, confirming language-universal patterns in acoustic attenuation by PFCs. The various coverings lower overall intensity levels of speech and degrade speech signal in higher frequency regions. Modification patterns specific to Hong Kong Cantonese are also identified. Vowel space area is reduced and found associated with increased speaking rates. Tones are produced with higher F0s under PFCs, which may be attributed to vocal tension caused by tightened vocal tract during speaking through facial coverings.}, } @article {pmid36215575, year = {2022}, author = {Urzúa, AR and Wolf, KB}, title = {Unitary rotation of pixellated polychromatic images.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {39}, number = {8}, pages = {1323-1329}, doi = {10.1364/JOSAA.462530}, pmid = {36215575}, issn = {1520-8532}, abstract = {Unitary rotations of polychromatic images on finite two-dimensional pixellated screens provide invertibility, group composition, and thus conservation of information. Rotations have been applied on monochromatic image data sets, where we now examine closer the Gibbs-like oscillations that appear due to discrete "discontinuities" of the input images under unitary transformations. Extended to three-color images, we examine here the display of color at the pixels where, due to oscillations, some pixel color values may fall outside their required common numerical range [0,1], between absence and saturation of the red, green, and blue formant colors we choose to represent the images.}, } @article {pmid36182345, year = {2022}, author = {Rothenberg, M and Rothenberg, S}, title = {Measuring the distortion of speech by a facemask.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095203}, doi = {10.1121/10.0014002}, pmid = {36182345}, issn = {2691-1191}, mesh = {Acoustics ; Masks ; Mouth ; *Speech ; *Voice ; }, abstract = {Most prior research focuses on the reduced amplitude of speech caused by facemasks. This paper argues that the interaction between the acoustic properties of a facemask and the acoustic properties of the vocal tract contributes to speech distortion by changing the formants of the voice. Speech distortion of a number of masks was tested by measuring the increase in damping of the first formant. Results suggest that masks dampen the first formant and that increasing the distance between the mask wall and mouth can reduce this distortion. These findings contribute to the research studying the impact of masks on speech.}, } @article {pmid36182341, year = {2022}, author = {Tran Ngoc, A and Meunier, F and Meyer, J}, title = {Testing perceptual flexibility in speech through the categorization of whistled Spanish consonants by French speakers.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095201}, doi = {10.1121/10.0013900}, pmid = {36182341}, issn = {2691-1191}, mesh = {Cues ; Humans ; Language ; Phonetics ; *Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Whistled speech is a form of modified speech where, in non-tonal languages, vowels and consonants are augmented and transposed to whistled frequencies, simplifying their timbre. According to previous studies, these transformations maintain some level of vowel recognition for naive listeners. Here, in a behavioral experiment, naive listeners' capacities for the categorization of four whistled consonants (/p/, /k/, /t/, and /s/) were analyzed. Results show patterns of correct responses and confusions that provide new insights into whistled speech perception, highlighting the importance of frequency modulation cues, transposed from phoneme formants, as well as the perceptual flexibility in processing these cues.}, } @article {pmid36182291, year = {2022}, author = {Winn, MB and Wright, RA}, title = {Reconsidering commonly used stimuli in speech perception experiments.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {3}, pages = {1394}, doi = {10.1121/10.0013415}, pmid = {36182291}, issn = {1520-8524}, mesh = {Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception/physiology ; *Voice ; }, abstract = {This paper examines some commonly used stimuli in speech perception experiments and raises questions about their use, or about the interpretations of previous results. The takeaway messages are: 1) the Hillenbrand vowels represent a particular dialect rather than a gold standard, and English vowels contain spectral dynamics that have been largely underappreciated, 2) the /ɑ/ context is very common but not clearly superior as a context for testing consonant perception, 3) /ɑ/ is particularly problematic when testing voice-onset-time perception because it introduces strong confounds in the formant transitions, 4) /dɑ/ is grossly overrepresented in neurophysiological studies and yet is insufficient as a generalized proxy for "speech perception," and 5) digit tests and matrix sentences including the coordinate response measure are systematically insensitive to important patterns in speech perception. Each of these stimulus sets and concepts is described with careful attention to their unique value and also cases where they might be misunderstood or over-interpreted.}, } @article {pmid36171463, year = {2022}, author = {Borodkin, K and Gassner, T and Ershaid, H and Amir, N}, title = {tDCS modulates speech perception and production in second language learners.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {16212}, pmid = {36171463}, issn = {2045-2322}, mesh = {Acoustic Stimulation ; Adult ; Humans ; Language ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; *Transcranial Direct Current Stimulation ; }, abstract = {Accurate identification and pronunciation of nonnative speech sounds can be particularly challenging for adult language learners. The current study tested the effects of a brief musical training combined with transcranial direct current stimulation (tDCS) on speech perception and production in a second language (L2). The sample comprised 36 native Hebrew speakers, aged 18-38, who studied English as L2 in a formal setting and had little musical training. Training encompassed musical perception tasks with feedback (i.e., timbre, duration, and tonal memory) and concurrent tDCS applied over the left posterior auditory-related cortex (including posterior superior temporal gyrus and planum temporale). Participants were randomly assigned to anodal or sham stimulation. Musical perception, L2 speech perception (measured by a categorical AXB discrimination task) and speech production (measured by a speech imitation task) were tested before and after training. There were no tDCS-dependent effects on musical perception post-training. However, only participants who received active stimulation showed increased accuracy of L2 phoneme discrimination and greater change in the acoustic properties of L2 speech sound production (i.e., second formant frequency in vowels and center of gravity in consonants). The results of this study suggest neuromodulation can facilitate the processing of nonnative speech sounds in adult learners.}, } @article {pmid36154230, year = {2022}, author = {Morse, RP and Holmes, SD and Irving, R and McAlpine, D}, title = {Noise helps cochlear implant listeners to categorize vowels.}, journal = {JASA express letters}, volume = {2}, number = {4}, pages = {042001}, doi = {10.1121/10.0010071}, pmid = {36154230}, issn = {2691-1191}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Noise/adverse effects ; Phonetics ; *Speech Perception ; }, abstract = {Theoretical studies demonstrate that controlled addition of noise can enhance the amount of information transmitted by a cochlear implant (CI). The present study is a proof-of-principle for whether stochastic facilitation can improve the ability of CI users to categorize speech sounds. Analogue vowels were presented to CI users through a single electrode with independent noise on multiple electrodes. Noise improved vowel categorization, particularly in terms of an increase in information conveyed by the first and second formant. Noise, however, did not significantly improve vowel recognition: the miscategorizations were just more consistent, giving the potential to improve with experience.}, } @article {pmid36129844, year = {2022}, author = {Easwar, V and Purcell, D and Lasarev, M and McGrath, E and Galloy, M}, title = {Speech-Evoked Envelope Following Responses in Children and Adults.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {10}, pages = {4009-4023}, doi = {10.1044/2022_JSLHR-22-00156}, pmid = {36129844}, issn = {1558-9102}, mesh = {Acoustic Stimulation ; Adolescent ; Child ; Hearing Tests ; Humans ; Male ; Sensitivity and Specificity ; *Speech ; *Speech Perception/physiology ; Young Adult ; }, abstract = {PURPOSE: Envelope following responses (EFRs) could be useful for objectively evaluating audibility of speech in children who are unable to participate in routine clinical tests. However, relative to adults, the characteristics of EFRs elicited by frequency-specific speech and their utility in predicting audibility in children are unknown.

METHOD: EFRs were elicited by the first (F1) and second and higher formants (F2+) of male-spoken vowels /u/ and /i/ and by fricatives /ʃ/ and /s/ in the token /suʃi/ presented at 15, 35, 55, 65, and 75 dB SPL. The F1, F2+, and fricatives were low-, mid-, and high-frequency dominant, respectively. EFRs were recorded between the vertex and the nape from twenty-three 6- to 17-year-old children and 21 young adults with normal hearing. Sensation levels of stimuli were estimated based on behavioral thresholds.

RESULTS: In children, amplitude decreased with age for /ʃ/-elicited EFRs but remained stable for low- and mid-frequency stimuli. As a group, EFR amplitude and phase coherence did not differ from that of adults. EFR sensitivity (proportion of audible stimuli detected) and specificity (proportion of inaudible stimuli not detected) did not vary between children and adults. Consistent with previous work, EFR sensitivity increased with stimulus frequency and level. The type of statistical indicator used for EFR detection did not influence accuracy in children.

CONCLUSIONS: Adultlike EFRs in 6- to 17-year-old typically developing children suggest mature envelope encoding for low- and mid-frequency stimuli. EFR sensitivity and specificity in children, when considering a wide range of stimulus levels and audibility, are ~77% and ~92%, respectively.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21136171.}, } @article {pmid36092651, year = {2022}, author = {Nault, DR and Mitsuya, T and Purcell, DW and Munhall, KG}, title = {Perturbing the consistency of auditory feedback in speech.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {905365}, pmid = {36092651}, issn = {1662-5161}, abstract = {Sensory information, including auditory feedback, is used by talkers to maintain fluent speech articulation. Current models of speech motor control posit that speakers continually adjust their motor commands based on discrepancies between the sensory predictions made by a forward model and the sensory consequences of their speech movements. Here, in two within-subject design experiments, we used a real-time formant manipulation system to explore how reliant speech articulation is on the accuracy or predictability of auditory feedback information. This involved introducing random formant perturbations during vowel production that varied systematically in their spatial location in formant space (Experiment 1) and temporal consistency (Experiment 2). Our results indicate that, on average, speakers' responses to auditory feedback manipulations varied based on the relevance and degree of the error that was introduced in the various feedback conditions. In Experiment 1, speakers' average production was not reliably influenced by random perturbations that were introduced every utterance to the first (F1) and second (F2) formants in various locations of formant space that had an overall average of 0 Hz. However, when perturbations were applied that had a mean of +100 Hz in F1 and -125 Hz in F2, speakers demonstrated reliable compensatory responses that reflected the average magnitude of the applied perturbations. In Experiment 2, speakers did not significantly compensate for perturbations of varying magnitudes that were held constant for one and three trials at a time. Speakers' average productions did, however, significantly deviate from a control condition when perturbations were held constant for six trials. Within the context of these conditions, our findings provide evidence that the control of speech movements is, at least in part, dependent upon the reliability and stability of the sensory information that it receives over time.}, } @article {pmid36063640, year = {2022}, author = {Frankford, SA and Cai, S and Nieto-Castañón, A and Guenther, FH}, title = {Auditory feedback control in adults who stutter during metronome-paced speech II. Formant Perturbation.}, journal = {Journal of fluency disorders}, volume = {74}, number = {}, pages = {105928}, pmid = {36063640}, issn = {1873-801X}, support = {R01 DC007683/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Stuttering/therapy ; Speech/physiology ; Feedback ; Feedback, Sensory/physiology ; Auditory Perception/physiology ; }, abstract = {PURPOSE: Prior work has shown that Adults who stutter (AWS) have reduced and delayed responses to auditory feedback perturbations. This study aimed to determine whether external timing cues, which increase fluency, resolve auditory feedback processing disruptions.

METHODS: Fifteen AWS and sixteen adults who do not stutter (ANS) read aloud a multisyllabic sentence either with natural stress and timing or with each syllable paced at the rate of a metronome. On random trials, an auditory feedback formant perturbation was applied, and formant responses were compared between groups and pacing conditions.

RESULTS: During normally paced speech, ANS showed a significant compensatory response to the perturbation by the end of the perturbed vowel, while AWS did not. In the metronome-paced condition, which significantly reduced the disfluency rate, the opposite was true: AWS showed a significant response by the end of the vowel, while ANS did not.

CONCLUSION: These findings indicate a potential link between the reduction in stuttering found during metronome-paced speech and changes in auditory motor integration in AWS.}, } @article {pmid36050247, year = {2022}, author = {Lee, SH and Lee, GS}, title = {Long-term Average Spectrum and Nasal Accelerometry in Sentences of Differing Nasality and Forward-Focused Vowel Productions Under Altered Auditory Feedback.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.026}, pmid = {36050247}, issn = {1873-4588}, abstract = {OBJECTIVES AND BACKGROUND: To investigate whether voice focus adjustments can alter the audio-vocal feedback and consequently modulate speech/voice motor control. Speaking with a forward-focused voice was expected to enhance audio-vocal feedback and thus decrease the variability of vocal fundamental frequency (F0).

MATERIALS AND METHOD: Twenty-two healthy, untrained adults (10 males and 12 females) were requested to sustain vowel /a/ with their natural focus and a forward focus and to naturally read the nasal, oral, and mixed oral-nasal sentences in normal noise-masked auditory conditions. Meanwhile, a miniature accelerometer was externally attached on the noise to detect the nasal vibrations during vocalization. Audio recordings were made and analyzed using the long-term average spectrum (LTAS) and power spectral analysis of F0.

RESULTS: Compared with naturally-focused vowel production and oral sentences, forward-focused vowel productions and nasal sentences both showed significant increases in nasal accelerometric amplitude and the spectral power within the range of 200∼300 Hz, and significantly decreased the F0 variability below 3 Hz, which has been reported to be associated with enhanced auditory feedback in our previous research. The auditory masking not only significantly increased the low-frequency F0 variability, but also significantly decreased the ratio of the spectral power within 200∼300 Hz to the power within 300∼1000 Hz for the vowel and sentence productions. Gender differences were found in the correlations between the degree of nasal coupling and F0 stability as well as in the LTAS characteristics in response to noise.

CONCLUSIONS: Variations in nasal-oral acoustic coupling not only change the formant features of speech signals, but involuntarily influence the auditory feedback control of vocal fold vibrations. Speakers tend to show improved F0 stability in response to a forward-focused voice adjustment.}, } @article {pmid36050180, year = {2022}, author = {Ibrahim, O and Yuen, I and van Os, M and Andreeva, B and Möbius, B}, title = {The combined effects of contextual predictability and noise on the acoustic realisation of German syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {911}, doi = {10.1121/10.0013413}, pmid = {36050180}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Noise/adverse effects ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speakers tend to speak clearly in noisy environments, while they tend to reserve effort by shortening word duration in predictable contexts. It is unclear how these two communicative demands are met. The current study investigates the acoustic realizations of syllables in predictable vs unpredictable contexts across different background noise levels. Thirty-eight German native speakers produced 60 CV syllables in two predictability contexts in three noise conditions (reference = quiet, 0 dB and -10 dB signal-to-noise ratio). Duration, intensity (average and range), F0 (median), and vowel formants of the target syllables were analysed. The presence of noise yielded significantly longer duration, higher average intensity, larger intensity range, and higher F0. Noise levels affected intensity (average and range) and F0. Low predictability syllables exhibited longer duration and larger intensity range. However, no interaction was found between noise and predictability. This suggests that noise-related modifications might be independent of predictability-related changes, with implications for including channel-based and message-based formulations in speech production.}, } @article {pmid36050169, year = {2022}, author = {Krumbiegel, J and Ufer, C and Blank, H}, title = {Influence of voice properties on vowel perception depends on speaker context.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {820}, doi = {10.1121/10.0013363}, pmid = {36050169}, issn = {1520-8524}, mesh = {Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Different speakers produce the same intended vowel with very different physical properties. Fundamental frequency (F0) and formant frequencies (FF), the two main parameters that discriminate between voices, also influence vowel perception. While it has been shown that listeners comprehend speech more accurately if they are familiar with a talker's voice, it is still unclear how such prior information is used when decoding the speech stream. In three online experiments, we examined the influence of speaker context via F0 and FF shifts on the perception of /o/-/u/ vowel contrasts. Participants perceived vowels from an /o/-/u/ continuum shifted toward /u/ when F0 was lowered or FF increased relative to the original speaker's voice and vice versa. This shift was reduced when the speakers were presented in a block-wise context compared to random order. Conversely, the original base voice was perceived to be shifted toward /u/ when presented in the context of a low F0 or high FF speaker, compared to a shift toward /o/ with high F0 or low FF speaker context. These findings demonstrate that that F0 and FF jointly influence vowel perception in speaker context.}, } @article {pmid36050157, year = {2022}, author = {Whalen, DH and Chen, WR and Shadle, CH and Fulop, SA}, title = {Formants are easy to measure; resonances, not so much: Lessons from Klatt (1986).}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {933}, pmid = {36050157}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; Algorithms ; Canada ; Humans ; Language ; *Speech Acoustics ; }, abstract = {Formants in speech signals are easily identified, largely because formants are defined to be local maxima in the wideband sound spectrum. Sadly, this is not what is of most interest in analyzing speech; instead, resonances of the vocal tract are of interest, and they are much harder to measure. Klatt [(1986). in Proceedings of the Montreal Satellite Symposium on Speech Recognition, 12th International Congress on Acoustics, edited by P. Mermelstein (Canadian Acoustical Society, Montreal), pp. 5-7] showed that estimates of resonances are biased by harmonics while the human ear is not. Several analysis techniques placed the formant closer to a strong harmonic than to the center of the resonance. This "harmonic attraction" can persist with newer algorithms and in hand measurements, and systematic errors can persist even in large corpora. Research has shown that the reassigned spectrogram is less subject to these errors than linear predictive coding and similar measures, but it has not been satisfactorily automated, making its wider use unrealistic. Pending better techniques, the recommendations are (1) acknowledge limitations of current analyses regarding influence of F0 and limits on granularity, (2) report settings more fully, (3) justify settings chosen, and (4) examine the pattern of F0 vs F1 for possible harmonic bias.}, } @article {pmid36009709, year = {2022}, author = {Beeck, VC and Heilmann, G and Kerscher, M and Stoeger, AS}, title = {Sound Visualization Demonstrates Velopharyngeal Coupling and Complex Spectral Variability in Asian Elephants.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {16}, pages = {}, pmid = {36009709}, issn = {2076-2615}, support = {W 1262/FWF_/Austrian Science Fund FWF/Austria ; P31034-B29//Austrian Science Fund (FWF)/ ; W1262-B29//Austrian Science Fund (FWF)/ ; //Marie Jahoda-Scholarship/ ; Final fellowship//VDS CoBeNe/ ; P 31034/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {Sound production mechanisms set the parameter space available for transmitting biologically relevant information in vocal signals. Low-frequency rumbles play a crucial role in coordinating social interactions in elephants' complex fission-fusion societies. By emitting rumbles through either the oral or the three-times longer nasal vocal tract, African elephants alter their spectral shape significantly. In this study, we used an acoustic camera to visualize the sound emission of rumbles in Asian elephants, which have received far less research attention than African elephants. We recorded nine adult captive females and analyzed the spectral parameters of 203 calls, including vocal tract resonances (formants). We found that the majority of rumbles (64%) were nasally emitted, 21% orally, and 13% simultaneously through the mouth and trunk, demonstrating velopharyngeal coupling. Some of the rumbles were combined with orally emitted roars. The nasal rumbles concentrated most spectral energy in lower frequencies exhibiting two formants, whereas the oral and mixed rumbles contained higher formants, higher spectral energy concentrations and were louder. The roars were the loudest, highest and broadest in frequency. This study is the first to demonstrate velopharyngeal coupling in a non-human animal. Our findings provide a foundation for future research into the adaptive functions of the elephant acoustic variability for information coding, localizability or sound transmission, as well as vocal flexibility across species.}, } @article {pmid36007484, year = {2022}, author = {Rong, P and Hansen, O and Heidrick, L}, title = {Relationship between rate-elicited changes in muscular-kinematic control strategies and acoustic performance in individuals with ALS-A multimodal investigation.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106253}, doi = {10.1016/j.jcomdis.2022.106253}, pmid = {36007484}, issn = {1873-7994}, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis ; Biomechanical Phenomena/physiology ; Humans ; Speech/physiology ; Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; }, abstract = {INTRODUCTION: As a key control variable, duration has been long suspected to mediate the organization of speech motor control strategies, which has management implications for neuromotor speech disorders. This study aimed to experimentally delineate the role of duration in organizing speech motor control in neurologically healthy and impaired speakers using a voluntary speaking rate manipulation paradigm.

METHODS: Thirteen individuals with amyotrophic lateral sclerosis (ALS) and 10 healthy controls performed a sentence reading task three times, first at their habitual rate, then at a slower rate. A multimodal approach combining surface electromyography, kinematic, and acoustic technologies was used to record jaw muscle activities, jaw kinematics, and speech acoustics. Six muscular-kinematic features were extracted and factor-analyzed to characterize the organization of the mandibular control hierarchy. Five acoustic features were extracted, measuring the spectrotemporal properties of the diphthong /ɑɪ/ and the plosives /t/ and /k/.

RESULTS: The muscular-kinematic features converged into two interpretable latent factors, reflecting the level and cohesiveness/flexibility of mandibular control, respectively. Voluntary rate reduction led to a trend toward (1) finer, less cohesive, and more flexible mandibular control, and (2) increased range and decreased transition slope of the diphthong formants, across neurologically healthy and impaired groups. Differential correlations were found between the rate-elicited changes in mandibular control and acoustic performance for neurologically healthy and impaired speakers.

CONCLUSIONS: The results provided empirical evidence for the long-suspected but previously unsubstantiated role of duration in (re)organizing speech motor control strategies. The rate-elicited reorganization of muscular-kinematic control contributed to the acoustic performance of healthy speakers, in ways consistent with theoretical predictions. Such contributions were less consistent in impaired speakers, implying the complex nature of speaking rate reduction in ALS, possibly reflecting an interplay of disease-related constraints and volitional duration control. This information may help to stratify and identify candidates for the rate manipulation therapy.}, } @article {pmid36002663, year = {2022}, author = {Easwar, V and Aiken, S and Beh, K and McGrath, E and Galloy, M and Scollie, S and Purcell, D}, title = {Variability in the Estimated Amplitude of Vowel-Evoked Envelope Following Responses Caused by Assumed Neurophysiologic Processing Delays.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {23}, number = {6}, pages = {759-769}, pmid = {36002663}, issn = {1438-7573}, support = {//CIHR/Canada ; }, mesh = {Young Adult ; Child ; Male ; Humans ; Adolescent ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; Noise ; Electroencephalography/methods ; Reaction Time/physiology ; }, abstract = {Vowel-evoked envelope following responses (EFRs) reflect neural encoding of the fundamental frequency of voice (f0). Accurate analysis of EFRs elicited by natural vowels requires the use of methods like the Fourier analyzer (FA) to consider the production-related f0 changes. The FA's accuracy in estimating EFRs is, however, dependent on the assumed neurophysiological processing delay needed to time-align the f0 time course and the recorded electroencephalogram (EEG). For male-spoken vowels (f0 ~ 100 Hz), a constant 10-ms delay correction is often assumed. Since processing delays vary with stimulus and physiological factors, we quantified (i) the delay-related variability that would occur in EFR estimation, and (ii) the influence of stimulus frequency, non-f0 related neural activity, and the listener's age on such variability. EFRs were elicited by the low-frequency first formant, and mid-frequency second and higher formants of /u/, /a/, and /i/ in young adults and 6- to 17-year-old children. To time-align with the f0 time course, EEG was shifted by delays between 5 and 25 ms to encompass plausible response latencies. The delay-dependent range in EFR amplitude did not vary by stimulus frequency or age and was significantly smaller when interference from low-frequency activity was reduced. On average, the delay-dependent range was < 22% of the maximum variability in EFR amplitude that could be expected by noise. Results suggest that using a constant EEG delay correction in FA analysis does not substantially alter EFR amplitude estimation. In the present study, the lack of substantial variability was likely facilitated by using vowels with small f0 ranges.}, } @article {pmid35993422, year = {2024}, author = {Clarke, H and Leav, S and Zestic, J and Mohamed, I and Salisbury, I and Sanderson, P}, title = {Enhanced Neonatal Pulse Oximetry Sounds for the First Minutes of Life: A Laboratory Trial.}, journal = {Human factors}, volume = {66}, number = {4}, pages = {1017-1036}, doi = {10.1177/00187208221118472}, pmid = {35993422}, issn = {1547-8181}, mesh = {Humans ; Infant, Newborn ; *Resuscitation ; *Oximetry ; Oxygen ; Sound ; Heart Rate ; }, abstract = {OBJECTIVE: Auditory enhancements to the pulse oximetry tone may help clinicians detect deviations from target ranges for oxygen saturation (SpO2) and heart rate (HR).

BACKGROUND: Clinical guidelines recommend target ranges for SpO2 and HR during neonatal resuscitation in the first 10 minutes after birth. The pulse oximeter currently maps HR to tone rate, and SpO2 to tone pitch. However, deviations from target ranges for SpO2 and HR are not easy to detect.

METHOD: Forty-one participants were presented with 30-second simulated scenarios of an infant's SpO2 and HR levels in the first minutes after birth. Tremolo marked distinct HR ranges and formants marked distinct SpO2 ranges. Participants were randomly allocated to conditions: (a) No Enhancement control, (b) Enhanced HR Only, (c) Enhanced SpO2 Only, and (d) Enhanced Both.

RESULTS: Participants in the Enhanced HR Only and Enhanced SpO2 Only conditions identified HR and SpO2 ranges, respectively, more accurately than participants in the No Enhancement condition, ps < 0.001. In the Enhanced Both condition, the tremolo enhancement of HR did not affect participants' ability to identify SpO2 range, but the formants enhancement of SpO2 may have attenuated participants' ability to identify tremolo-enhanced HR range.

CONCLUSION: Tremolo and formant enhancements improve range identification for HR and SpO2, respectively, and could improve clinicians' ability to identify SpO2 and HR ranges in the first minutes after birth.

APPLICATION: Enhancements to the pulse oximeter tone to indicate clinically important ranges could improve the management of oxygen delivery to the neonate during resuscitation in the first 10 minutes after birth.}, } @article {pmid35961825, year = {2022}, author = {Nascimento, GFD and Silva, HJD and Oliveira, KGSC and Lira, SZ and Gomes, AOC}, title = {Relationship Between Oropharyngeal Geometry and Acoustic Parameters in Singers: A Preliminary Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.012}, pmid = {35961825}, issn = {1873-4588}, abstract = {OBJECTIVE: To verify possible correlations between formant and cepstral parameters and oropharyngeal geometry in singers, stratified by sex.

METHOD: Voice records and oropharyngeal measures of 31 singers - 13 females and 18 males, mean age of 28 (±5.0) years - were retrieved from a database and analyzed. The oropharyngeal geometry measures were collected with acoustic pharyngometry, and the voice records consisted of sustained vowel /Ԑ/ phonation, which were exported to Praat software and edited to obtain the formant and cepstral parameters, stratified by sex. The Pearson linear correlation test was applied to relate voice parameters to oropharyngeal geometry, at the 5% significance level; the linear regression test was used to justify the variable related to the second formant.

RESULTS: Differences between the sexes were identified only in the oral cavity length (greater in males) and pharyngeal cavity length (greater in females). There was a linear correlation between the third formant and the cepstrum in the female group. In the male group, there was a linear correlation between the cepstrum and the third and fourth formants. A positive linear correlation with up to 95% confidence was also identified between the pharyngeal cavity volume and the second formant in the female group, making it possible to estimate a regression model for the second formant (R2 = 0.70).

CONCLUSION: There are correlations between the oropharyngeal geometry and formant and cepstral parameters in relation to sex. The pharyngeal cavity volume showed the greatest correlation between females and the second formant.}, } @article {pmid35951711, year = {2022}, author = {Nishimura, T and Tokuda, IT and Miyachi, S and Dunn, JC and Herbst, CT and Ishimura, K and Kaneko, A and Kinoshita, Y and Koda, H and Saers, JPP and Imai, H and Matsuda, T and Larsen, ON and Jürgens, U and Hirabayashi, H and Kojima, S and Fitch, WT}, title = {Evolutionary loss of complexity in human vocal anatomy as an adaptation for speech.}, journal = {Science (New York, N.Y.)}, volume = {377}, number = {6607}, pages = {760-763}, doi = {10.1126/science.abm1574}, pmid = {35951711}, issn = {1095-9203}, mesh = {Animals ; *Biological Evolution ; Humans ; *Larynx/anatomy & histology ; *Phonation ; Phonetics ; *Primates ; *Speech ; Speech Acoustics ; *Vocal Cords/anatomy & histology ; }, abstract = {Human speech production obeys the same acoustic principles as vocal production in other animals but has distinctive features: A stable vocal source is filtered by rapidly changing formant frequencies. To understand speech evolution, we examined a wide range of primates, combining observations of phonation with mathematical modeling. We found that source stability relies upon simplifications in laryngeal anatomy, specifically the loss of air sacs and vocal membranes. We conclude that the evolutionary loss of vocal membranes allows human speech to mostly avoid the spontaneous nonlinear phenomena and acoustic chaos common in other primate vocalizations. This loss allows our larynx to produce stable, harmonic-rich phonation, ideally highlighting formant changes that convey most phonetic information. Paradoxically, the increased complexity of human spoken language thus followed simplification of our laryngeal anatomy.}, } @article {pmid35944059, year = {2022}, author = {Suresh, CH and Krishnan, A}, title = {Frequency-Following Response to Steady-State Vowel in Quiet and Background Noise Among Marching Band Participants With Normal Hearing.}, journal = {American journal of audiology}, volume = {31}, number = {3}, pages = {719-736}, doi = {10.1044/2022_AJA-21-00226}, pmid = {35944059}, issn = {1558-9137}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/physiology ; Hearing ; Humans ; *Noise ; Sound ; *Speech Perception/physiology ; }, abstract = {OBJECTIVE: Human studies enrolling individuals at high risk for cochlear synaptopathy (CS) have reported difficulties in speech perception in adverse listening conditions. The aim of this study is to determine if these individuals show a degradation in the neural encoding of speech in quiet and in the presence of background noise as reflected in neural phase-locking to both envelope periodicity and temporal fine structure (TFS). To our knowledge, there are no published reports that have specifically examined the neural encoding of both envelope periodicity and TFS of speech stimuli (in quiet and in adverse listening conditions) among a sample with loud-sound exposure history who are at risk for CS.

METHOD: Using scalp-recorded frequency-following response (FFR), the authors evaluated the neural encoding of envelope periodicity (FFRENV) and TFS (FFRTFS) for a steady-state vowel (English back vowel /u/) in quiet and in the presence of speech-shaped noise presented at +5- and 0 dB SNR. Participants were young individuals with normal hearing who participated in the marching band for at least 5 years (high-risk group) and non-marching band group with low-noise exposure history (low-risk group).

RESULTS: The results showed no group differences in the neural encoding of either the FFRENV or the first formant (F1) in the FFRTFS in quiet and in noise. Paradoxically, the high-risk group demonstrated enhanced representation of F2 harmonics across all stimulus conditions.

CONCLUSIONS: These results appear to be in line with a music experience-dependent enhancement of F2 harmonics. However, due to sound overexposure in the high-risk group, the role of homeostatic central compensation cannot be ruled out. A larger scale data set with different noise exposure background, longitudinal measurements with an array of behavioral and electrophysiological tests is needed to disentangle the nature of the complex interaction between the effects of central compensatory gain and experience-dependent enhancement.}, } @article {pmid35944047, year = {2022}, author = {McAllister, T and Eads, A and Kabakoff, H and Scott, M and Boyce, S and Whalen, DH and Preston, JL}, title = {Baseline Stimulability Predicts Patterns of Response to Traditional and Ultrasound Biofeedback Treatment for Residual Speech Sound Disorder.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {8}, pages = {2860-2880}, pmid = {35944047}, issn = {1558-9102}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {*Apraxias ; Biofeedback, Psychology/methods ; Humans ; Language ; Speech/physiology ; *Speech Sound Disorder/diagnostic imaging/therapy ; Speech Therapy/methods ; }, abstract = {PURPOSE: This study aimed to identify predictors of response to treatment for residual speech sound disorder (RSSD) affecting English rhotics. Progress was tracked during an initial phase of traditional motor-based treatment and a longer phase of treatment incorporating ultrasound biofeedback. Based on previous literature, we focused on baseline stimulability and sensory acuity as predictors of interest.

METHOD: Thirty-three individuals aged 9-15 years with residual distortions of /ɹ/ received a course of individual intervention comprising 1 week of intensive traditional treatment and 9 weeks of ultrasound biofeedback treatment. Stimulability for /ɹ/ was probed prior to treatment, after the traditional treatment phase, and after the end of all treatment. Accuracy of /ɹ/ production in each probe was assessed with an acoustic measure: normalized third formant (F3)-second formant (F2) distance. Model-based clustering analysis was applied to these acoustic measures to identify different average trajectories of progress over the course of treatment. The resulting clusters were compared with respect to acuity in auditory and somatosensory domains.

RESULTS: All but four individuals were judged to exhibit a clinically significant response to the combined course of treatment. Two major clusters were identified. The "low stimulability" cluster was characterized by very low accuracy at baseline, minimal response to traditional treatment, and strong response to ultrasound biofeedback. The "high stimulability" group was more accurate at baseline and made significant gains in both traditional and ultrasound biofeedback phases of treatment. The clusters did not differ with respect to sensory acuity.

CONCLUSIONS: This research accords with clinical intuition in finding that individuals who are more stimulable at baseline are more likely to respond to traditional intervention, whereas less stimulable individuals may derive greater relative benefit from biofeedback.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20422236.}, } @article {pmid35931553, year = {2022}, author = {Levi, SV}, title = {Teaching acoustic phonetics to undergraduates in communication sciences and disorders: Course structure and sample projects.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {651}, doi = {10.1121/10.0012984}, pmid = {35931553}, issn = {1520-8524}, mesh = {Acoustics ; Communication ; Humans ; *Phonetics ; *Speech Acoustics ; Students ; }, abstract = {Virtually all undergraduate communication sciences and disorders programs require a course that covers acoustic phonetics. Students typically have a separate phonetics (transcription) course prior to taking the acoustic phonetics course. This paper describes a way to structure an acoustic phonetics course into two halves: a first half that focuses on the source, including basic acoustics (simple harmonic motion, harmonics), vocal fold vibration, modes of phonation, and intonation, and a second half that focuses on the filter, including resonance and tube models, vowel formants, and consonant acoustics. Thus, basic acoustic properties are interwoven with specific examples of speech-related acoustics. In addition, two projects that illustrate concepts from the two halves of the course (one on fundamental frequency and the other on vowel formants) are presented.}, } @article {pmid35931547, year = {2022}, author = {Mills, HE and Shorey, AE and Theodore, RM and Stilp, CE}, title = {Context effects in perception of vowels differentiated by F1 are not influenced by variability in talkers' mean F1 or F3.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {55}, doi = {10.1121/10.0011920}, pmid = {35931547}, issn = {1520-8524}, mesh = {*Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Spectral properties of earlier sounds (context) influence recognition of later sounds (target). Acoustic variability in context stimuli can disrupt this process. When mean fundamental frequencies (f0's) of preceding context sentences were highly variable across trials, shifts in target vowel categorization [due to spectral contrast effects (SCEs)] were smaller than when sentence mean f0's were less variable; when sentences were rearranged to exhibit high or low variability in mean first formant frequencies (F1) in a given block, SCE magnitudes were equivalent [Assgari, Theodore, and Stilp (2019) J. Acoust. Soc. Am. 145(3), 1443-1454]. However, since sentences were originally chosen based on variability in mean f0, stimuli underrepresented the extent to which mean F1 could vary. Here, target vowels (/ɪ/-/ɛ/) were categorized following context sentences that varied substantially in mean F1 (experiment 1) or mean F3 (experiment 2) with variability in mean f0 held constant. In experiment 1, SCE magnitudes were equivalent whether context sentences had high or low variability in mean F1; the same pattern was observed in experiment 2 for new sentences with high or low variability in mean F3. Variability in some acoustic properties (mean f0) can be more perceptually consequential than others (mean F1, mean F3), but these results may be task-dependent.}, } @article {pmid35920586, year = {2023}, author = {Feng, Y and Peng, G}, title = {Development of categorical speech perception in Mandarin-speaking children and adolescents.}, journal = {Child development}, volume = {94}, number = {1}, pages = {28-43}, pmid = {35920586}, issn = {1467-8624}, mesh = {Male ; Adult ; Humans ; Child ; Adolescent ; *Speech Perception ; Cross-Sectional Studies ; Linguistics ; Asian ; China ; }, abstract = {Although children develop categorical speech perception at a very young age, the maturation process remains unclear. A cross-sectional study in Mandarin-speaking 4-, 6-, and 10-year-old children, 14-year-old adolescents, and adults (n = 104, 56 males, all Asians from mainland China) was conducted to investigate the development of categorical perception of four Mandarin phonemic contrasts: lexical tone contrast Tone 1-2, vowel contrast /u/-/i/, consonant aspiration contrast /p/-/p[h] /, and consonant formant transition contrast /p/-/t/. The results indicated that different types of phonemic contrasts, and even the identification and discrimination of the same phonemic contrast, matured asynchronously. The observation that tone and vowel perception are achieved earlier than consonant perception supports the phonological saliency hypothesis.}, } @article {pmid35916929, year = {2023}, author = {Song, J and Wan, Q and Wang, Y and Zhou, H}, title = {Establishment of a Multi-parameter Evaluation Model for Risk of Aspiration in Dysphagia: A Pilot Study.}, journal = {Dysphagia}, volume = {38}, number = {1}, pages = {406-414}, pmid = {35916929}, issn = {1432-0460}, mesh = {Humans ; Deglutition ; *Deglutition Disorders/diagnosis/etiology ; Pilot Projects ; Risk Factors ; }, abstract = {It's difficult for clinical bedside evaluations to accurately determine the occurrence of aspiration in patients. Although VFSS and FEES are the gold standards for clinical diagnosis of dysphagia, which are mainly used to evaluate people at high risk of dysphagia found by bedside screening, the operation is complicated and time-consuming. The aim of this pilot study was to present an objective measure based on a multi-parameter approach to screen for aspiration risk in patients with dysphagia. Objective evaluation techniques based on speech parameters were used to assess the oral motor function, vocal cord function, and voice changes before and after swallowing in 32 patients with dysphagia (16 low-risk aspiration group, 16 high-risk aspiration group). Student's t test combined with stepwise logistic regression were used to determine the optimal index. The best model consists of three parameters, and the equation is: logit(P) = - 3.824 - (0.504 × maximum phonation time) + (0.008 × second formant frequency of /u/) - 0.085 × (fundamental frequency difference before and after swallowing). An additional eight patients with dysphagia were randomly selected as the validation group of the model. When applied to validation, this model can accurately identify the risk of aspiration in 87.5% of patients, and the sensitivity is as high as 100%. Therefore, it has certain clinical practical value that may help clinicians to assess the risk of aspiration in patients with dysphagia, especially for silent aspiration.}, } @article {pmid35905807, year = {2022}, author = {Lee, GS and Chang, CW}, title = {Comparisons of auditory brainstem response elicited by compound click-sawtooths sound and synthetic consonant-vowel /da/.}, journal = {Physiology & behavior}, volume = {255}, number = {}, pages = {113922}, doi = {10.1016/j.physbeh.2022.113922}, pmid = {35905807}, issn = {1873-507X}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Humans ; Phonetics ; Reaction Time/physiology ; Sound ; *Speech Perception/physiology ; }, abstract = {The auditory brainstem response to complex sounds (cABR) could be evoked using speech sounds such as the 40 ms synthetic consonant-vowel syllable /da/ (CV-da) that was commonly used in basic and clinical research. cABR consists of responses to formant energy as well as the energy of fundamental frequency. The co-existence of the two energy makes cABR a mixed response. We introduced a new stimulus of click-sawtooths (CSW) with similar time-lock patterns but without formant or harmonic energy. Ten young healthy volunteers were recruited and the cABRs of CV-da and CSW of their 20 ears were acquired. The response latencies, amplitudes, and frequency-domain analytic results were compared pairwisely between stimuli. The response amplitudes were significantly greater for CSW and the latencies were significantly shorter for CSW. The latency-intensity functions were also greater for CSW. For CSW, adjustments of energy component can be made without causing biased changes to the other. CSW may be used in future basic research and clinical applications.}, } @article {pmid35894373, year = {2022}, author = {França, FP and Almeida, AA and Lopes, LW}, title = {Immediate effect of different exercises in the vocal space of women with and without vocal nodules.}, journal = {CoDAS}, volume = {34}, number = {5}, pages = {e20210157}, pmid = {35894373}, issn = {2317-1782}, mesh = {Exercise ; Female ; Humans ; Language ; *Phonetics ; *Speech Acoustics ; Tongue ; }, abstract = {PURPOSE: To investigate the immediate effect of voiced tongue vibration (VSL), high-resistance straw in the air (CAR), and overarticulation (OA) on the vocal space of vocally healthy women (MVS) and with vocal nodules (MNV).

METHODS: 12 women participated in the MNV and 12 women in the MVS, allocated to perform the vocal exercises of VSL, CAR, and OA. Each participant performed only one of the three proposed exercises, for 5 minutes, preceded and followed by recording a sequence of vehicle sentences for extracting formants (F1 and F2) from the vowel segments [a, i, u]. The vowel space was analyzed through the differences between the measures of the formants of the vowels.

RESULTS: we observed a reduction of F1 in the interval [a]-[i] and [i]-[u] and of F2 between the vowels [a]-[u] and [i]-[u] in the MVS, after performing the CAR. In MNV, we observed a reduction of F2 in the interval [a]-[i] after VSL. In the intergroup analysis, there were higher F1 values between the intervals of the vowels [a]-[i] and [i]-[u] in the MVS, before performing the CAR, and after exercise only in the interval [a]-[i]. A higher value of F1 and F2 was observed in the interval between the vowels [i]-[u] in the MNV after VSL.

CONCLUSION: The VSL exercise reduced the vowel space in MNV women. CAR reduced the vocal space of women in the MVS. The MNV had a smaller vowel space compared to the MVS before and after the CAR. We observed a reduction in the vowel space in the MNV compared to the MNV after the VSL exercise.}, } @article {pmid35874163, year = {2022}, author = {Wang, H and Max, L}, title = {Inter-Trial Formant Variability in Speech Production Is Actively Controlled but Does Not Affect Subsequent Adaptation to a Predictable Formant Perturbation.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {890065}, pmid = {35874163}, issn = {1662-5161}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; }, abstract = {Despite ample evidence that speech production is associated with extensive trial-to-trial variability, it remains unclear whether this variability represents merely unwanted system noise or an actively regulated mechanism that is fundamental for maintaining and adapting accurate speech movements. Recent work on upper limb movements suggest that inter-trial variability may be not only actively regulated based on sensory feedback, but also provide a type of workspace exploration that facilitates sensorimotor learning. We therefore investigated whether experimentally reducing or magnifying inter-trial formant variability in the real-time auditory feedback during speech production (a) leads to adjustments in formant production variability that compensate for the manipulation, (b) changes the temporal structure of formant adjustments across productions, and (c) enhances learning in a subsequent adaptation task in which a predictable formant-shift perturbation is applied to the feedback signal. Results show that subjects gradually increased formant variability in their productions when hearing auditory feedback with reduced variability, but subsequent formant-shift adaptation was not affected by either reducing or magnifying the perceived variability. Thus, findings provide evidence for speakers' active control of inter-trial formant variability based on auditory feedback from previous trials, but-at least for the current short-term experimental manipulation of feedback variability-not for a role of this variability regulation mechanism in subsequent auditory-motor learning.}, } @article {pmid35865705, year = {2022}, author = {Mailhos, A and Egea-Caparrós, DA and Guerrero Rodríguez, C and Luzardo, M and Kiskimska, ND and Martínez Sánchez, F}, title = {Vocal Cues to Male Physical Formidability.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {879102}, pmid = {35865705}, issn = {1664-1078}, abstract = {Animal vocalizations convey important information about the emitter, including sex, age, biological quality, and emotional state. Early on, Darwin proposed that sex differences in auditory signals and vocalizations were driven by sexual selection mechanisms. In humans, studies on the association between male voice attributes and physical formidability have thus far reported mixed results. Hence, with a view to furthering our understanding of the role of human voice in advertising physical formidability, we sought to identify acoustic attributes of male voices associated with physical formidability proxies. Mean fundamental frequency (F 0), formant dispersion (D f), formant position (P f), and vocal tract length (VTL) data from a sample of 101 male voices was analyzed for potential associations with height, weight, and maximal handgrip strength (HGS). F 0 correlated negatively with HGS; P f showed negative correlations with HGS, height and weight, whereas VTL positively correlated with HGS, height and weight. All zero-order correlations remained significant after controlling for false discovery rate (FDR) with the Benjamini-Hochberg method. After controlling for height and weight-and controlling for FDR-the correlation between F 0 and HGS remained significant. In addition, to evaluate the ability of human male voices to advertise physical formidability to potential mates, 151 heterosexual female participants rated the voices of the 10 strongest and the 10 weakest males from the original sample for perceived physical strength, and given that physical strength is a desirable attribute in male partners, perceived attractiveness. Generalized linear mixed model analyses-which allow for generalization of inferences to other samples of both raters and targets-failed to support a significant association of perceived strength or attractiveness from voices alone and actual physical strength. These results add to the growing body of work on the role of human voices in conveying relevant biological information.}, } @article {pmid35858255, year = {2022}, author = {Shao, J and Bakhtiar, M and Zhang, C}, title = {Impaired Categorical Perception of Speech Sounds Under the Backward Masking Condition in Adults Who Stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2554-2570}, doi = {10.1044/2022_JSLHR-21-00276}, pmid = {35858255}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Child ; Humans ; Phonetics ; Speech ; *Speech Perception ; *Stuttering ; *Voice ; }, abstract = {PURPOSE: Evidence increasingly indicates that people with developmental stuttering have auditory perception deficits. Our previous research has indicated similar but slower performance in categorical perception of the speech sounds under the quiet condition in children who stutter and adults who stutter (AWS) compared with their typically fluent counterparts. We hypothesized that the quiet condition may not be sufficiently sensitive to reveal subtle perceptual deficiencies in people who stutter. This study examined this hypothesis by testing the categorical perception of speech and nonspeech sounds under backward masking condition (i.e., a noise was presented immediately after the target stimuli).

METHOD: Fifteen Cantonese-speaking AWS and 15 adults who do not stutter (AWNS) were tested on the categorical perception of four stimulus continua, namely, consonant varying in voice onset time (VOT), vowel, lexical tone, and nonspeech, under the backward masking condition using identification and discrimination tasks.

RESULTS: AWS demonstrated a broader boundary width than AWNS in the identification task. AWS also exhibited a worse performance than AWNS in the discrimination of between-category stimuli but a comparable performance in the discrimination of within-category stimuli, indicating reduced sensitivity to sounds that belonged to different phonemic categories among AWS. Moreover, AWS showed similar patterns of impaired categorical perception across the four stimulus types, although the boundary location on the VOT continuum occurred at an earlier point in AWS than in AWNS.

CONCLUSIONS: The findings provide robust evidence that AWS exhibit impaired categorical perception of speech and nonspeech sounds under the backward masking condition. Temporal processing (i.e., VOT manipulation), frequency/spectral/formant processing (i.e., lexical tone or vowel manipulations), and nonlinguistic pitch processing were all found to be impaired in AWS. Altogether, the findings support the hypothesis that AWS might be less efficient in accessing the phonemic representations when exposed to a demanding listening condition.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20249718.}, } @article {pmid35858067, year = {2022}, author = {Baciadonna, L and Solvi, C and Del Vecchio, F and Pilenga, C and Baracchi, D and Bandoli, F and Isaja, V and Gamba, M and Favaro, L}, title = {Vocal accommodation in penguins (Spheniscus demersus) as a result of social environment.}, journal = {Proceedings. Biological sciences}, volume = {289}, number = {1978}, pages = {20220626}, pmid = {35858067}, issn = {1471-2954}, mesh = {Animals ; Communication ; Humans ; Language ; Social Environment ; *Spheniscidae ; Vocalization, Animal ; }, abstract = {The ability to vary the characteristics of one's voice is a critical feature of human communication. Understanding whether and how animals change their calls will provide insights into the evolution of language. We asked to what extent the vocalizations of penguins, a phylogenetically distant species from those capable of explicit vocal learning, are flexible and responsive to their social environment. Using a principal components (PCs) analysis, we reduced 14 vocal parameters of penguin's contact calls to four PCs, each comprising highly correlated parameters and which can be categorized as fundamental frequency, formant frequency, frequency modulation, and amplitude modulation rate and duration. We compared how these differed between individuals with varying degrees of social interactions: same-colony versus different-colony, same colony over 3 years and partners versus non-partners. Our analyses indicate that the more penguins experience each other's calls, the more similar their calls become over time, that vocal convergence requires a long time and relative stability in colony membership, and that partners' unique social bond may affect vocal convergence differently than non-partners. Our results suggest that this implicit form of vocal plasticity is perhaps more widespread across the animal kingdom than previously thought and may be a fundamental capacity of vertebrate vocalization.}, } @article {pmid35804282, year = {2022}, author = {Easwar, V and Chung, L}, title = {The influence of phoneme contexts on adaptation in vowel-evoked envelope following responses.}, journal = {The European journal of neuroscience}, volume = {56}, number = {5}, pages = {4572-4582}, pmid = {35804282}, issn = {1460-9568}, mesh = {Acoustic Stimulation ; Humans ; Male ; Phonetics ; *Speech Perception/physiology ; }, abstract = {Repeated stimulus presentation leads to neural adaptation and consequent amplitude reduction in vowel-evoked envelope following responses (EFRs)-a response that reflects neural activity phase-locked to envelope periodicity. EFRs are elicited by vowels presented in isolation or in the context of other phonemes such as consonants in syllables. While context phonemes could exert some forward influence on vowel-evoked EFRs, they may reduce the degree of adaptation. Here, we evaluated whether the properties of context phonemes between consecutive vowel stimuli influence adaptation. EFRs were elicited by the low-frequency first formant (resolved harmonics) and middle-to-high-frequency second and higher formants (unresolved harmonics) of a male-spoken /i/ when the presence, number and predictability of context phonemes (/s/, /a/, /∫/ and /u/) between vowel repetitions varied. Monitored over four iterations of /i/, adaptation was evident only for EFRs elicited by the unresolved harmonics. EFRs elicited by the unresolved harmonics decreased in amplitude by ~16-20 nV (10%-17%) after the first presentation of /i/ and remained stable thereafter. EFR adaptation was reduced by the presence of a context phoneme, but the reduction did not change with their number or predictability. The presence of a context phoneme, however, attenuated EFRs by a degree similar to that caused by adaptation (~21-23 nV). Such a trade-off in the short- and long-term influence of context phonemes suggests that the benefit of interleaving EFR-eliciting vowels with other context phonemes depends on whether the use of consonant-vowel syllables is critical to improve the validity of EFR applications.}, } @article {pmid35802401, year = {2022}, author = {Teferra, BG and Borwein, S and DeSouza, DD and Simpson, W and Rheault, L and Rose, J}, title = {Acoustic and Linguistic Features of Impromptu Speech and Their Association With Anxiety: Validation Study.}, journal = {JMIR mental health}, volume = {9}, number = {7}, pages = {e36828}, pmid = {35802401}, issn = {2368-7959}, abstract = {BACKGROUND: The measurement and monitoring of generalized anxiety disorder requires frequent interaction with psychiatrists or psychologists. Access to mental health professionals is often difficult because of high costs or insufficient availability. The ability to assess generalized anxiety disorder passively and at frequent intervals could be a useful complement to conventional treatment and help with relapse monitoring. Prior work suggests that higher anxiety levels are associated with features of human speech. As such, monitoring speech using personal smartphones or other wearable devices may be a means to achieve passive anxiety monitoring.

OBJECTIVE: This study aims to validate the association of previously suggested acoustic and linguistic features of speech with anxiety severity.

METHODS: A large number of participants (n=2000) were recruited and participated in a single web-based study session. Participants completed the Generalized Anxiety Disorder 7-item scale assessment and provided an impromptu speech sample in response to a modified version of the Trier Social Stress Test. Acoustic and linguistic speech features were a priori selected based on the existing speech and anxiety literature, along with related features. Associations between speech features and anxiety levels were assessed using age and personal income as covariates.

RESULTS: Word count and speaking duration were negatively correlated with anxiety scores (r=-0.12; P<.001), indicating that participants with higher anxiety scores spoke less. Several acoustic features were also significantly (P<.05) associated with anxiety, including the mel-frequency cepstral coefficients, linear prediction cepstral coefficients, shimmer, fundamental frequency, and first formant. In contrast to previous literature, second and third formant, jitter, and zero crossing rate for the z score of the power spectral density acoustic features were not significantly associated with anxiety. Linguistic features, including negative-emotion words, were also associated with anxiety (r=0.10; P<.001). In addition, some linguistic relationships were sex dependent. For example, the count of words related to power was positively associated with anxiety in women (r=0.07; P=.03), whereas it was negatively associated with anxiety in men (r=-0.09; P=.01).

CONCLUSIONS: Both acoustic and linguistic speech measures are associated with anxiety scores. The amount of speech, acoustic quality of speech, and gender-specific linguistic characteristics of speech may be useful as part of a system to screen for anxiety, detect relapse, or monitor treatment.}, } @article {pmid35778699, year = {2022}, author = {Lin, YC and Yan, HT and Lin, CH and Chang, HH}, title = {Predicting frailty in older adults using vocal biomarkers: a cross-sectional study.}, journal = {BMC geriatrics}, volume = {22}, number = {1}, pages = {549}, pmid = {35778699}, issn = {1471-2318}, mesh = {Aged ; Biomarkers ; Cross-Sectional Studies ; Female ; Frail Elderly ; *Frailty/diagnosis/epidemiology ; Humans ; Male ; Odds Ratio ; *Osteoporotic Fractures ; }, abstract = {BACKGROUND: Frailty is a common issue in the aging population. Given that frailty syndrome is little discussed in the literature on the aging voice, the current study aims to examine the relationship between frailty and vocal biomarkers in older people.

METHODS: Participants aged ≥ 60 years visiting geriatric outpatient clinics were recruited. They underwent frailty assessment (Cardiovascular Health Study [CHS] index; Study of Osteoporotic Fractures [SOF] index; and Fatigue, Resistance, Ambulation, Illness, and Loss of weight [FRAIL] index) and were asked to pronounce a sustained vowel /a/ for approximately 1 s. Four voice parameters were assessed: average number of zero crossings (A1), variations in local peaks and valleys (A2), variations in first and second formant frequencies (A3), and spectral energy ratio (A4).

RESULTS: Among 277 older adults, increased A1 was associated with a lower likelihood of frailty as defined by SOF (odds ratio [OR] 0.84, 95% confidence interval [CI] 0.74-0.96). Participants with larger A2 values were more likely to be frail, as defined by FRAIL and CHS (FRAIL: OR 1.41, 95% CI 1.12-1.79; CHS: OR 1.38, 95% CI 1.10-1.75). Sex differences were observed across the three frailty indices. In male participants, an increase in A3 by 10 points increased the odds of frailty by almost 7% (SOF: OR 1.07, 95% CI 1.02-1.12), 6% (FRAIL: OR 1.06, 95% CI 1.02-1.11), or 6% (CHS: OR 1.06, 95% CI 1.01-1.11). In female participants, an increase in A4 by 0.1 conferred a significant 2.8-fold (SOF: OR 2.81, 95% CI 1.71-4.62), 2.3-fold (FRAIL: OR 2.31, 95% CI 1.45-3.68), or 2.8-fold (CHS: OR 2.82, 95% CI 1.76-4.51, CHS) increased odds of frailty.

CONCLUSIONS: Vocal biomarkers, especially spectral-domain voice parameters, might have potential for estimating frailty, as a non-invasive, instantaneous, objective, and cost-effective estimation tool, and demonstrating sex differences for individualised treatment of frailty.}, } @article {pmid35778208, year = {2022}, author = {Jibson, J}, title = {Formant detail needed for identifying, rating, and discriminating vowels in Wisconsin English.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {6}, pages = {4004}, doi = {10.1121/10.0011539}, pmid = {35778208}, issn = {1520-8524}, mesh = {*Language ; Wisconsin ; }, abstract = {Neel [(2004). Acoust. Res. Lett. Online 5, 125-131] asked how much time-varying formant detail is needed for vowel identification. In that study, multiple stimuli were synthesized for each vowel: 1-point (monophthongal with midpoint frequencies), 2-point (linear from onset to offset), 3-point, 5-point, and 11-point. Results suggested that a 3-point model was optimal. This conflicted with the dual-target hypothesis of vowel inherent spectral change research, which has found that two targets are sufficient to model vowel identification. The present study replicates and expands upon the work of Neel. Ten English monophthongs were chosen for synthesis. One-, two-, three-, and five-point vowels were created as described above, and another 1-point stimulus was created with onset frequencies rather than midpoint frequencies. Three experiments were administered (n = 18 for each): vowel identification, goodness rating, and discrimination. The results ultimately align with the dual-target hypothesis, consistent with most vowel inherent spectral change studies.}, } @article {pmid35749662, year = {2022}, author = {Groll, MD and Dahl, KL and Cádiz, MD and Welch, B and Tracy, LF and Stepp, CE}, title = {Resynthesis of Transmasculine Voices to Assess Gender Perception as a Function of Testosterone Therapy.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2474-2489}, pmid = {35749662}, issn = {1558-9102}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech ; Speech Acoustics ; *Speech Perception ; Testosterone ; *Voice ; Young Adult ; }, abstract = {PURPOSE: The goal of this study was to use speech resynthesis to investigate the effects of changes to individual acoustic features on speech-based gender perception of transmasculine voice samples following the onset of hormone replacement therapy (HRT) with exogenous testosterone. We hypothesized that mean fundamental frequency (f o) would have the largest effect on gender perception of any single acoustic feature.

METHOD: Mean f o, f o contour, and formant frequencies were calculated for three pairs of transmasculine speech samples before and after HRT onset. Sixteen speech samples with unique combinations of these acoustic features from each pair of speech samples were resynthesized. Twenty young adult listeners evaluated each synthesized speech sample for gender perception and synthetic quality. Two analyses of variance were used to investigate the effects of acoustic features on gender perception and synthetic quality.

RESULTS: Of the three acoustic features, mean f o was the only single feature that had a statistically significant effect on gender perception. Differences between the speech samples before and after HRT onset that were not captured by changes in f o and formant frequencies also had a statistically significant effect on gender perception.

CONCLUSION: In these transmasculine voice samples, mean f o was the most important acoustic feature for voice masculinization as a result of HRT; future investigations in a larger number of transmasculine speakers and on the effects of behavioral therapy-based changes in concert with HRT is warranted.}, } @article {pmid35744460, year = {2022}, author = {Yan, S and Liu, P and Chen, Z and Liu, J and Shen, L and Zhang, X and Cui, J and Li, T and Cui, Y and Ren, Y}, title = {High-Property Refractive Index and Bio-Sensing Dual-Purpose Sensor Based on SPPs.}, journal = {Micromachines}, volume = {13}, number = {6}, pages = {}, pmid = {35744460}, issn = {2072-666X}, abstract = {A high-property plasma resonance-sensor structure consisting of two metal-insulator-metal (MIM) waveguides coupled with a transverse ladder-shaped nano-cavity (TLSNC) is designed based on surface plasmon polaritons. Its transmission characteristics are analyzed using multimode interference coupling mode theory (MICMT), and are simulated using finite element analysis (FEA). Meanwhile, the influence of different structural arguments on the performance of the structure is investigated. This study shows that the system presents four high-quality formants in the transmission spectrum. The highest sensitivity is 3000 nm/RIU with a high FOM[*] of 9.7 × 10[5]. In addition, the proposed structure could act as a biosensor to detect the concentrations of sodium ions (Na[+]), potassium ions (K[+]), and the glucose solution with maximum sensitivities of 0.45, 0.625 and 5.5 nm/mgdL[-1], respectively. Compared with other structures, the designed system has the advantages of a simple construction, a wide working band range, high reliability and easy nano-scale integration, providing a high-performance cavity choice for refractive index sensing and biosensing devices based on surface plasmons.}, } @article {pmid35737731, year = {2022}, author = {Ham, J and Yoo, HJ and Kim, J and Lee, B}, title = {Vowel speech recognition from rat electroencephalography using long short-term memory neural network.}, journal = {PloS one}, volume = {17}, number = {6}, pages = {e0270405}, pmid = {35737731}, issn = {1932-6203}, mesh = {Animals ; Electroencephalography/methods ; Male ; Memory, Short-Term ; Neural Networks, Computer ; Rats ; Rats, Sprague-Dawley ; Speech ; *Speech Perception ; }, abstract = {Over the years, considerable research has been conducted to investigate the mechanisms of speech perception and recognition. Electroencephalography (EEG) is a powerful tool for identifying brain activity; therefore, it has been widely used to determine the neural basis of speech recognition. In particular, for the classification of speech recognition, deep learning-based approaches are in the spotlight because they can automatically learn and extract representative features through end-to-end learning. This study aimed to identify particular components that are potentially related to phoneme representation in the rat brain and to discriminate brain activity for each vowel stimulus on a single-trial basis using a bidirectional long short-term memory (BiLSTM) network and classical machine learning methods. Nineteen male Sprague-Dawley rats subjected to microelectrode implantation surgery to record EEG signals from the bilateral anterior auditory fields were used. Five different vowel speech stimuli were chosen, /a/, /e/, /i/, /o/, and /u/, which have highly different formant frequencies. EEG recorded under randomly given vowel stimuli was minimally preprocessed and normalized by a z-score transformation to be used as input for the classification of speech recognition. The BiLSTM network showed the best performance among the classifiers by achieving an overall accuracy, f1-score, and Cohen's κ values of 75.18%, 0.75, and 0.68, respectively, using a 10-fold cross-validation approach. These results indicate that LSTM layers can effectively model sequential data, such as EEG; hence, informative features can be derived through BiLSTM trained with end-to-end learning without any additional hand-crafted feature extraction methods.}, } @article {pmid35731636, year = {2023}, author = {Pravitharangul, N and Miyamoto, JJ and Yoshizawa, H and Matsumoto, T and Suzuki, S and Chantarawaratit, PO and Moriyama, K}, title = {Vowel sound production and its association with cephalometric characteristics in skeletal Class III subjects.}, journal = {European journal of orthodontics}, volume = {45}, number = {1}, pages = {20-28}, doi = {10.1093/ejo/cjac031}, pmid = {35731636}, issn = {1460-2210}, mesh = {Male ; Humans ; *Speech Acoustics ; Speech ; Acoustics ; Cephalometry ; *Overbite ; }, abstract = {BACKGROUND: This study aimed to evaluate differences in vowel production using acoustic analysis in skeletal Class III and Class I Japanese participants and to identify the correlation between vowel sounds and cephalometric variables in skeletal Class III subjects.

MATERIALS AND METHODS: Japanese males with skeletal Class III (ANB < 0°) and Class I skeletal anatomy (0.62° < ANB < 5.94°) were recruited (n = 18/group). Acoustic analysis of vowel sounds and cephalometric analysis of lateral cephalograms were performed. For sound analysis, an isolated Japanese vowel (/a/,/i/,/u/,/e/,/o/) pattern was recorded. Praat software was used to extract acoustic parameters such as fundamental frequency (F0) and the first four formants (F1, F2, F3, and F4). The formant graph area was calculated. Cephalometric values were obtained using ImageJ. Correlations between acoustic and cephalometric variables in skeletal Class III subjects were then investigated.

RESULTS: Skeletal Class III subjects exhibited significantly higher/o/F2 and lower/o/F4 values. Mandibular length, SNB, and overjet of Class III subjects were moderately negatively correlated with acoustic variables.

LIMITATIONS: This study did not take into account vertical skeletal patterns and tissue movements during sound production.

CONCLUSION: Skeletal Class III males produced different /o/ (back and rounded vowel), possibly owing to their anatomical positions or adaptive changes. Vowel production was moderately associated with cephalometric characteristics of Class III subjects. Thus, changes in speech after orthognathic surgery may be expected. A multidisciplinary team approach that included the input of a speech pathologist would be useful.}, } @article {pmid35728449, year = {2022}, author = {Kabakoff, H and Gritsyk, O and Harel, D and Tiede, M and Preston, JL and Whalen, DH and McAllister, T}, title = {Characterizing sensorimotor profiles in children with residual speech sound disorder: a pilot study.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106230}, pmid = {35728449}, issn = {1873-7994}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; *Apraxias ; Child ; Humans ; *Language Development Disorders ; Pilot Projects ; Speech ; Speech Production Measurement ; *Speech Sound Disorder/therapy ; *Stuttering ; }, abstract = {PURPOSE: Children with speech errors who have reduced motor skill may be more likely to develop residual errors associated with lifelong challenges. Drawing on models of speech production that highlight the role of somatosensory acuity in updating motor plans, this pilot study explored the relationship between motor skill and speech accuracy, and between somatosensory acuity and motor skill in children. Understanding the connections among sensorimotor measures and speech outcomes may offer insight into how somatosensation and motor skill cooperate during speech production, which could inform treatment decisions for this population.

METHOD: Twenty-five children (ages 9-14) produced syllables in an /ɹ/ stimulability task before and after an ultrasound biofeedback treatment program targeting rhotics. We first tested whether motor skill (as measured by two ultrasound-based metrics of tongue shape complexity) predicted acoustically measured accuracy (the normalized difference between the second and third formant frequencies). We then tested whether somatosensory acuity (as measured by an oral stereognosis task) predicted motor skill, while controlling for auditory acuity.

RESULTS: One measure of tongue shape complexity was a significant predictor of accuracy, such that higher tongue shape complexity was associated with lower accuracy at pre-treatment but higher accuracy at post-treatment. Based on the same measure, children with better somatosensory acuity produced /ɹ/ tongue shapes that were more complex, but this relationship was only present at post-treatment.

CONCLUSION: The predicted relationships among somatosensory acuity, motor skill, and acoustically measured /ɹ/ production accuracy were observed after treatment, but unexpectedly did not hold before treatment. The surprising finding that greater tongue shape complexity was associated with lower accuracy at pre-treatment highlights the importance of evaluating tongue shape patterns (e.g., using ultrasound) prior to treatment, and has the potential to suggest that children with high tongue shape complexity at pre-treatment may be good candidates for ultrasound-based treatment.}, } @article {pmid35727115, year = {2022}, author = {González-Alvarez, J and Sos-Peña, R}, title = {Perceiving Body Height From Connected Speech: Higher Fundamental Frequency Is Associated With the Speaker's Height.}, journal = {Perceptual and motor skills}, volume = {129}, number = {5}, pages = {1349-1361}, doi = {10.1177/00315125221110392}, pmid = {35727115}, issn = {1558-688X}, mesh = {Body Height ; Body Size ; Female ; Humans ; Male ; *Speech ; *Speech Perception ; }, abstract = {To a certain degree, human listeners can perceive a speaker's body size from their voice. The speaker's voice pitch or fundamental frequency (Fo) and the vocal formant frequencies are the voice parameters that have been most intensively studied in past body size perception research (particularly for body height). Artificially lowering the Fo of isolated vowels from male speakers improved listeners' accuracy of binary (i.e., tall vs not tall) body height perceptions. This has been explained by the theory that a denser harmonic spectrum provided by a low pitch improved the perceptual resolution of formants that aid formant-based size assessments. In the present study, we extended this research using connected speech (i.e., words and sentences) pronounced by speakers of both sexes. Unexpectedly, we found that raising Fo, not lowering it, increased the participants' perceptual performance in two binary discrimination tasks of body size. We explain our new finding in the temporal domain by the dynamic and time-varying acoustic properties of connected speech. Increased Fo might increase the sampling density of sound wave acoustic cycles and provide more detailed information, such as higher resolution, on the envelope shape.}, } @article {pmid35712147, year = {2022}, author = {Sugiyama, Y}, title = {Identification of Minimal Pairs of Japanese Pitch Accent in Noise-Vocoded Speech.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {887761}, pmid = {35712147}, issn = {1664-1078}, abstract = {The perception of lexical pitch accent in Japanese was assessed using noise-excited vocoder speech, which contained no fundamental frequency (f o) or its harmonics. While prosodic information such as in lexical stress in English and lexical tone in Mandarin Chinese is known to be encoded in multiple acoustic dimensions, such multidimensionality is less understood for lexical pitch accent in Japanese. In the present study, listeners were tested under four different conditions to investigate the contribution of non-f o properties to the perception of Japanese pitch accent: noise-vocoded speech stimuli consisting of 10 3-ERBN-wide bands and 15 2-ERBN-wide bands created from a male and female speaker. Results found listeners were able to identify minimal pairs of final-accented and unaccented words at a rate better than chance in all conditions, indicating the presence of secondary cues to Japanese pitch accent. Subsequent analyses were conducted to investigate if the listeners' ability to distinguish minimal pairs was correlated with duration, intensity or formant information. The results found no strong or consistent correlation, suggesting the possibility that listeners used different cues depending on the information available in the stimuli. Furthermore, the comparison of the current results with equivalent studies in English and Mandarin Chinese suggest that, although lexical prosodic information exists in multiple acoustic dimensions in Japanese, the primary cue is more salient than in other languages.}, } @article {pmid35700949, year = {2022}, author = {Preisig, BC and Riecke, L and Hervais-Adelman, A}, title = {Speech sound categorization: The contribution of non-auditory and auditory cortical regions.}, journal = {NeuroImage}, volume = {258}, number = {}, pages = {119375}, doi = {10.1016/j.neuroimage.2022.119375}, pmid = {35700949}, issn = {1095-9572}, mesh = {Acoustic Stimulation/methods ; *Auditory Cortex/diagnostic imaging/physiology ; Auditory Perception ; Hearing ; Humans ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Which processes in the human brain lead to the categorical perception of speech sounds? Investigation of this question is hampered by the fact that categorical speech perception is normally confounded by acoustic differences in the stimulus. By using ambiguous sounds, however, it is possible to dissociate acoustic from perceptual stimulus representations. Twenty-seven normally hearing individuals took part in an fMRI study in which they were presented with an ambiguous syllable (intermediate between /da/ and /ga/) in one ear and with disambiguating acoustic feature (third formant, F3) in the other ear. Multi-voxel pattern searchlight analysis was used to identify brain areas that consistently differentiated between response patterns associated with different syllable reports. By comparing responses to different stimuli with identical syllable reports and identical stimuli with different syllable reports, we disambiguated whether these regions primarily differentiated the acoustics of the stimuli or the syllable report. We found that BOLD activity patterns in left perisylvian regions (STG, SMG), left inferior frontal regions (vMC, IFG, AI), left supplementary motor cortex (SMA/pre-SMA), and right motor and somatosensory regions (M1/S1) represent listeners' syllable report irrespective of stimulus acoustics. Most of these regions are outside of what is traditionally regarded as auditory or phonological processing areas. Our results indicate that the process of speech sound categorization implicates decision-making mechanisms and auditory-motor transformations.}, } @article {pmid35694910, year = {2023}, author = {Sayyahi, F and Boulenger, V}, title = {A temporal-based therapy for children with inconsistent phonological disorder: A case-series.}, journal = {Clinical linguistics & phonetics}, volume = {37}, number = {7}, pages = {655-681}, doi = {10.1080/02699206.2022.2075792}, pmid = {35694910}, issn = {1464-5076}, mesh = {Child, Preschool ; Humans ; Child ; *Speech Sound Disorder/therapy ; Phonetics ; Speech ; Language ; Vocabulary ; }, abstract = {Deficits in temporal auditory processing, and in particular higher gap detection thresholds have been reported in children with inconsistent phonological disorder (IPD). Here we hypothesized that providing these children with extra time for phoneme identification may in turn enhance their phonological planning abilities for production, and accordingly improve not only consistency but also accuracy of their speech. We designed and tested a new temporal-based therapy, inspired by Core Vocabulary Therapy and called it T-CVT, where we digitally lengthened formant transitions between phonemes of words used for therapy. This allowed to target both temporal auditory processing and word phonological planning. Four preschool Persian native children with IPD received T-CVT for eight weeks. We measured changes in speech consistency (% inconsistency) and accuracy (percentage of consonants correct PCC) to assess the effects of the intervention. Therapy significantly improved both consistency and accuracy of word production in the four children: % inconsistency decreased from 59% on average before therapy to 2% post-T-CVT, and PCC increased from 61% to 92% on average. Consistency and accuracy were furthermore maintained or even still improved at three-month follow-up (2% inconsistency and 99% PCC). Results in a nonword repetition task showed the generalization of these effects to non-treated material: % inconsistency for nonwords decreased from 67% to 10% post-therapy, and PCC increased from 63% to 90%. These preliminary findings support the efficacy of the T-CVT intervention for children with IPD who show temporal auditory processing deficits as reflected by higher gap detection thresholds.}, } @article {pmid35673798, year = {2022}, author = {Di Dona, G and Scaltritti, M and Sulpizio, S}, title = {Formant-invariant voice and pitch representations are pre-attentively formed from constantly varying speech and non-speech stimuli.}, journal = {The European journal of neuroscience}, volume = {56}, number = {3}, pages = {4086-4106}, pmid = {35673798}, issn = {1460-9568}, mesh = {Acoustic Stimulation/methods ; Attention ; Female ; Humans ; Male ; Reaction Time ; Speech ; *Speech Perception ; }, abstract = {The present study investigated whether listeners can form abstract voice representations while ignoring constantly changing phonological information and if they can use the resulting information to facilitate voice change detection. Further, the study aimed at understanding whether the use of abstraction is restricted to the speech domain or can be deployed also in non-speech contexts. We ran an electroencephalogram (EEG) experiment including one passive and one active oddball task, each featuring a speech and a rotated speech condition. In the speech condition, participants heard constantly changing vowels uttered by a male speaker (standard stimuli) which were infrequently replaced by vowels uttered by a female speaker with higher pitch (deviant stimuli). In the rotated speech condition, participants heard rotated vowels, in which the natural formant structure of speech was disrupted. In the passive task, the mismatch negativity was elicited after the presentation of the deviant voice in both conditions, indicating that listeners could successfully group together different stimuli into a formant-invariant voice representation. In the active task, participants showed shorter reaction times (RTs), higher accuracy and a larger P3b in the speech condition with respect to the rotated speech condition. Results showed that whereas at a pre-attentive level the cognitive system can track pitch regularities while presumably ignoring constantly changing formant information both in speech and in rotated speech, at an attentive level the use of such information is facilitated for speech. This facilitation was also testified by a stronger synchronisation in the theta band (4-7 Hz), potentially pointing towards differences in encoding/retrieval processes.}, } @article {pmid35667724, year = {2022}, author = {Hampsey, E and Meszaros, M and Skirrow, C and Strawbridge, R and Taylor, RH and Chok, L and Aarsland, D and Al-Chalabi, A and Chaudhuri, R and Weston, J and Fristed, E and Podlewska, A and Awogbemila, O and Young, AH}, title = {Protocol for Rhapsody: a longitudinal observational study examining the feasibility of speech phenotyping for remote assessment of neurodegenerative and psychiatric disorders.}, journal = {BMJ open}, volume = {12}, number = {6}, pages = {e061193}, pmid = {35667724}, issn = {2044-6055}, mesh = {Feasibility Studies ; Humans ; Longitudinal Studies ; *Mental Disorders ; *Mobile Applications ; Observational Studies as Topic ; Speech ; }, abstract = {INTRODUCTION: Neurodegenerative and psychiatric disorders (NPDs) confer a huge health burden, which is set to increase as populations age. New, remotely delivered diagnostic assessments that can detect early stage NPDs by profiling speech could enable earlier intervention and fewer missed diagnoses. The feasibility of collecting speech data remotely in those with NPDs should be established.

METHODS AND ANALYSIS: The present study will assess the feasibility of obtaining speech data, collected remotely using a smartphone app, from individuals across three NPD cohorts: neurodegenerative cognitive diseases (n=50), other neurodegenerative diseases (n=50) and affective disorders (n=50), in addition to matched controls (n=75). Participants will complete audio-recorded speech tasks and both general and cohort-specific symptom scales. The battery of speech tasks will serve several purposes, such as measuring various elements of executive control (eg, attention and short-term memory), as well as measures of voice quality. Participants will then remotely self-administer speech tasks and follow-up symptom scales over a 4-week period. The primary objective is to assess the feasibility of remote collection of continuous narrative speech across a wide range of NPDs using self-administered speech tasks. Additionally, the study evaluates if acoustic and linguistic patterns can predict diagnostic group, as measured by the sensitivity, specificity, Cohen's kappa and area under the receiver operating characteristic curve of the binary classifiers distinguishing each diagnostic group from each other. Acoustic features analysed include mel-frequency cepstrum coefficients, formant frequencies, intensity and loudness, whereas text-based features such as number of words, noun and pronoun rate and idea density will also be used.

ETHICS AND DISSEMINATION: The study received ethical approval from the Health Research Authority and Health and Care Research Wales (REC reference: 21/PR/0070). Results will be disseminated through open access publication in academic journals, relevant conferences and other publicly accessible channels. Results will be made available to participants on request.

TRIAL REGISTRATION NUMBER: NCT04939818.}, } @article {pmid35664509, year = {2022}, author = {Roessig, S and Winter, B and Mücke, D}, title = {Tracing the Phonetic Space of Prosodic Focus Marking.}, journal = {Frontiers in artificial intelligence}, volume = {5}, number = {}, pages = {842546}, pmid = {35664509}, issn = {2624-8212}, abstract = {Focus is known to be expressed by a wide range of phonetic cues but only a few studies have explicitly compared different phonetic variables within the same experiment. Therefore, we presented results from an analysis of 19 phonetic variables conducted on a data set of the German language that comprises the opposition of unaccented (background) vs. accented (in focus), as well as different focus types with the nuclear accent on the same syllable (broad, narrow, and contrastive focus). The phonetic variables are measures of the acoustic and articulographic signals of a target syllable. Overall, our results provide the highest number of reliable effects and largest effect sizes for accentuation (unaccented vs. accented), while the differentiation of focus types with accented target syllables (broad, narrow, and contrastive focus) are more subtle. The most important phonetic variables across all conditions are measures of the fundamental frequency. The articulatory variables and their corresponding acoustic formants reveal lower tongue positions for both vowels /o, a/, and larger lip openings for the vowel /a/ under increased prosodic prominence with the strongest effects for accentuation. While duration exhibits consistent mid-ranked results for both accentuation and the differentiation of focus types, measures related to intensity are particularly important for accentuation. Furthermore, voice quality and spectral tilt are affected by accentuation but also in the differentiation of focus types. Our results confirm that focus is realized via multiple phonetic cues. Additionally, the present analysis allows a comparison of the relative importance of different measures to better understand the phonetic space of focus marking.}, } @article {pmid35664350, year = {2022}, author = {Coughler, C and Quinn de Launay, KL and Purcell, DW and Oram Cardy, J and Beal, DS}, title = {Pediatric Responses to Fundamental and Formant Frequency Altered Auditory Feedback: A Scoping Review.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {858863}, pmid = {35664350}, issn = {1662-5161}, abstract = {PURPOSE: The ability to hear ourselves speak has been shown to play an important role in the development and maintenance of fluent and coherent speech. Despite this, little is known about the developing speech motor control system throughout childhood, in particular if and how vocal and articulatory control may differ throughout development. A scoping review was undertaken to identify and describe the full range of studies investigating responses to frequency altered auditory feedback in pediatric populations and their contributions to our understanding of the development of auditory feedback control and sensorimotor learning in childhood and adolescence.

METHOD: Relevant studies were identified through a comprehensive search strategy of six academic databases for studies that included (a) real-time perturbation of frequency in auditory input, (b) an analysis of immediate effects on speech, and (c) participants aged 18 years or younger.

RESULTS: Twenty-three articles met inclusion criteria. Across studies, there was a wide variety of designs, outcomes and measures used. Manipulations included fundamental frequency (9 studies), formant frequency (12), frequency centroid of fricatives (1), and both fundamental and formant frequencies (1). Study designs included contrasts across childhood, between children and adults, and between typical, pediatric clinical and adult populations. Measures primarily explored acoustic properties of speech responses (latency, magnitude, and variability). Some studies additionally examined the association of these acoustic responses with clinical measures (e.g., stuttering severity and reading ability), and neural measures using electrophysiology and magnetic resonance imaging.

CONCLUSION: Findings indicated that children above 4 years generally compensated in the opposite direction of the manipulation, however, in several cases not as effectively as adults. Overall, results varied greatly due to the broad range of manipulations and designs used, making generalization challenging. Differences found between age groups in the features of the compensatory vocal responses, latency of responses, vocal variability and perceptual abilities, suggest that maturational changes may be occurring in the speech motor control system, affecting the extent to which auditory feedback is used to modify internal sensorimotor representations. Varied findings suggest vocal control develops prior to articulatory control. Future studies with multiple outcome measures, manipulations, and more expansive age ranges are needed to elucidate findings.}, } @article {pmid35634052, year = {2022}, author = {Wang, X and Wang, T}, title = {Voice Recognition and Evaluation of Vocal Music Based on Neural Network.}, journal = {Computational intelligence and neuroscience}, volume = {2022}, number = {}, pages = {3466987}, pmid = {35634052}, issn = {1687-5273}, mesh = {Humans ; *Music ; Neural Networks, Computer ; Voice Quality ; Voice Recognition ; Voice Training ; }, abstract = {Artistic voice is the artistic life of professional voice users. In the process of selecting and cultivating artistic performing talents, the evaluation of voice even occupies a very important position. Therefore, an appropriate evaluation of the artistic voice is crucial. With the development of art education, how to scientifically evaluate artistic voice training methods and fairly select artistic voice talents is an urgent need for objective evaluation of artistic voice. The current evaluation methods for artistic voices are time-consuming, laborious, and highly subjective. In the objective evaluation of artistic voice, the selection of evaluation acoustic parameters is very important. Attempt to extract the average energy, average frequency error, and average range error of singing voice by using speech analysis technology as the objective evaluation acoustic parameters, use neural network method to objectively evaluate the singing quality of artistic voice, and compare with the subjective evaluation of senior professional teachers. In this paper, voice analysis technology is used to extract the first formant, third formant, fundamental frequency, sound range, fundamental frequency perturbation, first formant perturbation, third formant perturbation, and average energy of singing acoustic parameters. By using BP neural network methods, the quality of singing was evaluated objectively and compared with the subjective evaluation of senior vocal professional teachers. The results show that the BP neural network method can accurately and objectively evaluate the quality of singing voice by using the evaluation parameters, which is helpful in scientifically guiding the selection and training of artistic voice talents.}, } @article {pmid35612119, year = {2022}, author = {Rafi, S and Gangloff, C and Paulhet, E and Grimault, O and Soulat, L and Bouzillé, G and Cuggia, M}, title = {Out-of-Hospital Cardiac Arrest Detection by Machine Learning Based on the Phonetic Characteristics of the Caller's Voice.}, journal = {Studies in health technology and informatics}, volume = {294}, number = {}, pages = {445-449}, doi = {10.3233/SHTI220498}, pmid = {35612119}, issn = {1879-8365}, mesh = {*Cardiopulmonary Resuscitation ; Emergency Medical Service Communication Systems ; *Emergency Medical Services ; Humans ; Machine Learning ; *Out-of-Hospital Cardiac Arrest/diagnosis ; Phonetics ; }, abstract = {INTRODUCTION: Out-of-hospital cardiac arrest (OHCA) is a major public health issue. The prognosis is closely related to the time from collapse to return of spontaneous circulation. Resuscitation efforts are frequently initiated at the request of emergency call center professionals who are specifically trained to identify critical conditions over the phone. However, 25% of OHCAs are not recognized during the first call. Therefore, it would be interesting to develop automated computer systems to recognize OHCA on the phone. The aim of this study was to build and evaluate machine learning models for OHCA recognition based on the phonetic characteristics of the caller's voice.

METHODS: All patients for whom a call was done to the emergency call center of Rennes, France, between 01/01/2017 and 01/01/2019 were eligible. The predicted variable was OHCA presence. Predicting variables were collected by computer-automatized phonetic analysis of the call. They were based on the following voice parameters: fundamental frequency, formants, intensity, jitter, shimmer, harmonic to noise ratio, number of voice breaks, and number of periods. Three models were generated using binary logistic regression, random forest, and neural network. The area under the curve (AUC) was the primary outcome used to evaluate each model performance.

RESULTS: 820 patients were included in the study. The best model to predict OHCA was random forest (AUC=74.9, 95% CI=67.4-82.4).

CONCLUSION: Machine learning models based on the acoustic characteristics of the caller's voice can recognize OHCA. The integration of the acoustic parameters identified in this study will help to design decision-making support systems to improve OHCA detection over the phone.}, } @article {pmid35548492, year = {2022}, author = {Tomaschek, F and Ramscar, M}, title = {Understanding the Phonetic Characteristics of Speech Under Uncertainty-Implications of the Representation of Linguistic Knowledge in Learning and Processing.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {754395}, pmid = {35548492}, issn = {1664-1078}, abstract = {The uncertainty associated with paradigmatic families has been shown to correlate with their phonetic characteristics in speech, suggesting that representations of complex sublexical relations between words are part of speaker knowledge. To better understand this, recent studies have used two-layer neural network models to examine the way paradigmatic uncertainty emerges in learning. However, to date this work has largely ignored the way choices about the representation of inflectional and grammatical functions (IFS) in models strongly influence what they subsequently learn. To explore the consequences of this, we investigate how representations of IFS in the input-output structures of learning models affect the capacity of uncertainty estimates derived from them to account for phonetic variability in speech. Specifically, we examine whether IFS are best represented as outputs to neural networks (as in previous studies) or as inputs by building models that embody both choices and examining their capacity to account for uncertainty effects in the formant trajectories of word final [ɐ], which in German discriminates around sixty different IFS. Overall, we find that formants are enhanced as the uncertainty associated with IFS decreases. This result dovetails with a growing number of studies of morphological and inflectional families that have shown that enhancement is associated with lower uncertainty in context. Importantly, we also find that in models where IFS serve as inputs-as our theoretical analysis suggests they ought to-its uncertainty measures provide better fits to the empirical variance observed in [ɐ] formants than models where IFS serve as outputs. This supports our suggestion that IFS serve as cognitive cues during speech production, and should be treated as such in modeling. It is also consistent with the idea that when IFS serve as inputs to a learning network. This maintains the distinction between those parts of the network that represent message and those that represent signal. We conclude by describing how maintaining a "signal-message-uncertainty distinction" can allow us to reconcile a range of apparently contradictory findings about the relationship between articulation and uncertainty in context.}, } @article {pmid35529579, year = {2022}, author = {Haiduk, F and Fitch, WT}, title = {Understanding Design Features of Music and Language: The Choric/Dialogic Distinction.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {786899}, pmid = {35529579}, issn = {1664-1078}, support = {W 1262/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {Music and spoken language share certain characteristics: both consist of sequences of acoustic elements that are combinatorically combined, and these elements partition the same continuous acoustic dimensions (frequency, formant space and duration). However, the resulting categories differ sharply: scale tones and note durations of small integer ratios appear in music, while speech uses phonemes, lexical tone, and non-isochronous durations. Why did music and language diverge into the two systems we have today, differing in these specific features? We propose a framework based on information theory and a reverse-engineering perspective, suggesting that design features of music and language are a response to their differential deployment along three different continuous dimensions. These include the familiar propositional-aesthetic ('goal') and repetitive-novel ('novelty') dimensions, and a dialogic-choric ('interactivity') dimension that is our focus here. Specifically, we hypothesize that music exhibits specializations enhancing coherent production by several individuals concurrently-the 'choric' context. In contrast, language is specialized for exchange in tightly coordinated turn-taking-'dialogic' contexts. We examine the evidence for our framework, both from humans and non-human animals, and conclude that many proposed design features of music and language follow naturally from their use in distinct dialogic and choric communicative contexts. Furthermore, the hybrid nature of intermediate systems like poetry, chant, or solo lament follows from their deployment in the less typical interactive context.}, } @article {pmid35520977, year = {2021}, author = {Hall, A and Kawai, K and Graber, K and Spencer, G and Roussin, C and Weinstock, P and Volk, MS}, title = {Acoustic analysis of surgeons' voices to assess change in the stress response during surgical in situ simulation.}, journal = {BMJ simulation & technology enhanced learning}, volume = {7}, number = {6}, pages = {471-477}, pmid = {35520977}, issn = {2056-6697}, abstract = {INTRODUCTION: Stress may serve as an adjunct (challenge) or hindrance (threat) to the learning process. Determining the effect of an individual's response to situational demands in either a real or simulated situation may enable optimisation of the learning environment. Studies of acoustic analysis suggest that mean fundamental frequency and formant frequencies of voice vary with an individual's response during stressful events. This hypothesis is reviewed within the otolaryngology (ORL) simulation environment to assess whether acoustic analysis could be used as a tool to determine participants' stress response and cognitive load in medical simulation. Such an assessment could lead to optimisation of the learning environment.

METHODOLOGY: ORL simulation scenarios were performed to teach the participants teamwork and refine clinical skills. Each was performed in an actual operating room (OR) environment (in situ) with a multidisciplinary team consisting of ORL surgeons, OR nurses and anaesthesiologists. Ten of the scenarios were led by an ORL attending and ten were led by an ORL fellow. The vocal communication of each of the 20 individual leaders was analysed using a long-term pitch analysis PRAAT software (autocorrelation method) to obtain mean fundamental frequency (F0) and first four formant frequencies (F1, F2, F3 and F4). In reviewing individual scenarios, each leader's voice was analysed during a non-stressful environment (WHO sign-out procedure) and compared with their voice during a stressful portion of the scenario (responding to deteriorating oxygen saturations in the manikin).

RESULTS: The mean unstressed F0 for the male voice was 161.4 Hz and for the female voice was 217.9 Hz. The mean fundamental frequency of speech in the ORL fellow (lead surgeon) group increased by 34.5 Hz between the scenario's baseline and stressful portions. This was significantly different to the mean change of -0.5 Hz noted in the attending group (p=0.01). No changes were seen in F1, F2, F3 or F4.

CONCLUSIONS: This study demonstrates a method of acoustic analysis of the voices of participants taking part in medical simulations. It suggests acoustic analysis of participants may offer a simple, non-invasive, non-intrusive adjunct in evaluating and titrating the stress response during simulation.}, } @article {pmid35497112, year = {2022}, author = {Jarollahi, F and Valadbeigi, A and Jalaei, B and Maarefvand, M and Motasaddi Zarandy, M and Haghani, H and Shirzhiyzn, Z}, title = {Comparing Sound-Field Speech-Auditory Brainstem Response Components between Cochlear Implant Users with Different Speech Recognition in Noise Scores.}, journal = {Iranian journal of child neurology}, volume = {16}, number = {2}, pages = {93-105}, pmid = {35497112}, issn = {1735-4668}, abstract = {OBJECTIVES: Many studies have suggested that cochlear implant (CI) users vary in terms of speech recognition in noise. Studies in this field attribute this variety partly to subcortical auditory processing. Studying speech-Auditory Brainstem Response (speech-ABR) provides good information about speech processing; thus, this work was designed to compare speech-ABR components between two groups of CI users with good and poor speech recognition in noise scores.

MATERIALS & METHODS: The present study was conducted on two groups of CI users aged 8-10 years old. The first group (CI-good) consisted of 15 children with prelingual CI who had good speech recognition in noise performance. The second group (CI-poor) was matched with the first group, but they had poor speech recognition in noise performance. The speech-ABR test in a sound-field presentation was performed for all the participants.

RESULTS: The speech-ABR response showed more delay in C, D, E, F, O latencies in CI-poor than CI-good users (P <0.05), meanwhile no significant difference was observed in initial wave (V(t= -0.293, p= 0.771 and A (t= -1.051, p= 0.307). Analysis in spectral-domain showed a weaker representation of fundamental frequency as well as the first formant and high-frequency component of speech stimuli in the CI users with poor auditory performance.

CONCLUSIONS: Results revealed that CI users who showed poor auditory performance in noise performance had deficits in encoding the periodic portion of speech signals at the brainstem level. Also, this study could be as physiological evidence for poorer pitch processing in CI users with poor speech recognition in noise performance.}, } @article {pmid35452247, year = {2022}, author = {Houle, N and Goudelias, D and Lerario, MP and Levi, SV}, title = {Effect of Anchor Term on Auditory-Perceptual Ratings of Feminine and Masculine Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {6}, pages = {2064-2080}, pmid = {35452247}, issn = {1558-9102}, support = {T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Cues ; Female ; Humans ; Male ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {BACKGROUND: Studies investigating auditory perception of gender expression vary greatly in the specific terms applied to gender expression in rating scales.

PURPOSE: This study examined the effects of different anchor terms on listeners' auditory perceptions of gender expression in phonated and whispered speech. Additionally, token and speaker cues were examined to identify predictors of the auditory-perceptual ratings.

METHOD: Inexperienced listeners (n = 105) completed an online rating study in which they were asked to use one of five visual analog scales (VASs) to rate cis men, cis women, and transfeminine speakers in both phonated and whispered speech. The VASs varied by anchor term (very female/very male, feminine/masculine, feminine female/masculine male, very feminine/not at all feminine, and not at all masculine/very masculine).

RESULTS: Linear mixed-effects models revealed significant two-way interactions of gender expression by anchor term and gender expression by condition. In general, the feminine female/masculine male scale resulted in the most extreme ratings (closest to the end points), and the feminine/masculine scale resulted in the most central ratings. As expected, for all speakers, whispered speech was rated more centrally than phonated speech. Additionally, ratings of phonated speech were predicted by mean fundamental frequency (f o) within each speaker group and by smoothed cepstral peak prominence in cisgender speakers. In contrast, ratings of whispered speech, which lacks an f o, were predicted by indicators of vocal tract resonance (second formant and speaker height).

CONCLUSIONS: The current results indicate that differences in the terms applied to rating scales limit generalization of results across studies. Identifying the patterns across listener ratings of gender expression provide a rationale for researchers and clinicians when making choices about terms. Additionally, beyond f o and vocal tract resonance, predictors of listener ratings vary based on the anchor terms used to describe gender expression.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19617564.}, } @article {pmid35418360, year = {2022}, author = {Kırbac, A and Turkyılmaz, MD and Yağcıoglu, S}, title = {Gender Effects on Binaural Speech Auditory Brainstem Response.}, journal = {The journal of international advanced otology}, volume = {18}, number = {2}, pages = {125-130}, pmid = {35418360}, issn = {2148-3817}, mesh = {Acoustic Stimulation ; Adult ; Brain Stem/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {BACKGROUND: The speech auditory brainstem response is a tool that provides direct information on how speech sound is temporally and spectrally coded by the auditory brainstem. Speech auditory brainstem response is influenced by many variables, but the effect of gender is unclear, particularly in the binaural recording. Studies on speech auditory brainstem response evoked by binaural stimulation are limited, but gender studies are even more limited and contradictory. This study aimed at examining the effect of gender on speech auditory brainstem response in adults.

METHODS: Time- and frequency-domain analyses of speech auditory brainstem response recordings of 30 healthy participants (15 women and 15 men) aged 18-35 years with normal hearing and no musical education were obtained. For each adult, speech auditory brainstem response was recorded with the syllable /da/ presented binaurally. Peaks of time (V, A, C, D, E, F, and O) and frequency (fundamental frequency, first formant frequency, and high frequency) domains of speech auditory brainstem response were compared between men and women.

RESULTS: V, A, and F peak latencies of women were significantly shorter than those of men (P< .05). However, no difference was found in the peak amplitude of the time (P > .05) or frequency domain between women and men (P > .05).

CONCLUSION: Gender differences in binaural speech auditory brainstem response are significant in adults, particularly in the time domain. When speech stimuli are used for auditory brainstem responses, normative data specific to gender are required. Preliminary normative data from this study could serve as a reference for future studies on binaural speech auditory brainstem response among Turkish adults.}, } @article {pmid35416268, year = {2022}, author = {Yasar, OC and Ozturk, S and Kemal, O and Kocabicak, E}, title = {Effects of Subthalamic Nucleus Deep Brain Stimulation Surgery on Voice and Formant Frequencies of Vowels in Turkish.}, journal = {Turkish neurosurgery}, volume = {32}, number = {5}, pages = {764-772}, doi = {10.5137/1019-5149.JTN.36134-21.2}, pmid = {35416268}, issn = {2651-5032}, mesh = {*Deep Brain Stimulation/methods ; Humans ; Language ; *Parkinson Disease/surgery ; *Subthalamic Nucleus/physiology/surgery ; }, abstract = {AIM: To investigate the effects of deep brain stimulation (DBS) of the subthalamic nucleus (STN) on acoustic characteristics of voice production in Turkish patients with Parkinson's disease (PD).

MATERIAL AND METHODS: This study recruited 20 patients diagnosed with PD. Voice samples were recorded under the "stimulation on" and "stimulation off" conditions of STN-DBS. Acoustic recordings of the patients were made during the production of vowels /a/, /o/, and /i/ and repetition of the syllables /pa/-/ta/-/ka/. Acoustic analyses were performed using Praat.

RESULTS: A significant difference in the parameters was observed among groups for vowels. A positive significant difference was observed between preoperative med-on and postoperative med-on/stim-on groups for /a/ and the postoperative med-on/ stim-on and postoperative med-on/stim-off groups for /o/ and /i/ for frequency perturbation (jitter) and noise-to-harmonics ratio. No significant difference was noted between the preoperative med-on and postoperative med-on/stim-off groups for any vowels.

CONCLUSION: STN-DBS surgery has an acute positive effect on voice. Studies on formant frequency analysis in STN-DBS may be expanded with both articulation and intelligibility tests to enable us to combine patient abilities in various perspectives and to obtain precise results.}, } @article {pmid35400757, year = {2022}, author = {Whalen, DH and DiCanio, C and Dockum, R}, title = {Phonetic Documentation in Three Collections: Topics and Evolution.}, journal = {Journal of the International Phonetic Association}, volume = {52}, number = {1}, pages = {95-121}, pmid = {35400757}, issn = {0025-1003}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, abstract = {Phonetic aspects of many languages have been documented, though the breadth and focus of such documentation varies substantially. In this survey, phonetic aspects (here called "categories") that are typically reported were assessed in three English-language collections-the Illustrations of the IPA, articles from the Journal of Phonetics, and papers from the Ladefoged/Maddieson Sounds of the World's Languages (SOWL) documentation project. Categories were defined for consonants (e.g., Voice Onset Time (VOT) and frication spectrum; 10 in total), vowels (e.g., formants and duration; 7 total) and suprasegmentals (e.g., stress and distinctive vowel length, 6 total). The Illustrations, due to their brevity, had, on average, limited coverage of the selected categories (12% of the 23 categories). Journal of Phonetics articles were typically theoretically motivated, but 64 had sufficient measurements to count as phonetic documentation; these also covered 12% of the categories. The SOWL studies, designed to cover as much of the phonetic structure as feasible in an article-length treatment, achieved 41% coverage on average. Four book-length studies were also examined, with an average of 49% coverage. Phonetic properties of many language families have been studied, though Indo-European is still disproportionately represented. Physiological measures were excluded as being less common, and perceptual measures were excluded as being typically more theoretical. This preliminary study indicates that certain acoustic properties of languages are typically measured and may be considered as an impetus for later, fuller coverage, but broader consensus on the categories is needed. Current documentation efforts could be more useful if these considerations were addressed.}, } @article {pmid35394801, year = {2022}, author = {Dahl, KL and François, FA and Buckley, DP and Stepp, CE}, title = {Voice and Speech Changes in Transmasculine Individuals Following Circumlaryngeal Massage and Laryngeal Reposturing.}, journal = {American journal of speech-language pathology}, volume = {31}, number = {3}, pages = {1368-1382}, pmid = {35394801}, issn = {1558-9110}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Massage ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: The purpose of this study was to measure the short-term effects of circumlaryngeal massage and laryngeal reposturing on acoustic and perceptual characteristics of voice in transmasculine individuals.

METHOD: Fifteen transmasculine individuals underwent one session of sequential circumlaryngeal massage and laryngeal reposturing with a speech-language pathologist. Voice recordings were collected at three time points-baseline, postmassage, and postreposturing. Fundamental frequency (f o), formant frequencies, and relative fundamental frequency (RFF; an acoustic correlate of laryngeal tension) were measured. Estimates of vocal tract length (VTL) were derived from formant frequencies. Twelve listeners rated the perceived masculinity of participants' voices at each time point. Repeated-measures analyses of variance measured the effect of time point on f o, estimated VTL, RFF, and perceived voice masculinity. Significant effects were evaluated with post hoc Tukey's tests.

RESULTS: Between baseline and end of the session, f o decreased, VTL increased, and participant voices were perceived as more masculine, all with statistically significant differences. RFF did not differ significantly at any time point. Outcomes were highly variable at the individual level.

CONCLUSION: Circumlaryngeal massage and laryngeal reposturing have short-term effects on select acoustic (f o, estimated VTL) and perceptual characteristics (listener-assigned voice masculinity) of voice in transmasculine individuals.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19529299.}, } @article {pmid35377739, year = {2022}, author = {Swann, Z and Daliri, A and Honeycutt, CF}, title = {Impact of Startling Acoustic Stimuli on Word Repetition in Individuals With Aphasia and Apraxia of Speech Following Stroke.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {5}, pages = {1671-1685}, doi = {10.1044/2022_JSLHR-21-00486}, pmid = {35377739}, issn = {1558-9102}, mesh = {Acoustics ; *Aphasia/etiology ; *Apraxias/etiology ; Humans ; Reflex, Startle/physiology ; Speech Intelligibility ; *Stroke/complications ; }, abstract = {PURPOSE: The StartReact effect, whereby movements are elicited by loud, startling acoustic stimuli (SAS), allows the evaluation of movements when initiated through involuntary circuitry, before auditory feedback. When StartReact is applied during poststroke upper extremity movements, individuals exhibit increased muscle recruitment, reaction times, and reaching distances. StartReact releases unimpaired speech with similar increases in muscle recruitment and reaction time. However, as poststroke communication disorders have divergent neural circuitry from upper extremity tasks, it is unclear if StartReact will enhance speech poststroke. Our objective is to determine if (a) StartReact is present in individuals with poststroke aphasia and apraxia and (b) SAS exposure enhances speech intelligibility.

METHOD: We remotely delivered startling, 105-dB white noise bursts (SAS) and quiet, non-SAS cues to 15 individuals with poststroke aphasia and apraxia during repetition of six words. We evaluated average word intensity, pitch, pitch trajectories, vowel formants F1 and F2 (first and second formants), phonemic error rate, and percent incidence of each SAS versus non-SAS-elicited phoneme produced under each cue type.

RESULTS: For SAS trials compared to non-SAS, speech intensity increased (∆ + 0.6 dB), speech pitch increased (∆ + 22.7 Hz), and formants (F1 and F2) changed, resulting in a smaller vowel space after SAS. SAS affected pitch trajectories for some, but not all, words. Non-SAS trials had more stops (∆ + 4.7 utterances) while SAS trials had more sustained phonemes (fricatives, glides, affricates, liquids; ∆ + 5.4 utterances). SAS trials had fewer distortion errors but no change in substitution errors or overall error rate compared to non-SAS trials.

CONCLUSIONS: We show that stroke-impaired speech is susceptible to StartReact, evidenced by decreased intelligibility due to altered formants, pitch trajectories, and articulation, including increased incidence of sounds that could not be produced without SAS. Future studies should examine the impact of SAS on voluntary speech intelligibility and clinical measures of aphasia and apraxia.}, } @article {pmid35377182, year = {2022}, author = {Zhang, G and Shao, J and Zhang, C and Wang, L}, title = {The Perception of Lexical Tone and Intonation in Whispered Speech by Mandarin-Speaking Congenital Amusics.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {4}, pages = {1331-1348}, doi = {10.1044/2021_JSLHR-21-00345}, pmid = {35377182}, issn = {1558-9102}, mesh = {*Auditory Perceptual Disorders ; Humans ; Pitch Perception ; Recognition, Psychology ; Speech ; *Speech Perception ; }, abstract = {PURPOSE: A fundamental feature of human speech is variation, including the manner of phonation, as exemplified in the case of whispered speech. In this study, we employed whispered speech to examine an unresolved issue about congenital amusia, a neurodevelopmental disorder of musical pitch processing, which also affects speech pitch processing such as lexical tone and intonation perception. The controversy concerns whether amusia is a pitch-processing disorder or can affect speech processing beyond pitch.

METHOD: We examined lexical tone and intonation recognition in 19 Mandarin-speaking amusics and 19 matched controls in phonated and whispered speech, where fundamental frequency (f o) information is either present or absent.

RESULTS: The results revealed that the performance of congenital amusics was inferior to that of controls in lexical tone identification in both phonated and whispered speech. These impairments were also detected in identifying intonation (statements/questions) in phonated and whispered modes. Across the experiments, regression models revealed that f o and non-f o (duration, intensity, and formant frequency) acoustic cues predicted tone and intonation recognition in phonated speech, whereas non-f o cues predicted tone and intonation recognition in whispered speech. There were significant differences between amusics and controls in the use of both f o and non-f o cues.

CONCLUSION: The results provided the first evidence that the impairments of amusics in lexical tone and intonation identification prevail into whispered speech and support the hypothesis that the deficits of amusia extend beyond pitch processing.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19302275.}, } @article {pmid35363414, year = {2022}, author = {Carl, M and Levy, ES and Icht, M}, title = {Speech treatment for Hebrew-speaking adolescents and young adults with developmental dysarthria: A comparison of mSIT and Beatalk.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {3}, pages = {660-679}, doi = {10.1111/1460-6984.12715}, pmid = {35363414}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/etiology/therapy ; Humans ; Language ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Individuals with developmental dysarthria typically demonstrate reduced functioning of one or more of the speech subsystems, which negatively impacts speech intelligibility and communication within social contexts. A few treatment approaches are available for improving speech production and intelligibility among individuals with developmental dysarthria. However, these approaches have only limited application and research findings among adolescents and young adults.

AIMS: To determine and compare the effectiveness of two treatment approaches, the modified Speech Intelligibility Treatment (mSIT) and the Beatalk technique, on speech production and intelligibility among Hebrew-speaking adolescents and young adults with developmental dysarthria.

METHODS & PROCEDURES: Two matched groups of adolescents and young adults with developmental dysarthria participated in the study. Each received one of the two treatments, mSIT or Beatalk, over the course of 9 weeks. Measures of speech intelligibility, articulatory accuracy, voice and vowel acoustics were assessed both pre- and post-treatment.

OUTCOMES & RESULTS: Both the mSIT and Beatalk groups demonstrated gains in at least some of the outcome measures. Participants in the mSIT group exhibited improvement in speech intelligibility and voice measures, while participants in the Beatalk group demonstrated increased articulatory accuracy and gains in voice measures from pre- to post-treatment. Significant increases were noted post-treatment for first formant values for select vowels.

Results of this preliminary study are promising for both treatment approaches. The differentiated results indicate their distinct application to speech intelligibility deficits. The current findings also hold clinical significance for treatment among adolescents and young adults with motor speech disorders and application for a language other than English.

WHAT THIS PAPER ADDS: What is already known on the subject Developmental dysarthria (e.g., secondary to cerebral palsy) is a motor speech disorder that negatively impacts speech intelligibility, and thus communication participation. Select treatment approaches are available with the aim of improving speech intelligibility in individuals with developmental dysarthria; however, these approaches are limited in number and have only seldomly been applied specifically to adolescents and young adults. What this paper adds to existing knowledge The current study presents preliminary data regarding two treatment approaches, the mSIT and Beatalk technique, administered to Hebrew-speaking adolescents and young adults with developmental dysarthria in a group setting. Results demonstrate the initial effectiveness of the treatment approaches, with different gains noted for each approach across speech and voice domains. What are the potential or actual clinical implications of this work? The findings add to the existing literature on potential treatment approaches aiming to improve speech production and intelligibility among individuals with developmental dysarthria. The presented approaches also show promise for group-based treatments as well as the potential for improvement among adolescents and young adults with motor speech disorders.}, } @article {pmid35344948, year = {2022}, author = {Ho, GY and Kansy, IK and Klavacs, KA and Leonhard, M and Schneider-Stickler, B}, title = {Effect of FFP2/3 Masks on Voice Range Profile Measurement and Voice Acoustics in Routine Voice Diagnostics.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {74}, number = {5}, pages = {335-344}, doi = {10.1159/000524299}, pmid = {35344948}, issn = {1421-9972}, mesh = {*Acoustics ; Adult ; COVID-19 ; COVID-19 Testing ; Female ; Humans ; Male ; *Masks ; Middle Aged ; Pandemics ; Phonation ; Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {INTRODUCTION: Voice diagnostics including voice range profile (VRP) measurement and acoustic voice analysis is essential in laryngology and phoniatrics. Due to COVID-19 pandemic, wearing of 2 or 3 filtering face piece (FFP2/3) masks is recommended when high-risk aerosol-generating procedures like singing and speaking are being performed. Goal of this study was to compare VRP parameters when performed without and with FFP2/3 masks. Further, formant analysis for sustained vowels, singer's formant, and analysis of reading standard text samples were performed without/with FFP2/3 masks.

METHODS: Twenty subjects (6 males and 14 females) were enrolled in this study with an average age of 36 ± 16 years (mean ± SD). Fourteen patients were rated as euphonic/not hoarse and 6 patients as mildly hoarse. All subjects underwent the VRP measurements, vowel, and text recordings without/with FFP2/3 mask using the software DiVAS by XION medical (Berlin, Germany). Voice range of singing voice, equivalent of voice extension measure (eVEM), fundamental frequency (F0), sound pressure level (SPL) of soft speaking and shouting were calculated and analyzed. Maximum phonation time (MPT) and jitter-% were included for Dysphonia Severity Index (DSI) measurement. Analyses of singer's formant were performed. Spectral analyses of sustained vowels /a:/, /i:/, and /u:/ (first = F1 and second = F2 formants), intensity of long-term average spectrum, and alpha-ratio were calculated using the freeware praat.

RESULTS: For all subjects, the mean values of routine voice parameters without/with mask were analyzed: no significant differences were found in results of singing voice range, eVEM, SPL, and frequency of soft speaking/shouting, except significantly lower mean SPL of shouting with FFP2/3 mask, in particular that of the female subjects (p = 0.002). Results of MPT, jitter, and DSI without/with FFP2/3 mask showed no significant differences. Further mean values analyzed without/with mask were ratio singer's formant/loud singing, with lower ratio with FFP2/3 mask (p = 0.001), and F1 and F2 of /a:/, /i:/, /u:/, with no significant differences of the results, with the exception of F2 of /i:/ with lower value with FFP2/3 mask (p = 0.005). With the exceptions mentioned, the t test revealed no significant differences for each of the routine parameters tested in the recordings without and with wearing a FFP2/3 mask.

CONCLUSION: It can be concluded that VRP measurements including DSI performed with FFP2/3 masks provide reliable data in clinical routine with respect to voice condition/constitution. Spectral analyses of sustained vowel, text, and singer's formant will be affected by wearing FFP2/3 masks.}, } @article {pmid35344807, year = {2022}, author = {Chauvette, L and Fournier, P and Sharp, A}, title = {The frequency-following response to assess the neural representation of spectral speech cues in older adults.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108486}, doi = {10.1016/j.heares.2022.108486}, pmid = {35344807}, issn = {1878-5891}, mesh = {Acoustic Stimulation/methods ; Aged ; Cues ; *Hearing Loss ; Humans ; Speech ; *Speech Perception/physiology ; }, abstract = {Older adults often present difficulties understanding speech that cannot be explained by age-related changes in sound audibility. Psychoacoustic and electrophysiologic studies have linked these suprathreshold difficulties to age-related deficits in the auditory processing of temporal and spectral sound information. These studies suggest the existence of an age-related temporal processing deficit in the central auditory system, but the existence of such deficit in the spectral domain remains understudied. The FFR is an electrophysiological evoked response that assesses the ability of the neural auditory system to reproduce the spectral and temporal patterns of a sound. The main goal of this short review is to investigate if the FFR can identify and measure spectral processing deficits in the elderly compared to younger adults (for both, without hearing loss or competing noise). Furthermore, we want to determine what stimuli and analyses have been used in the literature to assess the neural encoding of spectral cues in older adults. Almost all reviewed articles showed an age-related decline in the auditory processing of spectral acoustic information. Even when using different speech and non-speech stimuli, studies reported an age-related decline at the fundamental frequency, at the first formant, and at other harmonic components using different metrics, such as the response's amplitude, inter-trial phase coherence, signal-to-response correlation, and signal-to-noise ratio. These results suggest that older adults may present age-related spectral processing difficulties, but further FFR studies are needed to clarify the effect of advancing age on the neural encoding of spectral speech cues. Spectral processing research on aging would benefit from using a broader variety of stimuli and from rigorously controlling for hearing thresholds even in the absence of disabling hearing loss. Advances in the understanding of the effect of age on FFR measures of spectral encoding could lead to the development of new clinical tools, with possible applications in the field of hearing aid fitting.}, } @article {pmid35310278, year = {2022}, author = {Zaltz, Y and Kishon-Rabin, L}, title = {Difficulties Experienced by Older Listeners in Utilizing Voice Cues for Speaker Discrimination.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {797422}, pmid = {35310278}, issn = {1664-1078}, abstract = {Human listeners are assumed to apply different strategies to improve speech recognition in background noise. Young listeners with normal hearing (NH), e.g., have been shown to follow the voice of a particular speaker based on the fundamental (F0) and formant frequencies, which are both influenced by the gender, age, and size of the speaker. However, the auditory and cognitive processes that underlie the extraction and discrimination of these voice cues across speakers may be subject to age-related decline. The present study aimed to examine the utilization of F0 and formant cues for voice discrimination (VD) in older adults with hearing expected for their age. Difference limens (DLs) for VD were estimated in 15 healthy older adults (65-78 years old) and 35 young adults (18-35 years old) using only F0 cues, only formant frequency cues, and a combination of F0 + formant frequencies. A three-alternative forced-choice paradigm with an adaptive-tracking threshold-seeking procedure was used. Wechsler backward digit span test was used as a measure of auditory working memory. Trail Making Test (TMT) was used to provide cognitive information reflecting a combined effect of processing speed, mental flexibility, and executive control abilities. The results showed that (a) the mean VD thresholds of the older adults were poorer than those of the young adults for all voice cues, although larger variability was observed among the older listeners; (b) both age groups found the formant cues more beneficial for VD, compared to the F0 cues, and the combined (F0 + formant) cues resulted in better thresholds, compared to each cue separately; (c) significant associations were found for the older adults in the combined F0 + formant condition between VD and TMT scores, and between VD and hearing sensitivity, supporting the notion that a decline with age in both top-down and bottom-up mechanisms may hamper the ability of older adults to discriminate between voices. The present findings suggest that older listeners may have difficulty following the voice of a specific speaker and thus implementing doing so as a strategy for listening amid noise. This may contribute to understanding their reported difficulty listening in adverse conditions.}, } @article {pmid35288014, year = {2024}, author = {Paulino, CEB and Silva, HJD and Gomes, AOC and Silva, JMSD and Cunha, DAD and Coriolano, MDGWS and Lopes, LW and Lira, ZS}, title = {Relationship Between Oropharyngeal Geometry and Vocal Parameters in Subjects With Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {4}, pages = {967.e9-967.e17}, doi = {10.1016/j.jvoice.2022.01.020}, pmid = {35288014}, issn = {1873-4588}, mesh = {Humans ; *Parkinson Disease/physiopathology/diagnosis ; Male ; Female ; Middle Aged ; Cross-Sectional Studies ; *Speech Acoustics ; *Voice Quality ; Aged ; Retrospective Studies ; *Oropharynx/physiopathology ; *Speech Production Measurement ; *Acoustics ; Phonation ; Voice Disorders/physiopathology/diagnosis/etiology ; }, abstract = {OBJECTIVE: To verify whether the dimensions of different segments of the oropharyngeal cavity have different proportions between Parkinson's disease patients and vocally healthy subjects and investigate whether the measurements of these subjects' oropharyngeal geometry associate with their acoustic measurements of voice.

METHOD: Quantitative, descriptive, cross-sectional, and retrospective study with secondary data, approved by the Human Research Ethics Committee under no. 4.325.029. We used vocal samples and data from the oropharyngeal geometry of 40 subjects - 20 with Parkinson's disease stages I to III and 20 who formed the control group, matched for sex and age. Each group had 10 males and 10 females, mean age of 61 years (±6.0). Formant (F1, F2, and F3) and cepstral measures of the sustained vowel /ε/ were extracted and arranged in the database to determine their values using Praat software. The data were descriptively analyzed, with statistics generated with R software. The proportion of oropharyngeal geometry measurements was arranged by mean values and coefficients of variation. Pearson's linear correlation test was applied to relate voice parameters to oropharyngeal geometry, considering P < 0.05, and linear regression test, to justify F2.

RESULTS: The Parkinson's disease group showed a linear relationship between oral cavity length and F1 in males (P = 0.04) and between glottal area and F2 in females (P = 0.00); linear relationships were established according to age in both groups, and a regression model for F2 was estimated (R[2] = 0.61). There was no difference between pathological and healthy voices; there was a difference in the proportional relationship of oropharyngeal geometry between the groups.

CONCLUSION: The proportional relationship of oropharyngeal geometry differs between the Parkinson's disease group and the control group, as well as the relationship between oropharyngeal geometry and formant and cepstral values of voice according to the subjects' sex and age.}, } @article {pmid35276418, year = {2022}, author = {Jüchter, C and Beutelmann, R and Klump, GM}, title = {Speech sound discrimination by Mongolian gerbils.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108472}, doi = {10.1016/j.heares.2022.108472}, pmid = {35276418}, issn = {1878-5891}, mesh = {Animals ; Auditory Perception/physiology ; Gerbillinae ; Humans ; *Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {The present study establishes the Mongolian gerbil (Meriones unguiculatus) as a model for investigating the perception of human speech sounds. We report data on the discrimination of logatomes (CVCs - consonant-vowel-consonant combinations with outer consonants /b/, /d/, /s/ and /t/ and central vowels /a/, /aː/, /ɛ/, /eː/, /ɪ/, /iː/, /ɔ/, /oː/, /ʊ/ and /uː/, VCVs - vowel-consonant-vowel combinations with outer vowels /a/, /ɪ/ and /ʊ/ and central consonants /b/, /d/, /f/, /g/, /k/, /l/, /m/, /n/, /p/, /s/, /t/ and /v/) by gerbils. Four gerbils were trained to perform an oddball target detection paradigm in which they were required to discriminate a deviant CVC or VCV in a sequence of CVC or VCV standards, respectively. The experiments were performed with an ICRA-1 noise masker with speech-like spectral properties, and logatomes of multiple speakers were presented at various signal-to-noise ratios. Response latencies were measured to generate perceptual maps employing multidimensional scaling, which visualize the gerbils' internal maps of the sounds. The dimensions of the perceptual maps were correlated to multiple phonetic features of the speech sounds for evaluating which features of vowels and consonants are most important for the discrimination. The perceptual representation of vowels and consonants in gerbils was similar to that of humans, although gerbils needed higher signal-to-noise ratios for the discrimination of speech sounds than humans. The gerbils' discrimination of vowels depended on differences in the frequencies of the first and second formant determined by tongue height and position. Consonants were discriminated based on differences in combinations of their articulatory features. The similarities in the perception of logatomes by gerbils and humans renders the gerbil a suitable model for human speech sound discrimination.}, } @article {pmid35259200, year = {2022}, author = {Tamura, T and Tanaka, Y and Watanabe, Y and Sato, K}, title = {Relationships between maximum tongue pressure and second formant transition in speakers with different types of dysarthria.}, journal = {PloS one}, volume = {17}, number = {3}, pages = {e0264995}, pmid = {35259200}, issn = {1932-6203}, mesh = {Adult ; Aged ; *Dysarthria ; Female ; Humans ; Male ; Pressure ; *Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {The effects of muscle weakness on speech are currently not fully known. We investigated the relationships between maximum tongue pressure and second formant transition in adults with different types of dysarthria. It focused on the slope in the second formant transition because it reflects the tongue velocity during articulation. Sixty-three Japanese speakers with dysarthria (median age, 68 years; interquartile range, 58-77 years; 44 men and 19 women) admitted to acute and convalescent hospitals were included. Thirty neurologically normal speakers aged 19-85 years (median age, 22 years; interquartile range, 21.0-23.8 years; 14 men and 16 women) were also included. The relationship between the maximum tongue pressure and speech function was evaluated using correlation analysis in the dysarthria group. Speech intelligibility, the oral diadochokinesis rate, and the second formant slope were based on the impaired speech index. More than half of the speakers had mild to moderate dysarthria. Speakers with dysarthria showed significantly lower maximum tongue pressure, speech intelligibility, oral diadochokinesis rate, and second formant slope than neurologically normal speakers. Only the second formant slope was significantly correlated with the maximum tongue pressure (r = 0.368, p = 0.003). The relationship between the second formant slope and maximum tongue pressure showed a similar correlation in the analysis of subgroups divided by sex. The oral diadochokinesis rate, which is related to the speed of articulation, is affected by voice on/off, mandibular opening/closing, and range of motion. In contrast, the second formant slope was less affected by these factors. These results suggest that the maximum isometric tongue strength is associated with tongue movement speed during articulation.}, } @article {pmid35250034, year = {2022}, author = {Georgiou, GP}, title = {Acoustic markers of vowels produced with different types of face masks.}, journal = {Applied acoustics. Acoustique applique. Angewandte Akustik}, volume = {191}, number = {}, pages = {108691}, pmid = {35250034}, issn = {0003-682X}, abstract = {The wide spread of SARS-CoV-2 led to the extensive use of face masks in public places. Although masks offer significant protection from infectious droplets, they also impact verbal communication by altering speech signal. The present study examines how two types of face masks affect the speech properties of vowels. Twenty speakers were recorded producing their native vowels in a /pVs/ context, maintaining a normal speaking rate. Speakers were asked to produce the vowels in three conditions: (a) with a surgical mask, (b) with a cotton mask, and (c) without a mask. The speakers' output was analyzed through Praat speech acoustics software. We fitted three linear mixed-effects models to investigate the mask-wearing effects on the first formant (F1), second formant (F2), and duration of vowels. The results demonstrated that F1 and duration of vowels remained intact in the masked conditions compared to the unmasked condition, while F2 was altered for three out of five vowels (/e a u/) in the surgical mask and two out of five vowels (/e a/) in the cotton mask. So, both types of masks altered to some extent speech signal and they mostly affected the same vowel qualities. It is concluded that some acoustic properties are more sensitive than other to speech signal modification when speech is filtered through masks, while various sounds are affected in a different way. The findings may have significant implications for second/foreign language instructors who teach pronunciation and for speech therapists who teach sounds to individuals with language disorders.}, } @article {pmid35249395, year = {2023}, author = {Bertucci, V and Stevens, K and Sidhu, N and Suri, S and Bressmann, T}, title = {The Impact of Fan-Type Rapid Palatal Expanders on Speech in Patients With Unilateral Cleft Lip and Palate.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {60}, number = {7}, pages = {875-887}, pmid = {35249395}, issn = {1545-1569}, mesh = {Humans ; *Cleft Lip/surgery ; Speech ; *Cleft Palate/surgery ; Prospective Studies ; }, abstract = {Rapid palatal expanders (RPEs) are commonly used in patients with cleft lip and palate (CLP) prior to secondary alveolar bone grafting (SABG). Their position and size can impede tongue movement and affect speech. This study assessed changes in perception and production of speech over the course of RPE treatment. Prospective longitudinal. Tertiary university-affiliated hospital. Twenty-five patients with unilateral CLP treated with Fan-type RPEs, and their parents. Patient and parent speech questionnaires and patient speech recordings were collected at baseline before RPE insertion (T1), directly after RPE insertion (T2), during RPE expansion (T3), during RPE retention (T4), directly after RPE removal but before SABG (T5), and at short-term follow-up after RPE removal and SABG (T6). Ratings for patient and parent questionnaires, first (F1) and second (F2) formants for vowels /a/, /i/, and /u/, and nasalance scores for non-nasal and nasal sentences, were obtained and analyzed using mixed model analyses of variance. Ratings worsened at T2. For the vowel /a/, F1 and F2 were unchanged at T2. For the vowel /i/, F1 increased and F2 decreased at T2. For the vowel /u/, F1 was unchanged and F2 decreased at T2. Nasalance was unchanged at T2. All outcome measures returned to T1 levels by T4. RPE insertion resulted in initial adverse effects on speech perception and production, which decreased to baseline prior to removal. Information regarding transient speech dysfunction and distress may help prepare patients for treatment.}, } @article {pmid35242348, year = {2022}, author = {Anikin, A and Pisanski, K and Reby, D}, title = {Static and dynamic formant scaling conveys body size and aggression.}, journal = {Royal Society open science}, volume = {9}, number = {1}, pages = {211496}, pmid = {35242348}, issn = {2054-5703}, abstract = {When producing intimidating aggressive vocalizations, humans and other animals often extend their vocal tracts to lower their voice resonance frequencies (formants) and thus sound big. Is acoustic size exaggeration more effective when the vocal tract is extended before, or during, the vocalization, and how do listeners interpret within-call changes in apparent vocal tract length? We compared perceptual effects of static and dynamic formant scaling in aggressive human speech and nonverbal vocalizations. Acoustic manipulations corresponded to elongating or shortening the vocal tract either around (Experiment 1) or from (Experiment 2) its resting position. Gradual formant scaling that preserved average frequencies conveyed the impression of smaller size and greater aggression, regardless of the direction of change. Vocal tract shortening from the original length conveyed smaller size and less aggression, whereas vocal tract elongation conveyed larger size and more aggression, and these effects were stronger for static than for dynamic scaling. Listeners familiarized with the speaker's natural voice were less often 'fooled' by formant manipulations when judging speaker size, but paid more attention to formants when judging aggressive intent. Thus, within-call vocal tract scaling conveys emotion, but a better way to sound large and intimidating is to keep the vocal tract consistently extended.}, } @article {pmid35240298, year = {2022}, author = {Haider, CL and Suess, N and Hauswald, A and Park, H and Weisz, N}, title = {Masking of the mouth area impairs reconstruction of acoustic speech features and higher-level segmentational features in the presence of a distractor speaker.}, journal = {NeuroImage}, volume = {252}, number = {}, pages = {119044}, doi = {10.1016/j.neuroimage.2022.119044}, pmid = {35240298}, issn = {1095-9572}, support = {P 31230/FWF_/Austrian Science Fund FWF/Austria ; P 34237/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Acoustic Stimulation ; Acoustics ; Humans ; Mouth ; *Speech ; *Speech Perception ; Visual Perception ; }, abstract = {Multisensory integration enables stimulus representation even when the sensory input in a single modality is weak. In the context of speech, when confronted with a degraded acoustic signal, congruent visual inputs promote comprehension. When this input is masked, speech comprehension consequently becomes more difficult. But it still remains inconclusive which levels of speech processing are affected under which circumstances by occluding the mouth area. To answer this question, we conducted an audiovisual (AV) multi-speaker experiment using naturalistic speech. In half of the trials, the target speaker wore a (surgical) face mask, while we measured the brain activity of normal hearing participants via magnetoencephalography (MEG). We additionally added a distractor speaker in half of the trials in order to create an ecologically difficult listening situation. A decoding model on the clear AV speech was trained and used to reconstruct crucial speech features in each condition. We found significant main effects of face masks on the reconstruction of acoustic features, such as the speech envelope and spectral speech features (i.e. pitch and formant frequencies), while reconstruction of higher level features of speech segmentation (phoneme and word onsets) were especially impaired through masks in difficult listening situations. As we used surgical face masks in our study, which only show mild effects on speech acoustics, we interpret our findings as the result of the missing visual input. Our findings extend previous behavioural results, by demonstrating the complex contextual effects of occluding relevant visual information on speech processing.}, } @article {pmid35232632, year = {2024}, author = {Hoyer, P and Riedler, M and Unterhofer, C and Graf, S}, title = {Vocal Tract and Subglottal Impedance in High Performance Singing: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {5}, pages = {1248.e11-1248.e21}, doi = {10.1016/j.jvoice.2022.01.015}, pmid = {35232632}, issn = {1873-4588}, mesh = {Humans ; *Singing ; *Phonation ; Female ; *Glottis/physiology ; *Voice Quality ; Prospective Studies ; Vibration ; Acoustics ; Electric Impedance ; Biomechanical Phenomena ; Inhalation/physiology ; Vocal Cords/physiology ; Adult ; Sound Spectrography ; Exhalation/physiology ; }, abstract = {OBJECTIVES/HYPOTHESIS: The respiratory process is important in vocal training and in professional singing, the airflow is highly important. It is hypothesized that subglottal resonances are important to the singing voice in high performance singing.

STUDY DESIGN: Single subject, prospective.

METHOD: A professional soprano singer shaped her vocal tract to form the vowels [a], [e], [i], [o], and [u] at the pitch d4. We measured phonated vowels and the vocal tract impedance spectra with a deterministic noise supplied by an iPhone buzzer in the range of 200 to 4,000 Hz at closed glottis, during exhalation and during inhalation while maintaining the shape of the vocal tract.

RESULTS: Measurements of the phonated vowels before and after the different glottal adjustments were highly reproducible. Vocal tract resonances and the ones resulting during respiration are reported. The impedance spectra show vowel dependent resonances with closed and open glottis. The formants of the vocal spectra are explained by including both, the vocal tract, and the subglottal resonances.

CONCLUSION: The findings indicate that subglottal resonances influence the first formant as well as the singers's formant cluster in high-performance singing. The instrumental setup used for the impedance measurement allows a simple and lightweight procedure for a measurement of vocal tract and subglottal resonances.}, } @article {pmid35232067, year = {2022}, author = {Luberadzka, J and Kayser, H and Hohmann, V}, title = {Making sense of periodicity glimpses in a prediction-update-loop-A computational model of attentive voice tracking.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {712}, pmid = {35232067}, issn = {1520-8524}, support = {R01 DC015429/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Bayes Theorem ; Computer Simulation ; Humans ; Periodicity ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Humans are able to follow a speaker even in challenging acoustic conditions. The perceptual mechanisms underlying this ability remain unclear. A computational model of attentive voice tracking, consisting of four computational blocks: (1) sparse periodicity-based auditory features (sPAF) extraction, (2) foreground-background segregation, (3) state estimation, and (4) top-down knowledge, is presented. The model connects the theories about auditory glimpses, foreground-background segregation, and Bayesian inference. It is implemented with the sPAF, sequential Monte Carlo sampling, and probabilistic voice models. The model is evaluated by comparing it with the human data obtained in the study by Woods and McDermott [Curr. Biol. 25(17), 2238-2246 (2015)], which measured the ability to track one of two competing voices with time-varying parameters [fundamental frequency (F0) and formants (F1,F2)]. Three model versions were tested, which differ in the type of information used for the segregation: version (a) uses the oracle F0, version (b) uses the estimated F0, and version (c) uses the spectral shape derived from the estimated F0 and oracle F1 and F2. Version (a) simulates the optimal human performance in conditions with the largest separation between the voices, version (b) simulates the conditions in which the separation in not sufficient to follow the voices, and version (c) is closest to the human performance for moderate voice separation.}, } @article {pmid35232065, year = {2022}, author = {Saba, JN and Hansen, JHL}, title = {The effects of Lombard perturbation on speech intelligibility in noise for normal hearing and cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {1007}, pmid = {35232065}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; *Cochlear Implants ; Hearing ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Natural compensation of speech production in challenging listening environments is referred to as the Lombard effect (LE). The resulting acoustic differences between neutral and Lombard speech have been shown to provide intelligibility benefits for normal hearing (NH) and cochlear implant (CI) listeners alike. Motivated by this outcome, three LE perturbation approaches consisting of pitch, duration, formant, intensity, and spectral contour modifications were designed specifically for CI listeners to combat speech-in-noise performance deficits. Experiment 1 analyzed the effects of loudness, quality, and distortion of approaches on speech intelligibility with and without formant-shifting. Significant improvements of +9.4% were observed in CI listeners without the formant-shifting approach at +5 dB signal-to-noise ratio (SNR) large-crowd-noise (LCN) when loudness was controlled, however, performance was found to be significantly lower for NH listeners. Experiment 2 evaluated the non-formant-shifting approach with additional spectral contour and high pass filtering to reduce spectral smearing and decrease distortion observed in Experiment 1. This resulted in significant intelligibility benefits of +30.2% for NH and +21.2% for CI listeners at 0 and +5 dB SNR LCN, respectively. These results suggest that LE perturbation may be useful as front-end speech modification approaches to improve intelligibility for CI users in noise.}, } @article {pmid35180005, year = {2022}, author = {Sen, A and Thakkar, H and Vincent, V and Rai, S and Singh, A and Mohanty, S and Roy, A and Ramakrishnan, L}, title = {Endothelial colony forming cells' tetrahydrobiopterin level in coronary artery disease patients and its association with circulating endothelial progenitor cells.}, journal = {Canadian journal of physiology and pharmacology}, volume = {100}, number = {5}, pages = {473-485}, doi = {10.1139/cjpp-2021-0548}, pmid = {35180005}, issn = {1205-7541}, mesh = {Biopterins/analogs & derivatives ; *Coronary Artery Disease ; *Endothelial Progenitor Cells ; Humans ; }, abstract = {Endothelial colony forming cells (ECFCs) participate in neovascularization. Endothelial nitric oxide synthase (eNOS) derived NO· helps in homing of endothelial progenitor cells (EPCs) at the site of vascular injury. The enzyme cofactor tetrahydrobiopterin (BH4) stabilizes the catalytic active state of eNOS. Association of intracellular ECFCs biopterins and ratio of reduced to oxidized biopterin (BH4:BH2) with circulatory EPCs and ECFCs functionality have not been studied. We investigated ECFCs biopterin levels and its association with circulatory EPCs as well as ECFCs proliferative potential in terms of day of appearance in culture. Circulatory EPCs were enumerated by flowcytometry in 53 coronary artery disease (CAD) patients and 42 controls. ECFCs were cultured, characterized, and biopterin levels assessed by high performance liquid chromatography. Appearance of ECFCs' colony and their number were recorded. Circulatory EPCs were significantly lower in CAD and ECFCs appeared in 56% and 33% of CAD and control subjects, respectively. Intracellular BH4 and BH4:BH2 were significantly reduced in CAD. BH4:BH2 was positively correlated with circulatory EPCs (p = 0.01), and negatively with day of appearance of ECFCs (p = 0.04). Circulatory EPCs negatively correlated with ECFCs appearance (p = 0.02). These findings suggest the role of biopterins in maintaining circulatory EPCs and functional integrity of ECFCs.}, } @article {pmid35175986, year = {2022}, author = {Lou, Q and Wang, X and Jiang, L and Wang, G and Chen, Y and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients with Unrepaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {33}, number = {5}, pages = {e528-e532}, doi = {10.1097/SCS.0000000000008567}, pmid = {35175986}, issn = {1536-3732}, mesh = {Adult ; *Cleft Palate/complications/surgery ; Humans ; Speech ; Speech Disorders/diagnosis/etiology ; Speech Intelligibility ; Speech Production Measurement/methods ; Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with adult patients with unrepaired cleft palate and their non-cleft peers.

PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality, and consonant missing rate, whereas objective acoustic parameters included normalized vowel formants, voice onset time, and the analysis of three-dimensional spectrogram and spectrum, were carried out on speech samples produced by 2 groups of speakers: (a) speakers with unrepaired cleft palate (n = 65, mean age = 25.1 years) and (b) typical speakers (n = 30, mean age = 23.7 years).

RESULTS: Compared with typical speakers, individuals with unrepaired cleft palate exhibited a lower speech intelligibility with higher nasality and consonant missing rate, the missing rate is highest for the 6 consonants syllables The acoustic parameters are mainly manifested as differences in vowel formants and voice onset time.

CONCLUSIONS: The results revealed important acoustical differences between adult patients with unrepaired cleft palate and typical speakers. The trend of spectral deviation may have contributed to the difficulty in producing pressure vowels and aspirated consonants in individuals with speech disorders related to cleft palate.}, } @article {pmid35166414, year = {2022}, author = {Nguyen, DD and Chacon, A and Payten, C and Black, R and Sheth, M and McCabe, P and Novakovic, D and Madill, C}, title = {Acoustic characteristics of fricatives, amplitude of formants and clarity of speech produced without and with a medical mask.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {2}, pages = {366-380}, pmid = {35166414}, issn = {1460-6984}, mesh = {Acoustics ; Humans ; Phonetics ; *Speech ; Speech Acoustics ; Speech Disorders ; *Speech Perception ; }, abstract = {BACKGROUND: Previous research has found that high-frequency energy of speech signals decreased while wearing face masks. However, no study has examined the specific spectral characteristics of fricative consonants and vowels and the perception of clarity of speech in mask wearing.

AIMS: To investigate acoustic-phonetic characteristics of fricative consonants and vowels and auditory perceptual rating of clarity of speech produced with and without wearing a face mask.

METHODS & PROCEDURES: A total of 16 healthcare workers read the Rainbow Passage using modal phonation in three conditions: without a face mask, with a standard surgical mask and with a KN95 mask (China GB2626-2006, a medical respirator with higher barrier level than the standard surgical mask). Speech samples were acoustically analysed for root mean square (RMS) amplitude (ARMS) and spectral moments of four fricatives /f/, /s/, /ʃ/ and /z/; and amplitude of the first three formants (A1, A2 and A3) measured from the reading passage and extracted vowels. Auditory perception of speech clarity was performed. Data were compared across mask and non-mask conditions using linear mixed models.

OUTCOMES & RESULTS: The ARMS of all included fricatives was significantly lower in surgical mask and KN95 mask compared with non-mask condition. Centre of gravity of /f/ decreased in both surgical and KN95 mask while other spectral moments did not show systematic significant linear trends across mask conditions. None of the formant amplitude measures was statistically different across conditions. Speech clarity was significantly poorer in both surgical and KN95 mask conditions.

Speech produced while wearing either a surgical mask or KN95 mask was associated with decreased fricative amplitude and poorer speech clarity.

WHAT THIS PAPER ADDS: What is already known on the subject Previous studies have shown that the overall spectral levels in high frequency ranges and intelligibility are decreased for speech produced with a face mask. It is unclear how different types of the speech signals that is, fricatives and vowels are presented in speech produced with wearing either a medical surgical or KN95 mask. It is also unclear whether ratings of speech clarity are similar for speech produced with these face masks. What this paper adds to existing knowledge Speech data collected using a real-world, clinical and non-laboratory-controlled settings showed differences in the amplitude of fricatives and speech clarity ratings between non-mask and mask-wearing conditions. Formant amplitude did not show significant differences in mask-wearing conditions compared with non-mask. What are the potential or actual clinical implications of this work? Wearing a surgical mask or a KN95 mask had different effects on consonants and vowels. It appeared from the findings in this study that these masks only affected fricative consonants and did not affect vowel production. The poorer speech clarity in these mask-wearing conditions has important implications for speech perception in communication between clinical staff and between medical officers and patients in clinics, and between people in everyday situations. The impact of these masks on speech perception may be more pronounced in people with hearing impairment and communication disorders. In voice evaluation and/or therapy sessions, the effects of wearing a medical mask can occur bidirectionally for both the clinician and the patient. The patient may find it more challenging to understand the speech conveyed by the clinician while the clinician may not perceptually assess patient's speech and voice accurately. Given the significant correlation between clarity ratings and fricative amplitude, improving fricative signals would be useful to improve speech clarity while wearing these medical face masks.}, } @article {pmid35142977, year = {2022}, author = {Gábor, A and Kaszás, N and Faragó, T and Pérez Fraga, P and Lovas, M and Andics, A}, title = {The acoustic bases of human voice identity processing in dogs.}, journal = {Animal cognition}, volume = {25}, number = {4}, pages = {905-916}, pmid = {35142977}, issn = {1435-9456}, support = {LP2017-13/2017//magyar tudományos akadémia/ ; 950159//h2020 european research council/ ; ÚNKP-20-4-II-ELTE-286//hungarian ministry for innovation and technology, national research, development and innovation fund/ ; ÚNKP-20-5-ELTE-337//hungarian ministry for innovation and technology, national research, development and innovation fund/ ; BO/751/20//mta bolyai research scholarship, hungary/ ; ÚNKP-21-5-ELTE-1061//Hungarian Ministry for Innovation and Technology, National Research, Development and Innovation Fund/ ; }, mesh = {Acoustics ; Animals ; Cues ; Dogs ; Humans ; Recognition, Psychology ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Speech carries identity-diagnostic acoustic cues that help individuals recognize each other during vocal-social interactions. In humans, fundamental frequency, formant dispersion and harmonics-to-noise ratio serve as characteristics along which speakers can be reliably separated. The ability to infer a speaker's identity is also adaptive for members of other species (like companion animals) for whom humans (as owners) are relevant. The acoustic bases of speaker recognition in non-humans are unknown. Here, we tested whether dogs can recognize their owner's voice and whether they rely on the same acoustic parameters for such recognition as humans use to discriminate speakers. Stimuli were pre-recorded sentences spoken by the owner and control persons, played through loudspeakers placed behind two non-transparent screens (with each screen hiding a person). We investigated the association between acoustic distance of speakers (examined along several dimensions relevant in intraspecific voice identification) and dogs' behavior. Dogs chose their owner's voice more often than that of control persons', suggesting that they can identify it. Choosing success and time spent looking in the direction of the owner's voice were positively associated, showing that looking time is an index of the ease of choice. Acoustic distance of speakers in mean fundamental frequency and jitter were positively associated with looking time, indicating that the shorter the acoustic distance between speakers with regard to these parameters, the harder the decision. So, dogs use these cues to discriminate their owner's voice from unfamiliar voices. These findings reveal that dogs use some but probably not all acoustic parameters that humans use to identify speakers. Although dogs can detect fine changes in speech, their perceptual system may not be fully attuned to identity-diagnostic cues in the human voice.}, } @article {pmid35141903, year = {2022}, author = {V, K and S, SP}, title = {Hybrid machine learning classification scheme for speaker identification.}, journal = {Journal of forensic sciences}, volume = {67}, number = {3}, pages = {1033-1048}, doi = {10.1111/1556-4029.15006}, pmid = {35141903}, issn = {1556-4029}, mesh = {*Machine Learning ; Speech ; *Support Vector Machine ; }, abstract = {Motivated by the requirement to prepare for the next generation of "Automatic Spokesperson Recognition" (ASR) system, this paper applied the fused spectral features with hybrid machine learning (ML) strategy to the speech communication field. This strategy involved the combined spectral features such as mel-frequency cepstral coefficients (MFCCs), spectral kurtosis, spectral skewness, normalized pitch frequency (NPF), and formants. The characterization of suggested classification method could possibly serve in advanced speaker identification scenarios. Special attention was given to hybrid ML scheme capable of finding unknown speakers equipped with speaker id-detecting classifier technique, known as "Random Forest-Support Vector Machine" (RF-SVM). The extracted speaker precise spectral attributes are applied to the hybrid RF-SVM classifier to identify/verify the particular speaker. This work aims to construct an ensemble decision tree on a bounded area with minimal misclassification error using a hybrid ensemble RF-SVM strategy. A series of standard, real-time speaker databases, and noise conditions are functionally tested to validate its performance with other state-of-the-art mechanisms. The proposed fusion method succeeds in the speaker identification task with a high identification rate (97% avg) and lower equal error rate (EER) (<2%), compared with the individual schemes for the recorded experimental dataset. The robustness of the classifier is validated using the standard ELSDSR, TIMIT, and NIST audio datasets. Experiments on ELSDSR, TIMIT, and NIST datasets show that the hybrid classifier produces 98%, 99%, and 94% accuracy, and EERs were 2%, 1%, and 2% respectively. The findings are then compared with well-known other speaker recognition schemes and found to be superior.}, } @article {pmid35135714, year = {2024}, author = {Menezes, DP and de Lira, ZS and Araújo, ANB and de Almeida, AAF and Gomes, AOC and Moraes, BT and Lucena, JA}, title = {Prosodic Differences in the Voices of Transgender and Cisgender Women: Self-Perception of Voice - An Auditory and Acoustic Analysis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {4}, pages = {844-857}, doi = {10.1016/j.jvoice.2021.12.020}, pmid = {35135714}, issn = {1873-4588}, mesh = {Humans ; Female ; *Transgender Persons/psychology ; *Voice Quality ; Adult ; *Speech Acoustics ; *Self Concept ; *Speech Perception ; Young Adult ; *Emotions ; *Speech Production Measurement ; *Acoustics ; Cross-Sectional Studies ; Middle Aged ; Male ; Transsexualism/physiopathology/psychology ; Gender Identity ; }, abstract = {INTRODUCTION: The voice is an important parameter for identifying the speaker's gender. Transgender people seek to adapt their bodies to gender identity, and transgender women have greater difficulties in achieving vocal acceptance. In this context, the evaluation of the various parameters of the voice of transgender and cisgender women is essential to make it possible to propose appropriate intervention measures.

OBJECTIVES: To identify the differences in vocal characteristics of transgender and cisgender women.

METHODS: An sectional study was conducted. The sample comprised 20 transgender women and 20 cisgender women who underwent evaluation of acoustic parameters, emotional prosody, self-perception, and perception of gender by lay listeners.

RESULTS: The vocal characteristics of transgender and cisgender women differ in terms of the following parameters: f0, glottal noise excitation (GNE), vocal intensity, speech range profile (SRP), the first three formants of the vowel /a/, and in terms of emotional prosody, including duration and melodic contour. Higher values ​​were mostly found in the cisgender population, except for noise level and vocal intensity. In addition, in most cases lay listeners identified the voices of transgender women as belonging to the male gender. There was a negative correlation between vocal dissatisfaction and f0 among transgender women.

CONCLUSIONS: Even though they perform vocal adjustments, the voices of transgender women are different from cisgender women in terms of acoustic parameters, vocal extension, and emotional prosody including duration and melodic contour. These differences have repercussions on the perception of gender by listeners.}, } @article {pmid35130577, year = {2022}, author = {Rishiq, D and Harkrider, A and Springer, C and Hedrick, M}, title = {Effects of Spectral Shaping on Speech Auditory Brainstem Responses to Stop Consonant-Vowel Syllables.}, journal = {Journal of the American Academy of Audiology}, volume = {33}, number = {4}, pages = {232-243}, doi = {10.1055/a-1764-9805}, pmid = {35130577}, issn = {2157-3107}, mesh = {Humans ; Male ; Young Adult ; Aged ; Evoked Potentials, Auditory, Brain Stem/physiology ; Speech ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; *Hearing Aids ; }, abstract = {BACKGROUND: Spectral shaping is employed by hearing aids to make consonantal information, such as formant transitions, audible for listeners with hearing loss. How manipulations of the stimuli, such as spectral shaping, may alter encoding in the auditory brainstem has not been thoroughly studied.

PURPOSE: The aim of this study was to determine how spectral shaping of synthetic consonant-vowel (CV) syllables, varying in their second formant (F2) onset frequency, may affect encoding of the syllables in the auditory brainstem.

RESEARCH DESIGN: We employed a repeated measure design.

STUDY SAMPLE: Sixteen young adults (mean = 20.94 years, 6 males) and 11 older adults (mean = 58.60 years, 4 males) participated in this study.

DATA COLLECTION AND ANALYSIS: Speech-evoked auditory brainstem responses (speech-ABRs) were obtained from each participant using three CV exemplars selected from synthetic stimuli generated for a /ba-da-ga/ continuum. Brainstem responses were also recorded to corresponding three CV exemplars that were spectrally shaped to decrease low-frequency information and provide gain for middle and high frequencies according to a Desired Sensation Level function. In total, six grand average waveforms (3 phonemes [/ba/, /da/, /ga/] X 2 shaping conditions [unshaped, shaped]) were produced for each participant. Peak latencies and amplitudes, referenced to prestimulus baseline, were identified for 15 speech-ABR peaks. Peaks were marked manually using the program cursor on each individual waveform. Repeated-measures analysis of variances were used to determine the effects of shaping on the latencies and amplitudes of the speech-ABR peaks.

RESULTS: Shaping effects produced changes within participants in ABR latencies and amplitudes involving onset and major peaks of the speech-ABR waveform for certain phonemes. Specifically, data from onset peaks showed that shaping decreased latency for /ga/ in older listeners, and decreased amplitude onset for /ba/ in younger listeners. Shaping also increased the amplitudes of major peaks for /ga/ stimuli in both groups.

CONCLUSIONS: Encoding of speech in the ABR waveform may be more complex and multidimensional than a simple demarcation of source and filter information, and may also be influenced by cue intensity and age. These results suggest a more complex subcortical encoding of vocal tract filter information in the ABR waveform, which may also be influenced by cue intensity and age.}, } @article {pmid35120354, year = {2022}, author = {Easwar, V and Boothalingam, S and Wilson, E}, title = {Sensitivity of Vowel-Evoked Envelope Following Responses to Spectra and Level of Preceding Phoneme Context.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1327-1335}, doi = {10.1097/AUD.0000000000001190}, pmid = {35120354}, issn = {1538-4667}, mesh = {Electroencephalography ; Humans ; Male ; *Speech Perception/physiology ; Young Adult ; }, abstract = {OBJECTIVE: Vowel-evoked envelope following responses (EFRs) could be a useful noninvasive tool for evaluating neural activity phase-locked to the fundamental frequency of voice (f0). Vowel-evoked EFRs are often elicited by vowels in consonant-vowel syllables or words. Considering neural activity is susceptible to temporal masking, EFR characteristics elicited by the same vowel may vary with the features of the preceding phoneme. To this end, the objective of the present study was to evaluate the influence of the spectral and level characteristics of the preceding phoneme context on vowel-evoked EFRs.

DESIGN: EFRs were elicited by a male-spoken /i/ (stimulus; duration = 350 msec), modified to elicit two EFRs, one from the region of the first formant (F1) and one from the second and higher formants (F2+). The stimulus, presented at 65 dB SPL, was preceded by one of the four contexts: /∫/, /m/, /i/ or a silent gap of duration equal to that of the stimulus. The level of the context phonemes was either 50 or 80 dB SPL, 15 dB lower and higher than the level of the stimulus /i/. In a control condition, EFRs to the stimulus /i/ were elicited in isolation without any preceding phoneme contexts. The stimulus and the contexts were presented monaurally to a randomly chosen test ear in 21 young adults with normal hearing. EFRs were recorded using single-channel electroencephalogram between the vertex and the nape.

RESULTS: A repeated measures analysis of variance indicated a significant three-way interaction between context type (/∫/, /i/, /m/, silent gap), level (50, 80 dB SPL), and EFR-eliciting formant (F1, F2+). Post hoc analyses indicated no influence of the preceding phoneme context on F1-elicited EFRs. Relative to a silent gap as the preceding context, F2+-elicited EFRs were attenuated by /∫/ and /m/ presented at 50 and 80 dB SPL, as well as by /i/ presented at 80 dB SPL. The average attenuation ranged from 14.9 to 27.9 nV. When the context phonemes were presented at matched levels of 50 or 80 dB SPL, F2+-elicited EFRs were most often attenuated when preceded by /∫/. At 80 dB SPL, relative to the silent preceding gap, the average attenuation was 15.7 nV, and at 50 dB SPL, relative to the preceding context phoneme /i/, the average attenuation was 17.2 nV.

CONCLUSION: EFRs elicited by the second and higher formants of /i/ are sensitive to the spectral and level characteristics of the preceding phoneme context. Such sensitivity, measured as an attenuation in the present study, may influence the comparison of EFRs elicited by the same vowel in different consonant-vowel syllables or words. However, the degree of attenuation with realistic context levels exceeded the minimum measurable change only 12% of the time. Although the impact of the preceding context is statistically significant, it is likely to be clinically insignificant a majority of the time.}, } @article {pmid35111103, year = {2021}, author = {Chiu, C and Weng, Y and Chen, BW}, title = {Tongue Postures and Tongue Centers: A Study of Acoustic-Articulatory Correspondences Across Different Head Angles.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {768754}, pmid = {35111103}, issn = {1664-1078}, abstract = {Recent research on body and head positions has shown that postural changes may induce varying degrees of changes on acoustic speech signals and articulatory gestures. While the preservation of formant profiles across different postures is suitably accounted for by the two-tube model and perturbation theory, it remains unclear whether it is resulted from the accommodation of tongue postures. Specifically, whether the tongue accommodates the changes in head angle to maintain the target acoustics is yet to be determined. The present study examines vowel acoustics and their correspondence with the articulatory maneuvers of the tongue, including both tongue postures and movements of the tongue center, across different head angles. The results show that vowel acoustics, including pitch and formants, are largely unaffected by upward or downward tilting of the head. These preserved acoustics may be attributed to the lingual gestures that compensate for the effects of gravity. Our results also reveal that the tongue postures in response to head movements appear to be vowel-dependent, and the tongue center may serve as an underlying drive that covariates with the head angle changes. These results imply a close relationship between vowel acoustics and tongue postures as well as a target-oriented strategy for different head angles.}, } @article {pmid35105035, year = {2022}, author = {Merritt, B and Bent, T}, title = {Revisiting the acoustics of speaker gender perception: A gender expansive perspective.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {1}, pages = {484}, doi = {10.1121/10.0009282}, pmid = {35105035}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Femininity ; Humans ; Male ; Masculinity ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Examinations of speaker gender perception have primarily focused on the roles of fundamental frequency (fo) and formant frequencies from structured speech tasks using cisgender speakers. Yet, there is evidence to suggest that fo and formants do not fully account for listeners' perceptual judgements of gender, particularly from connected speech. This study investigated the perceptual importance of fo, formant frequencies, articulation, and intonation in listeners' judgements of gender identity and masculinity/femininity from spontaneous speech from cisgender male and female speakers as well as transfeminine and transmasculine speakers. Stimuli were spontaneous speech samples from 12 speakers who are cisgender (6 female and 6 male) and 12 speakers who are transgender (6 transfeminine and 6 transmasculine). Listeners performed a two-alternative forced choice (2AFC) gender identification task and masculinity/femininity rating task in two experiments that manipulated which acoustic cues were available. Experiment 1 confirmed that fo and formant frequency manipulations were insufficient to alter listener judgements across all speakers. Experiment 2 demonstrated that articulatory cues had greater weighting than intonation cues on the listeners' judgements when the fo and formant frequencies were in a gender ambiguous range. These findings counter the assumptions that fo and formant manipulations are sufficient to effectively alter perceived speaker gender.}, } @article {pmid35104414, year = {2022}, author = {Kim, Y and Chung, H and Thompson, A}, title = {Acoustic and Articulatory Characteristics of English Semivowels /ɹ, l, w/ Produced by Adult Second-Language Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {890-905}, doi = {10.1044/2021_JSLHR-21-00152}, pmid = {35104414}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Humans ; *Language ; *Multilingualism ; Phonetics ; Speech Acoustics ; }, abstract = {PURPOSE: This study presents the results of acoustic and kinematic analyses of word-initial semivowels (/ɹ, l, w/) produced by second-language (L2) speakers of English whose native language is Korean. In addition, the relationship of acoustic and kinematic measures to the ratings of foreign accent was examined by correlation analyses.

METHOD: Eleven L2 speakers and 10 native speakers (first language [L1]) of English read The Caterpillar passage. Acoustic and kinematic data were simultaneously recorded using an electromagnetic articulography system. In addition to speaking rate, two acoustic measures (ratio of third-formant [F3] frequency to second-formant [F2] frequency and duration of steady states of F2) and two kinematic measures (lip aperture and duration of lingual maximum hold) were obtained from individual target sounds. To examine the degree of contrast among the three sounds, acoustic and kinematic Euclidean distances were computed on the F2-F3 and x-y planes, respectively.

RESULTS: Compared with L1 speakers, L2 speakers exhibited a significantly slower speaking rate. For the three semivowels, L2 speakers showed a reduced F3/F2 ratio during constriction, increased lip aperture, and reduced acoustic Euclidean distances among semivowels. Additionally, perceptual ratings of foreign accent were significantly correlated with three measures: duration of steady F2, acoustic Euclidean distance, and kinematic Euclidean distance.

CONCLUSIONS: The findings provide acoustic and kinematic evidence for challenges that L2 speakers experience in the production of English semivowels, especially /ɹ/ and /w/. The robust and consistent finding of reduced contrasts among semivowels and their correlations with perceptual accent ratings suggests using sound contrasts as a potentially effective approach to accent modification paradigms.}, } @article {pmid35093243, year = {2022}, author = {Takemoto, N and Sanuki, T and Esaki, S and Iwasaki, S}, title = {Rabbit model with vocal fold hyperadduction.}, journal = {Auris, nasus, larynx}, volume = {49}, number = {5}, pages = {810-815}, doi = {10.1016/j.anl.2022.01.008}, pmid = {35093243}, issn = {1879-1476}, mesh = {Animals ; *Dysphonia ; Glottis ; Humans ; Laryngeal Muscles ; Phonation/physiology ; Rabbits ; *Vocal Cords ; }, abstract = {OBJECTIVE: Adductor spasmodic dysphonia (AdSD) is caused by hyperadduction of the vocal folds during phonation, resulting in a strained voice. Animal models are not yet used to elucidate this intractable disease because AdSD has a difficult pathology without a definitive origin. For the first step, we established an animal model with vocal fold hyperadduction and evaluated its validity by assessing laryngeal function.

METHODS: In this experimental animal study, three adult Japanese 20-week-old rabbits were used. The models were created using a combination of cricothyroid approximation, forced airflow, and electrical stimulation of the recurrent laryngeal nerves (RLNs). Cricothyroid approximation was added to produce a glottal slit. Thereafter, both RLNs were electrically stimulated to induce vocal fold hyperadduction. Finally, the left RLN was transected to relieve hyperadduction. The sound, endoscopic images, and subglottal pressure were recorded, and acoustic analysis was performed.

RESULTS: Subglottal pressure increased significantly, and the strained sound was produced after the electrical stimulation of the RLNs. After transecting the left RLN, the subglottal pressure decreased significantly, and the strained sound decreased. Acoustic analysis revealed an elevation of the standard deviation of F0 (SDF0) and degree of voice breaks (DVB) through stimulation of the RLNs, and degradation of SDF0 and DVB through RLN transection. Formant bands in the sound spectrogram were interrupted by the stimulation and appeared again after the RLN section.

CONCLUSION: This study developed a rabbit model with vocal fold hyperadduction . The subglottal pressure and acoustic analysis of this model resembled the characteristics of patients with AdSD. This model could be helpful to elucidate the pathology of the larynx caused by hyperadduction, and evaluate and compare the treatments for strained phonation.}, } @article {pmid35086866, year = {2022}, author = {Heeringa, AN and Köppl, C}, title = {Auditory Nerve Fiber Discrimination and Representation of Naturally-Spoken Vowels in Noise.}, journal = {eNeuro}, volume = {9}, number = {1}, pages = {}, pmid = {35086866}, issn = {2373-2822}, mesh = {Auditory Perception/physiology ; Cochlear Nerve/physiology ; Nerve Fibers/physiology ; *Noise ; Phonetics ; Speech ; *Speech Perception/physiology ; }, abstract = {To understand how vowels are encoded by auditory nerve (AN) fibers, a number of representation schemes have been suggested that extract the vowel's formant frequencies from AN-fiber spiking patterns. The current study aims to apply and compare these schemes for AN-fiber responses to naturally-spoken vowels in a speech-shaped background noise. Responses to three vowels were evaluated; based on behavioral experiments in the same species, two of these were perceptually difficult to discriminate from each other (/e/ vs /i/), and one was perceptually easy to discriminate from the other two (/a:/). Single-unit AN fibers were recorded from ketamine/xylazine-anesthetized Mongolian gerbils of either sex (n = 8). First, single-unit discrimination between the three vowels was studied. Compared with the perceptually easy discriminations, the average spike timing-based discrimination values were significantly lower for the perceptually difficult vowel discrimination. This was not true for an average rate-based discrimination metric, the rate d-prime (d'). Consistently, spike timing-based representation schemes, plotting the temporal responses of all recorded units as a function of their best frequency (BF), i.e., dominant component schemes, average localized interval rate, and fluctuation profiles, revealed representation of the vowel's formant frequencies, whereas no such representation was apparent in the rate-based excitation pattern. Making use of perceptual discrimination data, this study reveals that discrimination difficulties of naturally-spoken vowels in speech-shaped noise originate peripherally and can be studied in the spike timing patterns of single AN fibers.}, } @article {pmid35077652, year = {2022}, author = {Yüksel, M}, title = {Reliability and Efficiency of Pitch-Shifting Plug-Ins in Voice and Hearing Research.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {878-889}, doi = {10.1044/2021_JSLHR-21-00440}, pmid = {35077652}, issn = {1558-9102}, mesh = {Feedback, Sensory ; Female ; Hearing ; Humans ; Male ; *Music ; Pitch Perception ; Reproducibility of Results ; *Voice ; }, abstract = {PURPOSE: Auditory feedback perturbation with voice pitch manipulation has been widely used in previous studies. There are several hardware and software tools for such manipulations, but audio plug-ins developed for music, movies, and radio applications that operate in digital audio workstations may be extremely beneficial and are easy to use, accessible, and cost effective. However, it is unknown whether these plug-ins can perform similarly to tools that have been described in previous literature. Hence, this study aimed to evaluate the reliability and efficiency of these plug-ins.

METHOD: Six different plug-ins were used at +1 and -1 st pitch shifting with formant correction on and off to pitch shift the sustained /ɑ/ voice recording sample of 12 healthy participants (six cisgender males and six cisgender females). Pitch-shifting accuracy, formant shifting amount, intensity changes, and total latency values were reported.

RESULTS: Some variability was observed between different plug-ins and pitch shift settings. One plug-in managed to perform similarly in all four measured aspects with well-known hardware and software units with 1-cent pitch-shifting accuracy, low latency values, negligible intensity difference, and preserved formants. Other plug-ins performed similarly in some respects.

CONCLUSIONS: Audio plug-ins may be used effectively in pitch-shifting applications. Researchers and clinicians can access these plug-ins easily and test whether the features also fit their aims.}, } @article {pmid35071434, year = {2021}, author = {Cao, S and Xia, M and Zhou, R and Wang, J and Jin, CY and Pei, B and Zhou, ZK and Qian, YM and Jiang, H}, title = {Voice parameters for difficult mask ventilation evaluation: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {23}, pages = {1740}, pmid = {35071434}, issn = {2305-5839}, abstract = {BACKGROUND: Mask ventilation (MV) is an essential component of airway management. Difficult mask ventilation (DMV) is a major cause for perioperative hypoxic brain injury; however, predicting DMV remains a challenge. This study aimed to determine the potential value of voice parameters as novel predictors of DMV in patients scheduled for general anesthesia.

METHODS: We included 1,160 adult patients scheduled for elective surgery under general anesthesia. The clinical variables usually reported as predictors of DMV were collected before surgery. Voice sample of phonemes ([a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]) were recorded and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. The definition of DMV was the inability of an unassisted anesthesiologist to ensure adequate ventilation during MV under general anesthesia. Univariate and multivariate logistic regression analyses were used to explore the association between voice parameters and DMV. The predictive value of the voice parameters was evaluated by assessment of area under the curve (AUC) of receiver operating characteristic (ROC) curves of a stepwise forward model.

RESULTS: The prevalence of DMV was 218/1,160 (18.8%). The AUC of the stepwise forward model (including o_f4, e_bw2, i_f3, u_pitch, u_f1, u_f4, ü_bw4, ci_f1, qi_f1, qi_f4, qi_bw4, chi_f1, chi_bw2, chi_bw4, le_pitch, le_bw3, ke_bw2, en_pitch, and en_f2, en_bw4) attained a value of 0.779. The sensitivity and specificity of the model were 75.0% and 71.0%, respectively.

CONCLUSIONS: Voice parameters may be considered as alternative predictors of DMV, but additional studies are needed to confirm the initial findings.}, } @article {pmid35069371, year = {2021}, author = {Lee, A and Ng, E}, title = {Hong Kong Women Project a Larger Body When Speaking to Attractive Men.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {786507}, pmid = {35069371}, issn = {1664-1078}, abstract = {In this pilot study we investigated the vocal strategies of Cantonese women when addressing an attractive vs. unattractive male. We recruited 19 young female native speakers of Hong Kong Cantonese who completed an attractiveness rating task, followed by a speech production task where they were presented a subset of the same faces. By comparing the rating results and corresponding acoustic data of the facial stimuli, we found that when young Cantonese women spoke to an attractive male, they were less breathy, lower in fundamental frequency, and with denser formants, all of which are considered to project a larger body. Participants who were more satisfied with their own height used these vocal strategies more actively. These results are discussed in terms of the body size projection principle.}, } @article {pmid35062025, year = {2022}, author = {Suess, N and Hauswald, A and Reisinger, P and Rösch, S and Keitel, A and Weisz, N}, title = {Cortical tracking of formant modulations derived from silently presented lip movements and its decline with age.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {21}, pages = {4818-4833}, pmid = {35062025}, issn = {1460-2199}, support = {MR/W02912X/1/MRC_/Medical Research Council/United Kingdom ; P 31230/FWF_/Austrian Science Fund FWF/Austria ; P 34237/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Humans ; *Speech Perception ; Acoustic Stimulation ; Lip ; Speech ; Movement ; }, abstract = {The integration of visual and auditory cues is crucial for successful processing of speech, especially under adverse conditions. Recent reports have shown that when participants watch muted videos of speakers, the phonological information about the acoustic speech envelope, which is associated with but independent from the speakers' lip movements, is tracked by the visual cortex. However, the speech signal also carries richer acoustic details, for example, about the fundamental frequency and the resonant frequencies, whose visuophonological transformation could aid speech processing. Here, we investigated the neural basis of the visuo-phonological transformation processes of these more fine-grained acoustic details and assessed how they change as a function of age. We recorded whole-head magnetoencephalographic (MEG) data while the participants watched silent normal (i.e., natural) and reversed videos of a speaker and paid attention to their lip movements. We found that the visual cortex is able to track the unheard natural modulations of resonant frequencies (or formants) and the pitch (or fundamental frequency) linked to lip movements. Importantly, only the processing of natural unheard formants decreases significantly with age in the visual and also in the cingulate cortex. This is not the case for the processing of the unheard speech envelope, the fundamental frequency, or the purely visual information carried by lip movements. These results show that unheard spectral fine details (along with the unheard acoustic envelope) are transformed from a mere visual to a phonological representation. Aging affects especially the ability to derive spectral dynamics at formant frequencies. As listening in noisy environments should capitalize on the ability to track spectral fine details, our results provide a novel focus on compensatory processes in such challenging situations.}, } @article {pmid35038295, year = {2022}, author = {Almaghrabi, SA and Thewlis, D and Thwaites, S and Rogasch, NC and Lau, S and Clark, SR and Baumert, M}, title = {The Reproducibility of Bio-Acoustic Features is Associated With Sample Duration, Speech Task, and Gender.}, journal = {IEEE transactions on neural systems and rehabilitation engineering : a publication of the IEEE Engineering in Medicine and Biology Society}, volume = {30}, number = {}, pages = {167-175}, doi = {10.1109/TNSRE.2022.3143117}, pmid = {35038295}, issn = {1558-0210}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Reproducibility of Results ; *Speech ; Speech Acoustics ; *Voice ; }, abstract = {Bio-acoustic properties of speech show evolving value in analyzing psychiatric illnesses. Obtaining a sufficient speech sample length to quantify these properties is essential, but the impact of sample duration on the stability of bio-acoustic features has not been systematically explored. We aimed to evaluate bio-acoustic features' reproducibility against changes in speech durations and tasks. We extracted source, spectral, formant, and prosodic features in 185 English-speaking adults (98 w, 87 m) for reading-a-story and counting tasks. We compared features at 25% of the total sample duration of the reading task to those obtained from non-overlapping randomly selected sub-samples shortened to 75%, 50%, and 25% of total duration using intraclass correlation coefficients. We also compared the features extracted from entire recordings to those measured at 25% of the duration and features obtained from 50% of the duration. Further, we compared features extracted from reading-a-story to counting tasks. Our results show that the number of reproducible features (out of 125) decreased stepwise with duration reduction. Spectral shape, pitch, and formants reached excellent reproducibility. Mel-frequency cepstral coefficients (MFCCs), loudness, and zero-crossing rate achieved excellent reproducibility only at a longer duration. Reproducibility of source, MFCC derivatives, and voicing probability (VP) was poor. Significant gender differences existed in jitter, MFCC first-derivative, spectral skewness, pitch, VP, and formants. Around 97% of features in both genders were not reproducible across speech tasks, in part due to the short counting task duration. In conclusion, bio-acoustic features are less reproducible in shorter samples and are affected by gender.}, } @article {pmid35005711, year = {2021}, author = {Gaines, JL and Kim, KS and Parrell, B and Ramanarayanan, V and Nagarajan, SS and Houde, JF}, title = {Discrete constriction locations describe a comprehensive range of vocal tract shapes in the Maeda model.}, journal = {JASA express letters}, volume = {1}, number = {12}, pages = {124402}, pmid = {35005711}, issn = {2691-1191}, support = {F32 DC019538/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, abstract = {The Maeda model was used to generate a large set of vocoid-producing vocal tract configurations. The resulting dataset (a) produced a comprehensive range of formant frequencies and (b) displayed discrete tongue body constriction locations (palatal, velar/uvular, and lower pharyngeal). The discrete parameterization of constriction location across the vowel space suggests this is likely a fundamental characteristic of the human vocal tract, and not limited to any specific set of vowel contrasts. These findings suggest that in addition to established articulatory-acoustic constraints, fundamental biomechanical constraints of the vocal tract may also explain such discreteness.}, } @article {pmid34987356, year = {2021}, author = {Cheng, FY and Xu, C and Gold, L and Smith, S}, title = {Rapid Enhancement of Subcortical Neural Responses to Sine-Wave Speech.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {747303}, pmid = {34987356}, issn = {1662-4548}, support = {K01 DC017192/DC/NIDCD NIH HHS/United States ; }, abstract = {The efferent auditory nervous system may be a potent force in shaping how the brain responds to behaviorally significant sounds. Previous human experiments using the frequency following response (FFR) have shown efferent-induced modulation of subcortical auditory function online and over short- and long-term time scales; however, a contemporary understanding of FFR generation presents new questions about whether previous effects were constrained solely to the auditory subcortex. The present experiment used sine-wave speech (SWS), an acoustically-sparse stimulus in which dynamic pure tones represent speech formant contours, to evoke FFRSWS. Due to the higher stimulus frequencies used in SWS, this approach biased neural responses toward brainstem generators and allowed for three stimuli (/bɔ/, /bu/, and /bo/) to be used to evoke FFRSWS before and after listeners in a training group were made aware that they were hearing a degraded speech stimulus. All SWS stimuli were rapidly perceived as speech when presented with a SWS carrier phrase, and average token identification reached ceiling performance during a perceptual training phase. Compared to a control group which remained naïve throughout the experiment, training group FFRSWS amplitudes were enhanced post-training for each stimulus. Further, linear support vector machine classification of training group FFRSWS significantly improved post-training compared to the control group, indicating that training-induced neural enhancements were sufficient to bolster machine learning classification accuracy. These results suggest that the efferent auditory system may rapidly modulate auditory brainstem representation of sounds depending on their context and perception as non-speech or speech.}, } @article {pmid34975607, year = {2021}, author = {Meykadeh, A and Golfam, A and Nasrabadi, AM and Ameri, H and Sommer, W}, title = {First Event-Related Potentials Evidence of Auditory Morphosyntactic Processing in a Subject-Object-Verb Nominative-Accusative Language (Farsi).}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {698165}, pmid = {34975607}, issn = {1664-1078}, abstract = {While most studies on neural signals of online language processing have focused on a few-usually western-subject-verb-object (SVO) languages, corresponding knowledge on subject-object-verb (SOV) languages is scarce. Here we studied Farsi, a language with canonical SOV word order. Because we were interested in the consequences of second-language acquisition, we compared monolingual native Farsi speakers and equally proficient bilinguals who had learned Farsi only after entering primary school. We analyzed event-related potentials (ERPs) to correct and morphosyntactically incorrect sentence-final syllables in a sentence correctness judgment task. Incorrect syllables elicited a late posterior positivity at 500-700 ms after the final syllable, resembling the P600 component, as previously observed for syntactic violations at sentence-middle positions in SVO languages. There was no sign of a left anterior negativity (LAN) preceding the P600. Additionally, we provide evidence for a real-time discrimination of phonological categories associated with morphosyntactic manipulations (between 35 and 135 ms), manifesting the instantaneous neural response to unexpected perturbations. The L2 Farsi speakers were indistinguishable from L1 speakers in terms of performance and neural signals of syntactic violations, indicating that exposure to a second language at school entry may results in native-like performance and neural correlates. In nonnative (but not native) speakers verbal working memory capacity correlated with the late posterior positivity and performance accuracy. Hence, this first ERP study of morphosyntactic violations in a spoken SOV nominative-accusative language demonstrates ERP effects in response to morphosyntactic violations and the involvement of executive functions in non-native speakers in computations of subject-verb agreement.}, } @article {pmid34966297, year = {2021}, author = {Yamada, Y and Shinkawa, K and Nemoto, M and Arai, T}, title = {Automatic Assessment of Loneliness in Older Adults Using Speech Analysis on Responses to Daily Life Questions.}, journal = {Frontiers in psychiatry}, volume = {12}, number = {}, pages = {712251}, pmid = {34966297}, issn = {1664-0640}, abstract = {Loneliness is a perceived state of social and emotional isolation that has been associated with a wide range of adverse health effects in older adults. Automatically assessing loneliness by passively monitoring daily behaviors could potentially contribute to early detection and intervention for mitigating loneliness. Speech data has been successfully used for inferring changes in emotional states and mental health conditions, but its association with loneliness in older adults remains unexplored. In this study, we developed a tablet-based application and collected speech responses of 57 older adults to daily life questions regarding, for example, one's feelings and future travel plans. From audio data of these speech responses, we automatically extracted speech features characterizing acoustic, prosodic, and linguistic aspects, and investigated their associations with self-rated scores of the UCLA Loneliness Scale. Consequently, we found that with increasing loneliness scores, speech responses tended to have less inflections, longer pauses, reduced second formant frequencies, reduced variances of the speech spectrum, more filler words, and fewer positive words. The cross-validation results showed that regression and binary-classification models using speech features could estimate loneliness scores with an R [2] of 0.57 and detect individuals with high loneliness scores with 95.6% accuracy, respectively. Our study provides the first empirical results suggesting the possibility of using speech data that can be collected in everyday life for the automatic assessments of loneliness in older adults, which could help develop monitoring technologies for early detection and intervention for mitigating loneliness.}, } @article {pmid34963204, year = {2021}, author = {Hussain, Q and Kochetov, A}, title = {Acoustic classification of coronal stops of Eastern Punjabi.}, journal = {Phonetica}, volume = {79}, number = {1}, pages = {77-110}, doi = {10.1515/phon-2021-2015}, pmid = {34963204}, issn = {1423-0321}, mesh = {Acoustics ; Humans ; *Language ; Phonetics ; *Speech Acoustics ; Voice Quality ; }, abstract = {Punjabi is an Indo-Aryan language which contrasts a rich set of coronal stops at dental and retroflex places of articulation across three laryngeal configurations. Moreover, all these stops occur contrastively in various positions (word-initially, -medially, and -finally). The goal of this study is to investigate how various coronal place and laryngeal contrasts are distinguished acoustically both within and across word positions. A number of temporal and spectral correlates were examined in data from 13 speakers of Eastern Punjabi: Voice Onset Time, release and closure durations, fundamental frequency, F1-F3 formants, spectral center of gravity and standard deviation, H1*-H2*, and cepstral peak prominence. The findings indicated that higher formants and spectral measures were most important for the classification of place contrasts across word positions, whereas laryngeal contrasts were reliably distinguished by durational and voice quality measures. Word-medially and -finally, F2 and F3 of the preceding vowels played a key role in distinguishing the dental and retroflex stops, while spectral noise measures were more important word-initially. The findings of this study contribute to a better understanding of factors involved in the maintenance of typologically rare and phonetically complex sets of place and laryngeal contrasts in the coronal stops of Indo-Aryan languages.}, } @article {pmid34924928, year = {2021}, author = {Zheng, Z and Li, K and Feng, G and Guo, Y and Li, Y and Xiao, L and Liu, C and He, S and Zhang, Z and Qian, D and Feng, Y}, title = {Relative Weights of Temporal Envelope Cues in Different Frequency Regions for Mandarin Vowel, Consonant, and Lexical Tone Recognition.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {744959}, pmid = {34924928}, issn = {1662-4548}, abstract = {Objectives: Mandarin-speaking users of cochlear implants (CI) perform poorer than their English counterpart. This may be because present CI speech coding schemes are largely based on English. This study aims to evaluate the relative contributions of temporal envelope (E) cues to Mandarin phoneme (including vowel, and consonant) and lexical tone recognition to provide information for speech coding schemes specific to Mandarin. Design: Eleven normal hearing subjects were studied using acoustic temporal E cues that were extracted from 30 continuous frequency bands between 80 and 7,562 Hz using the Hilbert transform and divided into five frequency regions. Percent-correct recognition scores were obtained with acoustic E cues presented in three, four, and five frequency regions and their relative weights calculated using the least-square approach. Results: For stimuli with three, four, and five frequency regions, percent-correct scores for vowel recognition using E cues were 50.43-84.82%, 76.27-95.24%, and 96.58%, respectively; for consonant recognition 35.49-63.77%, 67.75-78.87%, and 87.87%; for lexical tone recognition 60.80-97.15%, 73.16-96.87%, and 96.73%. For frequency region 1 to frequency region 5, the mean weights in vowel recognition were 0.17, 0.31, 0.22, 0.18, and 0.12, respectively; in consonant recognition 0.10, 0.16, 0.18, 0.23, and 0.33; in lexical tone recognition 0.38, 0.18, 0.14, 0.16, and 0.14. Conclusion: Regions that contributed most for vowel recognition was Region 2 (502-1,022 Hz) that contains first formant (F1) information; Region 5 (3,856-7,562 Hz) contributed most to consonant recognition; Region 1 (80-502 Hz) that contains fundamental frequency (F0) information contributed most to lexical tone recognition.}, } @article {pmid34889651, year = {2022}, author = {Polka, L and Masapollo, M and Ménard, L}, title = {Setting the Stage for Speech Production: Infants Prefer Listening to Speech Sounds With Infant Vocal Resonances.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {1}, pages = {109-120}, doi = {10.1044/2021_JSLHR-21-00412}, pmid = {34889651}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Humans ; Infant ; Phonetics ; Speech ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: Current models of speech development argue for an early link between speech production and perception in infants. Recent data show that young infants (at 4-6 months) preferentially attend to speech sounds (vowels) with infant vocal properties compared to those with adult vocal properties, suggesting the presence of special "memory banks" for one's own nascent speech-like productions. This study investigated whether the vocal resonances (formants) of the infant vocal tract are sufficient to elicit this preference and whether this perceptual bias changes with age and emerging vocal production skills.

METHOD: We selectively manipulated the fundamental frequency (f0) of vowels synthesized with formants specifying either an infant or adult vocal tract, and then tested the effects of those manipulations on the listening preferences of infants who were slightly older than those previously tested (at 6-8 months).

RESULTS: Unlike findings with younger infants (at 4-6 months), slightly older infants in Experiment 1 displayed a robust preference for vowels with infant formants over adult formants when f0 was matched. The strength of this preference was also positively correlated with age among infants between 4 and 8 months. In Experiment 2, this preference favoring infant over adult formants was maintained when f0 values were modulated.

CONCLUSIONS: Infants between 6 and 8 months of age displayed a robust and distinct preference for speech with resonances specifying a vocal tract that is similar in size and length to their own. This finding, together with data indicating that this preference is not present in younger infants and appears to increase with age, suggests that nascent knowledge of the motor schema of the vocal tract may play a role in shaping this perceptual bias, lending support to current models of speech development.

SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.17131805.}, } @article {pmid34860148, year = {2023}, author = {Sundberg, J and Lindblom, B and Hefele, AM}, title = {Voice source, formant frequencies and vocal tract shape in overtone singing. A case study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {48}, number = {2}, pages = {75-87}, doi = {10.1080/14015439.2021.1998607}, pmid = {34860148}, issn = {1651-2022}, mesh = {Humans ; *Voice ; Phonation ; *Singing ; Voice Quality ; Tongue ; }, abstract = {Purpose: In overtone singing a singer produces two pitches simultaneously, a low-pitched, continuous drone plus a melody played on the higher, flutelike and strongly enhanced overtones of the drone. The purpose of this study was to analyse underlying acoustical, phonatory and articulatory phenomena.Methods: The voice source was analyzed by inverse filtering the sound, the articulation from a dynamic MRI video of the vocal tract profile, and the lip opening from a frontal-view video recording. Vocal tract cross-distances were measured in the MR recording and converted to area functions, the formant frequencies of which computed.Results: Inverse filtering revealed that the overtone enhancement resulted from a close clustering of formants 2 and 3. The MRI material showed that for low enhanced overtone frequencies (FE) the tongue tip was raised and strongly retracted, while for high FE the tongue tip was less retracted but forming a longer constriction. Thus, the tongue configuration changed from an apical/anterior to a dorsal/posterior articulation. The formant frequencies derived from the area functions matched almost perfectly those used for the inverse filtering. Further, analyses of the area functions revealed that the second formant frequency was strongly dependent on the back cavity, and the third on the front cavity, which acted like a Helmholtz resonator, tuned by the tongue tip position and lip opening.Conclusions: This type of overtone singing can be fully explained by the well-established source-filter theory of voice production, as recently found by Bergevin et al. [1] for another type of overtone singing.}, } @article {pmid34852626, year = {2021}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Mandatory dichotic integration of second-formant information: Contralateral sine bleats have predictable effects on consonant place judgments.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3693}, doi = {10.1121/10.0007132}, pmid = {34852626}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Judgment ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {Speech-on-speech informational masking arises because the interferer disrupts target processing (e.g., capacity limitations) or corrupts it (e.g., intrusions into the target percept); the latter should produce predictable errors. Listeners identified the consonant in monaural buzz-excited three-formant analogues of approximant-vowel syllables, forming a place of articulation series (/w/-/l/-/j/). There were two 11-member series; the vowel was either high-front or low-back. Series members shared formant-amplitude contours, fundamental frequency, and F1+F3 frequency contours; they were distinguished solely by the F2 frequency contour before the steady portion. Targets were always presented in the left ear. For each series, F2 frequency and amplitude contours were also used to generate interferers with altered source properties-sine-wave analogues of F2 (sine bleats) matched to their buzz-excited counterparts. Accompanying each series member with a fixed mismatched sine bleat in the contralateral ear produced systematic and predictable effects on category judgments; these effects were usually largest for bleats involving the fastest rate or greatest extent of frequency change. Judgments of isolated sine bleats using the three place labels were often unsystematic or arbitrary. These results indicate that informational masking by interferers involved corruption of target processing as a result of mandatory dichotic integration of F2 information, despite the grouping cues disfavoring this integration.}, } @article {pmid34852620, year = {2021}, author = {Lodermeyer, A and Bagheri, E and Kniesburges, S and Näger, C and Probst, J and Döllinger, M and Becker, S}, title = {The mechanisms of harmonic sound generation during phonation: A multi-modal measurement-based approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3485}, doi = {10.1121/10.0006974}, pmid = {34852620}, issn = {1520-8524}, mesh = {Glottis/diagnostic imaging ; Humans ; *Larynx ; *Phonation ; Sound ; Vocal Cords/diagnostic imaging ; }, abstract = {Sound generation during voiced speech remains an open research topic because the underlying process within the human larynx is hardly accessible for direct measurements. In the present study, harmonic sound generation during phonation was investigated with a model that replicates the fully coupled fluid-structure-acoustic interaction (FSAI). The FSAI was captured using a multi-modal approach by measuring the flow and acoustic source fields based on particle image velocimetry, as well as the surface velocity of the vocal folds based on laser vibrometry and high-speed imaging. Strong harmonic sources were localized near the glottis, as well as further downstream, during the presence of the supraglottal jet. The strongest harmonic content of the vocal fold surface motion was verified for the area near the glottis, which directly interacts with the glottal jet flow. Also, the acoustic back-coupling of the formant frequencies onto the harmonic oscillation of the vocal folds was verified. These findings verify that harmonic sound generation is the result of a strong interrelation between the vocal fold motion, modulated flow field, and vocal tract geometry.}, } @article {pmid34852594, year = {2021}, author = {Barreda, S and Assmann, PF}, title = {Perception of gender in children's voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3949}, doi = {10.1121/10.0006785}, pmid = {34852594}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Child ; Cues ; Female ; Humans ; Male ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {To investigate the perception of gender from children's voices, adult listeners were presented with /hVd/ syllables, in isolation and in sentence context, produced by children between 5 and 18 years. Half the listeners were informed of the age of the talker during trials, while the other half were not. Correct gender identifications increased with talker age; however, performance was above chance even for age groups where the cues most often associated with gender differentiation (i.e., average fundamental frequency and formant frequencies) were not consistently different between boys and girls. The results of acoustic models suggest that cues were used in an age-dependent manner, whether listeners were explicitly told the age of the talker or not. Overall, results are consistent with the hypothesis that talker age and gender are estimated jointly in the process of speech perception. Furthermore, results show that the gender of individual talkers can be identified accurately well before reliable anatomical differences arise in the vocal tracts of females and males. In general, results support the notion that the transmission of gender information from voice depends substantially on gender-dependent patterns of articulation, rather than following deterministically from anatomical differences between male and female talkers.}, } @article {pmid34847585, year = {2021}, author = {Wilson, RH and Scherer, NJ}, title = {Waveform Amplitude and Temporal Symmetric/Asymmetric Characteristics of Phoneme and Syllable Segments in the W-1 Spondaic Words Recorded by Four Speakers.}, journal = {Journal of the American Academy of Audiology}, volume = {32}, number = {7}, pages = {445-463}, doi = {10.1055/s-0041-1730959}, pmid = {34847585}, issn = {2157-3107}, mesh = {Data Collection ; Female ; Humans ; Male ; *Phonetics ; *Speech ; }, abstract = {BACKGROUND: The amplitude and temporal asymmetry of the speech waveform are mostly associated with voiced speech utterances and are obvious in recent graphic depictions in the literature. The asymmetries are attributed to the presence and interactions of the major formants characteristic of voicing with possible contributions from the unidirectional air flow that accompanies speaking.

PURPOSE: This study investigated the amplitude symmetry/asymmetry characteristics (polarity) of speech waveforms that to our knowledge have not been quantified.

STUDY SAMPLE: Thirty-six spondaic words spoken by two male speakers and two female speakers were selected because they were multisyllabic words providing a reasonable sampling of speech sounds and four recordings were available that were not related to the topic under study.

RESEARCH DESIGN: Collectively, the words were segmented into phonemes (vowels [130], diphthongs [77], voiced consonants [258], voiceless consonants [219]), syllables (82), and blends (6). For each segment the following were analyzed separately for the positive and negative datum points: peak amplitude, the percent of the total segment datum points, the root-mean-square (rms) amplitude, and the crest factor.

DATA COLLECTION AND ANALYSES: The digitized words (44,100 samples/s; 16-bit) were parsed into 144 files (36 words × 4 speakers), edited, transcribed to numeric values (±1), and stored in a spread sheet in which all analyses were performed with in-house routines. Overall approximately 85% of each waveform was analyzed, which excluded portions of silent intervals, transitions, and diminished waveform endings.

RESULTS: The vowel, diphthong, and syllable segments had durations (180-220 ms) that were about twice as long as the consonant durations (∼90 ms) and peak and rms amplitudes that were 6 to 12 dB higher than the consonant peak and rms amplitudes. Vowel, diphthong, and syllable segments had 10% more positive datum points (55%) than negative points (45%), which suggested temporal asymmetries within the segments. With voiced consonants, the distribution of positive and negative datum points dropped to 52 and 48% and essentially was equal with the voiceless consonants (50.3 and 49.6%). The mean rms amplitudes of the negative datum points were higher than the rms amplitudes for the positive points by 2 dB (vowels, diphthongs, and syllables), 1 dB (voiced consonants), and 0.1 dB (voiceless consonants). The 144 waveforms and segmentations are illustrated in the Supplementary Material along with the tabularized positive and negative segment characteristics.

CONCLUSIONS: The temporal and amplitude waveform asymmetries were by far most notable in segments that had a voicing component, which included the voiced consonants. These asymmetries were characterized by larger envelopes and more energy in the negative side of the waveform segment than in the positive side. Interestingly, these segments had more positive datum points than negative points, which indicated temporal asymmetry. All aspects of the voiceless consonants were equally divided between the positive and negative domains. There were female/male differences but with these limited samples such differences should not be generalized beyond the speakers in this study. The influence of the temporal and amplitude asymmetries on monaural word-recognition performance is thought to be negligible.}, } @article {pmid34827803, year = {2021}, author = {Hedwig, D and Poole, J and Granli, P}, title = {Does Social Complexity Drive Vocal Complexity? Insights from the Two African Elephant Species.}, journal = {Animals : an open access journal from MDPI}, volume = {11}, number = {11}, pages = {}, pmid = {34827803}, issn = {2076-2615}, abstract = {The social complexity hypothesis (SCH) for communication states that the range and frequency of social interactions drive the evolution of complex communication systems. Surprisingly, few studies have empirically tested the SHC for vocal communication systems. Filling this gap is important because a co-evolutionary runaway process between social and vocal complexity may have shaped the most intricate communication system, human language. We here propose the African elephant Loxodonta spec. as an excellent study system to investigate the relationships between social and vocal complexity. We review how the distinct differences in social complexity between the two species of African elephants, the forest elephant L. cyclotis and the savanna elephant L. africana, relate to repertoire size and structure, as well as complex communication skills in the two species, such as call combination or intentional formant modulation including the trunk. Our findings suggest that Loxodonta may contradict the SCH, as well as other factors put forth to explain patterns of vocal complexity across species. We propose that life history traits, a factor that has gained little attention as a driver of vocal complexity, and the extensive parental care associated with a uniquely low and slow reproductive rate, may have led to the emergence of pronounced vocal complexity in the forest elephant despite their less complex social system compared to the savanna elephant. Conclusions must be drawn cautiously, however. A better understanding of vocal complexity in the genus Loxodonta will depend on continuing advancements in remote data collection technologies to overcome the challenges of observing forest elephants in their dense rainforest habitat, as well as the availability of directly comparable data and methods, quantifying both structural and contextual variability in the production of rumbles and other vocalizations in both species of African elephants.}, } @article {pmid34809062, year = {2021}, author = {Du, X and Zhang, X and Wang, Y and Ma, G and Liu, Y and Wang, B and Mao, H}, title = {Highly sensitive detection of plant growth regulators by using terahertz time-domain spectroscopy combined with metamaterials.}, journal = {Optics express}, volume = {29}, number = {22}, pages = {36535-36545}, doi = {10.1364/OE.437909}, pmid = {34809062}, issn = {1094-4087}, mesh = {Biosensing Techniques/*methods ; Computer Simulation ; Equipment Design ; Glycylglycine/*analysis ; Hydrazines/*analysis ; Plant Growth Regulators/*analysis ; Plants/*chemistry ; Refractometry ; Sensitivity and Specificity ; Terahertz Spectroscopy/instrumentation/*methods ; }, abstract = {The rapid and sensitive detection of plant-growth-regulator (PGR) residue is essential for ensuring food safety for consumers. However, there are many disadvantages in current approaches to detecting PGR residue. In this paper, we demonstrate a highly sensitive PGR detection method by using terahertz time-domain spectroscopy combined with metamaterials. We propose a double formant metamaterial resonator based on a split-ring structure with titanium-gold nanostructure. The metamaterial resonator is a split-ring structure composed of a titanium-gold nanostructure based on polyimide film as the substrate. Also, terahertz spectral response and electric field distribution of metamaterials under different analyte thickness and refractive index were investigated. The simulation results showed that the theoretical sensitivity of resonance peak 1 and peak 2 of the refractive index sensor based on our designed metamaterial resonator approaches 780 and 720 gigahertz per refractive index unit (GHz/RIU), respectively. In experiments, a rapid solution analysis platform based on the double formant metamaterial resonator was set up and PGR residues in aqueous solution were directly and rapidly detected through terahertz time-domain spectroscopy. The results showed that metamaterials can successfully detect butylhydrazine and N-N diglycine at a concentration as low as 0.05 mg/L. This study paves a new way for sensitive, rapid, low-cost detection of PGRs. It also means that the double formant metamaterial resonator has significant potential for other applications in terahertz sensing.}, } @article {pmid34808474, year = {2022}, author = {Li, P and Ross, CF and Luo, ZX}, title = {Morphological disparity and evolutionary transformations in the primate hyoid apparatus.}, journal = {Journal of human evolution}, volume = {162}, number = {}, pages = {103094}, doi = {10.1016/j.jhevol.2021.103094}, pmid = {34808474}, issn = {1095-8606}, mesh = {Animals ; Female ; Haplorhini ; Hyoid Bone/anatomy & histology ; Phylogeny ; *Placenta ; Pregnancy ; *Primates/anatomy & histology ; }, abstract = {The hyoid apparatus plays an integral role in swallowing, respiration, and vocalization in mammals. Most placental mammals have a rod-shaped basihyal connected to the basicranium via both soft tissues and a mobile bony chain-the anterior cornu-whereas anthropoid primates have broad, shield-like or even cup-shaped basihyals suspended from the basicranium by soft tissues only. How the unique anthropoid hyoid morphology evolved is unknown, and hyoid morphology of nonanthropoid primates is poorly documented. Here we use phylogenetic comparative methods and linear morphometrics to address knowledge gaps in hyoid evolution among primates and their euarchontan outgroups. We find that dermopterans have variable reduction of cornu elements. Cynocephalus volans are sexually dimorphic in hyoid morphology. Tupaia and all lemuroids except Daubentonia have a fully ossified anterior cornu connecting a rod-shaped basihyal to the basicranium; this is the ancestral mammalian pattern that is also characteristic of the last common ancestor of Primates. Haplorhines exhibit a reduced anterior cornu, and anthropoids underwent further increase in basihyal aspect ratio values and in relative basihyal volume. Convergent with haplorhines, lorisoid strepsirrhines independently evolved a broad basihyal and reduced anterior cornua. While a reduced anterior cornu is hypothesized to facilitate vocal tract lengthening and lower formant frequencies in some mammals, our results suggest vocalization adaptations alone are unlikely to drive the iterative reduction of anterior cornua within Primates. Our new data on euarchontan hyoid evolution provide an anatomical basis for further exploring the form-function relationships of the hyoid across different behaviors, including vocalization, chewing, and swallowing.}, } @article {pmid34799495, year = {2022}, author = {Xu, L and Luo, J and Xie, D and Chao, X and Wang, R and Zahorik, P and Luo, X}, title = {Reverberation Degrades Pitch Perception but Not Mandarin Tone and Vowel Recognition of Cochlear Implant Users.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1139-1150}, doi = {10.1097/AUD.0000000000001173}, pmid = {34799495}, issn = {1538-4667}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; *Deafness/rehabilitation ; Humans ; Pitch Perception/physiology ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: The primary goal of this study was to investigate the effects of reverberation on Mandarin tone and vowel recognition of cochlear implant (CI) users and normal-hearing (NH) listeners. To understand the performance of Mandarin tone recognition, this study also measured participants' pitch perception and the availability of temporal envelope cues in reverberation.

DESIGN: Fifteen CI users and nine NH listeners, all Mandarin speakers, were asked to recognize Mandarin single-vowels produced in four lexical tones and rank harmonic complex tones in pitch with different reverberation times (RTs) from 0 to 1 second. Virtual acoustic techniques were used to simulate rooms with different degrees of reverberation. Vowel duration and correlation between amplitude envelope and fundamental frequency (F0) contour were analyzed for different tones as a function of the RT.

RESULTS: Vowel durations of different tones significantly increased with longer RTs. Amplitude-F0 correlation remained similar for the falling Tone 4 but greatly decreased for the other tones in reverberation. NH listeners had robust pitch-ranking, tone recognition, and vowel recognition performance as the RT increased. Reverberation significantly degraded CI users' pitch-ranking thresholds but did not significantly affect the overall scores of tone and vowel recognition with CIs. Detailed analyses of tone confusion matrices showed that CI users reduced the flat Tone-1 responses but increased the falling Tone-4 responses in reverberation, possibly due to the falling amplitude envelope of late reflections after the original vowel segment. CI users' tone recognition scores were not correlated with their pitch-ranking thresholds.

CONCLUSIONS: NH listeners can reliably recognize Mandarin tones in reverberation using salient pitch cues from spectral and temporal fine structures. However, CI users have poorer pitch perception using F0-related amplitude modulations that are reduced in reverberation. Reverberation distorts speech amplitude envelopes, which affect the distribution of tone responses but not the accuracy of tone recognition with CIs. Recognition of vowels with stationary formant trajectories is not affected by reverberation for both NH listeners and CI users, regardless of the available spectral resolution. Future studies should test how the relatively stable vowel and tone recognition may contribute to sentence recognition in reverberation of Mandarin-speaking CI users.}, } @article {pmid34783468, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Popadyuk, VI and Vostrikov, AM and Sheveleva, VA and Kleyman, VK and Shalamov, KP and Torshin, VI}, title = {[Dynamics of vowel acoustic space indicators in patients with long-term hearing loss].}, journal = {Vestnik otorinolaringologii}, volume = {86}, number = {5}, pages = {17-21}, doi = {10.17116/otorino20218605117}, pmid = {34783468}, issn = {0042-4668}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Male ; Phonetics ; Russia ; Speech Acoustics ; }, abstract = {UNLABELLED: New procedure of vowel acoustic space (VAS) (of vowel acoustic triangles) transformation for the purpose of characterization of vowel production in individuals with long-term hearing loss (HL) was developed.

OBJECTIVE: To characterize VAS of adult Russian speakers with long-term HL using newly developed acoustic indicators.

MATERIAL AND METHODS: Recordings of sustained Russian cardinal vowels /a/, /i/, /u/ of 10 women and 10 men with long-term HL were acoustically analyzed. For each participant, two first formants of each vowel were measured and log-transformed (logF1, logF2). VAS was transformed into right triangles, their /u/ corners were moved to the origin, and their legs were aligned with axes. VAS was almost symmetrical, equal and have a maximum size in the control group consisted of subjects without hearing impairment while these of long-term HL group VAS size tended to have reduced and VAS stretched along one axis.

RESULTS: Our study showed that a new VAS normalization approach can distinguish at least three groups of people with long-term HL.

CONCLUSION: There are those with vowel triangles stretched along logF1-axis, with vowel triangles stretched along logF2-axis, and with symmetrical vowel triangles. Causes of the VAS differences require further investigation.}, } @article {pmid34776842, year = {2021}, author = {Melchor, J and Vergara, J and Figueroa, T and Morán, I and Lemus, L}, title = {Formant-Based Recognition of Words and Other Naturalistic Sounds in Rhesus Monkeys.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {728686}, pmid = {34776842}, issn = {1662-4548}, abstract = {In social animals, identifying sounds is critical for communication. In humans, the acoustic parameters involved in speech recognition, such as the formant frequencies derived from the resonance of the supralaryngeal vocal tract, have been well documented. However, how formants contribute to recognizing learned sounds in non-human primates remains unclear. To determine this, we trained two rhesus monkeys to discriminate target and non-target sounds presented in sequences of 1-3 sounds. After training, we performed three experiments: (1) We tested the monkeys' accuracy and reaction times during the discrimination of various acoustic categories; (2) their ability to discriminate morphing sounds; and (3) their ability to identify sounds consisting of formant 1 (F1), formant 2 (F2), or F1 and F2 (F1F2) pass filters. Our results indicate that macaques can learn diverse sounds and discriminate from morphs and formants F1 and F2, suggesting that information from few acoustic parameters suffice for recognizing complex sounds. We anticipate that future neurophysiological experiments in this paradigm may help elucidate how formants contribute to the recognition of sounds.}, } @article {pmid34775826, year = {2022}, author = {Cartei, V and Reby, D and Garnham, A and Oakhill, J and Banerjee, R}, title = {Peer audience effects on children's vocal masculinity and femininity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200397}, pmid = {34775826}, issn = {1471-2970}, mesh = {Acoustics ; Child ; Female ; *Femininity ; Humans ; Male ; Masculinity ; *Voice ; }, abstract = {Existing evidence suggests that children from around the age of 8 years strategically alter their public image in accordance with known values and preferences of peers, through the self-descriptive information they convey. However, an important but neglected aspect of this 'self-presentation' is the medium through which such information is communicated: the voice itself. The present study explored peer audience effects on children's vocal productions. Fifty-six children (26 females, aged 8-10 years) were presented with vignettes where a fictional child, matched to the participant's age and sex, is trying to make friends with a group of same-sex peers with stereotypically masculine or feminine interests (rugby and ballet, respectively). Participants were asked to impersonate the child in that situation and, as the child, to read out loud masculine, feminine and gender-neutral self-descriptive statements to these hypothetical audiences. They also had to decide which of those self-descriptive statements would be most helpful for making friends. In line with previous research, boys and girls preferentially selected masculine or feminine self-descriptive statements depending on the audience interests. Crucially, acoustic analyses of fundamental frequency and formant frequency spacing revealed that children also spontaneously altered their vocal productions: they feminized their voices when speaking to members of the ballet club, while they masculinized their voices when speaking to members of the rugby club. Both sexes also feminized their voices when uttering feminine sentences, compared to when uttering masculine and gender-neutral sentences. Implications for the hitherto neglected role of acoustic qualities of children's vocal behaviour in peer interactions are discussed. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34775821, year = {2022}, author = {Pisanski, K and Anikin, A and Reby, D}, title = {Vocal size exaggeration may have contributed to the origins of vocalic complexity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200401}, pmid = {34775821}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; Body Size ; Speech ; Vocalization, Animal ; *Voice ; }, abstract = {Vocal tract elongation, which uniformly lowers vocal tract resonances (formant frequencies) in animal vocalizations, has evolved independently in several vertebrate groups as a means for vocalizers to exaggerate their apparent body size. Here, we propose that smaller speech-like articulatory movements that alter only individual formants can serve a similar yet less energetically costly size-exaggerating function. To test this, we examine whether uneven formant spacing alters the perceived body size of vocalizers in synthesized human vowels and animal calls. Among six synthetic vowel patterns, those characterized by the lowest first and second formant (the vowel /u/ as in 'boot') are consistently perceived as produced by the largest vocalizer. Crucially, lowering only one or two formants in animal-like calls also conveys the impression of a larger body size, and lowering the second and third formants simultaneously exaggerates perceived size to a similar extent as rescaling all formants. As the articulatory movements required for individual formant shifts are minor compared to full vocal tract extension, they represent a rapid and energetically efficient mechanism for acoustic size exaggeration. We suggest that, by favouring the evolution of uneven formant patterns in vocal communication, this deceptive strategy may have contributed to the origins of the phonemic diversification required for articulated speech. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34775819, year = {2022}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200455}, pmid = {34775819}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; *Pan troglodytes/physiology ; Phonetics ; Speech Acoustics ; *Voice Quality ; }, abstract = {The origins of human speech are obscure; it is still unclear what aspects are unique to our species or shared with our evolutionary cousins, in part due to a lack of a common framework for comparison. We asked what chimpanzee and human vocal production acoustics have in common. We examined visible supra-laryngeal articulators of four major chimpanzee vocalizations (hoos, grunts, barks, screams) and their associated acoustic structures, using techniques from human phonetic and animal communication analysis. Data were collected from wild adult chimpanzees, Taï National Park, Ivory Coast. Both discriminant and principal component classification procedures revealed classification of call types. Discriminating acoustic features include voice quality and formant structure, mirroring phonetic features in human speech. Chimpanzee lip and jaw articulation variables also offered similar discrimination of call types. Formant maps distinguished call types with different vowel-like sounds. Comparing our results with published primate data, humans show less F1-F2 correlation and further expansion of the vowel space, particularly for [i] sounds. Unlike recent studies suggesting monkeys achieve human vowel space, we conclude from our results that supra-laryngeal articulatory capacities show moderate evolutionary change, with vowel space expansion continuing through hominoid evolution. Studies on more primate species will be required to substantiate this. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, } @article {pmid34756498, year = {2024}, author = {Davatz, GC and Yamasaki, R and Hachiya, A and Tsuji, DH and Montagnoli, AN}, title = {Source and Filter Acoustic Measures of Young, Middle-Aged and Elderly Adults for Application in Vowel Synthesis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {253-263}, doi = {10.1016/j.jvoice.2021.08.025}, pmid = {34756498}, issn = {1873-4588}, mesh = {Male ; Middle Aged ; Young Adult ; Humans ; Female ; Aged ; Adolescent ; Adult ; Aged, 80 and over ; *Speech Acoustics ; Prospective Studies ; Acoustics ; Sound ; *Voice ; Phonetics ; }, abstract = {INTRODUCTION: The output sound has important changes throughout life due to anatomical and physiological modifications in the larynx and vocal tract. Understanding the young adult to the elderly speech acoustic characteristics may assist in the synthesis of representative voices of men and women of different age groups.

OBJECTIVE: To obtain the fundamental frequency (f0), formant frequencies (F1, F2, F3, F4), and bandwidth (B1, B2, B3, B4) values extracted from the sustained vowel /a/ of young, middle-aged, and elderly adults who are Brazilian Portuguese speakers; to present the application of these parameters in vowel synthesis.

STUDY DESIGN: Prospective study.

METHODS: The acoustic analysis of tokens of the 162 sustained vowel /a/ produced by vocally healthy adults, men, and women, between 18 and 80 years old, was performed. The adults were divided into three groups: young adults (18 to 44 years old); middle-aged adults (45 to 59 years old) and, elderly adults (60 to 80 years old). The f0, F1, F2, F3, F4, B1, B2, B3, B4 were extracted from the audio signals. Their average values were applied to a source-filter mathematical model to perform vowel synthesis in each age group both men and woman.

RESULTS: Young women had higher f0 than middle-aged and elderly women. Elderly women had lower F1 than middle-aged women. Young women had higher F2 than elderly women. For the men's output sound, the source-filter acoustic measures were statistically equivalent among the age groups. Average values of the f0, F1, F2, F3, F4, B1, and B2 were higher in women. The sound waves distance in signals, the position of formant frequencies and the dimension of the bandwidths visible in spectra of the synthesized sounds represent the average values extracted from the volunteers' emissions for the sustained vowel /a/ in Brazilian Portuguese.

CONCLUSION: Sustained vowel /a/ produced by women presented different values of f0,F1 and F2 between age groups, which was not observed for men. In addition to the f0 and the formant frequencies, the bandwidths were also different between women and men. The synthetic vowels available represent the acoustic changes found for each sex as a function of age.}, } @article {pmid34735295, year = {2021}, author = {Rowe, HP and Stipancic, KL and Lammert, AC and Green, JR}, title = {Validation of an Acoustic-Based Framework of Speech Motor Control: Assessing Criterion and Construct Validity Using Kinematic and Perceptual Measures.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4736-4753}, pmid = {34735295}, issn = {1558-9102}, support = {F31 DC019556/DC/NIDCD NIH HHS/United States ; R01 DC013547/DC/NIDCD NIH HHS/United States ; R01 DC009890/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; R01 DC017291/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; *Speech ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: This study investigated the criterion (analytical and clinical) and construct (divergent) validity of a novel, acoustic-based framework composed of five key components of motor control: Coordination, Consistency, Speed, Precision, and Rate.

METHOD: Acoustic and kinematic analyses were performed on audio recordings from 22 subjects with amyotrophic lateral sclerosis during a sequential motion rate task. Perceptual analyses were completed by two licensed speech-language pathologists, who rated each subject's speech on the five framework components and their overall severity. Analytical and clinical validity were assessed by comparing performance on the acoustic features to their kinematic correlates and to clinician ratings of the five components, respectively. Divergent validity of the acoustic-based framework was then assessed by comparing performance on each pair of acoustic features to determine whether the features represent distinct articulatory constructs. Bivariate correlations and partial correlations with severity as a covariate were conducted for each comparison.

RESULTS: Results revealed moderate-to-strong analytical validity for every acoustic feature, both with and without controlling for severity, and moderate-to-strong clinical validity for all acoustic features except Coordination, without controlling for severity. When severity was included as a covariate, the strong associations for Speed and Precision became weak. Divergent validity was supported by weak-to-moderate pairwise associations between all acoustic features except Speed (second-formant [F2] slope of consonant transition) and Precision (between-consonant variability in F2 slope).

CONCLUSIONS: This study demonstrated that the acoustic-based framework has potential as an objective, valid, and clinically useful tool for profiling articulatory deficits in individuals with speech motor disorders. The findings also suggest that compared to clinician ratings, instrumental measures are more sensitive to subtle differences in articulatory function. With further research, this framework could provide more accurate and reliable characterizations of articulatory impairment, which may eventually increase clinical confidence in the diagnosis and treatment of patients with different articulatory phenotypes.}, } @article {pmid34734018, year = {2021}, author = {Xia, M and Cao, S and Zhou, R and Wang, JY and Xu, TY and Zhou, ZK and Qian, YM and Jiang, H}, title = {Acoustic features as novel predictors of difficult laryngoscopy in orthognathic surgery: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {18}, pages = {1466}, pmid = {34734018}, issn = {2305-5839}, abstract = {BACKGROUND: The evaluation of the difficult intubation is an important process before anaesthesia. The unanticipated difficult intubation is associated with morbidity and mortality. This study aimed to determine whether acoustic features are valuable as an alternative method to predict difficult laryngoscopy (DL) in patients scheduled to undergo orthognathic surgery.

METHODS: This study included 225 adult patients who were undergoing elective orthognathic surgery under general anaesthesia with tracheal intubation. Preoperatively, clinical airway evaluation was performed, and the acoustic data were collected. Twelve phonemes {[a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]} were recorded, and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. Difficult laryngoscopy was defined as direct laryngoscopy with a Cormack-Lehane grade of 3 or 4. Univariate and multivariate logistic regression analyses were used to examine the associations between acoustic features and DL.

RESULTS: Difficult laryngoscopy was reported in 59/225 (26.2%) patients. The area under the curve (AUC) of the backward stepwise model including en_f2 [odds ratio (OR), 0.996; 95% confidence interval (CI), 0.994-0.999; P=0.006], ci_bw4 (OR, 0.997; 95% CI, 0.993-1.000; P=0.057), qi_bw4 (OR, 0.996; 95% CI, 0.993-0.999; P=0.017), le_f3 (OR, 0.998; 95% CI, 0.996-1.000; P=0.079), o_bw4 (OR, 1.001; 95% CI, 1.000-1.003; P=0.014), chi_f4 (OR, 1.003; 95% CI, 1.000-1.005; P=0.041), a_bw4 (OR, 0.999; 95% CI, 0.998-1.000; P=0.078) attained a value of 0.761 in the training set, but a value of 0.709 in the testing set. The sensitivity and specificity of the model in the testing set are 86.7% and 63.0%, respectively.

CONCLUSIONS: Acoustic features may be considered as useful predictors of DL during orthognathic surgery.}, } @article {pmid34731577, year = {2021}, author = {Abur, D and Subaciute, A and Daliri, A and Lester-Smith, RA and Lupiani, AA and Cilento, D and Enos, NM and Weerathunge, HR and Tardif, MC and Stepp, CE}, title = {Feedback and Feedforward Auditory-Motor Processes for Voice and Articulation in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4682-4694}, pmid = {34731577}, issn = {1558-9102}, support = {F31 DC019032/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Parkinson Disease/complications ; Speech ; Speech Intelligibility/physiology ; Speech Production Measurement ; *Voice ; }, abstract = {PURPOSE: Unexpected and sustained manipulations of auditory feedback during speech production result in "reflexive" and "adaptive" responses, which can shed light on feedback and feedforward auditory-motor control processes, respectively. Persons with Parkinson's disease (PwPD) have shown aberrant reflexive and adaptive responses, but responses appear to differ for control of vocal and articulatory features. However, these responses have not been examined for both voice and articulation in the same speakers and with respect to auditory acuity and functional speech outcomes (speech intelligibility and naturalness).

METHOD: Here, 28 PwPD on their typical dopaminergic medication schedule and 28 age-, sex-, and hearing-matched controls completed tasks yielding reflexive and adaptive responses as well as auditory acuity for both vocal and articulatory features.

RESULTS: No group differences were found for any measures of auditory-motor control, conflicting with prior findings in PwPD while off medication. Auditory-motor measures were also compared with listener ratings of speech function: first formant frequency acuity was related to speech intelligibility, whereas adaptive responses to vocal fundamental frequency manipulations were related to speech naturalness.

CONCLUSIONS: These results support that auditory-motor processes for both voice and articulatory features are intact for PwPD receiving medication. This work is also the first to suggest associations between measures of auditory-motor control and speech intelligibility and naturalness.}, } @article {pmid34717445, year = {2021}, author = {Cheung, ST and Thompson, K and Chen, JL and Yunusova, Y and Beal, DS}, title = {Response patterns to vowel formant perturbations in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {4}, pages = {2647}, doi = {10.1121/10.0006567}, pmid = {34717445}, issn = {1520-8524}, mesh = {Adaptation, Physiological ; Adolescent ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback is an important component of speech motor control, but its precise role in developing speech is less understood. The role of auditory feedback in development was probed by perturbing the speech of children 4-9 years old. The vowel sound /ɛ/ was shifted to /æ/ in real time and presented to participants as their own auditory feedback. Analyses of the resultant formant magnitude changes in the participants' speech indicated that children compensated and adapted by adjusting their formants to oppose the perturbation. Older and younger children responded to perturbation differently in F1 and F2. The compensatory change in F1 was greater for younger children, whereas the increase in F2 was greater for older children. Adaptation aftereffects were observed in both groups. Exploratory directional analyses in the two-dimensional formant space indicated that older children responded more directly and less variably to the perturbation than younger children, shifting their vowels back toward the vowel sound /ɛ/ to oppose the perturbation. Findings support the hypothesis that auditory feedback integration continues to develop between the ages of 4 and 9 years old such that the differences in the adaptive and compensatory responses arise between younger and older children despite receiving the same auditory feedback perturbation.}, } @article {pmid34717269, year = {2021}, author = {Tang, DL and McDaniel, A and Watkins, KE}, title = {Disruption of speech motor adaptation with repetitive transcranial magnetic stimulation of the articulatory representation in primary motor cortex.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {145}, number = {}, pages = {115-130}, pmid = {34717269}, issn = {1973-8102}, support = {/WT_/Wellcome Trust/United Kingdom ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Humans ; *Motor Cortex ; *Speech ; Transcranial Magnetic Stimulation ; }, abstract = {When auditory feedback perturbation is introduced in a predictable way over a number of utterances, speakers learn to compensate by adjusting their own productions, a process known as sensorimotor adaptation. Despite multiple lines of evidence indicating the role of primary motor cortex (M1) in motor learning and memory, whether M1 causally contributes to sensorimotor adaptation in the speech domain remains unclear. Here, we aimed to assay whether temporary disruption of the articulatory representation in left M1 by repetitive transcranial magnetic stimulation (rTMS) impairs speech adaptation. To induce sensorimotor adaptation, the frequencies of first formants (F1) were shifted up and played back to participants when they produced "head", "bed", and "dead" repeatedly (the learning phase). A low-frequency rTMS train (.6 Hz, subthreshold, 12 min) over either the tongue or the hand representation of M1 (between-subjects design) was applied before participants experienced altered auditory feedback in the learning phase. We found that the group who received rTMS over the hand representation showed the expected compensatory response for the upwards shift in F1 by significantly reducing F1 and increasing the second formant (F2) frequencies in their productions. In contrast, these expected compensatory changes in both F1 and F2 did not occur in the group that received rTMS over the tongue representation. Critically, rTMS (subthreshold) over the tongue representation did not affect vowel production, which was unchanged from baseline. These results provide direct evidence that the articulatory representation in left M1 causally contributes to sensorimotor learning in speech. Furthermore, these results also suggest that M1 is critical to the network supporting a more global adaptation that aims to move the altered speech production closer to a learnt pattern of speech production used to produce another vowel.}, } @article {pmid34714438, year = {2022}, author = {Sturdy, SK and Smith, DRR and George, DN}, title = {Domestic dogs (Canis lupus familiaris) are sensitive to the correlation between pitch and timbre in human speech.}, journal = {Animal cognition}, volume = {25}, number = {3}, pages = {545-554}, pmid = {34714438}, issn = {1435-9456}, mesh = {Animals ; Dogs ; Female ; Humans ; Male ; Pitch Perception ; Sex Characteristics ; Speech ; Speech Acoustics ; *Voice ; *Wolves ; }, abstract = {The perceived pitch of human voices is highly correlated with the fundamental frequency (f0) of the laryngeal source, which is determined largely by the length and mass of the vocal folds. The vocal folds are larger in adult males than in adult females, and men's voices consequently have a lower pitch than women's. The length of the supralaryngeal vocal tract (vocal-tract length; VTL) affects the resonant frequencies (formants) of speech which characterize the timbre of the voice. Men's longer vocal tracts produce lower frequency, and less dispersed, formants than women's shorter vocal tracts. Pitch and timbre combine to influence the perception of speaker characteristics such as size and age. Together, they can be used to categorize speaker sex with almost perfect accuracy. While it is known that domestic dogs can match a voice to a person of the same sex, there has been no investigation into whether dogs are sensitive to the correlation between pitch and timbre. We recorded a female voice giving three commands ('Sit', 'Lay down', 'Come here'), and manipulated the recordings to lower the fundamental frequency (thus lowering pitch), increase simulated VTL (hence affecting timbre), or both (synthesized adult male voice). Dogs responded to the original adult female and synthesized adult male voices equivalently. Their tendency to obey the commands was, however, reduced when either pitch or timbre was manipulated alone. These results suggest that dogs are sensitive to both the pitch and timbre of human voices, and that they learn about the natural covariation of these perceptual attributes.}, } @article {pmid34649740, year = {2024}, author = {Lester-Smith, RA and Derrick, E and Larson, CR}, title = {Characterization of Source-Filter Interactions in Vocal Vibrato Using a Neck-Surface Vibration Sensor: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {1-9}, pmid = {34649740}, issn = {1873-4588}, support = {90AR5015/ACL/ACL HHS/United States ; R21 DC017001/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Pilot Projects ; Voice Quality ; Vibration ; *Voice/physiology ; *Singing ; }, abstract = {PURPOSE: Vocal vibrato is a singing technique that involves periodic modulation of fundamental frequency (fo) and intensity. The physiological sources of modulation within the speech mechanism and the interactions between the laryngeal source and vocal tract filter in vibrato are not fully understood. Therefore, the purpose of this study was to determine if differences in the rate and extent of fo and intensity modulation could be captured using simultaneously recorded signals from a neck-surface vibration sensor and a microphone, which represent features of the source before and after supraglottal vocal tract filtering.

METHOD: Nine classically-trained singers produced sustained vowels with vibrato while simultaneous signals were recorded using a vibration sensor and a microphone. Acoustical analyses were performed to measure the rate and extent of fo and intensity modulation for each trial. Paired-samples sign tests were used to analyze differences between the rate and extent of fo and intensity modulation in the vibration sensor and microphone signals.

RESULTS: The rate and extent of fo modulation and the extent of intensity modulation were equivalent in the vibration sensor and microphone signals, but the rate of intensity modulation was significantly higher in the microphone signal than in the vibration sensor signal. Larger differences in the rate of intensity modulation were seen with vowels that typically have smaller differences between the first and second formant frequencies.

CONCLUSIONS: This study demonstrated that the rate of intensity modulation at the source prior to supraglottal vocal tract filtering, as measured in neck-surface vibration sensor signals, was lower than the rate of intensity modulation after supraglottal vocal tract filtering, as measured in microphone signals. The difference in rate varied based on the vowel. These findings provide further support of the resonance-harmonics interaction in vocal vibrato. Further investigation is warranted to determine if differences in the physiological source(s) of vibrato account for inconsistent relationships between the extent of intensity modulation in neck-surface vibration sensor and microphone signals.}, } @article {pmid34642073, year = {2024}, author = {Tarai, SK and Chatterjee, I and Pani, S}, title = {A Comparative Acoustic Analysis of Bangla Folk Song and RabindraSangeet on Long-Term Average Spectrum.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {304-308}, doi = {10.1016/j.jvoice.2021.08.014}, pmid = {34642073}, issn = {1873-4588}, mesh = {Humans ; Female ; Adolescent ; Young Adult ; Adult ; Voice Quality ; *Voice ; Phonation ; Acoustics ; *Singing ; }, abstract = {BACKGROUND: Singing is defined as a sensory-motor phenomenon that requires particular balanced physical skills such as respiration, phonation, resonance, and articulation. The long-term average spectrum (LTAS) is widely accepted as a robust and effective tool for the assessment of voice characteristics.

METHOD: Eighty female singers within the age range of 18-30 years were considered for the study. Among 80 participants, 40 were asked to perform one traditional song from Bangla Folk representing the Baul style and another 40 were asked to perform a traditional song from Rabindra Sangeet. Recordings were done and then acoustic (LTAS) analyses were done through PRAAT software. Statistical analyses were done for the analyzed data. software package of social sciences (Version 20.0) was used.

RESULTS: The averaged LTAS curve of Baul style showed a broad peak in the frequency range between 2000 and 3600Hz and its amplitude about 16 dB, Rabindra Sangeet showed a broader peak in the frequency range between 2200 and 3800 Hz and its amplitude about 15 dB. This evidence showed the presence of singer's formants in both singing styles.

CONCLUSION: It can be concluded from the present study that, there is an acoustical difference between the Bangla Folk and Rabindra Sangeet singing style which can be evidenced using LTAS through PRAAT.}, } @article {pmid34642071, year = {2024}, author = {Lee, Y and Park, HJ and Bae, IH and Kim, G}, title = {Resonance Characteristics in Epiglottic Cyst: Formant Frequency, Vowel Space Area, Vowel Articulatory Index, and Formant Centralization Ratio.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {2}, pages = {273-278}, doi = {10.1016/j.jvoice.2021.09.008}, pmid = {34642071}, issn = {1873-4588}, mesh = {Humans ; Male ; Speech Acoustics ; *Laryngeal Diseases/diagnostic imaging/surgery ; *Voice ; Voice Quality ; *Cysts/diagnostic imaging/surgery ; Phonetics ; }, abstract = {OBJECTIVES: Resonance characteristics can change due to alterations in the shape of the vocal tract in patients with epiglottic cysts. This study aimed to analyze the resonance characteristics before and after the surgical excision of epiglottic cysts.

METHODS: Twelve male patients with epiglottic cysts were enrolled in this study. We analyzed the first and second formants (F1 and F2) in vowels /a/, /e/, /i/, /o/, and /u/, vowel space area (VSA), vowel articulatory index (VAI), and formant centralization ratio (FCR). We measured these parameters before and after the surgical excision of epiglottic cysts.

RESULTS: There was a significant increase in the F1 values of /a/, VSA, and VAI, and a significant decrease in the value of FCR after the surgery.

CONCLUSION: We confirmed the change in the resonance characteristics in patients with epiglottic cysts. It is considered that further studies on epiglottic cysts and resonance changes are needed in the future.}, } @article {pmid34641989, year = {2021}, author = {König, A and Mallick, E and Tröger, J and Linz, N and Zeghari, R and Manera, V and Robert, P}, title = {Measuring neuropsychiatric symptoms in patients with early cognitive decline using speech analysis.}, journal = {European psychiatry : the journal of the Association of European Psychiatrists}, volume = {64}, number = {1}, pages = {e64}, pmid = {34641989}, issn = {1778-3585}, mesh = {Aged ; Anxiety/diagnosis ; *Apathy ; *Cognitive Dysfunction/diagnosis ; Female ; Humans ; Machine Learning ; Male ; Neuropsychological Tests ; Speech ; }, abstract = {BACKGROUND: Certain neuropsychiatric symptoms (NPS), namely apathy, depression, and anxiety demonstrated great value in predicting dementia progression, representing eventually an opportunity window for timely diagnosis and treatment. However, sensitive and objective markers of these symptoms are still missing. Therefore, the present study aims to investigate the association between automatically extracted speech features and NPS in patients with mild neurocognitive disorders.

METHODS: Speech of 141 patients aged 65 or older with neurocognitive disorder was recorded while performing two short narrative speech tasks. NPS were assessed by the neuropsychiatric inventory. Paralinguistic markers relating to prosodic, formant, source, and temporal qualities of speech were automatically extracted, correlated with NPS. Machine learning experiments were carried out to validate the diagnostic power of extracted markers.

RESULTS: Different speech variables are associated with specific NPS; apathy correlates with temporal aspects, and anxiety with voice quality-and this was mostly consistent between male and female after correction for cognitive impairment. Machine learning regressors are able to extract information from speech features and perform above baseline in predicting anxiety, apathy, and depression scores.

CONCLUSIONS: Different NPS seem to be characterized by distinct speech features, which are easily extractable automatically from short vocal tasks. These findings support the use of speech analysis for detecting subtypes of NPS in patients with cognitive impairment. This could have great implications for the design of future clinical trials as this cost-effective method could allow more continuous and even remote monitoring of symptoms.}, } @article {pmid34632373, year = {2021}, author = {Coto-Solano, R and Stanford, JN and Reddy, SK}, title = {Advances in Completely Automated Vowel Analysis for Sociophonetics: Using End-to-End Speech Recognition Systems With DARLA.}, journal = {Frontiers in artificial intelligence}, volume = {4}, number = {}, pages = {662097}, pmid = {34632373}, issn = {2624-8212}, abstract = {In recent decades, computational approaches to sociophonetic vowel analysis have been steadily increasing, and sociolinguists now frequently use semi-automated systems for phonetic alignment and vowel formant extraction, including FAVE (Forced Alignment and Vowel Extraction, Rosenfelder et al., 2011; Evanini et al., Proceedings of Interspeech, 2009), Penn Aligner (Yuan and Liberman, J. Acoust. Soc. America, 2008, 123, 3878), and DARLA (Dartmouth Linguistic Automation), (Reddy and Stanford, DARLA Dartmouth Linguistic Automation: Online Tools for Linguistic Research, 2015a). Yet these systems still have a major bottleneck: manual transcription. For most modern sociolinguistic vowel alignment and formant extraction, researchers must first create manual transcriptions. This human step is painstaking, time-consuming, and resource intensive. If this manual step could be replaced with completely automated methods, sociolinguists could potentially tap into vast datasets that have previously been unexplored, including legacy recordings that are underutilized due to lack of transcriptions. Moreover, if sociolinguists could quickly and accurately extract phonetic information from the millions of hours of new audio content posted on the Internet every day, a virtual ocean of speech from newly created podcasts, videos, live-streams, and other audio content would now inform research. How close are the current technological tools to achieving such groundbreaking changes for sociolinguistics? Prior work (Reddy et al., Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75) showed that an HMM-based Automated Speech Recognition system, trained with CMU Sphinx (Lamere et al., 2003), was accurate enough for DARLA to uncover evidence of the US Southern Vowel Shift without any human transcription. Even so, because that automatic speech recognition (ASR) system relied on a small training set, it produced numerous transcription errors. Six years have passed since that study, and since that time numerous end-to-end automatic speech recognition (ASR) algorithms have shown considerable improvement in transcription quality. One example of such a system is the RNN/CTC-based DeepSpeech from Mozilla (Hannun et al., 2014). (RNN stands for recurrent neural networks, the learning mechanism for DeepSpeech. CTC stands for connectionist temporal classification, the mechanism to merge phones into words). The present paper combines DeepSpeech with DARLA to push the technological envelope and determine how well contemporary ASR systems can perform in completely automated vowel analyses with sociolinguistic goals. Specifically, we used these techniques on audio recordings from 352 North American English speakers in the International Dialects of English Archive (IDEA), extracting 88,500 tokens of vowels in stressed position from spontaneous, free speech passages. With this large dataset we conducted acoustic sociophonetic analyses of the Southern Vowel Shift and the Northern Cities Chain Shift in the North American IDEA speakers. We compared the results using three different sources of transcriptions: 1) IDEA's manual transcriptions as the baseline "ground truth", 2) the ASR built on CMU Sphinx used by Reddy et al. (Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75), and 3) the latest publicly available Mozilla DeepSpeech system. We input these three different transcriptions to DARLA, which automatically aligned and extracted the vowel formants from the 352 IDEA speakers. Our quantitative results show that newer ASR systems like DeepSpeech show considerable promise for sociolinguistic applications like DARLA. We found that DeepSpeech's automated transcriptions had significantly fewer character error rates than those from the prior Sphinx system (from 46 to 35%). When we performed the sociolinguistic analysis of the extracted vowel formants from DARLA, we found that the automated transcriptions from DeepSpeech matched the results from the ground truth for the Southern Vowel Shift (SVS): five vowels showed a shift in both transcriptions, and two vowels didn't show a shift in either transcription. The Northern Cities Shift (NCS) was more difficult to detect, but ground truth and DeepSpeech matched for four vowels: One of the vowels showed a clear shift, and three showed no shift in either transcription. Our study therefore shows how technology has made progress toward greater automation in vowel sociophonetics, while also showing what remains to be done. Our statistical modeling provides a quantified view of both the abilities and the limitations of a completely "hands-free" analysis of vowel shifts in a large dataset. Naturally, when comparing a completely automated system against a semi-automated system involving human manual work, there will always be a tradeoff between accuracy on the one hand versus speed and replicability on the other hand [Kendall and Joseph, Towards best practices in sociophonetics (with Marianna DiPaolo), 2014]. The amount of "noise" that can be tolerated for a given study will depend on the particular research goals and researchers' preferences. Nonetheless, our study shows that, for certain large-scale applications and research goals, a completely automated approach using publicly available ASR can produce meaningful sociolinguistic results across large datasets, and these results can be generated quickly, efficiently, and with full replicability.}, } @article {pmid34632133, year = {2021}, author = {Sondhi, S and Salhan, A and Santoso, CA and Doucoure, M and Dharmawan, DM and Sureka, A and Natasha, BN and Danusaputro, AD and Dowson, NS and Yap, MSL and Hadiwidjaja, MA and Veeraraghavan, SG and Hatta, AZR and Lee, C and Megantara, RA and Wihardja, AN and Sharma, M and Lardizabal, EL and Sondhi, LJ and Raina, R and Vashisth, S and Hedwig, R}, title = {Voice processing for COVID-19 scanning and prognostic indicator.}, journal = {Heliyon}, volume = {7}, number = {10}, pages = {e08134}, pmid = {34632133}, issn = {2405-8440}, abstract = {COVID-19 pandemic has posed serious risk of contagion to humans. There is a need to find reliable non-contact tests like vocal correlates of COVID-19 infection. Thirty-six Asian ethnic volunteers 16 (8M & 8F) infected subjects and 20 (10M &10F) non-infected controls participated in this study by vocalizing vowels /a/, /e/, /i/, /o/, /u/. Voice correlates of 16 COVID-19 positive patients were compared during infection and after recovery with 20 non-infected controls. Compared to non-infected controls, significantly higher values of energy intensity for /o/ (p = 0.048); formant F1 for /o/ (p = 0.014); and formant F3 for /u/ (p = 0.032) were observed in male patients, while higher values of Jitter (local, abs) for /o/ (p = 0.021) and Jitter (ppq5) for /a/ (p = 0.014) were observed in female patients. However, formant F2 for /u/ (p = 0.018), mean pitch F0 for /e/, /i/ and /o/ (p = 0.033; 0.036; 0.047) decreased for female patients under infection. Compared to recovered conditions, HNR for /e/ (p = 0.014) was higher in male patients under infection, while Jitter (rap) for /a/ (p = 0.041); Jitter (ppq5) for /a/ (p = 0.032); Shimmer (local, dB) for /i/ (p = 0.024); Shimmer (apq5) for /u/ (p = 0.019); and formant F4 for vowel /o/ (p = 0.022) were higher in female patients under infection. However, HNR for /e/ (p = 0.041); and formant F1 for /o/ (p = 0.002) were lower in female patients compared to their recovered conditions. Obtained results support the hypothesis since changes in voice parameters were observed in the infected patients which can be correlated to a combination of acoustic measures like fundamental frequency, formant characteristics, HNR, and voice perturbations like jitter and shimmer for different vowels. Thus, voice analysis can be used for scanning and prognosis of COVID-19 infection. Based on the findings of this study, a mobile application can be developed to analyze human voice in real-time to detect COVID-19 symptoms for remedial measures and necessary action.}, } @article {pmid34550454, year = {2022}, author = {Gama, R and Castro, ME and van Lith-Bijl, JT and Desuter, G}, title = {Does the wearing of masks change voice and speech parameters?.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {279}, number = {4}, pages = {1701-1708}, pmid = {34550454}, issn = {1434-4726}, mesh = {Acoustics ; Humans ; Phonation ; Speech ; Speech Acoustics ; *Voice ; *Voice Disorders/etiology/prevention & control ; Voice Quality ; }, abstract = {PURPOSE: The authors aim to review available reports on the potential effects of masks on voice and speech parameters.

METHODS: A literature search was conducted using MEDLINE and Google Scholar databases through July 2021. Several targeted populations, mask scenarios and methodologies were approached. The assessed voice parameters were divided into self-reported, acoustic and aerodynamic.

RESULTS: It was observed that the wearing of a face mask has been shown to induce several changes in voice parameters: (1) self-reported-significantly increased vocal effort and fatigue, increased vocal tract discomfort and increased values of voice handicap index (VHI) were observed; (2) acoustics-increased voice intensity, altered formants frequency (F2 and F3) with no changes in fundamental frequency, increased harmonics-to-noise ratio (HNR) and increased mean spectral values in high-frequency levels (1000-8000 Hz), especially with KN95 mask; (3) aerodynamics-maximum phonatory time was assessed in only two reports, and showed no alterations.

CONCLUSION: Despite the different populations, mask-type scenarios and methodologies described by each study, the results of this review outline the significant changes in voice characteristics with the use of face masks. Wearing a mask shows to increase the perception of vocal effort and an alteration of the vocal tract length and speech articulatory movements, leading to spectral sound changes, impaired communication and perception. Studies analyzing the effect of masks on voice aerodynamics are lacking. Further research is required to study the long-term effects of face masks on the potential development of voice pathology.}, } @article {pmid34543515, year = {2021}, author = {Wang, Y and Qiu, X and Wang, F and Li, Y and Guo, H and Nie, L}, title = {Single-crystal ordered macroporous metal-organic framework as support for molecularly imprinted polymers and their integration in membrane formant for the specific recognition of zearalenone.}, journal = {Journal of separation science}, volume = {44}, number = {22}, pages = {4190-4199}, doi = {10.1002/jssc.202100393}, pmid = {34543515}, issn = {1615-9314}, support = {20180307024//Science and Technology Project of Guangdong Province/ ; 2017KTSCX169//Department of Education of Guangdong Province/ ; SZ2018KJ03//scientific research project of Shaoguan University/ ; S202010576027//National College Students Innovation and Entrepreneurship Training Program/ ; pdjh2020a0530//Special Fund for Science and Technology Innovation Strategy of Guangdong Province/ ; CX20201043//Hunan Provincial Innovation Foundation For Postgraduate/ ; 19A144//Scientific Research Fund of Hunan Provincial Education Department/ ; 2019JJ60058//Natural Science Foundation of Hunan Province/ ; 2020JJ6102//Natural Science Foundation of Hunan Province/ ; }, mesh = {Chromatography, High Pressure Liquid/methods ; Edible Grain/*chemistry ; Extraction and Processing Industry/methods ; Food Contamination/analysis ; Metal-Organic Frameworks ; Molecular Imprinting/methods ; Molecularly Imprinted Polymers ; Mycotoxins/analysis/chemistry ; Solid Phase Extraction/methods ; Zearalenone/*analysis/chemistry ; }, abstract = {Zearalenone is a fungal contaminant that is widely present in grains. Here, a novel molecularly imprinted membrane based on SOM-ZIF-8 was developed for the rapid and highly selective identification of zearalenone in grain samples. The molecularly imprinted membrane was prepared using polyvinylidene fluoride, cyclododecyl 2,4-dihydroxybenzoate as a template and SOM-ZIF-8 as a carrier. The factors influencing the extraction of zearalenone using this membrane, including the solution pH, extraction time, elution solvent, elution time, and elution volume, were studied in detail. The optimized conditions were 5 mL of sample solution at pH 6, extraction time of 45 min, 4 mL of acetonitrile:methanol = 9:1 as elution solvent, and elution time of 20 min. This method displayed a good linear range of 12-120 ng/g (R[2 ] = 0.998) with the limits of detection and quantification of this method are 1.7 and 5.5 ng/g, respectively. In addition, the membrane was used to selectively identify zearalenone in grain samples with percent recoveries ranging from 87.9 to 101.0% and relative standard deviation of less than 6.6%. Overall, this study presents a simple and effective chromatographic pretreatment method for detecting zearalenone in food samples.}, } @article {pmid34538710, year = {2022}, author = {Erdur, OE and Yilmaz, BS}, title = {Voice changes after surgically assisted rapid maxillary expansion.}, journal = {American journal of orthodontics and dentofacial orthopedics : official publication of the American Association of Orthodontists, its constituent societies, and the American Board of Orthodontics}, volume = {161}, number = {1}, pages = {125-132}, doi = {10.1016/j.ajodo.2020.06.055}, pmid = {34538710}, issn = {1097-6752}, mesh = {Acoustics ; Adult ; Humans ; Maxilla ; *Palatal Expansion Technique ; *Voice Quality ; }, abstract = {INTRODUCTION: This study aimed to investigate voice changes in patients who had surgically assisted rapid maxillary expansion (SARME).

METHODS: Nineteen adult patients with maxillary transverse deficiency were asked to pronounce the sounds "[a], [ϵ], [ɯ], [i], [ɔ], [œ] [u], [y]" for 3 seconds. Voice records were taken before the expansion appliance was placed (T0) and 5.8 weeks after removal (T1, after 5.2 months of retention). The same records were taken for the control group (n = 19). The formant frequencies (F0, F1, F2, and F3), shimmer, jitter, and noise-to-harmonics ratio (NHR) parameters were considered with Praat (version 6.0.43).

RESULTS: In the SARME group, significant differences were observed in the F1 of [a] (P = 0.005), F2 of [ϵ] (P = 0.008), and [œ] sounds (P = 0.004). The postexpansion values were lower than those recorded before. In contrast, the F1 of [y] sound (P = 0.02), F2 of [u] sound (P = 0.01), the jitter parameter of [ɯ] and [i] sounds (P = 0.04; P = 0.002), and the NHR value of [ϵ] sound (P = 0.04) were significantly than the baseline values. In the comparison with the control group, significant differences were found in the F0 (P = 0.025) and F1 (P = 0.046) of the [u] sound, the F1 of the [a] sound (P = 0.03), and the F2 of the [ϵ] sound (P = 0.037). Significant differences were also found in the shimmer of [i] (P = 0.017) and [ɔ] (P = 0.002), the jitter of [ϵ] (P = 0.046) and [i] (P = 0.017), and the NHR of [i] (P = 0.012) and [ɔ] (P = 0.009).

CONCLUSION: SARME led to significant differences in some of the acoustics parameters.}, } @article {pmid34498908, year = {2022}, author = {Perlman, M and Paul, J and Lupyan, G}, title = {Vocal communication of magnitude across language, age, and auditory experience.}, journal = {Journal of experimental psychology. General}, volume = {151}, number = {4}, pages = {885-896}, doi = {10.1037/xge0001103}, pmid = {34498908}, issn = {1939-2222}, support = {//NSF-INSPIRE/ ; //NSF-PAC/ ; }, mesh = {Adolescent ; Animals ; China ; Culture ; Humans ; *Language ; *Voice ; }, abstract = {Like many other vocalizing vertebrates, humans convey information about their body size through the sound of their voice. Vocalizations of larger animals are typically longer in duration, louder in intensity, and lower in frequency. We investigated people's ability to use voice-size correspondences to communicate about the magnitude of external referents. First, we asked hearing children, as well as deaf children and adolescents, living in China to improvise nonlinguistic vocalizations to distinguish between paired items contrasting in magnitude (e.g., a long vs. short string, a big vs. small ball). Then we played these vocalizations back to adult listeners in the United States and China to assess their ability to correctly guess the intended referents. We find that hearing and deaf producers both signaled greater magnitude items with longer and louder vocalizations and with smaller formant spacing. Only hearing producers systematically used fundamental frequency, communicating greater magnitude with higher fo. The vocalizations of both groups were understandable to Chinese and American listeners, although accuracy was higher with vocalizations from older producers. American listeners relied on the same acoustic properties as Chinese listeners: both groups interpreted vocalizations with longer duration and greater intensity as referring to greater items; neither American nor Chinese listeners consistently used fo or formant spacing as a cue. These findings show that the human ability to use vocalizations to communicate about the magnitude of external referents is highly robust, extending across listeners of disparate linguistic and cultural backgrounds, as well as across age and auditory experience. (PsycInfo Database Record (c) 2022 APA, all rights reserved).}, } @article {pmid34482728, year = {2021}, author = {Stansbury, AL and Janik, VM}, title = {The role of vocal learning in call acquisition of wild grey seal pups.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {376}, number = {1836}, pages = {20200251}, pmid = {34482728}, issn = {1471-2970}, mesh = {Animals ; Female ; *Learning ; Male ; *Seals, Earless ; *Vocalization, Animal ; }, abstract = {Pinnipeds have been identified as one of the best available models for the study of vocal learning. Experimental evidence for their learning skills is demonstrated with advanced copying skills, particularly in formant structure when copying human speech sounds and melodies. By contrast, almost no data are available on how learning skills are used in their own communication systems. We investigated the impact of playing modified seal sounds in a breeding colony of grey seals (Halichoerus grypus) to study how acoustic input influenced vocal development of eight pups. Sequences of two or three seal pup calls were edited so that the average peak frequency between calls in a sequence changed up or down. We found that seals copied the specific stimuli played to them and that copies became more accurate over time. The differential response of different groups showed that vocal production learning was used to achieve conformity, suggesting that geographical variation in seal calls can be caused by horizontal cultural transmission. While learning of pup calls appears to have few benefits, we suggest that it also affects the development of the adult repertoire, which may facilitate social interactions such as mate choice. This article is part of the theme issue 'Vocal learning in animals and humans'.}, } @article {pmid34474938, year = {2024}, author = {Güths, RC and Rolim, MRP and Coelho, A}, title = {Glottal Voice Distortions: Nasolaryngoscopic and Spectral Analysis of Anatomophysiologic Changes in Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {31-39}, doi = {10.1016/j.jvoice.2021.07.018}, pmid = {34474938}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Voice Quality ; *Voice ; Vocal Cords ; *Larynx ; }, abstract = {The distorted voices, commonly called vocal drives in Brazil and in some other South American countries, are vocal ornaments belonging to the aesthetics of popular singing and desired by singers of different styles. The advances in vocal sciences have allowed the demystification of this type of technique in the last four decades, classifying them as glottal, supraglottic or mixed distortions/drives. The interdisciplinary approach in the evaluation of singers who use glottal distortions is fundamental for a broad understanding of the particularities of each case. The present study has as main objective to describe the anatomophysiological and spectral findings of the glottal distortions, identified in the practice of many singers. A sample of three singers in a sung emission with and without vocal distortions was collected. PreSonus® AudioBox Studio One kit was used to record the voice during the nasolaryngoscopic evaluation. The singers underwent vocal warm-up and functional evaluation of the larynx based on two studies on contemporary singers. The singers performed the Snarl Voice and Phaser distortions and both showed particular anatomophysiological behaviors. The larynx was low in the first distortion and the level of the clean voice in the second, with the posterior opening of the glottis in both distortions being observed, with opening of the middle third of the glottis for the first as well. Formants vary according to the vocal tract settings used for the distortions. The glottic distortions present a complex anatomophysiological behavior in their composition, with fundamental participation of the transverse interarytenoid muscle and lateral cricoarytenoids, as well as the the participation of the vocal fold in the frequency break. F3 varied according to the longitudinal length and F4 with the diameter, both being related to the three-dimensional adjustments of the vocal tract.}, } @article {pmid34470280, year = {2021}, author = {Stehr, DA and Hickok, G and Ferguson, SH and Grossman, ED}, title = {Examining vocal attractiveness through articulatory working space.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1548}, doi = {10.1121/10.0005730}, pmid = {34470280}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Humans ; Language ; Male ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Robust gender differences exist in the acoustic correlates of clearly articulated speech, with females, on average, producing speech that is acoustically and phonetically more distinct than that of males. This study investigates the relationship between several acoustic correlates of clear speech and subjective ratings of vocal attractiveness. Talkers were recorded producing vowels in /bVd/ context and sentences containing the four corner vowels. Multiple measures of working vowel space were computed from continuously sampled formant trajectories and were combined with measures of speech timing known to co-vary with clear articulation. Partial least squares regression (PLS-R) modeling was used to predict ratings of vocal attractiveness for male and female talkers based on the acoustic measures. PLS components that loaded on size and shape measures of working vowel space-including the quadrilateral vowel space area, convex hull area, and bivariate spread of formants-along with measures of speech timing were highly successful at predicting attractiveness in female talkers producing /bVd/ words. These findings are consistent with a number of hypotheses regarding human attractiveness judgments, including the role of sexual dimorphism in mate selection, the significance of traits signalling underlying health, and perceptual fluency accounts of preferences.}, } @article {pmid34470262, year = {2021}, author = {Sahoo, S and Dandapat, S}, title = {Analyzing the vocal tract characteristics for out-of-breath speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1524}, doi = {10.1121/10.0005945}, pmid = {34470262}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Sound Spectrography ; Speech ; *Speech Acoustics ; *Voice ; }, abstract = {In this work, vocal tract characteristic changes under the out-of-breath condition are explored. Speaking under the influence of physical exercise is called out-of-breath speech. The change in breathing pattern results in perceptual changes in the produced sound. For vocal tract, the first four formants show a lowering in their average frequency. The bandwidths BF1 and BF2 widen, whereas the other two get narrowed. The change in bandwidth is small for the last three. For a speaker, the change in frequency and bandwidth may not be uniform across formants. Subband analysis is carried out around formants for comparing the variation of the vocal tract with the source. A vocal tract adaptive empirical wavelet transform is used for extracting formant specific subbands from speech and source. The support vector machine performs the subband-based binary classification between the normal and out-of-breath speech. For all speakers, it shows an F1-score improvement of 4% over speech subbands. Similarly, a performance improvement of 5% can be seen for both male and female speakers. Furthermore, the misclassification amount is less for source compared to speech. These results suggest that physical exercise influences the source more than the vocal tract.}, } @article {pmid34470045, year = {2022}, author = {Dastolfo-Hromack, C and Bush, A and Chrabaszcz, A and Alhourani, A and Lipski, W and Wang, D and Crammond, DJ and Shaiman, S and Dickey, MW and Holt, LL and Turner, RS and Fiez, JA and Richardson, RM}, title = {Articulatory Gain Predicts Motor Cortex and Subthalamic Nucleus Activity During Speech.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {7}, pages = {1337-1349}, pmid = {34470045}, issn = {1460-2199}, support = {U01 NS098969/NS/NINDS NIH HHS/United States ; }, mesh = {*Deep Brain Stimulation ; Humans ; *Motor Cortex/physiology ; *Parkinson Disease/therapy ; Speech ; *Subthalamic Nucleus/physiology ; }, abstract = {Speaking precisely is important for effective verbal communication, and articulatory gain is one component of speech motor control that contributes to achieving this goal. Given that the basal ganglia have been proposed to regulate the speed and size of limb movement, that is, movement gain, we explored the basal ganglia contribution to articulatory gain, through local field potentials (LFP) recorded simultaneously from the subthalamic nucleus (STN), precentral gyrus, and postcentral gyrus. During STN deep brain stimulation implantation for Parkinson's disease, participants read aloud consonant-vowel-consonant syllables. Articulatory gain was indirectly assessed using the F2 Ratio, an acoustic measurement of the second formant frequency of/i/vowels divided by/u/vowels. Mixed effects models demonstrated that the F2 Ratio correlated with alpha and theta activity in the precentral gyrus and STN. No correlations were observed for the postcentral gyrus. Functional connectivity analysis revealed that higher phase locking values for beta activity between the STN and precentral gyrus were correlated with lower F2 Ratios, suggesting that higher beta synchrony impairs articulatory precision. Effects were not related to disease severity. These data suggest that articulatory gain is encoded within the basal ganglia-cortical loop.}, } @article {pmid34400103, year = {2023}, author = {Aires, MM and de Vasconcelos, D and Lucena, JA and Gomes, AOC and Moraes, BT}, title = {Effect of Wendler glottoplasty on voice and quality of life of transgender women.}, journal = {Brazilian journal of otorhinolaryngology}, volume = {89}, number = {1}, pages = {22-29}, pmid = {34400103}, issn = {1808-8686}, mesh = {Male ; Humans ; Female ; Adult ; *Transgender Persons ; Quality of Life ; Prospective Studies ; Treatment Outcome ; Speech Acoustics ; }, abstract = {OBJECTIVE: To investigate the effect of Wendler glottoplasty on voice feminization, voice quality and voice-related quality of life.

METHODS: Prospective interventional cohort of transgender women submitted to Wendler glottoplasty. Acoustic analysis of the voice included assessment of fundamental frequency, maximum phonation time formant frequencies (F1 and F2), frequency range, jitter and shimmer. Voice quality was blindly assessed through GRBAS scale. Voice-related quality of life was measured using the Trans Woman Voice Questionnaire and the self-perceived femininity of the voice.

RESULTS: A total of 7 patients were included. The mean age was 35.4 years, and the mean postoperative follow-up time was 13.7 months. There was a mean increase of 47.9 ± 46.6 Hz (p = 0.023) in sustained/e/F0 and a mean increase of 24.6 ± 27.5 Hz (p = 0.029) in speaking F0 after glottoplasty. There was no statistical significance in the pre- and postoperative comparison of maximum phonation time, formant frequencies, frequency range, jitter, shimmer, and grade, roughness, breathiness, asthenia, and strain scale. Trans Woman Voice Questionnaire decreased following surgery from 98.3 ± 9.2 to 54.1 ± 25.0 (p = 0.007) and mean self-perceived femininity of the voice increased from 2.8 ± 1.8 to 7.7 ± 2.4 (p = 0.008). One patient (14%) presented a postoperative granuloma and there was 1 (14%) premature suture dehiscence.

CONCLUSION: Glottoplasty is safe and effective for feminizing the voice of transgender women. There was an increase in fundamental frequency, without aggravating other acoustic parameters or voice quality. Voice-related quality of life improved after surgery.}, } @article {pmid34396801, year = {2022}, author = {Chung, H}, title = {Acoustic Characteristics of Pre- and Post-vocalic /l/: Patterns from One Southern White Vernacular English.}, journal = {Language and speech}, volume = {65}, number = {2}, pages = {513-528}, doi = {10.1177/00238309211037368}, pmid = {34396801}, issn = {1756-6053}, mesh = {Acoustics ; Adult ; Female ; Humans ; Language ; Male ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study examined acoustic characteristics of the phoneme /l/ produced by young female and male adult speakers of Southern White Vernacular English (SWVE) from Louisiana. F1, F2, and F2-F1 values extracted at the /l/ midpoint were analyzed by word position (pre- vs. post-vocalic) and vowel contexts (/i, ɪ/ vs. /ɔ, a/). Descriptive analysis showed that SWVE /l/ exhibited characteristics of the dark /l/ variant. The formant patterns of /l/, however, differed significantly by word position and vowel context, with pre-vocalic /l/ showing significantly higher F2-F1 values than post-vocalic /l/, and /l/ in the high front vowel context showing significantly higher F2-F1 values than those in the low back vowel context. Individual variation in the effects of word position and vowel contexts on /l/ pattern was also observed. Overall, the findings of the current study showed a gradient nature of SWVE /l/ variants whose F2-F1 patterns generally fell into the range of the dark /l/ variant, while varying by word position and vowel context.}, } @article {pmid34388438, year = {2021}, author = {Yang, L and Fu, K and Zhang, J and Shinozaki, T}, title = {Non-native acoustic modeling for mispronunciation verification based on language adversarial representation learning.}, journal = {Neural networks : the official journal of the International Neural Network Society}, volume = {142}, number = {}, pages = {597-607}, doi = {10.1016/j.neunet.2021.07.017}, pmid = {34388438}, issn = {1879-2782}, mesh = {Acoustics ; Humans ; *Language ; Language Development ; Speech ; *Speech Perception ; }, abstract = {Non-native mispronunciation verification is designed to provide feedback to guide language learners to correct their pronunciation errors in their further learning and it plays an important role in the computer-aided pronunciation training (CAPT) system. Most existing approaches focus on establishing the acoustic model directly using non-native corpus thus they are suffering the data sparsity problem due to time-consuming non-native speech data collection and annotation tasks. In this work, to address this problem, we propose a pre-trained approach to utilize the speech data of two native languages (the learner's native and target languages) for non-native mispronunciation verification. We set up an unsupervised model to extract knowledge from a large scale of unlabeled raw speech of the target language by making predictions about future observations in the speech signal, then the model is trained with language adversarial training using the learner's native language to align the feature distribution of two languages by confusing a language discriminator. In addition, sinc filter is incorporated at the first convolutional layer to capture the formant-like feature. Formant is relevant to the place and manner of articulation. Therefore, it is useful not only for pronunciation error detection but also for providing instructive feedback. Then the pre-trained model serves as the feature extractor in the downstream mispronunciation verification task. Through the experiments on the Japanese part of the BLCU inter-Chinese speech corpus, the experimental results demonstrate that for the non-native phone recognition and mispronunciation verification tasks (1) the knowledge learned from two native languages speech with the proposed unsupervised approach is useful for these two tasks (2) our proposed language adversarial representation learning is effective to improve the performance (3) formant-like feature can be incorporated by introducing sinc filter to further improve the performance of mispronunciation verification.}, } @article {pmid34384662, year = {2024}, author = {Leyns, C and Corthals, P and Cosyns, M and Papeleu, T and Van Borsel, J and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Acoustic and Perceptual Effects of Articulation Exercises in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {38}, number = {1}, pages = {246.e15-246.e25}, doi = {10.1016/j.jvoice.2021.06.033}, pmid = {34384662}, issn = {1873-4588}, mesh = {Male ; Humans ; Female ; *Speech Acoustics ; *Transgender Persons ; Acoustics ; Speech ; Masculinity ; Phonetics ; }, abstract = {PURPOSE: This study measured the impact of articulation exercises using a cork and articulation exercises for lip spreading on the formant frequencies of vowels and listener perceptions of femininity in transgender women.

METHODS: Thirteen transgender women were recorded before and after the cork exercise and before and after the lip spreading exercise. Speech samples included continuous speech during reading and were analyzed using Praat software. Vowel formant frequencies (F1, F2, F3, F4, F5) and vowel space were determined. A listening experiment was organized using naïve cisgender women and cisgender men rating audio samples of continuous speech. Masculinity/femininity, vocal quality and age were rated, using a visual analogue scale (VAS).

RESULTS: Concerning vowel formant frequencies, F2 /a/ and F5 /u/ significantly increased after the lip spreading exercise, as well as F3 /a/, F3 /u/ and F4 /a/ after the cork exercise. The lip spreading exercise had more impact on the F2 /a/ than the cork exercise. Vowel space did not change after the exercises. The fundamental frequency (fo) increased simultaneously during both exercises. Both articulation exercises were associated with significantly increased listener perceptions of femininity of the voice.

CONCLUSION: Subtle changes in formant frequencies can be observed after performing articulation exercises, but not in every formant frequency or vowel. Cisgender listeners rated the speech of the transgender women more feminine after the exercises. Further research with a more extensive therapy program and listening experiment is needed to examine these preliminary findings.}, } @article {pmid34344099, year = {2021}, author = {Yang, JJ and Cheng, LY and Xu, W}, title = {[Study on changes of voice characteristics after adenotonsillectomy or adenoidectomy in children].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {56}, number = {7}, pages = {724-729}, doi = {10.3760/cma.j.cn115330-20200813-00672}, pmid = {34344099}, issn = {1673-0860}, mesh = {Adenoidectomy ; *Adenoids/surgery ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech Acoustics ; *Tonsillectomy ; Voice Quality ; }, abstract = {Objective: To study voice changes in children after adenotonsillectomy or adenoidectomy and the relationship with the vocal tract structure. Methods: Fifty patients were recruited in this study prospectively, aged from 4 to 12 years old with the median age of 6. They were underwent adenotonsillectomy or adenoidectomy in Beijing Tongren Hospital, Capital Medical University from July 2019 to August 2020. In the cases, there are 31 males and 19 females. Thirty-six patients underwent adenotonsillectomy and 14 patients underwent adenoidectomy alone. Twenty-two children (13 males, 9 females) with Ⅰ degree of bilateral tonsils without adenoid hypertrophy and no snoring were selected as normal controls. Adenoid and tonsil sizes were evaluated. Subjective changes of voice were recorded after surgery. Moreover, voice data including fundamental frequency(F0), jitter, shimmer, noise to harmonic ratio(NHR), maximum phonation time(MPT), formant frequencies(F1-F5) and bandwidths(B1-B5) of vowel/a/and/i/were analyzed before, 3 days and 1 month after surgery respectively.SPSS 23.0 was used for statistical analysis. Results: Thirty-six patients(72.0%,36/50) complained of postoperative voice changes. The incidence was inversely correlated with age. In children aged 4-6, 7-9, and 10-12, the incidence was 83.3%(25/30), 63.6%(7/11) and 44.4%(4/9) respectively. Voice changes appeared more common in children underwent adenotonsillectomy(77.8%,28/36) than in those underwent adenoidectomy alone(57.1%,8/14), but there was no statistical difference. After operation, for vowel/a/, MPT(Z=2.18,P=0.041) and F2(t=2.13,P=0.040) increased, B2(Z=2.04,P=0.041) and B4(Z=2.00,P=0.046) decreased. For vowel/i/, F2(t=2.035,P=0.050) and F4(t=4.44,P=0.0001) increased, B2(Z=2.36,P=0.019) decreased. Other acoustic parameters were not significantly different from those before surgery. The F2(r=-0.392, P =0.032) of vowel/a/and F2(r=-0.279, P=0.048) and F4 (r=-0.401, P =0.028) of vowel/i/after adenotonsillectomy were significantly higher than those of adenoidectomy alone. Half of patients with postopertive voice changes can recover spontaneously 1 month after surgery. Conclusions: Voice changes in children underwent adenotonsillectomy or adenoidectomy might be related to their changes in formants and bandwidths. The effect of adenotonsillectomy on voice was more significant compared with that of adenoidectomy alone. The acoustic parameters did not change significantly after surgery except MPT.}, } @article {pmid34342877, year = {2021}, author = {Frey, R and Wyman, MT and Johnston, M and Schofield, M and Locatelli, Y and Reby, D}, title = {Roars, groans and moans: Anatomical correlates of vocal diversity in polygynous deer.}, journal = {Journal of anatomy}, volume = {239}, number = {6}, pages = {1336-1369}, pmid = {34342877}, issn = {1469-7580}, mesh = {Acoustics ; Animals ; *Deer ; Female ; *Larynx ; Male ; Vocal Cords ; Vocalization, Animal ; }, abstract = {Eurasian deer are characterized by the extraordinary diversity of their vocal repertoires. Male sexual calls range from roars with relatively low fundamental frequency (hereafter fo) in red deer Cervus elaphus, to moans with extremely high fo in sika deer Cervus nippon, and almost infrasonic groans with exceptionally low fo in fallow deer Dama dama. Moreover, while both red and fallow males are capable of lowering their formant frequencies during their calls, sika males appear to lack this ability. Female contact calls are also characterized by relatively less pronounced, yet strong interspecific differences. The aim of this study is to examine the anatomical bases of these inter-specific and inter-sexual differences by identifying if the acoustic variation is reflected in corresponding anatomical variation. To do this, we investigated the vocal anatomy of male and female specimens of each of these three species. Across species and sexes, we find that the observed acoustic variability is indeed related to expected corresponding anatomical differences, based on the source-filter theory of vocal production. At the source level, low fo is associated with larger vocal folds, whereas high fo is associated with smaller vocal folds: sika deer have the smallest vocal folds and male fallow deer the largest. Red and sika deer vocal folds do not appear to be sexually dimorphic, while fallow deer exhibit strong sexual dimorphism (after correcting for body size differences). At the filter level, the variability in formants is related to the configuration of the vocal tract: in fallow and red deer, both sexes have evolved a permanently descended larynx (with a resting position of the larynx much lower in males than in females). Both sexes also have the potential for momentary, call-synchronous vocal tract elongation, again more pronounced in males than in females. In contrast, the resting position of the larynx is high in both sexes of sika deer and the potential for further active vocal tract elongation is virtually absent in both sexes. Anatomical evidence suggests an evolutionary reversal in larynx position within sika deer, that is, a secondary larynx ascent. Together, our observations confirm that the observed diversity of vocal behaviour in polygynous deer is supported by strong anatomical differences, highlighting the importance of anatomical specializations in shaping mammalian vocal repertoires. Sexual selection is discussed as a potential evolutionary driver of the observed vocal diversity and sexual dimorphisms.}, } @article {pmid34340503, year = {2021}, author = {Strycharczuk, P and Ćavar, M and Coretta, S}, title = {Distance vs time. Acoustic and articulatory consequences of reduced vowel duration in Polish.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {592}, doi = {10.1121/10.0005585}, pmid = {34340503}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; Poland ; Speech ; *Speech Acoustics ; }, abstract = {This paper presents acoustic and articulatory (ultrasound) data on vowel reduction in Polish. The analysis focuses on the question of whether the change in formant value in unstressed vowels can be explained by duration-driven undershoot alone or whether there is also evidence for additional stress-specific articulatory mechanisms that systematically affect vowel formants. On top of the expected durational differences between the stressed and unstressed conditions, the duration is manipulated by inducing changes in the speech rate. The observed vowel formants are compared to expected formants derived from the articulatory midsagittal tongue data in different conditions. The results show that the acoustic vowel space is reduced in size and raised in unstressed vowels compared to stressed vowels. Most of the spectral reduction can be explained by reduced vowel duration, but there is also an additional systematic effect of F1-lowering in unstressed non-high vowels that does not follow from tongue movement. The proposed interpretation is that spectral vowel reduction in Polish behaves largely as predicted by the undershoot model of vowel reduction, but the effect of undershoot is enhanced for low unstressed vowels, potentially by a stress marking strategy which involves raising the fundamental frequency.}, } @article {pmid34340486, year = {2021}, author = {Petersen, EA and Colinot, T and Silva, F and H-Turcotte, V}, title = {The bassoon tonehole lattice: Links between the open and closed holes and the radiated sound spectrum.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {398}, doi = {10.1121/10.0005627}, pmid = {34340486}, issn = {1520-8524}, abstract = {The acoustics of the bassoon has been the subject of relatively few studies compared with other woodwind instruments. One reason for this may lie in its complicated resonator geometry, which includes irregularly spaced toneholes with chimney heights ranging from 3 to 31 mm. The current article evaluates the effect of the open and closed tonehole lattice (THL) on the acoustic response of the bassoon resonator. It is shown that this response can be divided into three distinct frequency bands that are determined by the open and closed THL: below 500 Hz, 500-2200 Hz, and above 2200 Hz. The first is caused by the stopband of the open THL, where the low frequency effective length of the instrument is determined by the location of the first open tonehole. The second is due to the passband of the open THL, such that the modes are proportional to the total length of the resonator. The third is due to the closed THL, where part of the acoustical power is trapped within the resonator. It is proposed that these three frequency bands impact the radiated spectrum by introducing a formant in the vicinity of 500 Hz and suppressing radiation above 2200 Hz for most first register fingerings.}, } @article {pmid34340472, year = {2021}, author = {Uezu, Y and Hiroya, S and Mochida, T}, title = {Articulatory compensation for low-pass filtered formant-altered auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {64}, doi = {10.1121/10.0004775}, pmid = {34340472}, issn = {1520-8524}, mesh = {Feedback ; Feedback, Sensory ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback while speaking plays an important role in stably controlling speech articulation. Its importance has been verified in formant-altered auditory feedback (AAF) experiments where speakers utter while listening to speech with perturbed first (F1) and second (F2) formant frequencies. However, the contribution of the frequency components higher than F2 to the articulatory control under the perturbations of F1 and F2 has not yet been investigated. In this study, a formant-AAF experiment was conducted in which a low-pass filter was applied to speech. The experimental results showed that the deviation in the compensatory response was significantly larger when a low-pass filter with a cutoff frequency of 3 kHz was used compared to that when cutoff frequencies of 4 and 8 kHz were used. It was also found that the deviation in the 3-kHz condition correlated with the fundamental frequency and spectral tilt of the produced speech. Additional simulation results using a neurocomputational model of speech production (SimpleDIVA model) and the experimental data showed that the feedforward learning rate increased as the cutoff frequency decreased. These results suggest that high-frequency components of the auditory feedback would be involved in the determination of corrective motor commands from auditory errors.}, } @article {pmid34291230, year = {2021}, author = {Lynn, E and Narayanan, SS and Lammert, AC}, title = {Dark tone quality and vocal tract shaping in soprano song production: Insights from real-time MRI.}, journal = {JASA express letters}, volume = {1}, number = {7}, pages = {075202}, pmid = {34291230}, issn = {2691-1191}, abstract = {Tone quality termed "dark" is an aesthetically important property of Western classical voice performance and has been associated with lowered formant frequencies, lowered larynx, and widened pharynx. The present study uses real-time magnetic resonance imaging with synchronous audio recordings to investigate dark tone quality in four professionally trained sopranos with enhanced ecological validity and a relatively complete view of the vocal tract. Findings differ from traditional accounts, indicating that labial narrowing may be the primary driver of dark tone quality across performers, while many other aspects of vocal tract shaping are shown to differ significantly in a performer-specific way.}, } @article {pmid34265989, year = {2021}, author = {Liu, R and Wang, G and Deng, D and Zhang, T}, title = {Spin Hall effect of Laguerre-Gaussian beams in PT symmetric metamaterials.}, journal = {Optics express}, volume = {29}, number = {14}, pages = {22192-22201}, doi = {10.1364/OE.427869}, pmid = {34265989}, issn = {1094-4087}, abstract = {Spin Hall effect (SHE) of Laguerre-Gaussian (LG) beams reflected and transmitted in parity-time (PT) symmetric metamaterials are investigated near the coherent-perfect-absorption (CPA)-laser point and exceptional points (EPs). The numerical results show that large transverse shifts occur at the CPA-laser point regardless of the incident direction. But at EPs, the SHE increases at one side and disappears at the other side, thus achieving the intense SHE of the reflected light beams at the specified side incidence. In addition, it is found that Bragg oscillation can be generated by increasing the period number of PT symmetric metamaterial layers, thus increasing the number of formants in transverse displacement. In particular, the transverse shift peaks of the transmitted beams merge into a positive peak when the incident angle is close to 90[∘] and does not change basically with the increasing of Im(ɛ), which can also be considered as a strong tolerance to the variation of Im(ɛ). This feature is expected to realize a new type of optoelectronic devices with anti-interference performance. These results provide a feasible path for the modulation of spin Hall effect of light (SHEL) and provide the possibility for the development of new nanophotonic devices.}, } @article {pmid34261582, year = {2023}, author = {Joshi, A and Procter, T and Kulesz, PA}, title = {COVID-19: Acoustic Measures of Voice in Individuals Wearing Different Facemasks.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {6}, pages = {971.e1-971.e8}, pmid = {34261582}, issn = {1873-4588}, mesh = {Adult ; Male ; Female ; Humans ; *COVID-19 ; Pandemics ; Speech Acoustics ; Masks/adverse effects ; Acoustics ; }, abstract = {AIM: The global health pandemic caused by the SARS-coronavirus 2 (COVID-19) has led to the adoption of facemasks as a necessary safety precaution. Depending on the level of risk for exposure to the virus, the facemasks that are used can vary. The aim of this study was to examine the effect of different types of facemasks, typically used by healthcare professionals and the public during the COVID-19 pandemic, on measures of voice.

METHODS: Nineteen adults (ten females, nine males) with a normal voice quality completed sustained vowel tasks. All tasks were performed for each of the six mask conditions: no mask, cloth mask, surgical mask, KN95 mask and, surgical mask over a KN95 mask with and without a face shield. Intensity measurements were obtained at a 1ft and 6ft distance from the speaker with sound level meters. Tasks were recorded with a 1ft mouth-to-microphone distance. Acoustic variables of interest were fundamental frequency (F0), and formant frequencies (F1, F2) for /a/ and /i/ and smoothed cepstral peak prominence (CPPs) for /a/.

RESULTS: Data were analyzed to compare differences between sex and mask types. There was statistical significance between males and females for intensity measures and all acoustic variables except F2 for /a/ and F1 for /i/. Few pairwise comparisons between masks reached significance even though main effects for mask type were observed. These are further discussed in the article.

CONCLUSION: The masks tested in this study did not have a significant impact on intensity, fundamental frequency, CPPs, first or second formant frequency compared to voice output without a mask. Use of a face shield seemed to affect intensity and CPPs to some extent. Implications of these findings are discussed further in the article.}, } @article {pmid34260437, year = {2022}, author = {Easwar, V and Birstler, J and Harrison, A and Scollie, S and Purcell, D}, title = {The Influence of Sensation Level on Speech-Evoked Envelope Following Responses.}, journal = {Ear and hearing}, volume = {43}, number = {1}, pages = {250-254}, pmid = {34260437}, issn = {1538-4667}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; //CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Phonetics ; Sensation ; *Speech ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: To evaluate sensation level (SL)-dependent characteristics of envelope following responses (EFRs) elicited by band-limited speech dominant in low, mid, and high frequencies.

DESIGN: In 21 young normal hearing adults, EFRs were elicited by 8 male-spoken speech stimuli-the first formant, and second and higher formants of /u/, /a/ and /i/, and modulated fricatives, /∫/ and /s/. Stimulus SL was computed from behaviorally measured thresholds.

RESULTS: At 30 dB SL, the amplitude and phase coherence of fricative-elicited EFRs were ~1.5 to 2 times higher than all vowel-elicited EFRs, whereas fewer and smaller differences were found among vowel-elicited EFRs. For all stimuli, EFR amplitude and phase coherence increased by roughly 50% for every 10 dB increase in SL between ~0 and 50 dB.

CONCLUSIONS: Stimulus and frequency dependency in EFRs exist despite accounting for differences in audibility of speech sounds. The growth rate of EFR characteristics with SL is independent of stimulus and its frequency.}, } @article {pmid34256982, year = {2023}, author = {Zealouk, O and Satori, H and Hamidi, M and Laaidi, N and Salek, A and Satori, K}, title = {Analysis of COVID-19 Resulting Cough Using Formants and Automatic Speech Recognition System.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {6}, pages = {971.e9-971.e16}, pmid = {34256982}, issn = {1873-4588}, mesh = {Humans ; *Speech Recognition Software ; Cough/diagnosis/etiology ; Pandemics ; *COVID-19/complications/diagnosis ; Speech ; }, abstract = {As part of our contributions to researches on the ongoing COVID-19 pandemic worldwide, we have studied the cough changes to the infected people based on the Hidden Markov Model (HMM) speech recognition classification, formants frequency and pitch analysis. In this paper, An HMM-based cough recognition system was implemented with 5 HMM states, 8 Gaussian Mixture Distributions (GMMs) and 13 dimensions of the basic Mel-Frequency Cepstral Coefficients (MFCC) with 39 dimensions of the overall feature vector. A comparison between formants frequency and pitch extracted values is realized based on the cough of COVID-19 infected people and healthy ones to confirm our cough recognition system results. The experimental results present that the difference between the recognition rates of infected and non-infected people is 6.7%. Whereas, the formant analysis variation based on the cough of infected and non-infected people is clearly observed with F1, F3, and F4 and lower for F0 and F2.}, } @article {pmid34251887, year = {2021}, author = {Easwar, V and Scollie, S and Lasarev, M and Urichuk, M and Aiken, SJ and Purcell, DW}, title = {Characteristics of Speech-Evoked Envelope Following Responses in Infancy.}, journal = {Trends in hearing}, volume = {25}, number = {}, pages = {23312165211004331}, pmid = {34251887}, issn = {2331-2165}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Infant ; Phonetics ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {Envelope following responses (EFRs) may be a useful tool for evaluating the audibility of speech sounds in infants. The present study aimed to evaluate the characteristics of speech-evoked EFRs in infants with normal hearing, relative to adults, and identify age-dependent changes in EFR characteristics during infancy. In 42 infants and 21 young adults, EFRs were elicited by the first (F1) and the second and higher formants (F2+) of the vowels /u/, /a/, and /i/, dominant in low and mid frequencies, respectively, and by amplitude-modulated fricatives /s/ and /∫/, dominant in high frequencies. In a subset of 20 infants, the in-ear stimulus level was adjusted to match that of an average adult ear (65 dB sound pressure level [SPL]). We found that (a) adult-infant differences in EFR amplitude, signal-to-noise ratio, and intertrial phase coherence were larger and spread across the frequency range when in-ear stimulus level was adjusted in infants, (b) adult-infant differences in EFR characteristics were the largest for low-frequency stimuli, (c) infants demonstrated adult-like phase coherence when they received a higher (i.e., unadjusted) stimulus level, and (d) EFR phase coherence and signal-to-noise ratio changed with age in the first year of life for a few F2+ vowel stimuli in a level-specific manner. Together, our findings reveal that development-related changes in EFRs during infancy likely vary by stimulus frequency, with low-frequency stimuli demonstrating the largest adult-infant differences. Consistent with previous research, our findings emphasize the significant role of stimulus level calibration methods while investigating developmental trends in EFRs.}, } @article {pmid34241428, year = {2021}, author = {Echternach, M and Herbst, CT and Köberlein, M and Story, B and Döllinger, M and Gellrich, D}, title = {Are source-filter interactions detectable in classical singing during vowel glides?.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {4565}, doi = {10.1121/10.0005432}, pmid = {34241428}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Occupations ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {In recent studies, it has been assumed that vocal tract formants (Fn) and the voice source could interact. However, there are only few studies analyzing this assumption in vivo. Here, the vowel transition /i/-/a/-/u/-/i/ of 12 professional classical singers (6 females, 6 males) when phonating on the pitch D4 [fundamental frequency (ƒo) ca. 294 Hz] were analyzed using transnasal high speed videoendoscopy (20.000 fps), electroglottography (EGG), and audio recordings. Fn data were calculated using a cepstral method. Source-filter interaction candidates (SFICs) were determined by (a) algorithmic detection of major intersections of Fn/nƒo and (b) perceptual assessment of the EGG signal. Although the open quotient showed some increase for the /i-a/ and /u-i/ transitions, there were no clear effects at the expected Fn/nƒo intersections. In contrast, ƒo adjustments and changes in the phonovibrogram occurred at perceptually derived SFICs, suggesting level-two interactions. In some cases, these were constituted by intersections between higher nƒo and Fn. The presented data partially corroborates that vowel transitions may result in level-two interactions also in professional singers. However, the lack of systematically detectable effects suggests either the absence of a strong interaction or existence of confounding factors, which may potentially counterbalance the level-two-interactions.}, } @article {pmid34241427, year = {2021}, author = {Zhang, C and Jepson, K and Lohfink, G and Arvaniti, A}, title = {Comparing acoustic analyses of speech data collected remotely.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {3910}, pmid = {34241427}, issn = {1520-8524}, mesh = {Acoustics ; *COVID-19 ; Humans ; Phonetics ; SARS-CoV-2 ; *Speech ; Speech Acoustics ; }, abstract = {Face-to-face speech data collection has been next to impossible globally as a result of the COVID-19 restrictions. To address this problem, simultaneous recordings of three repetitions of the cardinal vowels were made using a Zoom H6 Handy Recorder with an external microphone (henceforth, H6) and compared with two alternatives accessible to potential participants at home: the Zoom meeting application (henceforth, Zoom) and two lossless mobile phone applications (Awesome Voice Recorder, and Recorder; henceforth, Phone). F0 was tracked accurately by all of the devices; however, for formant analysis (F1, F2, F3), Phone performed better than Zoom, i.e., more similarly to H6, although the data extraction method (VoiceSauce, Praat) also resulted in differences. In addition, Zoom recordings exhibited unexpected drops in intensity. The results suggest that lossless format phone recordings present a viable option for at least some phonetic studies.}, } @article {pmid34240071, year = {2021}, author = {Diamant, N and Amir, O}, title = {Examining the voice of Israeli transgender women: Acoustic measures, voice femininity and voice-related quality-of-life.}, journal = {International journal of transgender health}, volume = {22}, number = {3}, pages = {281-293}, pmid = {34240071}, issn = {2689-5277}, abstract = {BACKGROUND: Transgender women may experience gender-dysphoria associated with their voice and the way it is perceived. Previous studies have shown that specific acoustic measures are associated with the perception of voice-femininity and with voice-related quality-of-life, yet results are inconsistent.

AIMS: This study aimed to examine the associations between specific voice measures of transgender women, voice-related quality-of-life, and the perception of voice-femininity by listeners and by the speakers themselves.

METHODS: Thirty Hebrew speaking transgender women were recorded. They had also rated their voice-femininity and completed the Hebrew version of the TVQ[MtF] questionnaire. Recordings were analyzed to extract mean fundamental frequency (F0), formant frequencies (F1, F2, F3), and vocal-range (calculated in Hz. and in semitones). Recordings were also rated on a voice-gender 7-point scale, by 20 naïve cisgender listeners.

RESULTS: Significant correlations were found between both F0 and F1 and listeners' as well as speakers' evaluation of voice-femininity. TVQ[MtF] scores were significantly correlated with F0 and with the lower and upper boundaries of the vocal-range. Voice-femininity ratings were strongly correlated with vocal-range, when calculated in Hz, but not when defined in semitones. Listeners' evaluation and speakers' self-evaluation of voice-femininity were significantly correlated. However, TVQ[MtF] scores were significantly correlated only with the speakers' voice-femininity ratings, but not with those of the listeners.

CONCLUSION: Higher F0 and F1, which are perceived as more feminine, jointly improved speakers' satisfaction with their voice. Speakers' self-evaluation of voice-femininity does not mirror listeners' judgment, as it is affected by additional factors, related to self-satisfaction and personal experience. Combining listeners' and speakers' voice evaluation with acoustic analysis is valuable by providing a more holistic view on how transgender women feel about their voice and how it is perceived by listeners.}, } @article {pmid34232704, year = {2021}, author = {Leung, Y and Oates, J and Chan, SP and Papp, V}, title = {Associations Between Speaking Fundamental Frequency, Vowel Formant Frequencies, and Listener Perceptions of Speaker Gender and Vocal Femininity-Masculinity.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {7}, pages = {2600-2622}, doi = {10.1044/2021_JSLHR-20-00747}, pmid = {34232704}, issn = {1558-9102}, mesh = {Australia ; Female ; *Femininity ; Humans ; Male ; *Masculinity ; Perception ; Speech Acoustics ; }, abstract = {Purpose The aim of the study was to examine associations between speaking fundamental frequency (f os), vowel formant frequencies (F), listener perceptions of speaker gender, and vocal femininity-masculinity. Method An exploratory study was undertaken to examine associations between f os, F 1-F 3, listener perceptions of speaker gender (nominal scale), and vocal femininity-masculinity (visual analog scale). For 379 speakers of Australian English aged 18-60 years, f os mode and F 1-F 3 (12 monophthongs; total of 36 Fs) were analyzed on a standard reading passage. Seventeen listeners rated speaker gender and vocal femininity-masculinity on randomized audio recordings of these speakers. Results Model building using principal component analysis suggested the 36 Fs could be succinctly reduced to seven principal components (PCs). Generalized structural equation modeling (with the seven PCs of F and f os as predictors) suggested that only F 2 and f os predicted listener perceptions of speaker gender (male, female, unable to decide). However, listener perceptions of vocal femininity-masculinity behaved differently and were predicted by F 1, F 3, and the contrast between monophthongs at the extremities of the F 1 acoustic vowel space, in addition to F 2 and f os. Furthermore, listeners' perceptions of speaker gender also influenced ratings of vocal femininity-masculinity substantially. Conclusion Adjusted odds ratios highlighted the substantially larger contribution of F to listener perceptions of speaker gender and vocal femininity-masculinity relative to f os than has previously been reported.}, } @article {pmid34229221, year = {2021}, author = {Easwar, V and Boothalingam, S and Flaherty, R}, title = {Fundamental frequency-dependent changes in vowel-evoked envelope following responses.}, journal = {Hearing research}, volume = {408}, number = {}, pages = {108297}, doi = {10.1016/j.heares.2021.108297}, pmid = {34229221}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Male ; Speech ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {Scalp-recorded envelope following responses (EFRs) provide a non-invasive method to assess the encoding of the fundamental frequency (f0) of voice that is important for speech understanding. It is well-known that EFRs are influenced by voice f0. However, this effect of f0 has not been examined independent of concomitant changes in spectra or neural generators. We evaluated the effect of voice f0 on EFRs while controlling for vowel formant characteristics and potentially avoiding significant changes in dominant neural generators using a small f0 range. EFRs were elicited by a male-spoken vowel /u/ (average f0 = 100.4 Hz) and its lowered f0 version (average f0 = 91.9 Hz) with closely matched formant characteristics. Vowels were presented to each ear of 17 young adults with normal hearing. EFRs were simultaneously recorded between the vertex and the nape, and the vertex and the ipsilateral mastoid-the two most common electrode montages used for EFRs. Our results indicate that when vowel formant characteristics are matched, an increase in f0 by 8.5 Hz reduces EFR amplitude by 25 nV, phase coherence by 0.05 and signal-to-noise ratio by 3.5 dB, on average. The reduction in EFR characteristics was similar across ears of stimulation and the two montages used. These findings will help parse the influence of f0 or stimulus spectra on EFRs when both co-vary.}, } @article {pmid34213387, year = {2022}, author = {Eravci, FC and Yildiz, BD and Özcan, KM and Moran, M and Çolak, M and Karakurt, SE and Karakuş, MF and Ikinciogullari, A}, title = {Acoustic parameter changes after bariatric surgery.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {4}, pages = {256-261}, doi = {10.1080/14015439.2021.1945676}, pmid = {34213387}, issn = {1651-2022}, mesh = {Humans ; Adult ; Middle Aged ; *Speech Acoustics ; Voice Quality ; Prospective Studies ; Longitudinal Studies ; Acoustics ; *Bariatric Surgery/adverse effects ; Weight Loss ; }, abstract = {OBJECTIVE: To investigate the acoustic parameter changes after weight loss in bariatric surgery patients.

MATERIALS AND METHODS: This prospective, longitudinal study was conducted with 15 patients with planned bariatric surgery, who were evaluated pre-operatively and at 6 months post-operatively. Fundamental frequency (F0), Formant frequency (F1, F2, F3, and F4), Frequency perturbation (Jitter), Amplitude perturbation (Shimmer) and Noise-to-Harmonics Ratio (NHR) parameters were evaluated for /a/, /e/, /i/, /o/, and /u/ vowels. Changes in the acoustic analysis parameters for each vowel were compared. The study group was separated into two groups according to whether the Mallampati score had not changed (Group 1) or had decreased (Group 2) and changes in the formant frequencies were compared between these groups.

RESULTS: A total of 15 patients with a median age of 40 ± 11 years completed the study. The median weight of the patients was 122 ± 14 kg pre-operatively and 80 ± 15 kg, post-operatively. BMI declined from 46 ± 4 to 31 ± 5 kg/m[2]. The Mallampati score decreased by one point in six patients and remained stable in nine. Of the acoustic voice analysis parameters of vowels, in general, fundamental frequency tended to decrease, and shimmer and jitter values tended to increase. Some of the formant frequencies were specifically affected by the weight loss and this showed statistical significance between Group 1 and Group 2.

CONCLUSION: The present study reveals that some specific voice characteristics might be affected by successful weight loss after bariatric surgery.HighlightsObesity reduces the size of the pharyngeal lumen at different levels.The supralaryngeal vocal tract size and configuration is a determinative factor in the features of the voice.Changes in the length and shape of the vocal tract, or height and position of the tongue can result in changes especially in formant frequencies in acoustic analysis.}, } @article {pmid34160929, year = {2021}, author = {Yang, J}, title = {Vowel development in young Mandarin-English bilingual children.}, journal = {Phonetica}, volume = {78}, number = {3}, pages = {241-272}, doi = {10.1515/phon-2021-2006}, pmid = {34160929}, issn = {1423-0321}, mesh = {Child ; Child, Preschool ; Humans ; Language ; Language Development ; *Multilingualism ; Phonetics ; *Speech Perception ; }, abstract = {This study examined the development of vowel categories in young Mandarin -English bilingual children. The participants included 35 children aged between 3 and 4 years old (15 Mandarin-English bilinguals, six English monolinguals, and 14 Mandarin monolinguals). The bilingual children were divided into two groups: one group had a shorter duration (<1 year) of intensive immersion in English (Bi-low group) and one group had a longer duration (>1 year) of intensive immersion in English (Bi-high group). The participants were recorded producing one list of Mandarin words containing the vowels /a, i, u, y, ɤ/ and/or one list of English words containing the vowels /i, ɪ, e, ɛ, æ, u, ʊ, o, ɑ, ʌ/. Formant frequency values were extracted at five equidistant time locations (the 20-35-50-65-80% point) over the course of vowel duration. Cross-language and within-language comparisons were conducted on the midpoint formant values and formant trajectories. The results showed that children in the Bi-low group produced their English vowels into clusters and showed positional deviations from the monolingual targets. However, they maintained the phonetic features of their native vowel sounds well and mainly used an assimilatory process to organize the vowel systems. Children in the Bi-high group separated their English vowels well. They used both assimilatory and dissimilatory processes to construct and refine the two vowel systems. These bilingual children approximated monolingual English children to a better extent than the children in the Bi-low group. However, when compared to the monolingual peers, they demonstrated observable deviations in both L1 and L2.}, } @article {pmid34116888, year = {2023}, author = {Lin, Y and Cheng, L and Wang, Q and Xu, W}, title = {Effects of Medical Masks on Voice Assessment During the COVID-19 Pandemic.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {5}, pages = {802.e25-802.e29}, doi = {10.1016/j.jvoice.2021.04.028}, pmid = {34116888}, issn = {1873-4588}, mesh = {Male ; Female ; Humans ; Middle Aged ; Phonation ; Speech Acoustics ; Masks/adverse effects ; Pandemics/prevention & control ; *COVID-19/prevention & control ; *Voice ; }, abstract = {OBJECTIVE: Voice assessment is of great significance to the evaluation of voice quality. Our study aims to explore the effects of medical masks on healthy people in acoustic, aerodynamic and formant parameters during the COVID-19 pandemic. In addition, we also attempted to verify the differences between different sexes and ages.

METHODS: Fifty-three healthy participants (25 males and 28 females) were involved in our study. The acoustic parameters, including fundamental frequency (F0), sound pressure level (SPL), percentage of jitter (%), percentage of shimmer (%), noise to harmonic ratio (NHR) and cepstral peak prominence (CPP), aerodynamic parameter (maximum phonation time, MPT) and formant parameters (formant frequency, F1, F2, F3) without and with wearing medical masks were included. We further investigated the potential differences in the impact on different sexes and ages (≤45 years old and >45 years old).

RESULTS: While wearing medical masks, the SPL significantly increased (71.22±4.25 dB, 72.42±3.96 dB, P = 0.021). Jitter and shimmer significantly decreased (jitter 1.19±0.83, 0.87±0.67 P = 0.005; shimmer 4.49±2.20, 3.66±2.02 P = 0.002), as did F3 (2855±323.34 Hz, 2781.89±353.42 Hz P = 0.004). F0, MPT, F1 and F2 showed increasing trends without statistical significance, and NHR as well as CPP showed little change without and with wearing medical masks. There were no significant differences seen between males and females. Regarding to age, a significant difference in MPT was seen (>45-year-old 16.15±6.98 s, 15.38±7.02 s; ≤45-year-old 20.26±6.47 s, 21.44±6.98 s, P = 0.032).

CONCLUSION: Healthy participants showed a significantly higher SPL, a smaller perturbation and an evident decrease in F3 after wearing medical masks. These changes may result from the adjustment of the vocal tract and the filtration function of medical masks, leading to the stability of voices we recorded being overstated. The impacts of medical masks on sex were not evident, while the MPT in the >45-year-old group was influenced more than that in the ≤45-year-old group.}, } @article {pmid34091212, year = {2021}, author = {Madrid, AM and Walker, KA and Smith, SB and Hood, LJ and Prieve, BA}, title = {Relationships between click auditory brainstem response and speech frequency following response with development in infants born preterm.}, journal = {Hearing research}, volume = {407}, number = {}, pages = {108277}, doi = {10.1016/j.heares.2021.108277}, pmid = {34091212}, issn = {1878-5891}, support = {R01 DC011777/DC/NIDCD NIH HHS/United States ; }, mesh = {Child, Preschool ; *Evoked Potentials, Auditory, Brain Stem ; Gestational Age ; Humans ; Infant ; Infant, Newborn ; Infant, Premature ; Speech ; *Speech Perception ; }, abstract = {The speech evoked frequency following response (sFFR) is used to study relationships between neural processing and functional aspects of speech and language that are not captured by click or toneburst evoked auditory brainstem responses (ABR). The sFFR is delayed, deviant, or weak in school age children having a variety of disorders, including autism, dyslexia, reading and language disorders, in relation to their typically developing peers. Much less is known about the developmental characteristics of sFFR, especially in preterm infants, who are at risk of having language delays. In term neonates, phase locking and spectral representation of the fundamental frequency is developed in the early days of life. Spectral representation of higher harmonics and latencies associated with transient portions of the stimulus are still developing in term infants through at least 10 months of age. The goal of this research was to determine whether sFFR could be measured in preterm infants and to characterize its developmental trajectory in the time and frequency domain. Click ABR and sFFR were measured in 28 preterm infants at ages 33 to 64 weeks gestational age. The sFFR could be measured in the majority of infants at 33 weeks gestational age, and the detectability of all sFFR waves was 100% by 64 weeks gestational age. The latency of all waves associated with the transient portion of the response (waves V, A, and O), and most waves (waves D and E) associated with the quasi-steady state decreased with increasing age. The interpeak wave A-O latency did not change with age, indicating that these waves share a neural generator, or the neural generators are developing at the same rate. The spectral amplitude of F0 and the lower frequencies of the first formant increased with age, but that for higher frequencies of the first formant and higher harmonics did not. The results suggest that the sFFR can be reliably recorded in preterm infants, including those cared for in the neonatal intensive care unit. These findings support that in preterm infants, F0 amplitude continues to develop within the first 6 months of life and develops before efficient representation of higher frequency harmonics. Further research is needed to determine if the sFFR in preterm infants is predictive of long-term language or learning disorders.}, } @article {pmid34045154, year = {2023}, author = {Andrade, PA and Frič, M and Otčenášek, Z}, title = {Assessment of Changes in Laryngeal Configuration and Voice Parameters Among Different Frequencies of Neuromuscular Electrical Stimulation (NMES) and Cumulative Effects of NMES in a Normophonic Subject: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {632.e1-632.e20}, doi = {10.1016/j.jvoice.2021.03.018}, pmid = {34045154}, issn = {1873-4588}, mesh = {Humans ; Pilot Projects ; *Voice/physiology ; Vocal Cords/physiology ; Laryngeal Muscles/physiology ; Electric Stimulation ; }, abstract = {INTRODUCTION: Neuromuscular electrical stimulation (NMES) is a complementary resource to voice therapy that can be used for the treatment of hypofunctional voice disorders. Although positive clinical studies have been reported, neutral and even potentially harmful effects of NMES are also described in the literature. Furthermore, in the studies examined by the authors, the use of different methods of NMES have been identified, which further contributes to the inconsistent results found among studies. Moreover, limited rationale is provided for the chosen NMES parameters such as electrode placement, frequency of NMES and length of treatment. The aims of this pilot study were to investigate the a) impact of different frequencies of NMES on glottal configuration and vocal fold vibration patterns and b) changes in laryngeal configuration and vocal output across 12 minutes of NMES.

METHOD: Three experiments were carried out looking at changes in laryngeal configuration and voice output using different imaging techniques (fibreoptic nasolaryngoscopy and high-speed video), acoustical analysis (F0, formant analysis, SPL, CPPS and LHSR values), electroglottography (EGG) and Relative Fundamental Frequency (RFF) analyses. Glottal parameters and acoustical measures were recorded before, during, and after stimulation. Data was collected at rest and during phonation.

RESULTS: Overall the results showed global changes in laryngeal configuration from normal to hyperfunctional (ie, increased RFF, SPL, CQ, and stiffness). Changes were more pronounced for lower frequencies of NMES and were significant within less than three minutes of application.

CONCLUSION: NMES is an effective resource for the activation of intrinsic laryngeal muscles producing significant levels of adduction within few minutes of application. Lower NMES frequencies produced greater muscle activation when compared to higher frequencies.}, } @article {pmid34043445, year = {2021}, author = {Daliri, A}, title = {A Computational Model for Estimating the Speech Motor System's Sensitivity to Auditory Prediction Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1841-1854}, pmid = {34043445}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Female ; Humans ; Sound ; *Speech ; *Speech Perception ; }, abstract = {Purpose The speech motor system uses feedforward and feedback control mechanisms that are both reliant on prediction errors. Here, we developed a state-space model to estimate the error sensitivity of the control systems. We examined (a) whether the model accounts for the error sensitivity of the control systems and (b) whether the two systems have similar error sensitivity. Method Participants (N = 50) completed an adaptation paradigm, in which their first and second formants were perturbed such that a participant's /ε/ would sound like her /ӕ/. We measured adaptive responses to the perturbations at early (0-80 ms) and late (220-300 ms) time points relative to the onset of the perturbations. As data-driven correlates of the error sensitivity of the feedforward and feedback systems, we used the average early responses and difference responses (i.e., late minus early responses), respectively. We fitted the state-space model to participants' adaptive responses and used the model's parameters as model-based estimates of error sensitivity. Results We found that the late responses were larger than the early responses. Additionally, the model-based estimates of error sensitivity strongly correlated with the data-driven estimates. However, the data-driven and model-based estimates of error sensitivity of the feedforward system did not correlate with those of the feedback system. Conclusions Overall, our results suggested that the dynamics of adaptive responses as well as error sensitivity of the control systems can be accurately predicted by the model. Furthermore, our results suggested that the feedforward and feedback control systems function independently. Supplemental Material https://doi.org/10.23641/asha.14669808.}, } @article {pmid34019777, year = {2021}, author = {Souza, PE and Ellis, G and Marks, K and Wright, R and Gallun, F}, title = {Does the Speech Cue Profile Affect Response to Amplitude Envelope Distortion?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {2053-2069}, pmid = {34019777}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cues ; *Hearing Aids ; *Hearing Loss, Sensorineural ; Humans ; Speech ; *Speech Perception ; }, abstract = {Purpose A broad area of interest to our group is to understand the consequences of the "cue profile" (a measure of how well a listener can utilize audible temporal and/or spectral cues for listening scenarios in which a subset of cues is distorted. The study goal was to determine if listeners whose cue profile indicated that they primarily used temporal cues for recognition would respond differently to speech-envelope distortion than listeners who utilized both spectral and temporal cues. Method Twenty-five adults with sensorineural hearing loss participated in the study. The listener's cue profile was measured by analyzing identification patterns for a set of synthetic syllables in which envelope rise time and formant transitions were varied. A linear discriminant analysis quantified the relative contributions of spectral and temporal cues to identification patterns. Low-context sentences in noise were processed with time compression, wide-dynamic range compression, or a combination of time compression and wide-dynamic range compression to create a range of speech-envelope distortions. An acoustic metric, a modified version of the Spectral Correlation Index, was calculated to quantify envelope distortion. Results A binomial generalized linear mixed-effects model indicated that envelope distortion, the cue profile, the interaction between envelope distortion and the cue profile, and the pure-tone average were significant predictors of sentence recognition. Conclusions The listeners with good perception of spectro-temporal contrasts were more resilient to the detrimental effects of envelope compression than listeners who used temporal cues to a greater extent. The cue profile may provide information about individual listening that can direct choice of hearing aid parameters, especially those parameters that affect the speech envelope.}, } @article {pmid33987821, year = {2021}, author = {Stilp, CE and Assgari, AA}, title = {Contributions of natural signal statistics to spectral context effects in consonant categorization.}, journal = {Attention, perception & psychophysics}, volume = {83}, number = {6}, pages = {2694-2708}, pmid = {33987821}, issn = {1943-393X}, mesh = {Acoustic Stimulation ; Humans ; Language ; *Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speech perception, like all perception, takes place in context. Recognition of a given speech sound is influenced by the acoustic properties of surrounding sounds. When the spectral composition of earlier (context) sounds (e.g., a sentence with more energy at lower third formant [F3] frequencies) differs from that of a later (target) sound (e.g., consonant with intermediate F3 onset frequency), the auditory system magnifies this difference, biasing target categorization (e.g., towards higher-F3-onset /d/). Historically, these studies used filters to force context stimuli to possess certain spectral compositions. Recently, these effects were produced using unfiltered context sounds that already possessed the desired spectral compositions (Stilp & Assgari, 2019, Attention, Perception, & Psychophysics, 81, 2037-2052). Here, this natural signal statistics approach is extended to consonant categorization (/g/-/d/). Context sentences were either unfiltered (already possessing the desired spectral composition) or filtered (to imbue specific spectral characteristics). Long-term spectral characteristics of unfiltered contexts were poor predictors of shifts in consonant categorization, but short-term characteristics (last 475 ms) were excellent predictors. This diverges from vowel data, where long-term and shorter-term intervals (last 1,000 ms) were equally strong predictors. Thus, time scale plays a critical role in how listeners attune to signal statistics in the acoustic environment.}, } @article {pmid33979206, year = {2021}, author = {Dromey, C and Richins, M and Low, T}, title = {Kinematic and Acoustic Changes to Vowels and Diphthongs in Bite Block Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1794-1801}, doi = {10.1044/2021_JSLHR-20-00630}, pmid = {33979206}, issn = {1558-9102}, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Young Adult ; }, abstract = {Purpose We examined the effect of bite block insertion (BBI) on lingual movements and formant frequencies in corner vowel and diphthong production in a sentence context. Method Twenty young adults produced the corner vowels (/u/, /ɑ/, /æ/, /i/) and the diphthong /ɑɪ/ in sentence contexts before and after BBI. An electromagnetic articulograph measured the movements of the tongue back, middle, and front. Results There were significant decreases in the acoustic vowel articulation index and vowel space area following BBI. The kinematic vowel articulation index decreased significantly for the back and middle of the tongue but not for the front. There were no significant acoustic changes post-BBI for the diphthong, other than a longer transition duration. Diphthong kinematic changes after BBI included smaller movements for the back and middle of the tongue, but not the front. Conclusions BBI led to a smaller acoustic working space for the corner vowels. The adjustments made by the front of the tongue were sufficient to compensate for the BBI perturbation in the diphthong, resulting in unchanged formant trajectories. The back and middle of the tongue were likely biomechanically restricted in their displacement by the fixation of the jaw, whereas the tongue front showed greater movement flexibility.}, } @article {pmid33977813, year = {2024}, author = {Onosson, S and Stewart, J}, title = {The Effects of Language Contact on Non-Native Vowel Sequences in Lexical Borrowings: The Case of Media Lengua.}, journal = {Language and speech}, volume = {67}, number = {2}, pages = {498-527}, pmid = {33977813}, issn = {1756-6053}, mesh = {Humans ; *Phonetics ; *Speech Acoustics ; Female ; Male ; *Multilingualism ; Adult ; Speech Production Measurement ; Young Adult ; Language ; }, abstract = {Media Lengua (ML), a mixed language derived from Quichua and Spanish, exhibits a phonological system that largely conforms to that of Quichua acoustically. Yet, it incorporates a large number of vowel sequences from Spanish which do not occur in the Quichua system. This includes the use of mid-vowels, which are phonetically realized in ML as largely overlapping with the high-vowels in acoustic space. We analyze and compare production of vowel sequences by speakers of ML, Quichua, and Spanish through the use of generalized additive mixed models to determine statistically significant differences between vowel formant trajectories. Our results indicate that Spanish-derived ML vowel sequences frequently differ significantly from their Spanish counterparts, largely occupying a more central region of the vowel space and frequently exhibiting markedly reduced trajectories over time. In contrast, we find only one case where an ML vowel sequence differs significantly from its Quichua counterpart-and even in this case the difference from Spanish is substantially greater. Our findings show how the vowel system of ML successfully integrates novel vowel sequence patterns from Spanish into what is essentially Quichua phonology by markedly adapting their production, while still maintaining contrasts which are not expressed in Quichua.}, } @article {pmid33951578, year = {2021}, author = {Isler, B and Giroud, N and Hirsiger, S and Kleinjung, T and Meyer, M}, title = {Bilateral age-related atrophy in the planum temporale is associated with vowel discrimination difficulty in healthy older adults.}, journal = {Hearing research}, volume = {406}, number = {}, pages = {108252}, doi = {10.1016/j.heares.2021.108252}, pmid = {33951578}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Aged ; Atrophy/pathology ; *Auditory Cortex/diagnostic imaging ; Humans ; Speech ; *Speech Perception ; *Temporal Lobe/pathology ; }, abstract = {In this study we investigated the association between age-related brain atrophy and behavioural as well as electrophysiological markers of vowel perception in a sample of healthy younger and older adults with normal pure-tone hearing. Twenty-three older adults and 27 younger controls discriminated a set of vowels with altered second formants embedded in consonant-vowel syllables. Additionally, mismatch negativity (MMN) responses were recorded in a separate oddball paradigm with the same set of stimuli. A structural magnet resonance scan was obtained for each participant to determine cortical architecture of the left and right planum temporale (PT). The PT was chosen for its function as a major processor of auditory cues and speech. Results suggested that older adults performed worse in vowel discrimination despite normal-for-age pure-tone hearing. In the older group, we found evidence that those with greater age-related cortical atrophy (i.e., lower cortical surface area and cortical volume) in the left and right PT also showed weaker vowel discrimination. In comparison, we found a lateralized correlation in the younger group suggesting that those with greater cortical thickness in only the left PT performed weaker in the vowel discrimination task. We did not find any associations between macroanatomical traits of the PT and MMN responses. We conclude that deficient vowel processing is not only caused by pure-tone hearing loss but is also influenced by atrophy-related changes in the ageing auditory-related cortices. Furthermore, our results suggest that auditory processing might become more bilateral across the lifespan.}, } @article {pmid33938165, year = {2021}, author = {Xiao, Y and Wang, T and Deng, W and Yang, L and Zeng, B and Lao, X and Zhang, S and Liu, X and Ouyang, D and Liao, G and Liang, Y}, title = {Data mining of an acoustic biomarker in tongue cancers and its clinical validation.}, journal = {Cancer medicine}, volume = {10}, number = {11}, pages = {3822-3835}, pmid = {33938165}, issn = {2045-7634}, mesh = {Adult ; Aged ; Analysis of Variance ; Area Under Curve ; Articulation Disorders/diagnosis/*physiopathology ; China ; Cross-Sectional Studies ; *Data Mining ; Female ; Humans ; Male ; Middle Aged ; Quality of Life ; Sex Factors ; Speech Production Measurement/methods ; Support Vector Machine ; Tongue/surgery ; Tongue Neoplasms/diagnosis/pathology/*physiopathology/surgery ; }, abstract = {The promise of speech disorders as biomarkers in clinical examination has been identified in a broad spectrum of neurodegenerative diseases. However, to the best of our knowledge, a validated acoustic marker with established discriminative and evaluative properties has not yet been developed for oral tongue cancers. Here we cross-sectionally collected a screening dataset that included acoustic parameters extracted from 3 sustained vowels /ɑ/, /i/, /u/ and binary perceptual outcomes from 12 consonant-vowel syllables. We used a support vector machine with linear kernel function within this dataset to identify the formant centralization ratio (FCR) as a dominant predictor of different perceptual outcomes across gender and syllable. The Acoustic analysis, Perceptual evaluation and Quality of Life assessment (APeQoL) was used to validate the FCR in 33 patients with primary resectable oral tongue cancers. Measurements were taken before (pre-op) and four to six weeks after (post-op) surgery. The speech handicap index (SHI), a speech-specific questionnaire, was also administrated at these time points. Pre-op correlation analysis within the APeQoL revealed overall consistency and a strong correlation between FCR and SHI scores. FCRs also increased significantly with increasing T classification pre-operatively, especially for women. Longitudinally, the main effects of T classification, the extent of resection, and their interaction effects with time (pre-op vs. post-op) on FCRs were all significant. For pre-operative FCR, after merging the two datasets, a cut-off value of 0.970 produced an AUC of 0.861 (95% confidence interval: 0.785-0.938) for T3-4 patients. In sum, this study determined that FCR is an acoustic marker with the potential to detect disease and related speech function in oral tongue cancers. These are preliminary findings that need to be replicated in longitudinal studies and/or larger cohorts.}, } @article {pmid33909840, year = {2021}, author = {Rocha-Muniz, CN and Schochat, E}, title = {Investigation of the neural discrimination of acoustic characteristics of speech sounds in normal-hearing individuals through Frequency-following Response (FFR).}, journal = {CoDAS}, volume = {33}, number = {1}, pages = {e20180324}, doi = {10.1590/2317-1782/20202018324}, pmid = {33909840}, issn = {2317-1782}, mesh = {Acoustic Stimulation ; Acoustics ; Child ; Evoked Potentials, Auditory, Brain Stem ; Hearing ; Humans ; *Phonetics ; *Speech Perception ; }, abstract = {PURPOSE: To evaluate how the auditory pathways encode and discriminate the plosive syllables [ga], [da] and [ba] using the auditory evoked Frequency-following Response (FFR) in children with typical development.

METHODS: Twenty children aged 6-12 years were evaluated using the FFR for the [ga], [da] and [ba] stimuli. The stimuli were composed of six formants and were differentiated in the F2 to F3 transition (transient portion). The other formants were identical in the three syllables (sustained portion). The latencies of the 16 waves of the transient portion (<70ms) and of the 21 waves of the sustained portion (90-160ms) of the stimuli were analyzed in the neural responses obtained for each of the syllables.

RESULTS: The transient portion latencies were different in the three syllables, indicating a distinction in the acoustic characteristics of these syllables through their neural representations. In addition, the transient portion latencies progressively increased in the following order: [ga] <[da] <[ba], whereas no significant differences were observed in the sustained portion.

CONCLUSION: The FFR proved to be an efficient tool to investigate the subcortical acoustic differences in speech sounds, since it demonstrated different electrophysiological responses for the three evoked syllables. Changes in latency were observed in the transient portion (consonants) but not in the sustained portion (vowels) for the three stimuli. These results indicate the neural ability to distinguish between acoustic characteristics of the [ga], [da] and [ba] stimuli.}, } @article {pmid33900806, year = {2021}, author = {Chiu, YF and Neel, A and Loux, T}, title = {Exploring the Acoustic Perceptual Relationship of Speech in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1560-1570}, doi = {10.1044/2021_JSLHR-20-00610}, pmid = {33900806}, issn = {1558-9102}, mesh = {Acoustics ; Aged ; Dysarthria/diagnosis/etiology ; Humans ; *Parkinson Disease/complications ; *Speech ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Auditory perceptual judgments are commonly used to diagnose dysarthria and assess treatment progress. The purpose of the study was to examine the acoustic underpinnings of perceptual speech abnormalities in individuals with Parkinson's disease (PD). Method Auditory perceptual judgments were obtained from sentences produced by 13 speakers with PD and five healthy older adults. Twenty young listeners rated overall ease of understanding, articulatory precision, voice quality, and prosodic adequacy on a visual analog scale. Acoustic measures associated with the speech subsystems of articulation, phonation, and prosody were obtained, including second formant transitions, articulation rate, cepstral and spectral measures of voice, and pitch variations. Regression analyses were performed to assess the relationships between perceptual judgments and acoustic variables. Results Perceptual impressions of Parkinsonian speech were related to combinations of several acoustic variables. Approximately 36%-49% of the variance in the perceptual ratings were explained by the acoustic measures indicating a modest acoustic perceptual relationship. Conclusions The relationships between perceptual ratings and acoustic signals in Parkinsonian speech are multifactorial and involve a variety of acoustic features simultaneously. The modest acoustic perceptual relationships, however, suggest that future work is needed to further examine the acoustic bases of perceptual judgments in dysarthria.}, } @article {pmid33900786, year = {2021}, author = {Parrell, B and Ivry, RB and Nagarajan, SS and Houde, JF}, title = {Intact Correction for Self-Produced Vowel Formant Variability in Individuals With Cerebellar Ataxia Regardless of Auditory Feedback Availability.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2234-2247}, pmid = {33900786}, issn = {1558-9102}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cerebellar Ataxia ; Feedback ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose Individuals with cerebellar ataxia (CA) caused by cerebellar degeneration exhibit larger reactive compensatory responses to unexpected auditory feedback perturbations than neurobiologically typical speakers, suggesting they may rely more on feedback control during speech. We test this hypothesis by examining variability in unaltered speech. Previous studies of typical speakers have demonstrated a reduction in formant variability (centering) observed during the initial phase of vowel production from vowel onset to vowel midpoint. Centering is hypothesized to reflect feedback-based corrections for self-produced variability and thus may provide a behavioral assay of feedback control in unperturbed speech in the same manner as the compensatory response does for feedback perturbations. Method To comprehensively compare centering in individuals with CA and controls, we examine centering in two vowels (/i/ and /ɛ/) under two contexts (isolated words and connected speech). As a control, we examine speech produced both with and without noise to mask auditory feedback. Results Individuals with CA do not show increased centering compared to age-matched controls, regardless of vowel, context, or masking. Contrary to previous results in neurobiologically typical speakers, centering was not affected by the presence of masking noise in either group. Conclusions The similar magnitude of centering seen with and without masking noise questions whether centering is driven by auditory feedback. However, if centering is at least partially driven by auditory/somatosensory feedback, these results indicate that the larger compensatory response to altered auditory feedback observed in individuals with CA may not reflect typical motor control processes during normal, unaltered speech production.}, } @article {pmid33895925, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Reshetov, IV and Popadyuk, VI}, title = {Study of the Role of Hearing Aid on the Area of the Acoustic Field of Vowels.}, journal = {Doklady. Biochemistry and biophysics}, volume = {497}, number = {1}, pages = {108-111}, pmid = {33895925}, issn = {1608-3091}, mesh = {*Acoustics ; Adult ; Female ; *Hearing Aids ; Humans ; Male ; Sound ; }, abstract = {The method of transformation of acoustic vowel triangles (AVT) /a/, /i/, /u/ was used for an objective assessment of the acoustic features of vowels in the speech production of 20 persons with long-term hearing impairment (LHI). The logarithm of the values of the first two formants of each vowel (logF1, logF2) was determined for each subject. AVTs were transformed into the right-angled triangles, the vertices of the sound /u/ of which were moved to the origin of coordinates and the legs were aligned with the coordinate axes. In patients with LHI, the size of the triangles usually decreased, and they were stretched along one of the axes, which probably depends not only on the hearing loss severity but also on the duration of hearing aid use. The presented approach to the normalization of AVTs makes it possible to distinguish at least three groups of persons with LHI: in the first group, vowel triangles are stretched along the logF1 axis; in the second group, vowel triangles are stretched along the logF2 axis; and in the third group, AVT are symmetric.}, } @article {pmid33863624, year = {2023}, author = {Lã, FMB and Silva, LS and Granqvist, S}, title = {Long-Term Average Spectrum Characteristics of Portuguese Fado-Canção from Coimbra.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {631.e7-631.e15}, doi = {10.1016/j.jvoice.2021.03.005}, pmid = {33863624}, issn = {1873-4588}, mesh = {Humans ; Speech Acoustics ; Portugal ; *Voice ; *Singing ; Acoustics ; }, abstract = {Descriptions of acoustical characteristics of Fado, a Portuguese urban style sung in Lisbon and Oporto, are scarce, particularly concerning Fado-Canção, a related style sung in Coimbra. The present study aims at describing long-term average spectrum (LTAS) parameters of 16 professional singers while singing and reading the lyrics of a typical Fado-Canção. LTAS parameters were investigated in terms of: (1) equivalent sound level (Leq); (2) spectral differences between 3 frequency bands 0-2, 2-5, and 5-8 kHz; and (3) quantification of spectral prominence between 2 and 4 kHz, calculated as the level difference between the peak in this frequency region and a reference trendline between 1 and 5 kHz, henceforth Formant Cluster Prominence (FCP). Given that Fado-Canção, besides Fado and traditional styles, originated also from classical singing, and that previous studies on Fado suggest the absence of a singer's formant cluster, the averaged LTAS for all Fado-Canção singers was further compared to the LTAS of two world-touring opera baritones singing an operatic aria and a lied. Results show that Fado-Canção is commonly sung with a Leq of 86.4 dB and a FCP of about 10 dB, values significantly higher when compared to reading. The FCP in Fado-Canção, although smaller than for the two classical opera singers' examples (14.8 and 20 dB, respectively), suggests that the style preserved some of its original lyrical influence. However, because younger singers present higher energy in the 5-8 kHz region relative to the remaining frequency bands as compared to older singers, it seems that Fado-Canção may be drifting towards non-classical vocal practices. FCP seems to be a promising straightforward method to quantify the degree of formant clustering around the region of the singer's formant in LTAS, allowing comparisons between different singers and singing styles.}, } @article {pmid33856659, year = {2021}, author = {Loni, DY and Subbaraman, S}, title = {Genetically related singers-acoustic feature analysis and impact on singer identification.}, journal = {Journal of applied genetics}, volume = {62}, number = {3}, pages = {459-467}, pmid = {33856659}, issn = {2190-3883}, mesh = {Acoustics ; Female ; Humans ; Male ; Music ; Parents ; Siblings ; Singing/*genetics ; Voice Quality/*genetics ; }, abstract = {Studies relating music with genetics have been one of the fascinating fields of research. In this study, we have attempted to answer the most curious question-how acoustically close are the genetically related singers? The present study has investigated this perception using two genetically different relations-three female sibling singers and father-son singer relation. These are famous Indian playback singers and the acoustic features are extracted using the songs of Bollywood films. Three different sets of self-developed cappella database are used for the experimentation. Positive correlations among the major musical aptitudes-pitch, vibrato, formant, and harmonic spectral envelope for both the singer relationships-revealed the genetic impact on the acoustic features. Also, the investigation of timbre spectral feature proved it a significant acoustic feature that differentiates similar voices. With Spearman's correlation coefficient, we conclude that strong acoustical association was observed between the acoustic features of genetically related singers, especially the female sibling singers. This was further validated by correlating these singers with genetically unrelated singers. A human perception test performed using cover songs indicated the genetic impact in voice similarity, while the automatic singer identification system discriminated singers more accurately than the human listeners.}, } @article {pmid33833720, year = {2021}, author = {Hsieh, IH and Yeh, WT}, title = {The Interaction Between Timescale and Pitch Contour at Pre-attentive Processing of Frequency-Modulated Sweeps.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {637289}, pmid = {33833720}, issn = {1664-1078}, abstract = {Speech comprehension across languages depends on encoding the pitch variations in frequency-modulated (FM) sweeps at different timescales and frequency ranges. While timescale and spectral contour of FM sweeps play important roles in differentiating acoustic speech units, relatively little work has been done to understand the interaction between the two acoustic dimensions at early cortical processing. An auditory oddball paradigm was employed to examine the interaction of timescale and pitch contour at pre-attentive processing of FM sweeps. Event-related potentials to frequency sweeps that vary in linguistically relevant pitch contour (fundamental frequency F0 vs. first formant frequency F1) and timescale (local vs. global) in Mandarin Chinese were recorded. Mismatch negativities (MMNs) were elicited by all types of sweep deviants. For local timescale, FM sweeps with F0 contours yielded larger MMN amplitudes than F1 contours. A reversed MMN amplitude pattern was obtained with respect to F0/F1 contours for global timescale stimuli. An interhemispheric asymmetry of MMN topography was observed corresponding to local and global-timescale contours. Falling but not rising frequency difference waveforms sweep contours elicited right hemispheric dominance. Results showed that timescale and pitch contour interacts with each other in pre-attentive auditory processing of FM sweeps. Findings suggest that FM sweeps, a type of non-speech signal, is processed at an early stage with reference to its linguistic function. That the dynamic interaction between timescale and spectral pattern is processed during early cortical processing of non-speech frequency sweep signal may be critical to facilitate speech encoding at a later stage.}, } @article {pmid33833252, year = {2021}, author = {Wright, E and Grawunder, S and Ndayishimiye, E and Galbany, J and McFarlin, SC and Stoinski, TS and Robbins, MM}, title = {Chest beats as an honest signal of body size in male mountain gorillas (Gorilla beringei beringei).}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6879}, pmid = {33833252}, issn = {2045-2322}, mesh = {Acoustics ; Animals ; *Body Size ; *Competitive Behavior ; Gorilla gorilla/*physiology ; Male ; *Reproduction ; Thorax/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Acoustic signals that reliably indicate body size, which usually determines competitive ability, are of particular interest for understanding how animals assess rivals and choose mates. Whereas body size tends to be negatively associated with formant dispersion in animal vocalizations, non-vocal signals have received little attention. Among the most emblematic sounds in the animal kingdom is the chest beat of gorillas, a non-vocal signal that is thought to be important in intra and inter-sexual competition, yet it is unclear whether it reliably indicates body size. We examined the relationship among body size (back breadth), peak frequency, and three temporal characteristics of the chest beat: duration, number of beats and beat rate from sound recordings of wild adult male mountain gorillas. Using linear mixed models, we found that larger males had significantly lower peak frequencies than smaller ones, but we found no consistent relationship between body size and the temporal characteristics measured. Taken together with earlier findings of positive correlations among male body size, dominance rank and reproductive success, we conclude that the gorilla chest beat is an honest signal of competitive ability. These results emphasize the potential of non-vocal signals to convey important information in mammal communication.}, } @article {pmid33831309, year = {2021}, author = {Jekiel, M and Malarski, K}, title = {Musical Hearing and Musical Experience in Second Language English Vowel Acquisition.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1666-1682}, doi = {10.1044/2021_JSLHR-19-00253}, pmid = {33831309}, issn = {1558-9102}, mesh = {Adult ; Hearing ; Hearing Tests ; Humans ; Language ; *Multilingualism ; *Music ; Phonetics ; *Speech Perception ; }, abstract = {Purpose Former studies suggested that music perception can help produce certain accentual features in the first and second language (L2), such as intonational contours. What was missing in many of these studies was the identification of the exact relationship between specific music perception skills and the production of different accentual features in a foreign language. Our aim was to verify whether empirically tested musical hearing skills can be related to the acquisition of English vowels by learners of English as an L2 before and after a formal accent training course. Method Fifty adult Polish speakers of L2 English were tested before and after a two-semester accent training in order to observe the effect of musical hearing on the acquisition of English vowels. Their L2 English vowel formant contours produced in consonant-vowel-consonant context were compared with the target General British vowels produced by their pronunciation teachers. We juxtaposed these results with their musical hearing test scores and self-reported musical experience to observe a possible relationship between successful L2 vowel acquisition and musical aptitude. Results Preexisting rhythmic memory was reported as a significant predictor before training, while musical experience was reported as a significant factor in the production of more native-like L2 vowels after training. We also observed that not all vowels were equally acquired or affected by musical hearing or musical experience. The strongest estimate we observed was the closeness to model before training, suggesting that learners who already managed to acquire some features of a native-like accent were also more successful after training. Conclusions Our results are revealing in two aspects. First, the learners' former proficiency in L2 pronunciation is the most robust predictor in acquiring a native-like accent. Second, there is a potential relationship between rhythmic memory and L2 vowel acquisition before training, as well as years of musical experience after training, suggesting that specific musical skills and music practice can be an asset in learning a foreign language accent.}, } @article {pmid33825503, year = {2021}, author = {Michell, CT and Nyman, T}, title = {Microbiomes of willow-galling sawflies: effects of host plant, gall type, and phylogeny on community structure and function.}, journal = {Genome}, volume = {64}, number = {6}, pages = {615-626}, doi = {10.1139/gen-2020-0018}, pmid = {33825503}, issn = {1480-3321}, mesh = {Animals ; Bacteria/*classification/*genetics ; Biodiversity ; Host Microbial Interactions ; Host Specificity ; Insecta ; Larva ; Microbiota/*genetics/*physiology ; *Phylogeny ; Plant Growth Regulators ; Plant Leaves ; RNA, Ribosomal, 16S/genetics ; Salix/*microbiology ; }, abstract = {While free-living herbivorous insects are thought to harbor microbial communities composed of transient bacteria derived from their diet, recent studies indicate that insects that induce galls on plants may be involved in more intimate host-microbe relationships. We used 16S rDNA metabarcoding to survey larval microbiomes of 20 nematine sawfly species that induce bud or leaf galls on 13 Salix species. The 391 amplicon sequence variants (ASVs) detected represented 69 bacterial genera in six phyla. Multi-variate statistical analyses showed that the structure of larval microbiomes is influenced by willow host species as well as by gall type. Nevertheless, a "core" microbiome composed of 58 ASVs is shared widely across the focal galler species. Within the core community, the presence of many abundant, related ASVs representing multiple distantly related bacterial taxa is reflected as a statistically significant effect of bacterial phylogeny on galler-microbe associations. Members of the core community have a variety of inferred functions, including degradation of phenolic compounds, nutrient supplementation, and production of plant hormones. Hence, our results support suggestions of intimate and diverse interactions between galling insects and microbes and add to a growing body of evidence that microbes may play a role in the induction of insect galls on plants.}, } @article {pmid33798490, year = {2021}, author = {Zhang, K and Sjerps, MJ and Peng, G}, title = {Integral perception, but separate processing: The perceptual normalization of lexical tones and vowels.}, journal = {Neuropsychologia}, volume = {156}, number = {}, pages = {107839}, doi = {10.1016/j.neuropsychologia.2021.107839}, pmid = {33798490}, issn = {1873-3514}, mesh = {Adult ; Cues ; Humans ; Language ; Phonetics ; Pitch Perception ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {In tonal languages, speech variability arises in both lexical tone (i.e., suprasegmentally) and vowel quality (segmentally). Listeners can use surrounding speech context to overcome variability in both speech cues, a process known as extrinsic normalization. Although vowels are the main carriers of tones, it is still unknown whether the combined percept (lexical tone and vowel quality) is normalized integrally or in partly separate processes. Here we used electroencephalography (EEG) to investigate the time course of lexical tone normalization and vowel normalization to answer this question. Cantonese adults listened to synthesized three-syllable stimuli in which the identity of a target syllable - ambiguous between high vs. mid-tone (Tone condition) or between /o/ vs. /u/ (Vowel condition) - was dependent on either the tone range (Tone condition) or the formant range (Vowel condition) of the first two syllables. It was observed that the ambiguous tone was more often interpreted as a high-level tone when the context had a relatively low pitch than when it had a high pitch (Tone condition). Similarly, the ambiguous vowel was more often interpreted as /o/ when the context had a relatively low formant range than when it had a relatively high formant range (Vowel condition). These findings show the typical pattern of extrinsic tone and vowel normalization. Importantly, the EEG results of participants showing the contrastive normalization effect demonstrated that the effects of vowel normalization could already be observed within the N2 time window (190-350 ms), while the first reliable effect of lexical tone normalization on cortical processing was observable only from the P3 time window (220-500 ms) onwards. The ERP patterns demonstrate that the contrastive perceptual normalization of lexical tones and that of vowels occur at least in partially separate time windows. This suggests that the extrinsic normalization can operate at the level of phonemes and tonemes separately instead of operating on the whole syllable at once.}, } @article {pmid33795617, year = {2021}, author = {Smith, ML and Winn, MB}, title = {Individual Variability in Recalibrating to Spectrally Shifted Speech: Implications for Cochlear Implants.}, journal = {Ear and hearing}, volume = {42}, number = {5}, pages = {1412-1427}, pmid = {33795617}, issn = {1538-4667}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Humans ; Reproducibility of Results ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: Cochlear implant (CI) recipients are at a severe disadvantage compared with normal-hearing listeners in distinguishing consonants that differ by place of articulation because the key relevant spectral differences are degraded by the implant. One component of that degradation is the upward shifting of spectral energy that occurs with a shallow insertion depth of a CI. The present study aimed to systematically measure the effects of spectral shifting on word recognition and phoneme categorization by specifically controlling the amount of shifting and using stimuli whose identification specifically depends on perceiving frequency cues. We hypothesized that listeners would be biased toward perceiving phonemes that contain higher-frequency components because of the upward frequency shift and that intelligibility would decrease as spectral shifting increased.

DESIGN: Normal-hearing listeners (n = 15) heard sine wave-vocoded speech with simulated upward frequency shifts of 0, 2, 4, and 6 mm of cochlear space to simulate shallow CI insertion depth. Stimuli included monosyllabic words and /b/-/d/ and /∫/-/s/ continua that varied systematically by formant frequency transitions or frication noise spectral peaks, respectively. Recalibration to spectral shifting was operationally defined as shifting perceptual acoustic-phonetic mapping commensurate with the spectral shift. In other words, adjusting frequency expectations for both phonemes upward so that there is still a perceptual distinction, rather than hearing all upward-shifted phonemes as the higher-frequency member of the pair.

RESULTS: For moderate amounts of spectral shifting, group data suggested a general "halfway" recalibration to spectral shifting, but individual data suggested a notably different conclusion: half of the listeners were able to recalibrate fully, while the other halves of the listeners were utterly unable to categorize shifted speech with any reliability. There were no participants who demonstrated a pattern intermediate to these two extremes. Intelligibility of words decreased with greater amounts of spectral shifting, also showing loose clusters of better- and poorer-performing listeners. Phonetic analysis of word errors revealed certain cues were more susceptible to being compromised due to a frequency shift (place and manner of articulation), while voicing was robust to spectral shifting.

CONCLUSIONS: Shifting the frequency spectrum of speech has systematic effects that are in line with known properties of speech acoustics, but the ensuing difficulties cannot be predicted based on tonotopic mismatch alone. Difficulties are subject to substantial individual differences in the capacity to adjust acoustic-phonetic mapping. These results help to explain why speech recognition in CI listeners cannot be fully predicted by peripheral factors like electrode placement and spectral resolution; even among listeners with functionally equivalent auditory input, there is an additional factor of simply being able or unable to flexibly adjust acoustic-phonetic mapping. This individual variability could motivate precise treatment approaches guided by an individual's relative reliance on wideband frequency representation (even if it is mismatched) or limited frequency coverage whose tonotopy is preserved.}, } @article {pmid33792205, year = {2021}, author = {Chen, F and Zhang, H and Ding, H and Wang, S and Peng, G and Zhang, Y}, title = {Neural coding of formant-exaggerated speech and nonspeech in children with and without autism spectrum disorders.}, journal = {Autism research : official journal of the International Society for Autism Research}, volume = {14}, number = {7}, pages = {1357-1374}, doi = {10.1002/aur.2509}, pmid = {33792205}, issn = {1939-3806}, mesh = {*Autism Spectrum Disorder/complications ; Child ; Child, Preschool ; Evoked Potentials ; Humans ; Language Development ; Phonetics ; Speech ; *Speech Perception ; }, abstract = {The presence of vowel exaggeration in infant-directed speech (IDS) may adapt to the age-appropriate demands in speech and language acquisition. Previous studies have provided behavioral evidence of atypical auditory processing towards IDS in children with autism spectrum disorders (ASD), while the underlying neurophysiological mechanisms remain unknown. This event-related potential (ERP) study investigated the neural coding of formant-exaggerated speech and nonspeech in 24 4- to 11-year-old children with ASD and 24 typically-developing (TD) peers. The EEG data were recorded using an alternating block design, in which each stimulus type (exaggerated/non-exaggerated sound) was presented with equal probability. ERP waveform analysis revealed an enhanced P1 for vowel formant exaggeration in the TD group but not in the ASD group. This speech-specific atypical processing in ASD was not found for the nonspeech stimuli which showed similar P1 enhancement in both ASD and TD groups. Moreover, the time-frequency analysis indicated that children with ASD showed differences in neural synchronization in the delta-theta bands for processing acoustic formant changes embedded in nonspeech. Collectively, the results add substantiating neurophysiological evidence (i.e., a lack of neural enhancement effect of vowel exaggeration) for atypical auditory processing of IDS in children with ASD, which may exert a negative effect on phonetic encoding and language learning. LAY SUMMARY: Atypical responses to motherese might act as a potential early marker of risk for children with ASD. This study investigated the neural responses to such socially relevant stimuli in the ASD brain, and the results suggested a lack of neural enhancement responding to the motherese even in individuals without intellectual disability.}, } @article {pmid33786072, year = {2021}, author = {Carmona-Duarte, C and Ferrer, MA and Plamondon, R and Gómez-Rodellar, A and Gómez-Vilda, P}, title = {Sigma-Lognormal Modeling of Speech.}, journal = {Cognitive computation}, volume = {13}, number = {2}, pages = {488-503}, pmid = {33786072}, issn = {1866-9956}, abstract = {Human movement studies and analyses have been fundamental in many scientific domains, ranging from neuroscience to education, pattern recognition to robotics, health care to sports, and beyond. Previous speech motor models were proposed to understand how speech movement is produced and how the resulting speech varies when some parameters are changed. However, the inverse approach, in which the muscular response parameters and the subject's age are derived from real continuous speech, is not possible with such models. Instead, in the handwriting field, the kinematic theory of rapid human movements and its associated Sigma-lognormal model have been applied successfully to obtain the muscular response parameters. This work presents a speech kinematics-based model that can be used to study, analyze, and reconstruct complex speech kinematics in a simplified manner. A method based on the kinematic theory of rapid human movements and its associated Sigma-lognormal model are applied to describe and to parameterize the asymptotic impulse response of the neuromuscular networks involved in speech as a response to a neuromotor command. The method used to carry out transformations from formants to a movement observation is also presented. Experiments carried out with the (English) VTR-TIMIT database and the (German) Saarbrucken Voice Database, including people of different ages, with and without laryngeal pathologies, corroborate the link between the extracted parameters and aging, on the one hand, and the proportion between the first and second formants required in applying the kinematic theory of rapid human movements, on the other. The results should drive innovative developments in the modeling and understanding of speech kinematics.}, } @article {pmid33775469, year = {2023}, author = {Oren, L and Rollins, M and Gutmark, E and Howell, R}, title = {How Face Masks Affect Acoustic and Auditory Perceptual Characteristics of the Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {4}, pages = {515-521}, doi = {10.1016/j.jvoice.2021.02.028}, pmid = {33775469}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Voice Quality ; Masks ; *COVID-19 ; Acoustics ; }, abstract = {Wearing a face mask has been accepted as one of the most effective ways for slowing the spread of COVID-19. Yet information regarding the degree to which masks affect acoustics and perception associated with voice performers is scarce. This study examines these effects with common face masks, namely a neck gaiter, disposable surgical mask, and N95 mask, as well as a novel material that could be used as a mask (acoustic foam). A recorded excerpt from the "Star-Spangled Banner" was played through a miniature speaker placed inside the mouth of a masked manikin. Experienced listeners were asked to rate perceptual qualities of these singing stimuli by blindly comparing them with the same recording captured without a mask. Acoustic analysis showed that face masks affected the sound by enhancing or suppressing different frequency bands compared to no mask. Acoustic energy around the singer's formant was reduced when using surgical and N95 masks, which matches observations that these masks are more detrimental to the perceptions of singing voice compared with neck gaiter or acoustic foam. It suggests that singers can benefit from masks designed for minimal impact on auditory perception of the singing voice while maintaining reasonable efficacy of filtering efficiency.}, } @article {pmid33773895, year = {2023}, author = {Havel, M and Sundberg, J and Traser, L and Burdumy, M and Echternach, M}, title = {Effects of Nasalization on Vocal Tract Response Curve.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {339-347}, doi = {10.1016/j.jvoice.2021.02.013}, pmid = {33773895}, issn = {1873-4588}, mesh = {Humans ; *Nose/physiology ; *Paranasal Sinuses/physiology ; Vibration ; Magnetic Resonance Imaging ; Models, Biological ; Speech Acoustics ; }, abstract = {BACKGROUND: Earlier studies have shown that nasalization affects the radiated spectrum by modifying the vocal tract transfer function in a complex manner.

METHODS: Here we study this phenomenon by measuring sine-sweep response of 3-D models of the vowels /u, a, ᴂ, i/, derived from volumetric MR imaging, coupled by means of tubes of different lengths and diameters to a 3-D model of a nasal tract.

RESULTS: The coupling introduced a dip into the vocal tract transfer function. The dip frequency was close to the main resonance of the nasal tract, a result in agreement with the Fujimura & Lindqvist in vivo sweep tone measurements [Fujimura & Lindqvist, 1972]. With increasing size of the coupling tube the depth of the dip increased and the first formant peak either changed in frequency or was split by the dip. Only marginal effects were observed of the paranasal sinuses. For certain coupling tube sizes, the spectrum balance was changed, boosting the formant peaks in the 2 - 4 kHz range.

CONCLUSION: A velopharyngeal opening introduces a dip in the transfer function at the main resonance of the nasal tract. Its depth increases with the area of the opening and its frequency rises in some vowels.}, } @article {pmid33769836, year = {2021}, author = {Coughler, C and Hamel, EM and Cardy, JO and Archibald, LMD and Purcell, DW}, title = {Compensation to Altered Auditory Feedback in Children With Developmental Language Disorder and Typical Development.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2363-2376}, doi = {10.1044/2020_JSLHR-20-00374}, pmid = {33769836}, issn = {1558-9102}, mesh = {Child ; Feedback ; Humans ; *Language Development Disorders ; Speech ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Purpose Developmental language disorder (DLD), an unexplained problem using and understanding spoken language, has been hypothesized to have an underlying auditory processing component. Auditory feedback plays a key role in speech motor control. The current study examined whether auditory feedback is used to regulate speech production in a similar way by children with DLD and their typically developing (TD) peers. Method Participants aged 6-11 years completed tasks measuring hearing, language, first formant (F1) discrimination thresholds, partial vowel space, and responses to altered auditory feedback with F1 perturbation. Results Children with DLD tended to compensate more than TD children for the positive F1 manipulation and compensated less than TD children in the negative shift condition. Conclusion Our findings suggest that children with DLD make atypical use of auditory feedback.}, } @article {pmid33758251, year = {2021}, author = {Arenillas-Alcón, S and Costa-Faidella, J and Ribas-Prats, T and Gómez-Roig, MD and Escera, C}, title = {Neural encoding of voice pitch and formant structure at birth as revealed by frequency-following responses.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6660}, pmid = {33758251}, issn = {2045-2322}, mesh = {*Acoustic Stimulation ; Adult ; Age Factors ; Biomarkers ; Brain/*physiology ; Cognition ; Humans ; Infant, Newborn ; Pediatrics ; *Pitch Perception ; Sound Spectrography ; Speech Perception ; *Voice ; }, abstract = {Detailed neural encoding of voice pitch and formant structure plays a crucial role in speech perception, and is of key importance for an appropriate acquisition of the phonetic repertoire in infants since birth. However, the extent to what newborns are capable of extracting pitch and formant structure information from the temporal envelope and the temporal fine structure of speech sounds, respectively, remains unclear. Here, we recorded the frequency-following response (FFR) elicited by a novel two-vowel, rising-pitch-ending stimulus to simultaneously characterize voice pitch and formant structure encoding accuracy in a sample of neonates and adults. Data revealed that newborns tracked changes in voice pitch reliably and no differently than adults, but exhibited weaker signatures of formant structure encoding, particularly at higher formant frequency ranges. Thus, our results indicate a well-developed encoding of voice pitch at birth, while formant structure representation is maturing in a frequency-dependent manner. Furthermore, we demonstrate the feasibility to assess voice pitch and formant structure encoding within clinical evaluation times in a hospital setting, and suggest the possibility to use this novel stimulus as a tool for longitudinal developmental studies of the auditory system.}, } @article {pmid33741872, year = {2021}, author = {Emrani, E and Ghaemi, H and Labafchi, A and Samieirad, S}, title = {The Effect of Bimaxillary Orthognathic Surgery on Voice Characteristics in Skeletal Class 3 Deformity Patients: An Evaluation Using Acoustic Analysis.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {6}, pages = {2129-2133}, doi = {10.1097/SCS.0000000000007479}, pmid = {33741872}, issn = {1536-3732}, mesh = {Acoustics ; Adult ; Cephalometry ; Female ; Follow-Up Studies ; Humans ; Male ; *Malocclusion, Angle Class III/surgery ; Mandible ; Maxilla ; *Orthognathic Surgery ; *Orthognathic Surgical Procedures ; Osteotomy, Le Fort ; Osteotomy, Sagittal Split Ramus ; }, abstract = {The aim of this study was to analyze the effects of bimaxillary orthognathic surgery on the acoustic voice characteristics of skeletal class 3 patients. All healthy nonsyndromic patients with Class 3 deformity who were eligible for bimaxillary orthognathic surgery, were included in this before and after quasi-experimental study. This experiment's main intervention was mandibular setback surgery by bilateral sagittal split osteotomy plus maxillary advancement using LeFort 1 osteotomy. Age, sex, and intraoperative jaw movements were recorded. Acoustic analysis of voice samples (vowels /a/ and /i/) was performed with Praat software as outcome variables. The formant frequencies (F0, F1, F2, and F3) of these vowels were extracted 1 week preoperatively (T0), 1 and 6 months (T1, T2) postoperatively by a speech therapist. The significance level was set at 0.05 using SPSS 19. The study sample comprised 20 patients including 11 women (55%) and 9 men (45%) with a mean age of 31.95 ± 4.72 years. The average mandibular setback and maxillary advancement were 3.30 ± 0.86 and 2.85 ± 0.74 mm, respectively. The fundamental frequency (F0) and the first, second, and third formants (F1, F2, F3) of vowels /i/ and /a/ were significantly decreased over time intervals, postoperatively (P < 0.05). The finding revealed that bimaxillary orthognathic surgery (maxillary advancement and mandibular setback with bilateral sagittal split osteotomy) might reduce the acoustic formant parameters of voice to the normal frequency ranges, in patients with class 3 skeletal deformities. More clinical trials with greater sample sizes and long-term follow-ups are suggested in the future.}, } @article {pmid33740875, year = {2022}, author = {Geng, P and Gu, W}, title = {Acoustic and Perceptual Characteristics of Mandarin Speech in Gay and Heterosexual Male Speakers.}, journal = {Language and speech}, volume = {65}, number = {4}, pages = {1096-1109}, doi = {10.1177/00238309211000783}, pmid = {33740875}, issn = {1756-6053}, mesh = {Male ; Humans ; Speech Acoustics ; Speech ; Heterosexuality ; Acoustics ; *Speech Perception ; *Sexual and Gender Minorities ; }, abstract = {This study investigated acoustic and perceptual characteristics of Mandarin speech produced by gay and heterosexual male speakers. Acoustic analysis of monosyllabic words showed significant differences between the two groups in voice fundamental frequency (F0), F1 of low vowel, and duration of aspiration/frication in consonants. The acoustic patterns on F0, formants, and center of gravity as well as spectral skewness of /s/ differed from those reported for Western languages like American English, which could be interpreted from a sociopsychological point of view based on different acceptability of gay identity in the two societies. The results of a perceptual experiment revealed significant but weak correlations between the acoustic parameters and the score of perceived gayness, which was significantly higher on gay speech than on heterosexual male speech. Although the observed F0 and F1 patterns in Mandarin gay speech were opposite to the stereotype of gayness, gay identity can still be identified to some extent from speech due to the existence of other acoustic cues such as a longer fricative duration, which is not a stereotype of gayness but has been consistently observed in Mandarin and Western languages.}, } @article {pmid33739930, year = {2021}, author = {König, A and Riviere, K and Linz, N and Lindsay, H and Elbaum, J and Fabre, R and Derreumaux, A and Robert, P}, title = {Measuring Stress in Health Professionals Over the Phone Using Automatic Speech Analysis During the COVID-19 Pandemic: Observational Pilot Study.}, journal = {Journal of medical Internet research}, volume = {23}, number = {4}, pages = {e24191}, pmid = {33739930}, issn = {1438-8871}, mesh = {Adult ; Anxiety/*diagnosis/etiology/psychology ; Burnout, Professional/*diagnosis/etiology/psychology ; COVID-19/epidemiology/*psychology ; Female ; Health Personnel/*psychology ; Humans ; Male ; Pandemics ; Pilot Projects ; SARS-CoV-2 ; Speech/*physiology ; *Speech Acoustics ; Surveys and Questionnaires ; Telephone ; }, abstract = {BACKGROUND: During the COVID-19 pandemic, health professionals have been directly confronted with the suffering of patients and their families. By making them main actors in the management of this health crisis, they have been exposed to various psychosocial risks (stress, trauma, fatigue, etc). Paradoxically, stress-related symptoms are often underreported in this vulnerable population but are potentially detectable through passive monitoring of changes in speech behavior.

OBJECTIVE: This study aims to investigate the use of rapid and remote measures of stress levels in health professionals working during the COVID-19 outbreak. This was done through the analysis of participants' speech behavior during a short phone call conversation and, in particular, via positive, negative, and neutral storytelling tasks.

METHODS: Speech samples from 89 health care professionals were collected over the phone during positive, negative, and neutral storytelling tasks; various voice features were extracted and compared with classical stress measures via standard questionnaires. Additionally, a regression analysis was performed.

RESULTS: Certain speech characteristics correlated with stress levels in both genders; mainly, spectral (ie, formant) features, such as the mel-frequency cepstral coefficient, and prosodic characteristics, such as the fundamental frequency, appeared to be sensitive to stress. Overall, for both male and female participants, using vocal features from the positive tasks for regression yielded the most accurate prediction results of stress scores (mean absolute error 5.31).

CONCLUSIONS: Automatic speech analysis could help with early detection of subtle signs of stress in vulnerable populations over the phone. By combining the use of this technology with timely intervention strategies, it could contribute to the prevention of burnout and the development of comorbidities, such as depression or anxiety.}, } @article {pmid33733165, year = {2020}, author = {Strycharczuk, P and López-Ibáñez, M and Brown, G and Leemann, A}, title = {General Northern English. Exploring Regional Variation in the North of England With Machine Learning.}, journal = {Frontiers in artificial intelligence}, volume = {3}, number = {}, pages = {48}, pmid = {33733165}, issn = {2624-8212}, abstract = {In this paper, we present a novel computational approach to the analysis of accent variation. The case study is dialect leveling in the North of England, manifested as reduction of accent variation across the North and emergence of General Northern English (GNE), a pan-regional standard accent associated with middle-class speakers. We investigated this instance of dialect leveling using random forest classification, with audio data from a crowd-sourced corpus of 105 urban, mostly highly-educated speakers from five northern UK cities: Leeds, Liverpool, Manchester, Newcastle upon Tyne, and Sheffield. We trained random forest models to identify individual northern cities from a sample of other northern accents, based on first two formant measurements of full vowel systems. We tested the models using unseen data. We relied on undersampling, bagging (bootstrap aggregation) and leave-one-out cross-validation to address some challenges associated with the data set, such as unbalanced data and relatively small sample size. The accuracy of classification provides us with a measure of relative similarity between different pairs of cities, while calculating conditional feature importance allows us to identify which input features (which vowels and which formants) have the largest influence in the prediction. We do find a considerable degree of leveling, especially between Manchester, Leeds and Sheffield, although some differences persist. The features that contribute to these differences most systematically are typically not the ones discussed in previous dialect descriptions. We propose that the most systematic regional features are also not salient, and as such, they serve as sociolinguistic regional indicators. We supplement the random forest results with a more traditional variationist description of by-city vowel systems, and we use both sources of evidence to inform a description of the vowels of General Northern English.}, } @article {pmid33705674, year = {2021}, author = {Niziolek, CA and Parrell, B}, title = {Responses to Auditory Feedback Manipulations in Speech May Be Affected by Previous Exposure to Auditory Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2169-2181}, pmid = {33705674}, issn = {1558-9102}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; Reproducibility of Results ; *Speech ; *Speech Perception ; }, abstract = {Purpose Speakers use auditory feedback to guide their speech output, although individuals differ in the magnitude of their compensatory response to perceived errors in feedback. Little is known about the factors that contribute to the compensatory response or how fixed or flexible they are within an individual. Here, we test whether manipulating the perceived reliability of auditory feedback modulates speakers' compensation to auditory perturbations, as predicted by optimal models of sensorimotor control. Method Forty participants produced monosyllabic words in two separate sessions, which differed in the auditory feedback given during an initial exposure phase. In the veridical session exposure phase, feedback was normal. In the noisy session exposure phase, small, random formant perturbations were applied, reducing reliability of auditory feedback. In each session, a subsequent test phase introduced larger unpredictable formant perturbations. We assessed whether the magnitude of within-trial compensation for these larger perturbations differed across the two sessions. Results Compensatory responses to downward (though not upward) formant perturbations were larger in the veridical session than the noisy session. However, in post hoc testing, we found the magnitude of this effect is highly dependent on the choice of analysis procedures. Compensation magnitude was not predicted by other production measures, such as formant variability, and was not reliably correlated across sessions. Conclusions Our results, though mixed, provide tentative support that the feedback control system monitors the reliability of sensory feedback. These results must be interpreted cautiously given the potentially limited stability of auditory feedback compensation measures across analysis choices and across sessions. Supplemental Material https://doi.org/10.23641/asha.14167136.}, } @article {pmid33705004, year = {2021}, author = {Hernández-García, E and Velazquez, LM and González, R and Godino Llorente, JI and Plaza, G}, title = {Influence of Upper Airway Surgery on Voice and Speech Recognition.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {2}, pages = {660-663}, doi = {10.1097/SCS.0000000000007175}, pmid = {33705004}, issn = {1536-3732}, mesh = {Humans ; Prospective Studies ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; Voice Quality ; }, abstract = {PURPOSE: Upper airway surgery comprises a set of techniques that modify the anatomy of the vocal tract, including tonsillectomy and septoplasty. The objective of this work is to study the changes in acoustic parameters and the effects on the identification or verification of the speaker through the speech produced after the vocal tract surgeries, comparing them with a control group.

METHODS: A prospective study was performed between January 2019 and June 2019 including. The final study sample consisted of 84 patients who met the inclusion criteria. Of these, 31 underwent septoplasty, 26 tonsillectomy patients, and 27 controls. Demographic data and GRBAS evaluation were statistically evaluated. Tests were taken before surgery, 2 weeks after surgery and 3 months later. Furthermore, to establish the equal error rate, the recording of patients' voices was made with a succeeding acoustic analysis and programmed identification of the speaker through machine learning systems.

RESULTS: A significant variance was observed in GRBAS, after surgery. Regarding acoustic parameters, a greater change was observed in the fundamental frequency at 2 weeks after surgery in the tonsillectomy group. Formants (F1-F3) and antiformants (AntiF1-AntiF3) changed in septoplasty group, not in tonsillectomy and control group at 3 months. When studying the impact of voice changes on the verification of the speaker through the speech, it was observed that there was a greater error in recognition in the tonsillectomy group at 2 weeks, coinciding with the results obtained in the rest of the parameters studied.

CONCLUSIONS: Results suggest that upper airway surgery produces modifications in the vocal tract affecting GRBAS, acoustic parameters, including formants and antiformants, producing an effect on verification of the speaker through the speech.}, } @article {pmid33679344, year = {2021}, author = {Riedinger, M and Nagels, A and Werth, A and Scharinger, M}, title = {Asymmetries in Accessing Vowel Representations Are Driven by Phonological and Acoustic Properties: Neural and Behavioral Evidence From Natural German Minimal Pairs.}, journal = {Frontiers in human neuroscience}, volume = {15}, number = {}, pages = {612345}, pmid = {33679344}, issn = {1662-5161}, abstract = {In vowel discrimination, commonly found discrimination patterns are directional asymmetries where discrimination is faster (or easier) if differing vowels are presented in a certain sequence compared to the reversed sequence. Different models of speech sound processing try to account for these asymmetries based on either phonetic or phonological properties. In this study, we tested and compared two of those often-discussed models, namely the Featurally Underspecified Lexicon (FUL) model (Lahiri and Reetz, 2002) and the Natural Referent Vowel (NRV) framework (Polka and Bohn, 2011). While most studies presented isolated vowels, we investigated a large stimulus set of German vowels in a more naturalistic setting within minimal pairs. We conducted an mismatch negativity (MMN) study in a passive and a reaction time study in an active oddball paradigm. In both data sets, we found directional asymmetries that can be explained by either phonological or phonetic theories. While behaviorally, the vowel discrimination was based on phonological properties, both tested models failed to explain the found neural patterns comprehensively. Therefore, we additionally examined the influence of a variety of articulatory, acoustical, and lexical factors (e.g., formant structure, intensity, duration, and frequency of occurrence) but also the influence of factors beyond the well-known (perceived loudness of vowels, degree of openness) in depth via multiple regression analyses. The analyses revealed that the perceptual factor of perceived loudness has a greater impact than considered in the literature and should be taken stronger into consideration when analyzing preattentive natural vowel processing.}, } @article {pmid33675539, year = {2021}, author = {Kim, KS and Max, L}, title = {Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, journal = {The European journal of neuroscience}, volume = {53}, number = {9}, pages = {3093-3108}, pmid = {33675539}, issn = {1460-9568}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adaptation, Physiological ; Adult ; Child ; Feedback ; Feedback, Sensory ; Humans ; Learning ; *Speech ; *Stuttering ; }, abstract = {The neural mechanisms underlying stuttering remain poorly understood. A large body of work has focused on sensorimotor integration difficulties in individuals who stutter, including recently the capacity for sensorimotor learning. Typically, sensorimotor learning is assessed with adaptation paradigms in which one or more sensory feedback modalities are experimentally perturbed in real time. Our own previous work on speech with perturbed auditory feedback revealed substantial auditory-motor learning limitations in both children and adults who stutter (AWS). It remains unknown, however, which subprocesses of sensorimotor learning are impaired. Indeed, new insights from research on upper limb motor control indicate that sensorimotor learning involves at least two distinct components: (a) an explicit component that includes intentional strategy use and presumably is driven by target error and (b) an implicit component that updates an internal model without awareness of the learner and presumably is driven by sensory prediction error. Here, we attempted to dissociate these components for speech auditory-motor learning in AWS versus adults who do not stutter (AWNS). Our formant-shift auditory-motor adaptation results replicated previous findings that such sensorimotor learning is limited in AWS. Novel findings are that neither control nor stuttering participants reported any awareness of changing their productions in response to the auditory perturbation and that neither group showed systematic drift in auditory target judgments made throughout the adaptation task. These results indicate that speech auditory-motor adaptation to formant-shifted feedback relies exclusively on implicit learning processes. Thus, limited adaptation in AWS reflects poor implicit sensorimotor learning. Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, } @article {pmid33658966, year = {2021}, author = {Stefanich, S and Cabrelli, J}, title = {The Effects of L1 English Constraints on the Acquisition of the L2 Spanish Alveopalatal Nasal.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {640354}, pmid = {33658966}, issn = {1664-1078}, abstract = {This study examines whether L1 English/L2 Spanish learners at different proficiency levels acquire a novel L2 phoneme, the Spanish palatal nasal /ɲ/. While alveolar /n/ is part of the Spanish and English inventories, /ɲ/, which consists of a tautosyllabic palatal nasal+glide element, is not. This crosslinguistic disparity presents potential difficulty for L1 English speakers due to L1 segmental and phonotactic constraints; the closest English approximation is the heterosyllabic sequence /nj/ (e.g., "canyon" /kænjn/ ['k[h]æn.jn], cf. Spanish cañón "canyon" /kaɲon/ [ka.'ɲon]). With these crosslinguistic differences in mind, we ask: (1a) Do L1 English learners of L2 Spanish produce acoustically distinct Spanish /n/ and /ɲ/ and (1b) Does the distinction of /n/ and /ɲ/ vary by proficiency? In the case that learners distinguish /n/ and /ɲ/, the second question investigates the acoustic quality of /ɲ/ to determine (2a) if learners' L2 representation patterns with that of an L1 Spanish representation or if learners rely on an L1 representation (here, English /nj/) and (2b) if the acoustic quality of L2 Spanish /ɲ/ varies as a function of proficiency. Beginner (n = 9) and advanced (n = 8) L1 English/L2 Spanish speakers and a comparison group of 10 L1 Spanish/L2 English speakers completed delayed repetition tasks in which disyllabic nonce words were produced in a carrier phrase. English critical items contained an intervocalic heterosyllabic /nj/ sequence (e.g., ['p[h]an.jə]); Spanish critical items consisted of items with either intervocalic onset /ɲ/ (e.g., ['xa.ɲa]) or /n/ ['xa.na]. We measured duration and formant contours of the following vocalic portion as acoustic indices of the /n/~/ɲ/ and /ɲ/ ~/nj/ distinctions. Results show that, while L2 Spanish learners produce an acoustically distinct /n/ ~ /ɲ/ contrast even at a low level of proficiency, the beginners produce an intermediate /ɲ/ that falls acoustically between their English /nj/ and the L1 Spanish /ɲ/ while the advanced learners' Spanish /ɲ/ and English /nj/ appear to be in the process of equivalence classification. We discuss these outcomes as they relate to the robustness of L1 phonological constraints in late L2 acquisition coupled with the role of perceptual cues, functional load, and questions of intelligibility.}, } @article {pmid33657098, year = {2021}, author = {Tabas, A and von Kriegstein, K}, title = {Neural modelling of the encoding of fast frequency modulation.}, journal = {PLoS computational biology}, volume = {17}, number = {3}, pages = {e1008787}, pmid = {33657098}, issn = {1553-7358}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Pathways/*physiology ; Computational Biology ; Female ; Humans ; Male ; *Models, Neurological ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Frequency modulation (FM) is a basic constituent of vocalisation in many animals as well as in humans. In human speech, short rising and falling FM-sweeps of around 50 ms duration, called formant transitions, characterise individual speech sounds. There are two representations of FM in the ascending auditory pathway: a spectral representation, holding the instantaneous frequency of the stimuli; and a sweep representation, consisting of neurons that respond selectively to FM direction. To-date computational models use feedforward mechanisms to explain FM encoding. However, from neuroanatomy we know that there are massive feedback projections in the auditory pathway. Here, we found that a classical FM-sweep perceptual effect, the sweep pitch shift, cannot be explained by standard feedforward processing models. We hypothesised that the sweep pitch shift is caused by a predictive feedback mechanism. To test this hypothesis, we developed a novel model of FM encoding incorporating a predictive interaction between the sweep and the spectral representation. The model was designed to encode sweeps of the duration, modulation rate, and modulation shape of formant transitions. It fully accounted for experimental data that we acquired in a perceptual experiment with human participants as well as previously published experimental results. We also designed a new class of stimuli for a second perceptual experiment to further validate the model. Combined, our results indicate that predictive interaction between the frequency encoding and direction encoding neural representations plays an important role in the neural processing of FM. In the brain, this mechanism is likely to occur at early stages of the processing hierarchy.}, } @article {pmid33656916, year = {2021}, author = {Levy, ES and Chang, YM and Hwang, K and McAuliffe, MJ}, title = {Perceptual and Acoustic Effects of Dual-Focus Speech Treatment in Children With Dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2301-2316}, doi = {10.1044/2020_JSLHR-20-00301}, pmid = {33656916}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; *Dysarthria/etiology/therapy ; Humans ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Children with dysarthria secondary to cerebral palsy may experience reduced speech intelligibility and diminished communicative participation. However, minimal research has been conducted examining the outcomes of behavioral speech treatments in this population. This study examined the effect of Speech Intelligibility Treatment (SIT), a dual-focus speech treatment targeting increased articulatory excursion and vocal intensity, on intelligibility of narrative speech, speech acoustics, and communicative participation in children with dysarthria. Method American English-speaking children with dysarthria (n = 17) received SIT in a 3-week summer camplike setting at Columbia University. SIT follows motor-learning principles to train the child-friendly, dual-focus strategy, "Speak with your big mouth and strong voice." Children produced a story narrative at baseline, immediate posttreatment (POST), and at 6-week follow-up (FUP). Outcomes were examined via blinded listener ratings of ease of understanding (n = 108 adult listeners), acoustic analyses, and questionnaires focused on communicative participation. Results SIT resulted in significant increases in ease of understanding at POST, that were maintained at FUP. There were no significant changes to vocal intensity, speech rate, or vowel spectral characteristics, with the exception of an increase in second formant difference between vowels following SIT. Significantly enhanced communicative participation was evident at POST and FUP. Considerable variability in response to SIT was observed between children. Conclusions Dual-focus treatment shows promise for improving intelligibility and communicative participation in children with dysarthria, although responses to treatment vary considerably across children. Possible mechanisms underlying the intelligibility gains, enhanced communicative participation, and variability in treatment effects are discussed.}, } @article {pmid33646815, year = {2021}, author = {Howson, PJ and Redford, MA}, title = {The Acquisition of Articulatory Timing for Liquids: Evidence From Child and Adult Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {734-753}, pmid = {33646815}, issn = {1558-9102}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged, 80 and over ; Child ; Child, Preschool ; Family ; Humans ; *Language ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {Purpose Liquids are among the last sounds to be acquired by English-speaking children. The current study considers their acquisition from an articulatory timing perspective by investigating anticipatory posturing for /l/ versus /ɹ/ in child and adult speech. Method In Experiment 1, twelve 5-year-old, twelve 8-year-old, and 11 college-aged speakers produced carrier phrases with penultimate stress on monosyllabic words that had /l/, /ɹ/, or /d/ (control) as singleton onsets and /æ/ or /u/ as the vowel. Short-domain anticipatory effects were acoustically investigated based on schwa formant values extracted from the preceding determiner (= the) and dynamic formant values across the /ə#LV/ sequence. In Experiment 2, long-domain effects were perceptually indexed using a previously validated forward-gated audiovisual speech prediction task. Results Experiment 1 results indicated that all speakers distinguished /l/ from /ɹ/ along F3. Adults distinguished /l/ from /ɹ/ with a lower F2. Older children produced subtler versions of the adult pattern; their anticipatory posturing was also more influenced by the following vowel. Younger children did not distinguish /l/ from /ɹ/ along F2, but both liquids were distinguished from /d/ in the domains investigated. Experiment 2 results indicated that /ɹ/ was identified earlier than /l/ in gated adult speech; both liquids were identified equally early in 5-year-olds' speech. Conclusions The results are interpreted to suggest a pattern of early tongue-body retraction for liquids in /ə#LV/ sequences in children's speech. More generally, it is suggested that children must learn to inhibit the influence of vowels on liquid articulation to achieve an adultlike contrast between /l/ and /ɹ/ in running speech.}, } @article {pmid33639824, year = {2021}, author = {Raharjo, I and Kothare, H and Nagarajan, SS and Houde, JF}, title = {Speech compensation responses and sensorimotor adaptation to formant feedback perturbations.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {1147}, pmid = {33639824}, issn = {1520-8524}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 DC017690/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 NS100440/NS/NINDS NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Phonetics ; *Speech ; *Speech Perception ; }, abstract = {Control of speech formants is important for the production of distinguishable speech sounds and is achieved with both feedback and learned feedforward control. However, it is unclear whether the learning of feedforward control involves the mechanisms of feedback control. Speakers have been shown to compensate for unpredictable transient mid-utterance perturbations of pitch and loudness feedback, demonstrating online feedback control of these speech features. To determine whether similar feedback control mechanisms exist in the production of formants, responses to unpredictable vowel formant feedback perturbations were examined. Results showed similar within-trial compensatory responses to formant perturbations that were presented at utterance onset and mid-utterance. The relationship between online feedback compensation to unpredictable formant perturbations and sensorimotor adaptation to consistent formant perturbations was further examined. Within-trial online compensation responses were not correlated with across-trial sensorimotor adaptation. A detailed analysis of within-trial time course dynamics across trials during sensorimotor adaptation revealed that across-trial sensorimotor adaptation responses did not result from an incorporation of within-trial compensation response. These findings suggest that online feedback compensation and sensorimotor adaptation are governed by distinct neural mechanisms. These findings have important implications for models of speech motor control in terms of how feedback and feedforward control mechanisms are implemented.}, } @article {pmid33639809, year = {2021}, author = {Carignan, C}, title = {A practical method of estimating the time-varying degree of vowel nasalization from acoustic features.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {911}, doi = {10.1121/10.0002925}, pmid = {33639809}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This paper presents a simple and easy-to-use method of creating a time-varying signal of the degree of nasalization in vowels, generated from acoustic features measured in oral and nasalized vowel contexts. The method is presented for separate models constructed using two sets of acoustic features: (1) an uninformed set of 13 Mel-frequency cepstral coefficients (MFCCs) and (2) a combination of the 13 MFCCs and a phonetically informed set of 20 acoustic features of vowel nasality derived from previous research. Both models are compared against two traditional approaches to estimating vowel nasalization from acoustics: A1-P0 and A1-P1, as well as their formant-compensated counterparts. Data include productions from six speakers of different language backgrounds, producing 11 different qualities within the vowel quadrilateral. The results generated from each of the methods are compared against nasometric measurements, representing an objective "ground truth" of the degree of nasalization. The results suggest that the proposed method is more robust than conventional acoustic approaches, generating signals which correlate strongly with nasometric measures across all vowel qualities and all speakers and accurately approximate the time-varying change in the degree of nasalization. Finally, an experimental example is provided to help researchers implement the method in their own study designs.}, } @article {pmid33630668, year = {2021}, author = {Chung, H and Weismer, G}, title = {Formant Trajectory Patterns of American English /l/ Produced by Adults and Children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {809-822}, doi = {10.1044/2020_JSLHR-20-00345}, pmid = {33630668}, issn = {1558-9102}, mesh = {Adult ; Child ; Child Language ; Child, Preschool ; Female ; Humans ; Language ; Language Development ; *Phonetics ; *Speech ; Speech Acoustics ; United States ; }, abstract = {Purpose Most acoustic and articulatory studies on /l/ have focused on either duration, formant frequencies, or tongue shape during the constriction interval. Only a limited set of data exists for the transition characteristics of /l/ to and from surrounding vowels. The aim of this study was to examine second formant (F2) transition characteristics of /l/ produced by young children and adults. This was to better understand articulatory behaviors in the production of /l/ and potential clinical applications of these data to typical and delayed /l/ development. Method Participants included 17 children with typically developing speech between the ages of 2 and 5 years, and 10 female adult speakers of Southern American English. Each subject produced single words containing pre- and postvocalic /l/ in two vowel contexts (/i, ɪ/ and /ɔ, ɑ/). F2 transitions, out of and into /l/ constriction intervals from the adjacent vowels, were analyzed for perceptually acceptable /l/ productions. The F2 transition extent, duration, and rate, as well as F2 loci data, were compared across age groups by vowel context for both pre- and postvocalic /l/. Results F2 transitions of adults' /l/ showed a great similarity across and within speakers. Those of young children showed greater variability, but became increasingly similar to those of adults with age. The F2 loci data seemed consistent with greater coarticulation among children than adults. This conclusion, however, must be regarded as preliminary due to the possible influence of different vocal tract size across ages and variability in the data. Conclusions The results suggest that adult patterns can serve as a reliable reference to which children's /l/ productions can be evaluated. The articulatory configurations associated with the /l/ constriction interval and the vocal tract movements into and out of that interval may provide insight into the underlying difficulties related to misarticulated /l/.}, } @article {pmid33615923, year = {2021}, author = {Ng, ML and Woo, HK}, title = {Effect of total laryngectomy on vowel production: An acoustic study of vowels produced by alaryngeal speakers of Cantonese.}, journal = {International journal of speech-language pathology}, volume = {23}, number = {6}, pages = {652-661}, doi = {10.1080/17549507.2021.1876166}, pmid = {33615923}, issn = {1754-9515}, mesh = {Acoustics ; Humans ; Laryngectomy ; *Larynx, Artificial ; Phonetics ; Speech ; Speech Acoustics ; *Speech, Alaryngeal ; }, abstract = {Purpose: To investigate the effect of total laryngectomy on vowel production, the present study examined the change in vowel articulation associated with different types of alaryngeal speech in comparison with laryngeal speech using novel derived formant metrics.Method: Six metrics derived from the first two formants (F1 and F2) including the First and Second Formant Range Ratios (F1RR and F2RR), triangular and pentagonal Vowel Space Area (tVSA and pVSA), Formant Centralisation Ratio (FCR) and Average Vowel Spacing (AVS) were measured from vowels (/i, y, ɛ, a, ɔ, œ, u/) produced by oesophageal (ES), tracheoesophageal (TE), electrolaryngeal (EL), pneumatic artificial laryngeal (PA) speakers, as well as laryngeal speakers.Result: Data revealed a general reduction in articulatory range and a tendency of vowel centralisation in Cantonese alaryngeal speakers. Significant articulatory difference was found for PA and EL compared with ES, TE, and laryngeal speakers.Conclusion: The discrepant results among alaryngeal speakers may be related to the difference in new sound source (external vs internal). Sensitivity and correlation analyses confirmed the use of the matrix of derived formant metrics provided a more comprehensive profile of the articulatory pattern in the alaryngeal population.}, } @article {pmid33608184, year = {2023}, author = {Maryn, Y and Wuyts, FL and Zarowski, A}, title = {Are Acoustic Markers of Voice and Speech Signals Affected by Nose-and-Mouth-Covering Respiratory Protective Masks?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {468.e1-468.e12}, pmid = {33608184}, issn = {1873-4588}, mesh = {Humans ; *Speech ; Masks ; Speech Acoustics ; *COVID-19 ; Acoustics ; Speech Production Measurement ; }, abstract = {BACKGROUND: Worldwide use of nose-and-mouth-covering respiratory protective mask (RPM) has become ubiquitous during COVID19 pandemic. Consequences of wearing RPMs, especially regarding perception and production of spoken communication, are gradually emerging. The present study explored how three prevalent RPMs affect various speech and voice sound properties.

METHODS: Pre-recorded sustained [a] vowels and read sentences from 47 subjects were played by a speech production model ('Voice Emitted by Spare Parts', or 'VESPA') in four conditions: without RPM (C1), with disposable surgical mask (C2), with FFP2 mask (C3), and with transparent plastic mask (C4). Differences between C1 and masked conditions were assessed with Dunnett's t test in 26 speech sound properties related to voice production (fundamental frequency, sound intensity level), voice quality (jitter percent, shimmer percent, harmonics-to-noise ratio, smoothed cepstral peak prominence, Acoustic Voice Quality Index), articulation and resonance (first and second formant frequencies, first and second formant bandwidths, spectral center of gravity, spectral standard deviation, spectral skewness, spectral kurtosis, spectral slope, and spectral energy in ten 1-kHz bands from 0 to 10 kHz).

RESULTS: C2, C3, and C4 significantly affected 10, 15, and 19 of the acoustic speech markers, respectively. Furthermore, absolute differences between unmasked and masked conditions were largest for C4 and smallest for C2.

CONCLUSIONS: All RPMs influenced more or less speech sound properties. However, this influence was least for surgical RPMs and most for plastic RPMs. Surgical RPMs are therefore preferred when spoken communication is priority next to respiratory protection.}, } @article {pmid33600430, year = {2021}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {Acoustic analysis of vowel formant frequencies in genetically-related and non-genetically related speakers with implications for forensic speaker comparison.}, journal = {PloS one}, volume = {16}, number = {2}, pages = {e0246645}, pmid = {33600430}, issn = {1932-6203}, mesh = {Acoustics ; Adult ; Brazil ; Forensic Sciences/methods ; Humans ; Language ; Male ; Phonetics ; Psychoacoustics ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/physiology ; Twins, Monozygotic ; Verbal Behavior/*physiology ; }, abstract = {The purpose of this study was to explore the speaker-discriminatory potential of vowel formant mean frequencies in comparisons of identical twin pairs and non-genetically related speakers. The influences of lexical stress and the vowels' acoustic distances on the discriminatory patterns of formant frequencies were also assessed. Acoustic extraction and analysis of the first four speech formants F1-F4 were carried out using spontaneous speech materials. The recordings comprise telephone conversations between identical twin pairs while being directly recorded through high-quality microphones. The subjects were 20 male adult speakers of Brazilian Portuguese (BP), aged between 19 and 35. As for comparisons, stressed and unstressed oral vowels of BP were segmented and transcribed manually in the Praat software. F1-F4 formant estimates were automatically extracted from the middle points of each labeled vowel. Formant values were represented in both Hertz and Bark. Comparisons within identical twin pairs using the Bark scale were performed to verify whether the measured differences would be potentially significant when following a psychoacoustic criterion. The results revealed consistent patterns regarding the comparison of low-frequency and high-frequency formants in twin pairs and non-genetically related speakers, with high-frequency formants displaying a greater speaker-discriminatory power compared to low-frequency formants. Among all formants, F4 seemed to display the highest discriminatory potential within identical twin pairs, followed by F3. As for non-genetically related speakers, both F3 and F4 displayed a similar high discriminatory potential. Regarding vowel quality, the central vowel /a/ was found to be the most speaker-discriminatory segment, followed by front vowels. Moreover, stressed vowels displayed a higher inter-speaker discrimination than unstressed vowels in both groups; however, the combination of stressed and unstressed vowels was found even more explanatory in terms of the observed differences. Although identical twins displayed a higher phonetic similarity, they were not found phonetically identical.}, } @article {pmid33589372, year = {2023}, author = {Lau, HYC and Scherer, RC}, title = {Objective Measures of Two Musical Interpretations of an Excerpt From Berlioz's "La mort d'Ophélie".}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {301.e9-301.e25}, doi = {10.1016/j.jvoice.2020.12.045}, pmid = {33589372}, issn = {1873-4588}, mesh = {Humans ; Voice Quality ; *Music ; Speech Acoustics ; Glottis/physiology ; *Voice ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study aimed to determine objective production differences relative to two emotional interpretations in performing an excerpt from a classical art song. The null hypothesis was proposed.

METHODS: The first author recorded an excerpt from an art song. The excerpt was sung with two contrasting musical interpretations: an "empathetic legato" approach, and a "sarcastic" approach characterized by emphatic attacks. Microphone, airflow, and electroglottography signals were digitized. The vowels were analyzed in terms of intensity, long term average spectra, fundamental frequency (fo), airflow vibrato rate and extent, vowel onset slope, intensity comparison of harmonic frequencies, and glottal measures based on electroglottograph waveforms. Four consonant tokens were analyzed relative to airflow, voice onset time, and production duration.

RESULTS & CONCLUSIONS: The emphatic performance had faster vowel onset, increased glottal adduction, increased intensity of harmonics in 2-3 kHz, increased intensity in the fourth and fifth formants, inferred subglottal pressure increase, increased airflow for /f/, and greater aspiration airflow for /p, t/. Vibrato extents for intensity, fo, and airflow were wider in the emphatic approach. Findings revealed larger EGGW25 and peak-to-peak amplitude values of the electroglottography waveform, suggesting greater vocal fold contact area and longer glottal closure for the emphatic approach. Long-term average spectrum analyses of the entire production displayed minor variation across all formant frequencies, suggesting an insignificant change in vocal tract shaping between the two approaches. This single-case objective study emphasizes the reality of physiological, aerodynamic, and acoustic production differences in the interpretive and pedagogical aspects of art song performance.}, } @article {pmid33577218, year = {2021}, author = {Easwar, V and Bridgwater, E and Purcell, D}, title = {The Influence of Vowel Identity, Vowel Production Variability, and Consonant Environment on Envelope Following Responses.}, journal = {Ear and hearing}, volume = {42}, number = {3}, pages = {662-672}, doi = {10.1097/AUD.0000000000000966}, pmid = {33577218}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem ; Humans ; Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {OBJECTIVES: The vowel-evoked envelope following response (EFR) is a useful tool for studying brainstem processing of speech in natural consonant-vowel productions. Previous work, however, demonstrates that the amplitude of EFRs is highly variable across vowels. To clarify factors contributing to the variability observed, the objectives of the present study were to evaluate: (1) the influence of vowel identity and the consonant context surrounding each vowel on EFR amplitude and (2) the effect of variations in repeated productions of a vowel on EFR amplitude while controlling for the consonant context.

DESIGN: In Experiment 1, EFRs were recorded in response to seven English vowels (/ij/, /Ι/, /ej/, /ε/, /æ/, /u/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic1/v/2021-04-30T105427Z/r/image-tiff/) embedded in each of four consonant contexts (/hVd/, /sVt/, /zVf/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic2/v/2021-04-30T105427Z/r/image-tiffVv/). In Experiment 2, EFRs were recorded in response to four different variants of one of the four possible vowels (/ij/, /ε/, /æ/, or /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic3/v/2021-04-30T105427Z/r/image-tiff/), embedded in the same consonant-vowel-consonant environments used in Experiment 1. All vowels were edited to minimize formant transitions before embedding in a consonant context. Different talkers were used for the two experiments. Data from a total of 30 and 64 (16 listeners/vowel) young adults with normal hearing were included in Experiments 1 and 2, respectively. EFRs were recorded using a single-channel electrode montage between the vertex and nape of the neck while stimuli were presented monaurally.

RESULTS: In Experiment 1, vowel identity had a significant effect on EFR amplitude with the vowel /æ/ eliciting the highest amplitude EFRs (170 nV, on average), and the vowel /ej/ eliciting the lowest amplitude EFRs (106 nV, on average). The consonant context surrounding each vowel stimulus had no statistically significant effect on EFR amplitude. Similarly in Experiment 2, consonant context did not influence the amplitude of EFRs elicited by the vowel variants. Vowel identity significantly altered EFR amplitude with /ε/ eliciting the highest amplitude EFRs (104 nV, on average). Significant, albeit small, differences (<21 nV, on average) in EFR amplitude were evident between some variants of /ε/ and /u/.

CONCLUSION: Based on a comprehensive set of naturally produced vowel samples in carefully controlled consonant contexts, the present study provides additional evidence for the sensitivity of EFRs to vowel identity and variations in vowel production. The surrounding consonant context (after removal of formant transitions) has no measurable effect on EFRs, irrespective of vowel identity and variant. The sensitivity of EFRs to nuances in vowel acoustics emphasizes the need for adequate control and evaluation of stimuli proposed for clinical and research purposes.}, } @article {pmid33568701, year = {2021}, author = {Hodges-Simeon, CR and Grail, GPO and Albert, G and Groll, MD and Stepp, CE and Carré, JM and Arnocky, SA}, title = {Testosterone therapy masculinizes speech and gender presentation in transgender men.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {3494}, pmid = {33568701}, issn = {2045-2322}, support = {T32 DC013017/DC/NIDCD NIH HHS/United States ; DC013017/NH/NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Speech/*drug effects/physiology ; Speech Acoustics ; Speech Perception/*drug effects/physiology ; Testosterone/*pharmacology ; Transgender Persons/psychology ; Transsexualism/*drug therapy ; Voice/drug effects ; Voice Quality/drug effects ; Young Adult ; }, abstract = {Voice is one of the most noticeably dimorphic traits in humans and plays a central role in gender presentation. Transgender males seeking to align internal identity and external gender expression frequently undergo testosterone (T) therapy to masculinize their voices and other traits. We aimed to determine the importance of changes in vocal masculinity for transgender men and to determine the effectiveness of T therapy at masculinizing three speech parameters: fundamental frequency (i.e., pitch) mean and variation (fo and fo-SD) and estimated vocal tract length (VTL) derived from formant frequencies. Thirty transgender men aged 20 to 40 rated their satisfaction with traits prior to and after T therapy and contributed speech samples and salivary T. Similar-aged cisgender men and women contributed speech samples for comparison. We show that transmen viewed voice change as critical to transition success compared to other masculine traits. However, T therapy may not be sufficient to fully masculinize speech: while fo and fo-SD were largely indistinguishable from cismen, VTL was intermediate between cismen and ciswomen. fo was correlated with salivary T, and VTL associated with T therapy duration. This argues for additional approaches, such as behavior therapy and/or longer duration of hormone therapy, to improve speech transition.}, } @article {pmid33555417, year = {2021}, author = {Heimbauer, LA and Beran, MJ and Owren, MJ}, title = {A chimpanzee recognizes varied acoustical versions of sine-wave and noise-vocoded speech.}, journal = {Animal cognition}, volume = {24}, number = {4}, pages = {843-854}, pmid = {33555417}, issn = {1435-9456}, support = {IBN-9876754//National Science Foundation/ ; }, mesh = {Acoustic Stimulation/veterinary ; Animals ; Cues ; Noise ; Pan troglodytes ; *Speech ; *Speech Perception ; }, abstract = {Previous research demonstrated that a language-trained chimpanzee recognized familiar English words in sine-wave and noise-vocoded forms (Heimbauer et al. Curr Biol 21:1210-1214, 2011). However, those results did not provide information regarding processing strategies of the specific acoustic cues to which the chimpanzee may have attended. The current experiments tested this chimpanzee and adult humans using sine-wave and noise-vocoded speech manipulated using specific sine-waves and a different number of noise bands, respectively. Similar to humans tested with the same stimuli, the chimpanzee was more successful identifying sine-wave speech when both SW1 and SW2 were present - the components that are modeled on formants F1 and F2 in the natural speech signal. Results with noise-vocoded speech revealed that the chimpanzee and humans performed best with stimuli that included four or five noise bands, as compared to those with three and two. Overall, amplitude and frequency modulation over time were important for identification of sine-wave and noise-vocoded speech, with further evidence that a nonhuman primate is capable of using top-down processes for speech perception when the signal is altered and incomplete.}, } @article {pmid33524265, year = {2021}, author = {Yang, J and Xu, L}, title = {Vowel Production in Prelingually Deafened Mandarin-Speaking Children With Cochlear Implants.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {2}, pages = {664-682}, doi = {10.1044/2020_JSLHR-20-00469}, pmid = {33524265}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Child ; Child, Preschool ; *Cochlear Implantation ; *Cochlear Implants ; *Deafness/surgery ; Humans ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose The purpose of this study was to characterize the acoustic profile and to evaluate the intelligibility of vowel productions in prelingually deafened, Mandarin-speaking children with cochlear implants (CIs). Method Twenty-five children with CIs and 20 age-matched children with normal hearing (NH) were recorded producing a list of Mandarin disyllabic and trisyllabic words containing 20 Mandarin vowels [a, i, u, y, ɤ, ɿ, ʅ, ai, ei, ia, ie, ye, ua, uo, au, ou, iau, iou, uai, uei] located in the first consonant-vowel syllable. The children with CIs were all prelingually deafened and received unilateral implantation before 7 years of age with an average length of CI use of 4.54 years. In the acoustic analysis, the first two formants (F1 and F2) were extracted at seven equidistant time locations for the tested vowels. The durational and spectral features were compared between the CI and NH groups. In the vowel intelligibility task, the extracted vowel portions in both NH and CI children were presented to six Mandarin-speaking, NH adult listeners for identification. Results The acoustic analysis revealed that the children with CIs deviated from the NH controls in the acoustic features for both single vowels and compound vowels. The acoustic deviations were reflected in longer duration, more scattered vowel categories, smaller vowel space area, and distinct formant trajectories in the children with CIs in comparison to NH controls. The vowel intelligibility results showed that the recognition accuracy of the vowels produced by the children with CIs was significantly lower than that of the NH children. The confusion pattern of vowel recognition in the children with CIs generally followed that in the NH children. Conclusion Our data suggested that the prelingually deafened children with CIs, with a relatively long duration of CI experience, still showed measurable acoustic deviations and lower intelligibility in vowel productions in comparison to the NH children.}, } @article {pmid33522087, year = {2021}, author = {Carl, M and Icht, M}, title = {Acoustic vowel analysis and speech intelligibility in young adult Hebrew speakers: Developmental dysarthria versus typical development.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {283-298}, doi = {10.1111/1460-6984.12598}, pmid = {33522087}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/diagnosis ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Developmental dysarthria is a motor speech impairment commonly characterized by varying levels of reduced speech intelligibility. The relationship between intelligibility deficits and acoustic vowel space among these individuals has long been noted in the literature, with evidence of vowel centralization (e.g., in English and Mandarin). However, the degree to which this centralization occurs and the intelligibility-acoustic relationship is maintained in different vowel systems has yet to be studied thoroughly. In comparison with American English, the Hebrew vowel system is significantly smaller, with a potentially smaller vowel space area, a factor that may impact upon the comparisons of the acoustic vowel space and its correlation with speech intelligibility. Data on vowel space and speech intelligibility are particularly limited for Hebrew speakers with motor speech disorders.

AIMS: To determine the nature and degree of vowel space centralization in Hebrew-speaking adolescents and young adults with dysarthria, in comparison with typically developing (TD) peers, and to correlate these findings with speech intelligibility scores.

METHODS & PROCEDURES: Adolescents and young adults with developmental dysarthria (secondary to cerebral palsy (CP) and other motor deficits, n = 17) and their TD peers (n = 17) were recorded producing Hebrew corner vowels within single words. For intelligibility assessments, naïve listeners transcribed those words produced by speakers with CP, and intelligibility scores were calculated.

OUTCOMES & RESULTS: Acoustic analysis of vowel formants (F1, F2) revealed a centralization of vowel space among speakers with CP for all acoustic metrics of vowel formants, and mainly for the formant centralization ratio (FCR), in comparison with TD peers. Intelligibility scores were correlated strongly with the FCR metric for speakers with CP.

The main results, vowel space centralization for speakers with CP in comparison with TD peers, echo previous cross-linguistic results. The correlation of acoustic results with speech intelligibility carries clinical implications. Taken together, the results contribute to better characterization of the speech production deficit in Hebrew speakers with motor speech disorders. Furthermore, they may guide clinical decision-making and intervention planning to improve speech intelligibility. What this paper adds What is already known on the subject Speech production and intelligibility deficits among individuals with developmental dysarthria (e.g., secondary to CP) are well documented. These deficits have also been correlated with centralization of the acoustic vowel space, although primarily in English speakers. Little is known about the acoustic characteristics of vowels in Hebrew speakers with motor speech disorders, and whether correlations with speech intelligibility are maintained. What this paper adds to existing knowledge This study is the first to describe the acoustic characteristics of vowel space in Hebrew-speaking adolescents and young adults with developmental dysarthria. The results demonstrate a centralization of the acoustic vowel space in comparison with TD peers for all measures, as found in other languages. Correlation between acoustic measures and speech intelligibility scores were also documented. We discuss these results within the context of cross-linguistic comparisons. What are the potential or actual clinical implications of this work? The results confirm the use of objective acoustic measures in the assessment of individuals with motor speech disorders, providing such data for Hebrew-speaking adolescents and young adults. These measures can be used to determine the nature and severity of the speech deficit across languages, may guide intervention planning, as well as measure the effectiveness of intelligibility-based treatment programmes.}, } @article {pmid33514177, year = {2021}, author = {Bakst, S and Niziolek, CA}, title = {Effects of syllable stress in adaptation to altered auditory feedback in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {1}, pages = {708}, pmid = {33514177}, issn = {1520-8524}, support = {F32 DC017653/DC/NIDCD NIH HHS/United States ; K99 DC014520/DC/NIDCD NIH HHS/United States ; R00 DC014520/DC/NIDCD NIH HHS/United States ; T32 DC005359/DC/NIDCD NIH HHS/United States ; }, mesh = {*Feedback ; Humans ; Language ; Phonetics ; *Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Unstressed syllables in English most commonly contain the vowel quality [ə] (schwa), which is cross-linguistically described as having a variable target. The present study examines whether speakers are sensitive to whether their auditory feedback matches their target when producing unstressed syllables. When speakers hear themselves producing formant-altered speech, they will change their motor plans so that their altered feedback is a better match to the target. If schwa has no target, then feedback mismatches in unstressed syllables may not drive a change in production. In this experiment, participants spoke disyllabic words with initial or final stress where the auditory feedback of F1 was raised (Experiment 1) or lowered (Experiment 2) by 100 mels. Both stressed and unstressed syllables showed adaptive changes in F1. In Experiment 1, initial-stress words showed larger adaptive decreases in F1 than final-stress words, but in Experiment 2, stressed syllables overall showed greater adaptive increases in F1 than unstressed syllables in all words, regardless of which syllable contained the primary stress. These results suggest that speakers are sensitive to feedback mismatches in both stressed and unstressed syllables, but that stress and metrical foot type may mediate the corrective response.}, } @article {pmid33495033, year = {2023}, author = {Hakanpää, T and Waaramaa, T and Laukkanen, AM}, title = {Training the Vocal Expression of Emotions in Singing: Effects of Including Acoustic Research-Based Elements in the Regular Singing Training of Acting Students.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {293.e7-293.e23}, doi = {10.1016/j.jvoice.2020.12.032}, pmid = {33495033}, issn = {1873-4588}, mesh = {Humans ; *Singing ; *Voice ; Acoustics ; Students ; Emotions ; }, abstract = {OBJECTIVES: This study examines the effects of including acoustic research-based elements of the vocal expression of emotions in the singing lessons of acting students during a seven-week teaching period. This information may be useful in improving the training of interpretation in singing.

STUDY DESIGN: Experimental comparative study.

METHODS: Six acting students participated in seven weeks of extra training concerning voice quality in the expression of emotions in singing. Song samples were recorded before and after the training. A control group of six acting students were recorded twice within a seven-week period, during which they participated in ordinary training. All participants sang on the vowel [a:] and on a longer phrase expressing anger, sadness, joy, tenderness, and neutral states. The vowel and phrase samples were evaluated by 34 listeners for the perceived emotion. Additionally, the vowel samples were analyzed for formant frequencies (F1-F4), sound pressure level (SPL), spectral structure (Alpha ratio = SPL 1500-5000 Hz - SPL 50-1500 Hz), harmonic-to-noise ratio (HNR), and perturbation (jitter, shimmer).

RESULTS: The number of correctly perceived expressions improved in the test group's vowel samples, while no significant change was observed in the control group. The overall recognition was higher for the phrases than for the vowel samples. Of the acoustic parameters, F1 and SPL significantly differentiated emotions in both groups, and HNR specifically differentiated emotions in the test group. The Alpha ratio was found to statistically significantly differentiate emotion expression after training.

CONCLUSIONS: The expression of emotion in the singing voice improved after seven weeks of voice quality training. The F1, SPL, Alpha ratio, and HNR differentiated emotional expression. The variation in acoustic parameters became wider after training. Similar changes were not observed after seven weeks of ordinary voice training.}, } @article {pmid33484095, year = {2021}, author = {Mendoza Ramos, V and Paulyn, C and Van den Steen, L and Hernandez-Diaz Huici, ME and De Bodt, M and Van Nuffelen, G}, title = {Effect of boost articulation therapy (BArT) on intelligibility in adults with dysarthria.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {271-282}, pmid = {33484095}, issn = {1460-6984}, mesh = {Adult ; Behavior Therapy ; *Dysarthria/diagnosis/therapy ; Humans ; Speech Articulation Tests ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {BACKGROUND: The articulatory accuracy of patients with dysarthria is one of the most affected speech dimensions with a high impact on speech intelligibility. Behavioural treatments of articulation can either involve direct or indirect approaches. The latter have been thoroughly investigated and are generally appreciated for their almost immediate effects on articulation and intelligibility. The number of studies on (short-term) direct articulation therapy is limited.

AIMS: To investigate the effects of short-term, boost articulation therapy (BArT) on speech intelligibility in patients with chronic or progressive dysarthria and the effect of severity of dysarthria on the outcome.

METHODS & PROCEDURES: The study consists of a two-group pre-/post-test design to assess speech intelligibility at phoneme and sentence level and during spontaneous speech, automatic speech and reading a phonetically balanced text. A total of 17 subjects with mild to severe dysarthria participated in the study and were randomly assigned to either a patient-tailored, intensive articulatory drill programme or an intensive minimal pair training. Both training programmes were based on the principles of motor learning. Each training programme consisted of five sessions of 45 min completed within one week.

OUTCOMES & RESULTS: Following treatment, a statistically significant increase of mean group intelligibility was shown at phoneme and sentence level, and in automatic sequences. This was supported by an acoustic analysis that revealed a reduction in formant centralization ratio. Within specific groups of severity, large and moderate positive effect sizes with Cohen's d were demonstrated.

BArT successfully improves speech intelligibility in patients with chronic or progressive dysarthria at different levels of the impairment. What this paper adds What is already known on the subject Behavioural treatment of articulation in patients with dysarthria mainly involves indirect strategies, which have shown positive effects on speech intelligibility. However, there is limited evidence on the short-term effects of direct articulation therapy at the segmental level of speech. This study investigates the effectiveness of BArT on speech intelligibility in patients with chronic or progressive dysarthria at all severity levels. What this paper adds to existing knowledge The intensive and direct articulatory therapy programmes developed and applied in this study intend to reduce the impairment instead of compensating it. This approach results in a significant improvement of speech intelligibility at different dysarthria severity levels in a short period of time while contributing to exploit and develop all available residual motor skills in persons with dysarthria. What are the potential or actual clinical implications of this work? The improvements in intelligibility demonstrate the effectiveness of a BArT at the segmental level of speech. This makes it to be considered a suitable approach in the treatment of patients with chronic or progressive dysarthria.}, } @article {pmid33455538, year = {2022}, author = {Kulikov, V}, title = {Voice and Emphasis in Arabic Coronal Stops: Evidence for Phonological Compensation.}, journal = {Language and speech}, volume = {65}, number = {1}, pages = {73-104}, pmid = {33455538}, issn = {1756-6053}, mesh = {Cues ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {The current study investigates multiple acoustic cues-voice onset time (VOT), spectral center of gravity (SCG) of burst, pitch (F0), and frequencies of the first (F1) and second (F2) formants at vowel onset-associated with phonological contrasts of voicing and emphasis in production of Arabic coronal stops. The analysis of the acoustic data collected from eight native speakers of the Qatari dialect showed that the three stops form three distinct modes on the VOT scale: [d] is (pre)voiced, voiceless [t] is aspirated, and emphatic [ṭ] is voiceless unaspirated. The contrast is also maintained in spectral cues. Each cue influences production of coronal stops while their relevance to phonological contrasts varies. VOT was most relevant for voicing, but F2 was mostly associated with emphasis. The perception experiment revealed that listeners were able to categorize ambiguous tokens correctly and compensate for phonological contrasts. The listeners' results were used to evaluate three categorization models to predict the intended category of a coronal stop: a model with unweighted and unadjusted cues, a model with weighted cues compensating for phonetic context, and a model with weighted cues compensating for the voicing and emphasis contrasts. The findings suggest that the model with phonological compensation performed most similar to human listeners both in terms of accuracy rate and error pattern.}, } @article {pmid33441596, year = {2021}, author = {Aung, T and Goetz, S and Adams, J and McKenna, C and Hess, C and Roytman, S and Cheng, JT and Zilioli, S and Puts, D}, title = {Low fundamental and formant frequencies predict fighting ability among male mixed martial arts fighters.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {905}, pmid = {33441596}, issn = {2045-2322}, mesh = {Acoustics ; Adult ; Aggression/*physiology/psychology ; Anthropometry ; Athletes/psychology ; Biomarkers ; Cues ; Humans ; Male ; Martial Arts/physiology ; Phenotype ; Pitch Discrimination/physiology ; Sexual Behavior/physiology/psychology ; Social Perception/psychology ; Voice/*physiology ; }, abstract = {Human voice pitch is highly sexually dimorphic and eminently quantifiable, making it an ideal phenotype for studying the influence of sexual selection. In both traditional and industrial populations, lower pitch in men predicts mating success, reproductive success, and social status and shapes social perceptions, especially those related to physical formidability. Due to practical and ethical constraints however, scant evidence tests the central question of whether male voice pitch and other acoustic measures indicate actual fighting ability in humans. To address this, we examined pitch, pitch variability, and formant position of 475 mixed martial arts (MMA) fighters from an elite fighting league, with each fighter's acoustic measures assessed from multiple voice recordings extracted from audio or video interviews available online (YouTube, Google Video, podcasts), totaling 1312 voice recording samples. In four regression models each predicting a separate measure of fighting ability (win percentages, number of fights, Elo ratings, and retirement status), no acoustic measure significantly predicted fighting ability above and beyond covariates. However, after fight statistics, fight history, height, weight, and age were used to extract underlying dimensions of fighting ability via factor analysis, pitch and formant position negatively predicted "Fighting Experience" and "Size" factor scores in a multivariate regression model, explaining 3-8% of the variance. Our findings suggest that lower male pitch and formants may be valid cues of some components of fighting ability in men.}, } @article {pmid33413460, year = {2021}, author = {Volodin, IA and Volodina, EV and Frey, R}, title = {Rutting vocal display in male impala (Aepyceros melampus) and overlap with alarm context.}, journal = {Frontiers in zoology}, volume = {18}, number = {1}, pages = {2}, pmid = {33413460}, issn = {1742-9994}, support = {19-04-00133//Российский Фонд Фундаментальных Исследований (РФФИ)/ ; 19-04-00133//Российский Фонд Фундаментальных Исследований (РФФИ)/ ; }, abstract = {BACKGROUND: The rutting vocal display of male impala Aepyceros melampus is unique for its complexity among ruminants. This study investigates bouts of rutting calls produced towards potential mates and rival males by free-ranging male impala in Namibia. In particular, a comparison of male rutting and alarm snorts is conducted, inspired by earlier findings of mate guarding by using alarm snorts in male topi Damaliscus lunatus.

RESULTS: Rutting male impala produced 4-38 (13.5 ± 6.5) rutting calls per bout. We analyzed 201 bouts, containing in total 2709 rutting calls of five types: continuous roars produced within a single exhalation-inhalation cycle; interrupted roars including few exhalation-inhalation cycles; pant-roars distinctive by a pant-phase with rapidly alternating inhalations and exhalations; usual snorts lacking any roar part; and roar-snorts starting with a short roar part. Bouts mostly started and ended with usual snorts. Continuous roars were the shortest roars. The average duration of the exhalatory phase was longest in the continuous roars and shortest in the pant-roars. The average fundamental frequency (49.7-51.4 Hz) did not differ between roar types. Vocal tract length, calculated by using measurements of the first four vocal tract resonances (formants), ranged within 381-382 mm in all roar types. In the studied male impala, rutting snorts within bouts of rutting calls were longer and had higher values of the upper quartile in the call spectra than alarm snorts produced towards potential danger.

CONCLUSIONS: Additional inhalations during the emission of the interrupted and pant-roars prolong their duration compared to the continuous roars but do not affect the fundamental frequency or the degree of larynx retraction while roaring. Alarm snorts are separated from one another by large intervals, whereas the intervals between rutting snorts within bouts are short. Sometimes, rutting snorts alternate with roars, whereas alarm snorts do not. Therefore, it is not the acoustic structure of individual snorts but the temporal sequence and the occasional association with another call type that defines snorts as either rutting or alarm snorts. The rutting snorts of male impala may function to attract the attention of receptive females and delay their departure from a male's harem or territory.}, } @article {pmid33399816, year = {2021}, author = {Bodaghi, D and Jiang, W and Xue, Q and Zheng, X}, title = {Effect of Supraglottal Acoustics on Fluid-Structure Interaction During Human Voice Production.}, journal = {Journal of biomechanical engineering}, volume = {143}, number = {4}, pages = {}, pmid = {33399816}, issn = {1528-8951}, support = {R01 DC009616/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; *Acoustics ; *Glottis/physiology ; *Voice ; Hydrodynamics ; Phonation ; Vibration ; Vocal Cords/physiology ; Models, Biological ; Pressure ; }, abstract = {A hydrodynamic/acoustic splitting method was used to examine the effect of supraglottal acoustics on fluid-structure interactions during human voice production in a two-dimensional computational model. The accuracy of the method in simulating compressible flows in typical human airway conditions was verified by comparing it to full compressible flow simulations. The method was coupled with a three-mass model of vocal fold lateral motion to simulate fluid-structure interactions during human voice production. By separating the acoustic perturbation components of the airflow, the method allows isolation of the role of supraglottal acoustics in fluid-structure interactions. The results showed that an acoustic resonance between a higher harmonic of the sound source and the first formant of the supraglottal tract occurred during normal human phonation when the fundamental frequency was much lower than the formants. The resonance resulted in acoustic pressure perturbation at the glottis which was of the same order as the incompressible flow pressure and found to affect vocal fold vibrations and glottal flow rate waveform. Specifically, the acoustic perturbation delayed the opening of the glottis, reduced the vertical phase difference of vocal fold vibrations, decreased flow rate and maximum flow deceleration rate (MFDR) at the glottal exit; yet, they had little effect on glottal opening. The results imply that the sound generation in the glottis and acoustic resonance in the supraglottal tract are coupled processes during human voice production and computer modeling of vocal fold vibrations needs to include supraglottal acoustics for accurate predictions.}, } @article {pmid33397591, year = {2023}, author = {Feng, M and Howard, DM}, title = {The Dynamic Effect of the Valleculae on Singing Voice - An Exploratory Study Using 3D Printed Vocal Tracts.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {178-186}, doi = {10.1016/j.jvoice.2020.12.012}, pmid = {33397591}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Speech Acoustics ; *Voice/physiology ; Acoustics ; Printing, Three-Dimensional ; }, abstract = {BACKGROUND AND OBJECTIVES: The valleculae can be seen as a pair of side branches of the human vocal tract like the piriform fossae. While the acoustic properties of the piriform fossae have been explored in detail, there is little evidence of full exploration of the acoustic properties of the valleculae. A recent investigation (Vampola, Horáček, & Švec, 2015), using a finite element model of a single vowel /a/, suggests that the valleculae created two antiresonances and two resonances in the high frequency region (above 4kHz) along with those produced by the piriform sinuses. In the current study, we investigate, in multiple vowels, the acoustic influences of the valleculae in singing voice, using 3-D printed vocal tracts.

METHOD: MRI data were collected from an operatic tenor singing English vowels /a/, /u/, /i/. The images of each vowel were segmented and edited to create a pair of tracts, where one is the original and one had the valleculae digitally removed.The printed tracts were then placed atop a vocal tract organ loudspeaker, excited by white noise. Recordings were made with a microphone placed in front of the mouths of the tracts, to measure their frequency responses.

RESULTS: Dimensional changes were observed in valleculae of different vowels, with the long-term average spectra of the recordings illustrating clear differences between the frequency responses of the va-nova (valleculae - no valleculae) pairs, which varies with vowels.

CONCLUSION: The experiment demonstrates the dynamic[1] nature of the shapes of the valleculae in the human vocal tract and its acoustic consequences. It provides evidence that the valleculae have similar acoustic properties to the piriform fossae but with larger variations, and in some cases can influence acoustically the frequency region below 4kHz. The results suggest that large volume valleculae have the potential to impede to some extent the acoustic effect of the singers formant cluster and small valleculae may do the reverse. Since the volume of the valleculae is observed to be largely dependent on tongue movement and also with changes to the uttered vowel, it can be assumed that the high frequency energy, including that within the singer's formant region, could be vowel dependent. Strategies to control valleculae volumes are likely to be highly relevant to voice pedagogy practice as well as singing performance.}, } @article {pmid36154080, year = {2021}, author = {Ying Liu, Y and Polka, L and Masapollo, M and Ménard, L}, title = {Disentangling the roles of formant proximity and stimulus prototypicality in adult vowel perception.}, journal = {JASA express letters}, volume = {1}, number = {1}, pages = {015201}, doi = {10.1121/10.0003041}, pmid = {36154080}, issn = {2691-1191}, abstract = {The present investigation examined the extent to which asymmetries in vowel perception derive from a sensitivity to focalization (formant proximity), stimulus prototypicality, or both. English-speaking adults identified, rated, and discriminated a vowel series that spanned a less-focal/prototypic English /u/ and a more-focal/prototypic French /u/ exemplar. Discrimination pairs included one-step, two-step, and three-step intervals along the series. Asymmetries predicted by both focalization and prototype effects emerged when discrimination step-size was varied. The findings indicate that both generic/universal and language-specific biases shape vowel perception in adults; the latter are challenging to isolate without well-controlled stimuli and appropriately scaled discrimination tasks.}, } @article {pmid33379914, year = {2020}, author = {Lovcevic, I and Kalashnikova, M and Burnham, D}, title = {Acoustic features of infant-directed speech to infants with hearing loss.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3399}, doi = {10.1121/10.0002641}, pmid = {33379914}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Infant ; Speech ; *Speech Perception ; }, abstract = {This study investigated the effects of hearing loss and hearing experience on the acoustic features of infant-directed speech (IDS) to infants with hearing loss (HL) compared to controls with normal hearing (NH) matched by either chronological or hearing age (experiment 1) and across development in infants with hearing loss as well as the relation between IDS features and infants' developing lexical abilities (experiment 2). Both experiments included detailed acoustic analyses of mothers' productions of the three corner vowels /a, i, u/ and utterance-level pitch in IDS and in adult-directed speech. Experiment 1 demonstrated that IDS to infants with HL was acoustically more variable than IDS to hearing-age matched infants with NH. Experiment 2 yielded no changes in IDS features over development; however, the results did show a positive relationship between formant distances in mothers' speech and infants' concurrent receptive vocabulary size, as well as between vowel hyperarticulation and infants' expressive vocabulary. These findings suggest that despite infants' HL and thus diminished access to speech input, infants with HL are exposed to IDS with generally similar acoustic qualities as are infants with NH. However, some differences persist, indicating that infants with HL might receive less intelligible speech.}, } @article {pmid33379900, year = {2020}, author = {Nault, DR and Munhall, KG}, title = {Individual variability in auditory feedback processing: Responses to real-time formant perturbations and their relation to perceptual acuity.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3709}, doi = {10.1121/10.0002923}, pmid = {33379900}, issn = {1520-8524}, abstract = {In this study, both between-subject and within-subject variability in speech perception and speech production were examined in the same set of speakers. Perceptual acuity was determined using an ABX auditory discrimination task, whereby speakers made judgments between pairs of syllables on a /ɛ/ to /æ/ acoustic continuum. Auditory feedback perturbations of the first two formants were implemented in a production task to obtain measures of compensation, normal speech production variability, and vowel spacing. Speakers repeated the word "head" 120 times under varying feedback conditions, with the final Hold phase involving the strongest perturbations of +240 Hz in F1 and -300 Hz in F2. Multiple regression analyses were conducted to determine whether individual differences in compensatory behavior in the Hold phase could be predicted by perceptual acuity, speech production variability, and vowel spacing. Perceptual acuity significantly predicted formant changes in F1, but not in F2. These results are discussed in consideration of the importance of using larger sample sizes in the field and developing new methods to explore feedback processing at the individual participant level. The potential positive role of variability in speech motor control is also considered.}, } @article {pmid33379892, year = {2020}, author = {Kothare, H and Raharjo, I and Ramanarayanan, V and Ranasinghe, K and Parrell, B and Johnson, K and Houde, JF and Nagarajan, SS}, title = {Sensorimotor adaptation of speech depends on the direction of auditory feedback alteration.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3682}, pmid = {33379892}, issn = {1520-8524}, support = {K08 AG058749/AG/NIA NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {A hallmark feature of speech motor control is its ability to learn to anticipate and compensate for persistent feedback alterations, a process referred to as sensorimotor adaptation. Because this process involves adjusting articulation to counter the perceived effects of altering acoustic feedback, there are a number of factors that affect it, including the complex relationship between acoustics and articulation and non-uniformities of speech perception. As a consequence, sensorimotor adaptation is hypothesised to vary as a function of the direction of the applied auditory feedback alteration in vowel formant space. This hypothesis was tested in two experiments where auditory feedback was altered in real time, shifting the frequency values of the first and second formants (F1 and F2) of participants' speech. Shifts were designed on a subject-by-subject basis and sensorimotor adaptation was quantified with respect to the direction of applied shift, normalised for individual speakers. Adaptation was indeed found to depend on the direction of the applied shift in vowel formant space, independent of shift magnitude. These findings have implications for models of sensorimotor adaptation of speech.}, } @article {pmid33379880, year = {2020}, author = {Houle, N and Levi, SV}, title = {Acoustic differences between voiced and whispered speech in gender diverse speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {4002}, doi = {10.1121/10.0002952}, pmid = {33379880}, issn = {1520-8524}, mesh = {Acoustics ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; *Voice ; }, abstract = {Whispered speech is a naturally produced mode of communication that lacks a fundamental frequency. Several other acoustic differences exist between whispered and voiced speech, such as speaking rate (measured as segment duration) and formant frequencies. Previous research has shown that listeners are less accurate at identifying linguistic information (e.g., identifying a speech sound) and speaker information (e.g., reporting speaker gender) from whispered speech. To further explore differences between voiced and whispered speech, acoustic differences were examined across three datasets (hVd, sVd, and ʃVd) and three speaker groups (ciswomen, transwomen, cismen). Consistent with previous studies, vowel duration was generally longer in whispered speech and formant frequencies were shifted higher, although the magnitude of these differences depended on vowel and gender. Despite the increase in duration, the acoustic vowel space area (measured either with a vowel quadrilateral or with a convex hull) was smaller in the whispered speech, suggesting that larger vowel space areas are not an automatic consequence of a lengthened articulation. Overall, these findings are consistent with previous literature showing acoustic differences between voiced and whispered speech beyond the articulatory change of eliminating fundamental frequency.}, } @article {pmid33369591, year = {2021}, author = {Ananthakrishnan, S and Grinstead, L and Yurjevich, D}, title = {Human Frequency Following Responses to Filtered Speech.}, journal = {Ear and hearing}, volume = {42}, number = {1}, pages = {87-105}, doi = {10.1097/AUD.0000000000000902}, pmid = {33369591}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adult ; *Hearing Aids ; Humans ; Noise ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: There is increasing interest in using the frequency following response (FFR) to describe the effects of varying different aspects of hearing aid signal processing on brainstem neural representation of speech. To this end, recent studies have examined the effects of filtering on brainstem neural representation of the speech fundamental frequency (f0) in listeners with normal hearing sensitivity by measuring FFRs to low- and high-pass filtered signals. However, the stimuli used in these studies do not reflect the entire range of typical cutoff frequencies used in frequency-specific gain adjustments during hearing aid fitting. Further, there has been limited discussion on the effect of filtering on brainstem neural representation of formant-related harmonics. Here, the effects of filtering on brainstem neural representation of speech fundamental frequency (f0) and harmonics related to first formant frequency (F1) were assessed by recording envelope and spectral FFRs to a vowel low-, high-, and band-pass filtered at cutoff frequencies ranging from 0.125 to 8 kHz.

DESIGN: FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth and low-pass (experiment 1), high-pass (experiment 2), and band-pass (experiment 3) filtered conditions. In experiment 1, FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 11 low-pass filtered conditions (low-pass cutoff frequencies: 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 6, and 8 kHz) in 19 adult listeners with normal hearing sensitivity. In experiment 2, FFRs were measured to the same synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 10 high-pass filtered conditions (hig