@article {pmid37260602, year = {2023}, author = {Ashokumar, M and Guichet, C and Schwartz, JL and Ito, T}, title = {Correlation between the effect of orofacial somatosensory inputs in speech perception and speech production performance.}, journal = {Auditory perception & cognition}, volume = {6}, number = {1-2}, pages = {97-107}, pmid = {37260602}, issn = {2574-2450}, abstract = {INTRODUCTION: Orofacial somatosensory inputs modify the perception of speech sounds. Such auditory-somatosensory integration likely develops alongside speech production acquisition. We examined whether the somatosensory effect in speech perception varies depending on individual characteristics of speech production.
METHODS: The somatosensory effect in speech perception was assessed by changes in category boundary between /e/ and /ø/ in a vowel identification test resulting from somatosensory stimulation providing facial skin deformation in the rearward direction corresponding to articulatory movement for /e/ applied together with the auditory input. Speech production performance was quantified by the acoustic distances between the average first, second and third formants of /e/ and /ø/ utterances recorded in a separate test.
RESULTS: The category boundary between /e/ and /ø/ was significantly shifted towards /ø/ due to the somatosensory stimulation which is consistent with previous research. The amplitude of the category boundary shift was significantly correlated with the acoustic distance between the mean second - and marginally third - formants of /e/ and /ø/ productions, with no correlation with the first formant distance.
DISCUSSION: Greater acoustic distances can be related to larger contrasts between the articulatory targets of vowels in speech production. These results suggest that the somatosensory effect in speech perception can be linked to speech production performance.}, }
@article {pmid37227411, year = {2023}, author = {Saba, JN and Ali, H and Hansen, JHL}, title = {The effects of estimation accuracy, estimation approach, and number of selected channels using formant-priority channel selection for an "n-of-m" sound processing strategy for cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {3100}, doi = {10.1121/10.0019416}, pmid = {37227411}, issn = {1520-8524}, abstract = {Previously, selection of l channels was prioritized according to formant frequency locations in an l-of-n-of-m-based signal processing strategy to provide important voicing information independent of listening environments for cochlear implant (CI) users. In this study, ideal, or ground truth, formants were incorporated into the selection stage to determine the effect of accuracy on (1) subjective speech intelligibility, (2) objective channel selection patterns, and (3) objective stimulation patterns (current). An average +11% improvement (p < 0.05) was observed across six CI users in quiet, but not for noise or reverberation conditions. Analogous increases in channel selection and current for the upper range of F1 and a decrease across mid-frequencies with higher corresponding current, were both observed at the expense of noise-dominant channels. Objective channel selection patterns were analyzed a second time to determine the effects of estimation approach and number of selected channels (n). A significant effect of estimation approach was only observed in the noise and reverberation condition with minor differences in channel selection and significantly decreased stimulated current. Results suggest that estimation method, accuracy, and number of channels in the proposed strategy using ideal formants may improve intelligibility when corresponding stimulated current of formant channels are not masked by noise-dominant channels.}, }
@article {pmid37224720, year = {2023}, author = {Carney, LH and Cameron, DA and Kinast, KB and Feld, CE and Schwarz, DM and Leong, UC and McDonough, JM}, title = {Effects of sensorineural hearing loss on formant-frequency discrimination: Measurements and models.}, journal = {Hearing research}, volume = {435}, number = {}, pages = {108788}, doi = {10.1016/j.heares.2023.108788}, pmid = {37224720}, issn = {1878-5891}, abstract = {This study concerns the effect of hearing loss on discrimination of formant frequencies in vowels. In the response of the healthy ear to a harmonic sound, auditory-nerve (AN) rate functions fluctuate at the fundamental frequency, F0. Responses of inner-hair-cells (IHCs) tuned near spectral peaks are captured (or dominated) by a single harmonic, resulting in lower fluctuation depths than responses of IHCs tuned between spectral peaks. Therefore, the depth of neural fluctuations (NFs) varies along the tonotopic axis and encodes spectral peaks, including formant frequencies of vowels. This NF code is robust across a wide range of sound levels and in background noise. The NF profile is converted into a rate-place representation in the auditory midbrain, wherein neurons are sensitive to low-frequency fluctuations. The NF code is vulnerable to sensorineural hearing loss (SNHL) because capture depends upon saturation of IHCs, and thus the interaction of cochlear gain with IHC transduction. In this study, formant-frequency discrimination limens (DLFFs) were estimated for listeners with normal hearing or mild to moderate SNHL. The F0 was fixed at 100 Hz, and formant peaks were either aligned with harmonic frequencies or placed between harmonics. Formant peak frequencies were 600 and 2000 Hz, in the range of first and second formants of several vowels. The difficulty of the task was varied by changing formant bandwidth to modulate the contrast in the NF profile. Results were compared to predictions from model auditory-nerve and inferior colliculus (IC) neurons, with listeners' audiograms used to individualize the AN model. Correlations between DLFFs, audiometric thresholds near the formant frequencies, age, and scores on the Quick speech-in-noise test are reported. SNHL had a strong effect on DLFF for the second formant frequency (F2), but relatively small effect on DLFF for the first formant (F1). The IC model appropriately predicted substantial threshold elevations for changes in F2 as a function of SNHL and little effect of SNHL on thresholds for changes in F1.}, }
@article {pmid37214801, year = {2023}, author = {Rizzi, R and Bidelman, GM}, title = {Duplex perception reveals brainstem auditory representations are modulated by listeners' ongoing percept for speech.}, journal = {bioRxiv : the preprint server for biology}, volume = {}, number = {}, pages = {}, doi = {10.1101/2023.05.09.540018}, pmid = {37214801}, abstract = {So-called duplex speech stimuli with perceptually ambiguous spectral cues to one ear and isolated low- vs. high-frequency third formant "chirp" to the opposite ear yield a coherent percept supporting their phonetic categorization. Critically, such dichotic sounds are only perceived categorically upon binaural integration. Here, we used frequency-following responses (FFRs), scalp-recorded potentials reflecting phase-locked subcortical activity, to investigate brainstem responses to fused speech percepts and to determine whether FFRs reflect binaurally integrated category-level representations. We recorded FFRs to diotic and dichotic stop-consonants (/da/, /ga/) that either did or did not require binaural fusion to properly label along with perceptually ambiguous sounds without clear phonetic identity. Behaviorally, listeners showed clear categorization of dichotic speech tokens confirming they were heard with a fused, phonetic percept. Neurally, we found FFRs were stronger for categorically perceived speech relative to category-ambiguous tokens but also differentiated phonetic categories for both diotically and dichotically presented speech sounds. Correlations between neural and behavioral data further showed FFR latency predicted the degree to which listeners labeled tokens as "da" vs. "ga". The presence of binaurally integrated, category-level information in FFRs suggests human brainstem processing reflects a surprisingly abstract level of the speech code typically circumscribed to much later cortical processing.}, }
@article {pmid37212513, year = {2023}, author = {Cox, SR and Huang, T and Chen, WR and Ng, ML}, title = {An acoustic study of Cantonese alaryngeal speech in different speaking conditions.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2973}, doi = {10.1121/10.0019471}, pmid = {37212513}, issn = {1520-8524}, abstract = {Esophageal (ES) speech, tracheoesophageal (TE) speech, and the electrolarynx (EL) are common methods of communication following the removal of the larynx. Our recent study demonstrated that intelligibility may increase for Cantonese alaryngeal speakers using clear speech (CS) compared to their everyday "habitual speech" (HS), but the reasoning is still unclear [Hui, Cox, Huang, Chen, and Ng (2022). Folia Phoniatr. Logop. 74, 103-111]. The purpose of this study was to assess the acoustic characteristics of vowels and tones produced by Cantonese alaryngeal speakers using HS and CS. Thirty-one alaryngeal speakers (9 EL, 10 ES, and 12 TE speakers) read The North Wind and the Sun passage in HS and CS. Vowel formants, vowel space area (VSA), speaking rate, pitch, and intensity were examined, and their relationship to intelligibility were evaluated. Statistical models suggest that larger VSAs significantly improved intelligibility, but slower speaking rate did not. Vowel and tonal contrasts did not differ between HS and CS for all three groups, but the amount of information encoded in fundamental frequency and intensity differences between high and low tones positively correlated with intelligibility for TE and ES groups, respectively. Continued research is needed to understand the effects of different speaking conditions toward improving acoustic and perceptual characteristics of Cantonese alaryngeal speech.}, }
@article {pmid37210244, year = {2023}, author = {Valls-Ontañón, A and Ferreiro, M and Moragues-Aguiló, B and Molins-Ballabriga, G and Julián-González, S and Sauca-Balart, A and Hernández-Alfaro, F}, title = {Impact of 3-dimensional anatomical changes secondary to orthognathic surgery on voice resonance and articulatory function: a prospective study.}, journal = {The British journal of oral & maxillofacial surgery}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.bjoms.2023.04.007}, pmid = {37210244}, issn = {1532-1940}, abstract = {An evaluation was made of the impact of orthognathic surgery (OS) on speech, addressing in particular the effects of skeletal and airway changes on voice resonance characteristics and articulatory function. A prospective study was carried out involving 29 consecutive patientssubjected to OS. Preoperative, and short and long-term postoperative evaluations were made of anatomical changes (skeletal and airway measurements), speech evolution (assessed objectively by acoustic analysis: fundamental frequency, local jitter, local shimmer of each vowel, and formants F1 and F2 of vowel /a/), and articulatory function (use of compensatory musculature, point of articulation, and speech intelligibility). These were also assessed subjectively by means of a visual analogue scale. Articulatory function after OS showed immediate improvement and had further progressed at one year of follow up. This improvement significantly correlated with the anatomical changes, and was also notably perceived by the patient. On the other hand, although a slight modification in vocal resonance was reported and seen to correlate with anatomical changes of the tongue, hyoid bone, and airway, it was not subjectively perceived by the patients. In conclusion, the results demonstrated that OS had beneficial effects on articulatory function and imperceptible subjective changes in a patient's voice. Patients subjected to OS, apart from benefitting from improved articulatory function, should not be afraid that they will not recognise their voice after treatment.}, }
@article {pmid37205390, year = {2023}, author = {Shellikeri, S and Cho, S and Ash, S and Gonzalez-Recober, C and McMillan, CT and Elman, L and Quinn, C and Amado, DA and Baer, M and Irwin, DJ and Massimo, L and Olm, C and Liberman, M and Grossman, M and Nevler, N}, title = {Digital markers of motor speech impairments in natural speech of patients with ALS-FTD spectrum disorders.}, journal = {medRxiv : the preprint server for health sciences}, volume = {}, number = {}, pages = {}, doi = {10.1101/2023.04.29.23289308}, pmid = {37205390}, abstract = {BACKGROUND AND OBJECTIVES: Patients with ALS-FTD spectrum disorders (ALS-FTSD) have mixed motor and cognitive impairments and require valid and quantitative assessment tools to support diagnosis and tracking of bulbar motor disease. This study aimed to validate a novel automated digital speech tool that analyzes vowel acoustics from natural, connected speech as a marker for impaired articulation due to bulbar motor disease in ALS-FTSD.
METHODS: We used an automatic algorithm called Forced Alignment Vowel Extraction (FAVE) to detect spoken vowels and extract vowel acoustics from 1 minute audio-recorded picture descriptions. Using automated acoustic analysis scripts, we derived two articulatory-acoustic measures: vowel space area (VSA, in Bark [2]) which represents tongue range-of-motion (size), and average second formant slope of vowel trajectories (F2 slope) which represents tongue movement speed. We compared vowel measures between ALS with and without clinically-evident bulbar motor disease (ALS+bulbar vs. ALS-bulbar), behavioral variant frontotemporal dementia (bvFTD) without a motor syndrome, and healthy controls (HC). We correlated impaired vowel measures with bulbar disease severity, estimated by clinical bulbar scores and perceived listener effort, and with MRI cortical thickness of the orobuccal part of the primary motor cortex innervating the tongue (oralPMC). We also tested correlations with respiratory capacity and cognitive impairment.
RESULTS: Participants were 45 ALS+bulbar (30 males, mean age=61±11), 22 ALS-nonbulbar (11 males, age=62±10), 22 bvFTD (13 males, age=63±7), and 34 HC (14 males, age=69±8). ALS+bulbar had smaller VSA and shallower average F2 slopes than ALS-bulbar (VSA: | d |=0.86, p =0.0088; F2 slope: | d |=0.98, p =0.0054), bvFTD (VSA: | d |=0.67, p =0.043; F2 slope: | d |=1.4, p <0.001), and HC (VSA: | d |=0.73, p =0.024; F2 slope: | d |=1.0, p <0.001). Vowel measures declined with worsening bulbar clinical scores (VSA: R=0.33, p =0.033; F2 slope: R=0.25, p =0.048), and smaller VSA was associated with greater listener effort (R=-0.43, p =0.041). Shallower F2 slopes were related to cortical thinning in oralPMC (R=0.50, p =0.03). Neither vowel measure was associated with respiratory nor cognitive test scores.
CONCLUSIONS: Vowel measures extracted with automatic processing from natural speech are sensitive to bulbar motor disease in ALS-FTD and are robust to cognitive impairment.}, }
@article {pmid37203275, year = {2023}, author = {Easwar, V and Peng, ZE and Mak, V and Mikiel-Hunter, J}, title = {Differences between children and adults in the neural encoding of voice fundamental frequency in the presence of noise and reverberation.}, journal = {The European journal of neuroscience}, volume = {}, number = {}, pages = {}, doi = {10.1111/ejn.16049}, pmid = {37203275}, issn = {1460-9568}, abstract = {Environmental noise and reverberation challenge speech understanding more significantly in children than in adults. However, the neural/sensory basis for the difference is poorly understood. We evaluated the impact of noise and reverberation on the neural processing of the fundamental frequency of voice (f0)-an important cue to tag or recognize a speaker. In a group of 39 6-15-year-old children and 26 adults with normal hearing, envelope following responses (EFRs) were elicited by a male-spoken/i/in quiet, noise, reverberation, and both noise and reverberation. Due to increased resolvability of harmonics at lower than higher vowel formants that may affect susceptibility to noise and/or reverberation, the/i/was modified to elicit two EFRs: one initiated by the low frequency first formant (F1) and the other initiated by mid to high frequency second and higher formants (F2+) with predominantly resolved and unresolved harmonics, respectively. F1 EFRs were more susceptible to noise whereas F2+ EFRs were more susceptible to reverberation. Reverberation resulted in greater attenuation of F1 EFRs in adults than children, and greater attenuation of F2+ EFRs in older than younger children. Reduced modulation depth caused by reverberation and noise explained changes in F2+ EFRs but was not the primary determinant for F1 EFRs. Experimental data paralleled modelled EFRs, especially for F1. Together, data suggest that noise or reverberation influences the robustness of f0 encoding depending on the resolvability of vowel harmonics, and that maturation of processing temporal/envelope information of voice is delayed in reverberation, particularly for low frequency stimuli.}, }
@article {pmid37173176, year = {2023}, author = {Wang, Y and Hattori, M and Masaki, K and Sumita, YI}, title = {Detailed speech evaluation including formant 3 analysis and voice visualization in maxillofacial rehabilitation: A clinical report.}, journal = {The Journal of prosthetic dentistry}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.prosdent.2023.02.022}, pmid = {37173176}, issn = {1097-6841}, abstract = {Objective speech evaluation such as analysis of formants 1 and 2 and nasality measurement have been used in maxillofacial rehabilitation for outcome assessment. However, in some patients, those evaluations are insufficient to assess a specific or unique problem. This report describes the use of a new speech evaluation including formant 3 analysis and voice visualization in a patient with a maxillofacial defect. The patient was a 67-year-old man who had a maxillary defect that opened to the maxillary sinus and who had an unnatural voice even when wearing an obturator. Nasality was low and the frequency of formants 1 and 2 were normal even without the obturator. However, a low frequency of formant 3 and a shifted center of voice were observed. These results indicated that the unnatural voice was related to increased resonant volume in the pharynx rather than hypernasality. This patient demonstrates that advanced speech analysis can be useful for detecting the cause of speech disorder and planning maxillofacial rehabilitation.}, }
@article {pmid37138997, year = {2023}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {On the speaker discriminatory power asymmetry regarding acoustic-phonetic parameters and the impact of speaking style.}, journal = {Frontiers in psychology}, volume = {14}, number = {}, pages = {1101187}, doi = {10.3389/fpsyg.2023.1101187}, pmid = {37138997}, issn = {1664-1078}, abstract = {This study aimed to assess what we refer to as the speaker discriminatory power asymmetry and its forensic implications in comparisons performed in different speaking styles: spontaneous dialogues vs. interviews. We also addressed the impact of data sampling on the speaker's discriminatory performance concerning different acoustic-phonetic estimates. The participants were 20 male speakers, Brazilian Portuguese speakers from the same dialectal area. The speech material consisted of spontaneous telephone conversations between familiar individuals, and interviews conducted between each individual participant and the researcher. Nine acoustic-phonetic parameters were chosen for the comparisons, spanning from temporal and melodic to spectral acoustic-phonetic estimates. Ultimately, an analysis based on the combination of different parameters was also conducted. Two speaker discriminatory metrics were examined: Cost Log-likelihood-ratio (Cllr) and Equal Error Rate (EER) values. A general speaker discriminatory trend was suggested when assessing the parameters individually. Parameters pertaining to the temporal acoustic-phonetic class depicted the weakest performance in terms of speaker contrasting power as evidenced by the relatively higher Cllr and EER values. Moreover, from the set of acoustic parameters assessed, spectral parameters, mainly high formant frequencies, i.e., F3 and F4, were the best performing in terms of speaker discrimination, depicting the lowest EER and Cllr scores. The results appear to suggest a speaker discriminatory power asymmetry concerning parameters from different acoustic-phonetic classes, in which temporal parameters tended to present a lower discriminatory power. The speaking style mismatch also seemed to considerably impact the speaker comparison task, by undermining the overall discriminatory performance. A statistical model based on the combination of different acoustic-phonetic estimates was found to perform best in this case. Finally, data sampling has proven to be of crucial relevance for the reliability of discriminatory power assessment.}, }
@article {pmid37129674, year = {2023}, author = {Zaltz, Y}, title = {The effect of stimulus type and testing method on talker discrimination of school-age children.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {5}, pages = {2611}, doi = {10.1121/10.0017999}, pmid = {37129674}, issn = {1520-8524}, abstract = {Efficient talker discrimination (TD) improves speech understanding under multi-talker conditions. So far, TD of children has been assessed using various testing parameters, making it difficult to draw comparative conclusions. This study explored the effects of the stimulus type and variability on children's TD. Thirty-two children (7-10 years old) underwent eight TD assessments with fundamental frequency + formant changes using an adaptive procedure. Stimuli included consonant-vowel-consonant words or three-word sentences and were either fixed by run or by trial (changing throughout the run). Cognitive skills were also assessed. Thirty-one adults (18-35 years old) served as controls. The results showed (1) poorer TD for the fixed-by-trial than the fixed-by-run method, with both stimulus types for the adults but only with the words for the children; (2) poorer TD for the words than the sentences with the fixed-by-trial method only for the children; and (3) significant correlations between the children's age and TD. These results support a developmental trajectory in the use of perceptual anchoring for TD and in its reliance on comprehensive acoustic and linguistic information. The finding that the testing parameters may influence the top-down and bottom-up processing for TD should be considered when comparing data across studies or when planning new TD experiments.}, }
@article {pmid37128454, year = {2022}, author = {Ghosh, S and Feng, Z and Bian, J and Butler, K and Prosperi, M}, title = {DR-VIDAL - Doubly Robust Variational Information-theoretic Deep Adversarial Learning for Counterfactual Prediction and Treatment Effect Estimation on Real World Data.}, journal = {AMIA ... Annual Symposium proceedings. AMIA Symposium}, volume = {2022}, number = {}, pages = {485-494}, pmid = {37128454}, issn = {1942-597X}, mesh = {Humans ; Prognosis ; *Electronic Health Records ; Causality ; }, abstract = {Determining causal effects of interventions onto outcomes from real-world, observational (non-randomized) data, e.g., treatment repurposing using electronic health records, is challenging due to underlying bias. Causal deep learning has improved over traditional techniques for estimating individualized treatment effects (ITE). We present the Doubly Robust Variational Information-theoretic Deep Adversarial Learning (DR-VIDAL), a novel generative framework that combines two joint models of treatment and outcome, ensuring an unbiased ITE estimation even when one of the two is misspecified. DR-VIDAL integrates: (i) a variational autoencoder (VAE) to factorize confounders into latent variables according to causal assumptions; (ii) an information-theoretic generative adversarial network (Info-GAN) to generate counterfactuals; (iii) a doubly robust block incorporating treatment propensities for outcome predictions. On synthetic and real-world datasets (Infant Health and Development Program, Twin Birth Registry, and National Supported Work Program), DR-VIDAL achieves better performance than other non-generative and generative methods. In conclusion, DR-VIDAL uniquely fuses causal assumptions, VAE, Info-GAN, and doubly robustness into a comprehensive, per- formant framework. Code is available at: https://github.com/Shantanu48114860/DR-VIDAL-AMIA-22 under MIT license.}, }
@article {pmid37116009, year = {2023}, author = {Li, M and Erickson, IM and Cross, EV and Lee, JD}, title = {It's Not Only What You Say, But Also How You Say It: Machine Learning Approach to Estimate Trust from Conversation.}, journal = {Human factors}, volume = {}, number = {}, pages = {187208231166624}, doi = {10.1177/00187208231166624}, pmid = {37116009}, issn = {1547-8181}, abstract = {OBJECTIVE: The objective of this study was to estimate trust from conversations using both lexical and acoustic data.
BACKGROUND: As NASA moves to long-duration space exploration operations, the increasing need for cooperation between humans and virtual agents requires real-time trust estimation by virtual agents. Measuring trust through conversation is a novel and unintrusive approach.
METHOD: A 2 (reliability) × 2 (cycles) × 3 (events) within-subject study with habitat system maintenance was designed to elicit various levels of trust in a conversational agent. Participants had trust-related conversations with the conversational agent at the end of each decision-making task. To estimate trust, subjective trust ratings were predicted using machine learning models trained on three types of conversational features (i.e., lexical, acoustic, and combined). After training, model explanation was performed using variable importance and partial dependence plots.
RESULTS: Results showed that a random forest algorithm, trained using the combined lexical and acoustic features, predicted trust in the conversational agent most accurately (Radj2=0.71). The most important predictors were a combination of lexical and acoustic cues: average sentiment considering valence shifters, the mean of formants, and Mel-frequency cepstral coefficients (MFCC). These conversational features were identified as partial mediators predicting people's trust.
CONCLUSION: Precise trust estimation from conversation requires lexical cues and acoustic cues.
APPLICATION: These results showed the possibility of using conversational data to measure trust, and potentially other dynamic mental states, unobtrusively and dynamically.}, }
@article {pmid37106680, year = {2023}, author = {Teixeira, FL and Costa, MRE and Abreu, JP and Cabral, M and Soares, SP and Teixeira, JP}, title = {A Narrative Review of Speech and EEG Features for Schizophrenia Detection: Progress and Challenges.}, journal = {Bioengineering (Basel, Switzerland)}, volume = {10}, number = {4}, pages = {}, pmid = {37106680}, issn = {2306-5354}, abstract = {Schizophrenia is a mental illness that affects an estimated 21 million people worldwide. The literature establishes that electroencephalography (EEG) is a well-implemented means of studying and diagnosing mental disorders. However, it is known that speech and language provide unique and essential information about human thought. Semantic and emotional content, semantic coherence, syntactic structure, and complexity can thus be combined in a machine learning process to detect schizophrenia. Several studies show that early identification is crucial to prevent the onset of illness or mitigate possible complications. Therefore, it is necessary to identify disease-specific biomarkers for an early diagnosis support system. This work contributes to improving our knowledge about schizophrenia and the features that can identify this mental illness via speech and EEG. The emotional state is a specific characteristic of schizophrenia that can be identified with speech emotion analysis. The most used features of speech found in the literature review are fundamental frequency (F0), intensity/loudness (I), frequency formants (F1, F2, and F3), Mel-frequency cepstral coefficients (MFCC's), the duration of pauses and sentences (SD), and the duration of silence between words. Combining at least two feature categories achieved high accuracy in the schizophrenia classification. Prosodic and spectral or temporal features achieved the highest accuracy. The work with higher accuracy used the prosodic and spectral features QEVA, SDVV, and SSDL, which were derived from the F0 and spectrogram. The emotional state can be identified with most of the features previously mentioned (F0, I, F1, F2, F3, MFCCs, and SD), linear prediction cepstral coefficients (LPCC), linear spectral features (LSF), and the pause rate. Using the event-related potentials (ERP), the most promissory features found in the literature are mismatch negativity (MMN), P2, P3, P50, N1, and N2. The EEG features with higher accuracy in schizophrenia classification subjects are the nonlinear features, such as Cx, HFD, and Lya.}, }
@article {pmid37105171, year = {2023}, author = {Oganian, Y and Bhaya-Grossman, I and Johnson, K and Chang, EF}, title = {Vowel and formant representation in the human auditory speech cortex.}, journal = {Neuron}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.neuron.2023.04.004}, pmid = {37105171}, issn = {1097-4199}, abstract = {Vowels, a fundamental component of human speech across all languages, are cued acoustically by formants, resonance frequencies of the vocal tract shape during speaking. An outstanding question in neurolinguistics is how formants are processed neurally during speech perception. To address this, we collected high-density intracranial recordings from the human speech cortex on the superior temporal gyrus (STG) while participants listened to continuous speech. We found that two-dimensional receptive fields based on the first two formants provided the best characterization of vowel sound representation. Neural activity at single sites was highly selective for zones in this formant space. Furthermore, formant tuning is adjusted dynamically for speaker-specific spectral context. However, the entire population of formant-encoding sites was required to accurately decode single vowels. Overall, our results reveal that complex acoustic tuning in the two-dimensional formant space underlies local vowel representations in STG. As a population code, this gives rise to phonological vowel perception.}, }
@article {pmid37080890, year = {2023}, author = {Herbst, CT and Story, BH and Meyer, D}, title = {Acoustical Theory of Vowel Modification Strategies in Belting.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.01.004}, pmid = {37080890}, issn = {1873-4588}, abstract = {Various authors have argued that belting is to be produced by "speech-like" sounds, with the first and second supraglottic vocal tract resonances (fR1 and fR2) at frequencies of the vowels determined by the lyrics to be sung. Acoustically, the hallmark of belting has been identified as a dominant second harmonic, possibly enhanced by first resonance tuning (fR1≈2fo). It is not clear how both these concepts - (a) phonating with "speech-like," unmodified vowels; and (b) producing a belting sound with a dominant second harmonic, typically enhanced by fR1 - can be upheld when singing across a singer's entire musical pitch range. For instance, anecdotal reports from pedagogues suggest that vowels with a low fR1, such as [i] or [u], might have to be modified considerably (by raising fR1) in order to phonate at higher pitches. These issues were systematically addressed in silico with respect to treble singing, using a linear source-filter voice production model. The dominant harmonic of the radiated spectrum was assessed in 12987 simulations, covering a parameter space of 37 fundamental frequencies (fo) across the musical pitch range from C3 to C6; 27 voice source spectral slope settings from -4 to -30 dB/octave; computed for 13 different IPA vowels. The results suggest that, for most unmodified vowels, the stereotypical belting sound characteristics with a dominant second harmonic can only be produced over a pitch range of about a musical fifth, centered at fo≈0.5fR1. In the [ɔ] and [ɑ] vowels, that range is extended to an octave, supported by a low second resonance. Data aggregation - considering the relative prevalence of vowels in American English - suggests that, historically, belting with fR1≈2fo was derived from speech, and that songs with an extended musical pitch range likely demand considerable vowel modification. We thus argue that - on acoustical grounds - the pedagogical commandment for belting with unmodified, "speech-like" vowels can not always be fulfilled.}, }
@article {pmid37078508, year = {2023}, author = {Dillon, MT and Helpard, L and Brown, KD and Selleck, AM and Richter, ME and Rooth, MA and Thompson, NJ and Dedmon, MM and Ladak, HM and Agrawal, S}, title = {Influence of the Frequency-to-Place Function on Recognition with Place-Based Cochlear Implant Maps.}, journal = {The Laryngoscope}, volume = {}, number = {}, pages = {}, doi = {10.1002/lary.30710}, pmid = {37078508}, issn = {1531-4995}, abstract = {OBJECTIVE: Comparison of acute speech recognition for cochlear implant (CI) alone and electric-acoustic stimulation (EAS) users listening with default maps or place-based maps using either a spiral ganglion (SG) or a new Synchrotron Radiation-Artificial Intelligence (SR-AI) frequency-to-place function.
METHODS: Thirteen adult CI-alone or EAS users completed a task of speech recognition at initial device activation with maps that differed in the electric filter frequency assignments. The three map conditions were: (1) maps with the default filter settings (default map), (2) place-based maps with filters aligned to cochlear SG tonotopicity using the SG function (SG place-based map), and (3) place-based maps with filters aligned to cochlear Organ of Corti (OC) tonotopicity using the SR-AI function (SR-AI place-based map). Speech recognition was evaluated using a vowel recognition task. Performance was scored as the percent correct for formant 1 recognition due to the rationale that the maps would deviate the most in the estimated cochlear place frequency for low frequencies.
RESULTS: On average, participants had better performance with the OC SR-AI place-based map as compared to the SG place-based map and the default map. A larger performance benefit was observed for EAS users than for CI-alone users.
CONCLUSION: These pilot data suggest that EAS and CI-alone users may experience better performance with a patient-centered mapping approach that accounts for the variability in cochlear morphology (OC SR-AI frequency-to-place function) in the individualization of the electric filter frequencies (place-based mapping procedure).
LEVEL OF EVIDENCE: 3 Laryngoscope, 2023.}, }
@article {pmid37071803, year = {2023}, author = {Terband, H and van Brenk, F}, title = {Modeling Responses to Auditory Feedback Perturbations in Adults, Children, and Children With Complex Speech Sound Disorders: Evidence for Impaired Auditory Self-Monitoring?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-25}, doi = {10.1044/2023_JSLHR-22-00379}, pmid = {37071803}, issn = {1558-9102}, abstract = {PURPOSE: Previous studies have found that typically developing (TD) children were able to compensate and adapt to auditory feedback perturbations to a similar or larger degree compared to young adults, while children with speech sound disorder (SSD) were found to produce predominantly following responses. However, large individual differences lie underneath the group-level results. This study investigates possible mechanisms in responses to formant shifts by modeling parameters of feedback and feedforward control of speech production based on behavioral data.
METHOD: SimpleDIVA was used to model an existing dataset of compensation/adaptation behavior to auditory feedback perturbations collected from three groups of Dutch speakers: 50 young adults, twenty-three 4- to 8-year-old children with TD speech, and seven 4- to 8-year-old children with SSD. Between-groups and individual within-group differences in model outcome measures representing auditory and somatosensory feedback control gain and feedforward learning rate were assessed.
RESULTS: Notable between-groups and within-group variation was found for all outcome measures. Data modeled for individual speakers yielded model fits with varying reliability. Auditory feedback control gain was negative in children with SSD and positive in both other groups. Somatosensory feedback control gain was negative for both groups of children and marginally negative for adults. Feedforward learning rate measures were highest in the children with TD speech followed by children with SSD, compared to adults.
CONCLUSIONS: The SimpleDIVA model was able to account for responses to the perturbation of auditory feedback other than corrective, as negative auditory feedback control gains were associated with following responses to vowel shifts. These preliminary findings are suggestive of impaired auditory self-monitoring in children with complex SSD. Possible mechanisms underlying the nature of following responses are discussed.}, }
@article {pmid37059081, year = {2023}, author = {Chao, SC and Daliri, A}, title = {Effects of Gradual and Sudden Introduction of Perturbations on Adaptive Responses to Formant-Shift and Formant-Clamp Perturbations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-12}, doi = {10.1044/2023_JSLHR-21-00435}, pmid = {37059081}, issn = {1558-9102}, abstract = {PURPOSE: When the speech motor system encounters errors, it generates adaptive responses to compensate for the errors. Unlike errors induced by formant-shift perturbations, errors induced by formant-clamp perturbations do not correspond with the speaker's speech (i.e., degraded motor-to-auditory correspondence). We previously showed that adaptive responses to formant-clamp perturbations are smaller than responses to formant-shift perturbations when perturbations are introduced gradually. This study examined responses to formant-clamp and formant-shift perturbations when perturbations are introduced suddenly.
METHOD: One group of participants (n = 30) experienced gradually introduced formant-clamp and formant-shift perturbations, and another group (n = 30) experienced suddenly introduced formant-clamp and formant-shift perturbations. We designed the perturbations based on participant-specific vowel configurations such that a participant's first and second formants of /ɛ/ were perturbed toward their /æ/. To estimate adaptive responses, we measured formant changes (0-100 ms of the vowel) in response to the formant perturbations.
RESULTS: We found that (a) the difference between responses to formant-clamp and formant-shift perturbations was smaller when the perturbations were introduced suddenly and (b) responses to suddenly introduced (but not gradually introduced) formant-shift perturbations positively correlated with responses to formant-clamp perturbations.
CONCLUSIONS: These results showed that the speech motor system responds to errors induced by formant-shift and formant-clamp perturbations more differently when perturbations are introduced gradually than suddenly. Overall, the quality of errors (formant-shift vs. formant-clamp) and the manner of introducing errors (gradually vs. suddenly) modulate the speech motor system's evaluations of and responses to errors.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.22406422.}, }
@article {pmid37040323, year = {2023}, author = {Luo, X and Daliri, A}, title = {The Impact of Bimodal Hearing on Speech Acoustics of Vowel Production in Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-14}, doi = {10.1044/2023_JSLHR-22-00201}, pmid = {37040323}, issn = {1558-9102}, abstract = {PURPOSE: This study aimed to investigate the acoustic changes in vowel production with different forms of auditory feedback via cochlear implant (CI), hearing aid (HA), and bimodal hearing (CI + HA).
METHOD: Ten post-lingually deaf adult bimodal CI users (aged 50-78 years) produced English vowels /i/, /ɛ/, /æ/, /ɑ/, /ʊ/, and /u/ in the context of /hVd/ during short-term use of no device (ND), HA, CI, and CI + HA. Segmental features (first formant frequency [F 1], second formant frequency [F 2], and vowel space area) and suprasegmental features (duration, intensity, and fundamental frequency [f o]) of vowel production were analyzed. Participants also categorized a vowel continuum synthesized from their own productions of /ɛ/ and /æ/ using HA, CI, and CI + HA.
RESULTS: F 1s of all vowels decreased; F 2s of front vowels but not back vowels increased; vowel space areas increased; and vowel durations, intensities, and f os decreased with statistical significance in the HA, CI, and CI + HA conditions relative to the ND condition. Only f os were lower, and vowel space areas were larger with CI and CI + HA than with HA. Average changes in f o, intensity, and F 1 from the ND condition to the HA, CI, and CI + HA conditions were positively correlated. Most participants did not show a typical psychometric function for vowel categorization, and thus, the relationship between vowel categorization and production was not tested.
CONCLUSIONS: The results suggest that acoustic, electric, and bimodal hearing have a measurable impact on vowel acoustics of post-lingually deaf adults when their hearing devices are turned on and off temporarily. Also, changes in f o and F 1 with the use of hearing devices may be largely driven by changes in intensity.}, }
@article {pmid37031224, year = {2023}, author = {Hsu, TC and Wu, BX and Lin, RT and Chien, CJ and Yeh, CY and Chang, TH}, title = {Electron-phonon interaction toward engineering carrier mobility of periodic edge structured graphene nanoribbons.}, journal = {Scientific reports}, volume = {13}, number = {1}, pages = {5781}, pmid = {37031224}, issn = {2045-2322}, abstract = {Graphene nanoribbons have many extraordinary electrical properties and are the candidates for semiconductor industry. In this research, we propose a design of Coved GNRs with periodic structure ranged from 4 to 8 nm or more, of which the size is within practical feature sizes by advanced lithography tools. The carrier transport properties of Coved GNRs with the periodic coved shape are designed to break the localized electronic state and reducing electron-phonon scattering. In this way, the mobility of Coved GNRs can be enhanced by orders compared with the zigzag GNRs in same width. Moreover, in contrast to occasional zero bandgap transition of armchair and zigzag GNRs without precision control in atomic level, the Coved GNRs with periodic edge structures can exclude the zero bandgap conditions, which makes practical the mass production process. The designed Coved-GNRs is fabricated over the Germanium (110) substrate where the graphene can be prepared in the single-crystalline and single-oriented formants and the edge of GNRs is later repaired under "balanced condition growth" and we demonstrate that the propose coved structures are compatible to current fabrication facility.}, }
@article {pmid37015000, year = {2023}, author = {Vorperian, HK and Kent, RD and Lee, Y and Buhr, KA}, title = {Vowel Production in Children and Adults With Down Syndrome: Fundamental and Formant Frequencies of the Corner Vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {66}, number = {4}, pages = {1208-1239}, doi = {10.1044/2022_JSLHR-22-00510}, pmid = {37015000}, issn = {1558-9102}, mesh = {Male ; Female ; Humans ; Adult ; Child ; Child, Preschool ; Adolescent ; Young Adult ; Middle Aged ; Aged ; Aged, 80 and over ; *Speech Acoustics ; *Down Syndrome ; Phonetics ; Speech Intelligibility ; Acoustics ; }, abstract = {PURPOSE: Atypical vowel production contributes to reduced speech intelligibility in children and adults with Down syndrome (DS). This study compares the acoustic data of the corner vowels /i/, /u/, /æ/, and /ɑ/ from speakers with DS against typically developing/developed (TD) speakers.
METHOD: Measurements of the fundamental frequency (f o) and first four formant frequencies (F1-F4) were obtained from single word recordings containing the target vowels from 81 participants with DS (ages 3-54 years) and 293 TD speakers (ages 4-92 years), all native speakers of English. The data were used to construct developmental trajectories and to determine interspeaker and intraspeaker variability.
RESULTS: Trajectories for DS differed from TD based on age and sex, but the groups were similar with the striking change in f o and F1-F4 frequencies around age 10 years. Findings confirm higher f o in DS, and vowel-specific differences between DS and TD in F1 and F2 frequencies, but not F3 and F4. The measure of F2 differences of front-versus-back vowels was more sensitive of compression than reduced vowel space area/centralization across age and sex. Low vowels had more pronounced F2 compression as related to reduced speech intelligibility. Intraspeaker variability was significantly greater for DS than TD for nearly all frequency values across age.
DISCUSSION: Vowel production differences between DS and TD are age- and sex-specific, which helps explain contradictory results in previous studies. Increased intraspeaker variability across age in DS confirms the presence of a persisting motor speech disorder. Atypical vowel production in DS is common and related to dysmorphology, delayed development, and disordered motor control.}, }
@article {pmid37005127, year = {2023}, author = {Capobianco, S and Nacci, A and Calcinoni, O and Bruschini, L and Berrettini, S and Bottalico, P}, title = {Assessing Acoustic Parameters in Early Music and Romantic Operatic Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2023.02.009}, pmid = {37005127}, issn = {1873-4588}, abstract = {OBJECTIVE: Since the recent early music (EM) revival, a subset of singers have begun to specialize in a style of singing that is perceptually different from the more "mainstream" romantic operatic (RO) singing style. The aim of this study is to characterize EM with respect to RO singing in terms of its vibrato characteristics and the singer's formant cluster.
STUDY DESIGN: This study presents a within-subject experimental design.
METHODS: Ten professional singers (5 F; 5M) versed in both EM and RO repertoire were enrolled in the study. Each singer recorded the first 10 bars of the famous Aria, "Amarilli Mia Bella" (Giulio Caccini, 1602) a cappella, in RO and EM styles, in random order. Three sustained notes were extracted from the acoustical recordings and were analyzed using the free user-friendly software Biovoice to extract five parameters: vibrato rate, vibrato extent, vibrato jitter (Jvib), vibrato shimmer, and quality ratio (QR), an estimation of the singer's formant power.
RESULTS: Vibrato in EM singing was characterized by a higher rate, a smaller extent, and less regular cycle-cycle period duration (higher Jvib) compared to RO singing. As in previous studies, RO singing presented a more prominent singer's formant, as indicated by a smaller QR.
CONCLUSIONS: Acoustical analysis of some vibrato characteristics and the Singer's Formant significantly differentiated EM from RO singing styles. Given the acoustical distinctions between EM and RO styles, future scientific and musicological studies should consider distinguishing between the two styles rather than using a singular term for and description of Western Classical singing.}, }
@article {pmid37003707, year = {2023}, author = {Wood, S}, title = {Dating the open /æ/ sound change in Southern British English.}, journal = {JASA express letters}, volume = {3}, number = {3}, pages = {035205}, doi = {10.1121/10.0015281}, pmid = {37003707}, issn = {2691-1191}, abstract = {The new open /æ/ was not noticed in the non-regional received pronunciation (RP) accent of Southern British English until the 1980s. Dating to the 1950s or 1920s had been suggested, but the earliest known regional example was born in Kent in the 1860s. Formant data from archived recordings of 29 Southeastern speakers, born between the 1850s and 1960s, were studied using two methods: inspection of formant diagrams for closer /æ/, and modelling low vowels for open /æ/. The earliest RP speaker found with new open /æ/ was born in 1857, demonstrating that this type of sound change had started by the 1850s.}, }
@article {pmid37002095, year = {2023}, author = {Serrurier, A and Neuschaefer-Rube, C}, title = {Morphological and acoustic modeling of the vocal tract.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {3}, pages = {1867}, doi = {10.1121/10.0017356}, pmid = {37002095}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Speech Acoustics ; Phonetics ; Speech ; *Voice ; Acoustics ; }, abstract = {In speech production, the anatomical morphology forms the substrate on which the speakers build their articulatory strategy to reach specific articulatory-acoustic goals. The aim of this study is to characterize morphological inter-speaker variability by building a shape model of the full vocal tract including hard and soft structures. Static magnetic resonance imaging data from 41 speakers articulating altogether 1947 phonemes were considered, and the midsagittal articulator contours were manually outlined. A phoneme-independent average-articulation representative of morphology was calculated as the speaker mean articulation. A principal component analysis-driven shape model was derived from average-articulations, leading to five morphological components, which explained 87% of the variance. Almost three-quarters of the variance was related to independent variations of the horizontal oral and vertical pharyngeal lengths, the latter capturing male-female differences. The three additional components captured shape variations related to head tilt and palate shape. Plane wave propagation acoustic simulations were run to characterize morphological components. A lengthening of 1 cm of the vocal tract in the vertical or horizontal directions led to a decrease in formant values of 7%-8%. Further analyses are required to analyze three-dimensional variability and to understand the morphological-acoustic relationships per phoneme. Average-articulations and model code are publicly available (https://github.com/tonioser/VTMorphologicalModel).}, }
@article {pmid36949035, year = {2023}, author = {Lou, Q and Wang, X and Chen, Y and Wang, G and Jiang, L and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients With Repaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {}, number = {}, pages = {}, doi = {10.1097/SCS.0000000000009301}, pmid = {36949035}, issn = {1536-3732}, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients with repaired cleft palate through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with complete velopharyngeal closure (VPC) and velopharyngeal insufficiency (VPI) patients.
PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality and consonant missing rate, for objective acoustic analysis, we used speech sample normalization and objective acoustic parameters included normalized vowel formants, voice onset time and the analysis of 3-dimensional spectrogram and spectrum, were carried out on speech samples produced by 3 groups of speakers: (a) speakers with velopharyngeal competence after palatorrhaphy (n=38); (b) speakers with velopharyngeal incompetence after palatorrhaphy (n=70), (c) adult patients with cleft palate (n=65) and (d) typical speakers (n=30).
RESULTS: There was a highly negative correlation between VPC grade and speech intelligibility (ρ=-0.933), and a highly positive correlation between VPC and nasality (ρ=0.813). In subjective evaluation, the speech level of VPI patients was significantly lower than that of VPC patients and normal adults. Although the nasality and consonant loss rate of VPC patients were significantly higher than that of normal adults, the speech intelligibility of VPC patients was not significantly different from that of normal adults. In acoustic analysis, patients with VPI still performed poorly compared with patients with VPC.
CONCLUSIONS: The speech function of adult cleft palate patients is affected by abnormal palatal structure and bad pronunciation habits. In subjective evaluation, there was no significant difference in speech level between VPC patients and normal adults, whereas there was significant difference between VPI patients and normal adults. The acoustic parameters were different between the 2 groups after cleft palate repair. The condition of palatopharyngeal closure after cleft palate can affect the patient's speech.}, }
@article {pmid36946195, year = {2023}, author = {Easwar, V and Purcell, D and Wright, T}, title = {Predicting Hearing aid Benefit Using Speech-Evoked Envelope Following Responses in Children With Hearing Loss.}, journal = {Trends in hearing}, volume = {27}, number = {}, pages = {23312165231151468}, pmid = {36946195}, issn = {2331-2165}, mesh = {Humans ; Male ; Child ; *Hearing Aids ; Speech ; *Hearing Loss/diagnosis/therapy ; *Deafness ; *Speech Perception/physiology ; *Hearing Loss, Sensorineural/diagnosis/therapy ; }, abstract = {Electroencephalography could serve as an objective tool to evaluate hearing aid benefit in infants who are developmentally unable to participate in hearing tests. We investigated whether speech-evoked envelope following responses (EFRs), a type of electroencephalography-based measure, could predict improved audibility with the use of a hearing aid in children with mild-to-severe permanent, mainly sensorineural, hearing loss. In 18 children, EFRs were elicited by six male-spoken band-limited phonemic stimuli--the first formants of /u/ and /i/, the second and higher formants of /u/ and /i/, and the fricatives /s/ and /∫/--presented together as /su∫i/. EFRs were recorded between the vertex and nape, when /su∫i/ was presented at 55, 65, and 75 dB SPL using insert earphones in unaided conditions and individually fit hearing aids in aided conditions. EFR amplitude and detectability improved with the use of a hearing aid, and the degree of improvement in EFR amplitude was dependent on the extent of change in behavioral thresholds between unaided and aided conditions. EFR detectability was primarily influenced by audibility; higher sensation level stimuli had an increased probability of detection. Overall EFR sensitivity in predicting audibility was significantly higher in aided (82.1%) than unaided conditions (66.5%) and did not vary as a function of stimulus or frequency. EFR specificity in ascertaining inaudibility was 90.8%. Aided improvement in EFR detectability was a significant predictor of hearing aid-facilitated change in speech discrimination accuracy. Results suggest that speech-evoked EFRs could be a useful objective tool in predicting hearing aid benefit in children with hearing loss.}, }
@article {pmid36945094, year = {2023}, author = {Duan, H and Xie, Q and Zhang, Z}, title = {Characteristics of Alveolo-palatal Affricates Produced by Mandarin-speaking Children with Repaired Cleft Palate.}, journal = {American journal of health behavior}, volume = {47}, number = {1}, pages = {13-20}, doi = {10.5993/AJHB.47.1.2}, pmid = {36945094}, issn = {1945-7359}, mesh = {Humans ; Child ; Child, Preschool ; *Cleft Palate/surgery ; Phonetics ; Language ; }, abstract = {Objectives: In this study, examined the acoustic properties of affricates /t/ and /t[h]/ in Mandarin Chinese, and analyzed the differences of the acoustic characteristics of these affricates produced by children with repaired cleft palate and normally developing children. We also explored the relationship between the affricates and high-front vowel /i/. Methods: We analyzed 16 monosyllabic words with alveolo-palatal affricates as the initial consonants produced by children with repaired cleft palate (N=13, Mean=5.9 years) and normally developing children (N=6, Mean age=5.3 years). We used several acoustic parameters to investigate the characteristics of these affricates, such as the center of gravity, VOT and the formants of vowels. Results: Compared with normally developing children, children with cleft palate exhibited a lower center of gravity for the 2 affricates /t/ and /t[h]/. Data from the control group showed that the affricate /t[h]/ had a significantly greater center of gravity than that of /t/. The accuracy of /t , t[h]/ produced by speakers of cleft palate was significantly correlated with that of /i/ (r=0.63). High-front vowel /i/ is a significant index in diagnosing speech intelligibility which is more valuable than /a/ and /u/. There was a significant difference in F2 of vowel /i/ between children with cleft palate without speech therapy (CS1) and after speech therapy (CS2). After speech intervention, the accuracy of affricates produced by children with cleft palate was improved, the acoustic properties "stop + noise segments" appeared. Conclusion: Children with cleft palate can be distinguished better from children with normal development by 2 significant acoustic characteristics: center of gravity and VOT. As alveolo-palatal affricates /t , t[h]/ and high-front vowel /i/ have a similar place of articulation, front-tongue-blade, their production accuracy can be improved mutually. The analysis showed that the articulation of Chinese /i/ has a higher frontal lingual position and less variability, which is more conducive to articulation training and improves the effect of cleft palate training. These findings provide a potential relationship on affricates /t, t[h]/ and vowel /i/. Children with cleft palate have difficulty pronouncing the /t, t [h]/ and /i/. It is better to start with a vowel /i/, resulting in improvement in overall speech intelligibility.}, }
@article {pmid36938342, year = {2023}, author = {Alghowinem, S and Gedeon, T and Goecke, R and Cohn, JF and Parker, G}, title = {Interpretation of Depression Detection Models via Feature Selection Methods.}, journal = {IEEE transactions on affective computing}, volume = {14}, number = {1}, pages = {133-152}, pmid = {36938342}, issn = {1949-3045}, support = {R01 MH051435/MH/NIMH NIH HHS/United States ; R01 MH065376/MH/NIMH NIH HHS/United States ; R01 MH096951/MH/NIMH NIH HHS/United States ; }, abstract = {Given the prevalence of depression worldwide and its major impact on society, several studies employed artificial intelligence modelling to automatically detect and assess depression. However, interpretation of these models and cues are rarely discussed in detail in the AI community, but have received increased attention lately. In this study, we aim to analyse the commonly selected features using a proposed framework of several feature selection methods and their effect on the classification results, which will provide an interpretation of the depression detection model. The developed framework aggregates and selects the most promising features for modelling depression detection from 38 feature selection algorithms of different categories. Using three real-world depression datasets, 902 behavioural cues were extracted from speech behaviour, speech prosody, eye movement and head pose. To verify the generalisability of the proposed framework, we applied the entire process to depression datasets individually and when combined. The results from the proposed framework showed that speech behaviour features (e.g. pauses) are the most distinctive features of the depression detection model. From the speech prosody modality, the strongest feature groups were F0, HNR, formants, and MFCC, while for the eye activity modality they were left-right eye movement and gaze direction, and for the head modality it was yaw head movement. Modelling depression detection using the selected features (even though there are only 9 features) outperformed using all features in all the individual and combined datasets. Our feature selection framework did not only provide an interpretation of the model, but was also able to produce a higher accuracy of depression detection with a small number of features in varied datasets. This could help to reduce the processing time needed to extract features and creating the model.}, }
@article {pmid36882955, year = {2023}, author = {Hauser, I}, title = {Differential Cue Weighting in Mandarin Sibilant Production.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309231152495}, doi = {10.1177/00238309231152495}, pmid = {36882955}, issn = {1756-6053}, abstract = {Individual talkers vary in their relative use of different cues to signal phonological contrast. Previous work provides limited and conflicting data on whether such variation is modulated by cue trading or individual differences in speech style. This paper examines differential cue weighting patterns in Mandarin sibilants as a test case for these hypotheses. Standardized Mandarin exhibits a three-way place contrast between retroflex, alveopalatal, and alveolar sibilants with individual differences in relative weighting of spectral center of gravity (COG) and the second formant of the following vowel (F2). In results from a speech production task, cue weights of COG and F2 are inversely correlated across speakers, demonstrating a trade-off relationship in cue use. These findings are consistent with a cue trading account of individual differences in contrast signaling.}, }
@article {pmid36880531, year = {2023}, author = {Yang, X and Guo, C and Zhang, M and Li, Y and Ren, M and Mao, S and Dhakal, R and Kim, NY and Dong, Z and Sun, B and Yao, Z}, title = {Ultrahigh-sensitivity multi-parameter tacrolimus solution detection based on an anchor planar millifluidic microwave biosensor.}, journal = {Analytical methods : advancing methods and applications}, volume = {15}, number = {14}, pages = {1765-1774}, doi = {10.1039/d3ay00100h}, pmid = {36880531}, issn = {1759-9679}, mesh = {*Tacrolimus ; Microwaves ; Radio Waves ; Limit of Detection ; *Biosensing Techniques ; }, abstract = {To detect drug concentration in tacrolimus solution, an anchor planar millifluidic microwave (APMM) biosensor is proposed. The millifluidic system integrated with the sensor enables accurate and efficient detection while eliminating interference caused by the fluidity of the tacrolimus sample. Different concentrations (10-500 ng mL[-1]) of the tacrolimus analyte were introduced into the millifluidic channel, where it completely interacts with the radio frequency patch electromagnetic field, thereby effectively and sensitively modifying the resonant frequency and amplitude of the transmission coefficient. Experimental results indicate that the sensor has an extremely low limit of detection (LoD) of 0.12 pg mL[-1] and a frequency detection resolution (FDR) of 1.59 (MHz (ng mL[-1])). The greater the FDR and the lower the LoD, the more the feasibility of a label-free biosensing method. Regression analysis revealed a strong linear correlation (R[2] = 0.992) between the concentration of tacrolimus and the frequency difference of the two resonant peaks of APMM. In addition, the difference in the reflection coefficient between the two formants was measured and calculated, and a strong linear correlation (R[2] = 0.998) was found between the difference and tacrolimus concentration. Five measurements were performed on each individual sample of tacrolimus to validate the biosensor's high repeatability. Consequently, the proposed biosensor is a potential candidate for the early detection of tacrolimus drug concentration levels in organ transplant recipients. This study presents a simple method for constructing microwave biosensors with high sensitivity and rapid response.}, }
@article {pmid36859160, year = {2023}, author = {Liu, Z and Xu, Y}, title = {Deep learning assessment of syllable affiliation of intervocalic consonants.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {848}, doi = {10.1121/10.0017117}, pmid = {36859160}, issn = {1520-8524}, mesh = {Male ; Humans ; *Deep Learning ; Acoustics ; Emotions ; Judgment ; Language ; }, abstract = {In English, a sentence like "He made out our intentions." could be misperceived as "He may doubt our intentions." because the coda /d/ sounds like it has become the onset of the next syllable. The nature and occurrence condition of this resyllabification phenomenon are unclear, however. Previous empirical studies mainly relied on listener judgment, limited acoustic evidence, such as voice onset time, or average formant values to determine the occurrence of resyllabification. This study tested the hypothesis that resyllabification is a coarticulatory reorganisation that realigns the coda consonant with the vowel of the next syllable. Deep learning in conjunction with dynamic time warping (DTW) was used to assess syllable affiliation of intervocalic consonants. The results suggest that convolutional neural network- and recurrent neural network-based models can detect cases of resyllabification using Mel-frequency spectrograms. DTW analysis shows that neural network inferred resyllabified sequences are acoustically more similar to their onset counterparts than their canonical productions. A binary classifier further suggests that, similar to the genuine onsets, the inferred resyllabified coda consonants are coarticulated with the following vowel. These results are interpreted with an account of resyllabification as a speech-rate-dependent coarticulatory reorganisation mechanism in speech.}, }
@article {pmid36859151, year = {2023}, author = {Lasota, M and Šidlof, P and Maurerlehner, P and Kaltenbacher, M and Schoder, S}, title = {Anisotropic minimum dissipation subgrid-scale model in hybrid aeroacoustic simulations of human phonation.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {2}, pages = {1052}, doi = {10.1121/10.0017202}, pmid = {36859151}, issn = {1520-8524}, mesh = {Humans ; *Acoustics ; Anisotropy ; Computer Simulation ; *Computer Systems ; Phonation ; }, abstract = {This article deals with large-eddy simulations of three-dimensional incompressible laryngeal flow followed by acoustic simulations of human phonation of five cardinal English vowels, /ɑ, æ, i, o, u/. The flow and aeroacoustic simulations were performed in OpenFOAM and in-house code openCFS, respectively. Given the large variety of scales in the flow and acoustics, the simulation is separated into two steps: (1) computing the flow in the larynx using the finite volume method on a fine moving grid with 2.2 million elements, followed by (2) computing the sound sources separately and wave propagation to the radiation zone around the mouth using the finite element method on a coarse static grid with 33 000 elements. The numerical results showed that the anisotropic minimum dissipation model, which is not well known since it is not available in common CFD software, predicted stronger sound pressure levels at higher harmonics, and especially at first two formants, than the wall-adapting local eddy-viscosity model. The model on turbulent flow in the larynx was employed and a positive impact on the quality of simulated vowels was found.}, }
@article {pmid36857868, year = {2023}, author = {Huang, Z and Lobbezoo, F and Vanhommerig, JW and Volgenant, CMC and de Vries, N and Aarab, G and Hilgevoord, AAJ}, title = {Effects of demographic and sleep-related factors on snoring sound parameters.}, journal = {Sleep medicine}, volume = {104}, number = {}, pages = {3-10}, doi = {10.1016/j.sleep.2023.02.012}, pmid = {36857868}, issn = {1878-5506}, mesh = {Adult ; Humans ; Male ; Middle Aged ; *Snoring ; *Sleep Apnea, Obstructive ; Sleep ; Polysomnography ; Demography ; }, abstract = {OBJECTIVE: To investigate the effect of frequently reported between-individual (viz., age, gender, body mass index [BMI], and apnea-hypopnea index [AHI]) and within-individual (viz., sleep stage and sleep position) snoring sound-related factors on snoring sound parameters in temporal, intensity, and frequency domains.
METHODS: This study included 83 adult snorers (mean ± SD age: 42.2 ± 11.3 yrs; male gender: 59%) who underwent an overnight polysomnography (PSG) and simultaneous sound recording, from which a total of 131,745 snoring events were extracted and analyzed. Data on both between-individual and within-individual factors were extracted from the participants' PSG reports.
RESULTS: Gender did not have any significant effect on snoring sound parameters. The fundamental frequency (FF; coefficient = -0.31; P = 0.02) and dominant frequency (DF; coefficient = -12.43; P < 0.01) of snoring sounds decreased with the increase of age, and the second formant increased (coefficient = 22.91; P = 0.02) with the increase of BMI. Severe obstructive sleep apnea (OSA; AHI ≥30 events/hour), non-rapid eye movement sleep stage 3 (N3), and supine position were all associated with more, longer, and louder snoring events (P < 0.05). Supine position was associated with higher FF and DF, and lateral decubitus positions were associated with higher formants.
CONCLUSIONS: Within the limitations of the current patient profile and included factors, AHI was found to have greater effects on snoring sound parameters than the other between-individual factors. The included within-individual factors were found to have greater effects on snoring sound parameters than the between-individual factors under study.}, }
@article {pmid36844947, year = {2023}, author = {Wang, L and Jiang, Z}, title = {Tidal Volume Level Estimation Using Respiratory Sounds.}, journal = {Journal of healthcare engineering}, volume = {2023}, number = {}, pages = {4994668}, pmid = {36844947}, issn = {2040-2309}, mesh = {Humans ; *Respiratory Sounds ; Snoring ; Tidal Volume ; *Sleep Apnea, Obstructive ; Algorithms ; }, abstract = {Respiratory sounds have been used as a noninvasive and convenient method to estimate respiratory flow and tidal volume. However, current methods need calibration, making them difficult to use in a home environment. A respiratory sound analysis method is proposed to estimate tidal volume levels during sleep qualitatively. Respiratory sounds are filtered and segmented into one-minute clips, all clips are clustered into three categories: normal breathing/snoring/uncertain with agglomerative hierarchical clustering (AHC). Formant parameters are extracted to classify snoring clips into simple snoring and obstructive snoring with the K-means algorithm. For simple snoring clips, the tidal volume level is calculated based on snoring last time. For obstructive snoring clips, the tidal volume level is calculated by the maximum breathing pause interval. The performance of the proposed method is evaluated on an open dataset, PSG-Audio, in which full-night polysomnography (PSG) and tracheal sound were recorded simultaneously. The calculated tidal volume levels are compared with the corresponding lowest nocturnal oxygen saturation (LoO2) data. Experiments show that the proposed method calculates tidal volume levels with high accuracy and robustness.}, }
@article {pmid36816289, year = {2023}, author = {Aldamen, H and Al-Deaibes, M}, title = {Arabic emphatic consonants as produced by English speakers: An acoustic study.}, journal = {Heliyon}, volume = {9}, number = {2}, pages = {e13401}, pmid = {36816289}, issn = {2405-8440}, abstract = {This study examines the production of emphatic consonants as produced by American L2 learners of Arabic. To this end, 19 participants, 5 native speakers and 14 L2 learners, participated in a production experiment in which they produced monosyllabic CVC pairs that were contrasted in terms of whether the initial consonant was plain or emphatic. The acoustic parameters that were investigated are VOT of voiceless stops, COG of fricatives, and the first three formant frequencies of the target vowels. The results of the native speakers showed that VOT is a reliable acoustic correlate of emphasis in MSA. The results also showed that vowels in the emphatic context have higher F1 and F3 and lower F2. The results showed that the L2 learners produced comparable VOT values to those of native Arabic speakers. Further, L2 learners produced a significantly lower F2 of the vowels in the emphatic context than that in the plain context. Proficiency in Arabic played a role on the F2 measure; the intermediate learners tended to be more native-like than the beginning learners. As for F3, the results of the L2 learners unexpectedly showed that the beginning learners produced a higher F3 in the context of fricatives only. This suggests that the relationship between emphasis and proficiency depends on whether the preceding consonant is a stop or fricative.}, }
@article {pmid36816122, year = {2023}, author = {Ali, IE and Sumita, Y and Wakabayashi, N}, title = {Comparison of Praat and Computerized Speech Lab for formant analysis of five Japanese vowels in maxillectomy patients.}, journal = {Frontiers in neuroscience}, volume = {17}, number = {}, pages = {1098197}, pmid = {36816122}, issn = {1662-4548}, abstract = {INTRODUCTION: Speech impairment is a common complication after surgical resection of maxillary tumors. Maxillofacial prosthodontists play a critical role in restoring this function so that affected patients can enjoy better lives. For that purpose, several acoustic software packages have been used for speech evaluation, among which Computerized Speech Lab (CSL) and Praat are widely used in clinical and research contexts. Although CSL is a commercial product, Praat is freely available on the internet and can be used by patients and clinicians to practice several therapy goals. Therefore, this study aimed to determine if both software produced comparable results for the first two formant frequencies (F1 and F2) and their respective formant ranges obtained from the same voice samples from Japanese participants with maxillectomy defects.
METHODS: CSL was used as a reference to evaluate the accuracy of Praat with both the default and newly proposed adjusted settings. Thirty-seven participants were enrolled in this study for formant analysis of the five Japanese vowels (a/i/u/e/o) using CSL and Praat. Spearman's rank correlation coefficient was used to judge the correlation between the analysis results of both programs regarding F1 and F2 and their respective formant ranges.
RESULTS: As the findings pointed out, highly positive correlations between both software were found for all acoustic features and all Praat settings.
DISCUSSION: The strong correlations between the results of both CSL and Praat suggest that both programs may have similar decision strategies for atypical speech and for both sexes. This study highlights that the default settings in Praat can be used for formant analysis in maxillectomy patients with predictable accuracy. The proposed adjusted settings in Praat can yield more accurate results for formant analysis of atypical speech in maxillectomy cases when the examiner cannot precisely locate the formant frequencies using the default settings or confirm analysis results obtained using CSL.}, }
@article {pmid36748155, year = {2023}, author = {Zhang, C and Hou, Q and Guo, TT and Zhong, JT and Ren, H and Li, GL}, title = {[The effect of Wendler Glottoplasty to elevate vocal pitch in transgender women].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {58}, number = {2}, pages = {139-144}, doi = {10.3760/cma.j.cn115330-20220518-00275}, pmid = {36748155}, issn = {1673-0860}, mesh = {Humans ; Male ; Female ; Young Adult ; Adult ; Middle Aged ; *Transgender Persons ; Retrospective Studies ; Speech Acoustics ; Voice Quality ; Phonation ; }, abstract = {Objective: To evaluate the effect of Wendler Glottoplasty to elevate vocal pitch in transgender women. Methods: The voice parameters of pre-and 3-month post-surgery of 29 transgender women who underwent Wendler Glottoplasty in department of otorhinolaryngology head and neck surgery of Beijing Friendship Hospital from January, 2017 to October, 2020 were retrospectively analyzed. The 29 transgender women ranged in age from 19-47 (27.0±6.3) years old. Subjective evaluation was performed using Transsexual Voice Questionnaire for Male to Female (TVQ[MtF]). Objective parameters included fundamental frequency (F0), highest pitch, lowest pitch, habitual volume, Jitter, Shimmer, maximal phonation time (MPT), noise to harmonic ratio (NHR) and formants frequencies(F1, F2, F3, F4). SPSS 25.0 software was used for statistically analysis. Results: Three months after surgery, the score of TVQ[MtF] was significantly decreased [(89.9±14.7) vs. (50.4±13.6), t=11.49, P<0.001]. The F0 was significantly elevated [(152.7±23.3) Hz vs. (207.7±45.9) Hz, t=-6.03, P<0.001]. Frequencies of F1, F2 and F3 were significantly elevated. No statistical difference was observed in the frequencies of F4. The highest pitch was not significantly altered while the lowest pitch was significantly elevated [(96.8±17.7) Hz vs. (120.0±28.9) Hz, t=-3.71, P=0.001]. Habitual speech volume was significantly increased [(60.0±5.2) dB vs. (63.6±9.6) dB, t=-2.12, P=0.043]. Jitter, Shimmer, NHR and MPT were not obviously altered (P>0.05). Conclusions: Wendler Glottoplasty could notably elevate the vocal pitch, formants frequencies and degree of vocal femininity in transgender women without affecting phonation ability and voice quality. It can be an effective treatment modality for voice feminization.}, }
@article {pmid36742666, year = {2022}, author = {Gunjawate, DR and Ravi, R and Tauro, JP and Philip, R}, title = {Spectral and Temporal Characteristics of Vowels in Konkani.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4870-4879}, pmid = {36742666}, issn = {2231-3796}, abstract = {The present study was undertaken to study the acoustic characteristics of vowels using spectrographic analysis in Mangalorean Catholic Konkani dialect of Konkani spoken in Mangalore, Karnataka, India. Recordings were done using CVC words in 11 males and 19 females between the age range of 18-55 years. The CVC words consisted of combinations of vowels such as (/i, i:, e, ɵ, ə, u, o, ɐ, ӓ, ɔ/) and consonants such as (/m, k, w, s, ʅ, h, l, r, p, ʤ, g, n, Ɵ, ṭ, ḷ, b, dh/). Recordings were done in a sound-treated room using PRAAT software and spectrographic analysis was done and spectral and temporal characteristics such as fundamental frequency (F0), formants (F1, F2, F3) and vowel duration. The results showed that higher fundamental frequency values were observed for short, high and back vowels. Higher F1 values were noted for open vowels and F2 was higher for front vowels. Long vowels had longer duration compared to short vowels and females had longer vowel duration compared to males. The acoustic information in terms of spectral and temporal cues helps in better understanding the production and perception of languages and dialects.}, }
@article {pmid36742539, year = {2022}, author = {Prakash, P and Boominathan, P and Mahalingam, S}, title = {Acoustic Description of Bhramari Pranayama.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {74}, number = {Suppl 3}, pages = {4738-4747}, pmid = {36742539}, issn = {2231-3796}, abstract = {UNLABELLED: The study's aim was (1) To describe the acoustic characteristics of Bhramari pranayama, and (2) to compare the acoustic features of nasal consonant /m/ and the sound of Bhramari pranayama produced by yoga trainers. Cross-sectional study design. Thirty-three adult male yoga trainers performed five repeats of nasal consonant /m/ and Bhramari pranayama. These samples were recorded into Computerized Speech Lab, Kay Pentax model 4500b using a microphone (SM48). Formant frequencies (f F1, f F2, f F3, & f F4), formant bandwidths (BF1, BF2, BF3, & BF4), anti-formant, alpha and beta ratio were analyzed. Nasal consonant /m/ had higher f F2 and anti-formant compared to Bhramari pranayama. Statistical significant differences were noted in f F2, BF3, and anti-formants. Bhramari pranayama revealed a low alpha ratio and a higher beta ratio than /m/. However, these differences were not statistically significant. Findings are discussed from acoustic and physiological perspectives. Bhramari pranayama was assumed to be produced with a larger pharyngeal cavity and narrower velar passage when compared to nasal consonant /m/. Verification at the level of the glottis and with aerodynamic parameters may ascertain the above propositions.
SUPPLEMENTARY INFORMATION: The online version contains supplementary material available at 10.1007/s12070-021-03054-1.}, }
@article {pmid36732236, year = {2023}, author = {Kondaurova, MV and Zheng, Q and Donaldson, CW and Smith, AF}, title = {Effect of telepractice on pediatric cochlear implant users and provider vowel space: A preliminary report.}, journal = {The Journal of the Acoustical Society of America}, volume = {153}, number = {1}, pages = {467}, doi = {10.1121/10.0016866}, pmid = {36732236}, issn = {1520-8524}, mesh = {Child ; Humans ; *Cochlear Implants ; Speech Acoustics ; Speech Production Measurement ; *Cochlear Implantation ; *Deafness/rehabilitation ; Phonetics ; *Speech Perception ; }, abstract = {Clear speaking styles are goal-oriented modifications in which talkers adapt acoustic-phonetic characteristics of speech to compensate for communication challenges. Do children with hearing loss and a clinical provider modify speech characteristics during telepractice to adjust for remote communication? The study examined the effect of telepractice (tele-) on vowel production in seven (mean age 4:11 years, SD 1:2 years) children with cochlear implants (CIs) and a provider. The first (F1) and second (F2) formant frequencies of /i/, /ɑ/, and /u/ vowels were measured in child and provider speech during one in-person and one tele-speech-language intervention, order counterbalanced. Child and provider vowel space areas (VSA) were calculated. The results demonstrated an increase in F2 formant frequency for /i/ vowel in child and provider speech and an increase in F1 formant frequency for /ɑ/ vowel in the provider speech during tele- compared to in-person intervention. An expansion of VSA was found in child and provider speech in tele- compared to in-person intervention. In children, the earlier age of CI activation was associated with larger VSA in both tele- and in-person intervention. The results suggest that the children and the provider adjust vowel articulation in response to remote communication during telepractice.}, }
@article {pmid36719795, year = {2022}, author = {Kirby, J and Pittayaporn, P and Brunelle, M}, title = {Transphonologization of onset voicing: revisiting Northern and Eastern Kmhmu'.}, journal = {Phonetica}, volume = {79}, number = {6}, pages = {591-629}, pmid = {36719795}, issn = {1423-0321}, mesh = {Humans ; *Voice ; Phonation ; Language ; Speech Acoustics ; Acoustics ; Phonetics ; }, abstract = {Phonation and vowel quality are often thought to play a vital role at the initial stage of tonogenesis. This paper investigates the production of voicing and tones in a tonal Northern Kmhmu' dialect spoken in Nan Province, Thailand, and a non-tonal Eastern Kmhmu' dialect spoken in Vientiane, Laos, from both acoustic and electroglottographic perspectives. Large and consistent VOT differences between voiced and voiceless stops are preserved in Eastern Kmhmu', but are not found in Northern Kmhmu', consistent with previous reports. With respect to pitch, f0 is clearly a secondary property of the voicing contrast in Eastern Kmhmu', but unquestionably the primary contrastive property in Northern Kmhmu'. Crucially, no evidence is found to suggest that either phonation type or formant differences act as significant cues to voicing in Eastern Kmhmu' or tones in Northern Kmhmu'. These results suggests that voicing contrasts can also be transphonologized directly into f0-based contrasts, skipping a registral stage based primarily on phonation and/or vowel quality.}, }
@article {pmid36714887, year = {2023}, author = {Viegas, F and Camargo, Z and Viegas, D and Guimarães, GS and Luiz, RR and Ritto, F and Simões-Zenari, M and Nemr, K}, title = {Acoustic Measurements of Speech and Voice in Men with Angle Class II, Division 1, Malocclusion.}, journal = {International archives of otorhinolaryngology}, volume = {27}, number = {1}, pages = {e10-e15}, pmid = {36714887}, issn = {1809-9777}, abstract = {Introduction The acoustic analysis of speech (measurements of the fundamental frequency and formant frequencies) of different vowels produced by speakers with the Angle class II, division 1, malocclusion can provide information about the relationship between articulatory and phonatory mechanisms in this type of maxillomandibular disproportion. Objectives To investigate acoustic measurements related to the fundamental frequency (F0) and formant frequencies (F1 and F2) of the oral vowels of Brazilian Portuguese (BP) produced by male speakers with Angle class II, division 1, malocclusion (study group) and compare with men with Angle class I malocclusion (control group). Methods In total, 60 men (20 with class II, 40 with class I) aged between 18 and 40 years were included in the study. Measurements of F0, F1 and F2 of the seven oral vowels of BP were estimated from the audio samples containing repetitions of carrier sentences. The statistical analysis was performed using the Student t -test and the effect size was calculated. Results Significant differences (p -values) were detected for F0 values in five vowels ([e], [i], [ᴐ], [o] and [u]), and for F1 in vowels [a] and [ᴐ], with high levels for class II, division 1. Conclusion Statistical differences were found in the F0 measurements with higher values in five of the seven vowels analysed in subjects with Angle class II, division 1. The formant frequencies showed differences only in F1 in two vowels with higher values in the study group. The data suggest that data on voice and speech production must be included in the protocol's assessment of patients with malocclusion.}, }
@article {pmid36712820, year = {2023}, author = {Freeman, V}, title = {Production and perception of prevelar merger: Two-dimensional comparisons using Pillai scores and confusion matrices.}, journal = {Journal of phonetics}, volume = {97}, number = {}, pages = {}, pmid = {36712820}, issn = {0095-4470}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, abstract = {Vowel merger production is quantified with gradient acoustic measures, while phonemic perception methods are often coarser, complicating comparisons within mergers in progress. This study implements a perception experiment in two-dimensional formant space (F1 × F2), allowing unified plotting, quantification, and statistics with production data. Production and perception are compared within 20 speakers for a two-part prevelar merger in progress in Pacific Northwest English, where mid-front /ɛ, e/ approximate or merge before voiced velar /ɡ/ (leg-vague merger), and low-front prevelar /æɡ/ raises toward them (bag-raising). Distributions are visualized with kernel density plots and overlap quantified with Pillai scores and confusion matrices from linear discriminant analysis models. Results suggest that leg-vague merger is perceived as more complete than it is produced (in both the sample and community), while bag-raising is highly variable in production but rejected in perception. Relationships between production and perception varied by age, with raising and merger progressing across two generations in production but not perception, followed by younger adults perceiving leg-vague merger but not producing it and varying in (minimal) raising perception while varying in bag-raising in production. Thus, prevelar raising/merger may be progressing among some social groups but reversing in others.}, }
@article {pmid36701896, year = {2023}, author = {Holmes, E and Johnsrude, IS}, title = {Intelligibility benefit for familiar voices is not accompanied by better discrimination of fundamental frequency or vocal tract length.}, journal = {Hearing research}, volume = {429}, number = {}, pages = {108704}, doi = {10.1016/j.heares.2023.108704}, pmid = {36701896}, issn = {1878-5891}, support = {MOP 133450//CIHR/Canada ; }, mesh = {Humans ; *Voice ; Speech ; Cognition ; *Speech Perception ; Heart Rate ; }, abstract = {Speech is more intelligible when it is spoken by familiar than unfamiliar people. If this benefit arises because key voice characteristics like perceptual correlates of fundamental frequency or vocal tract length (VTL) are more accurately represented for familiar voices, listeners may be able to discriminate smaller manipulations to such characteristics for familiar than unfamiliar voices. We measured participants' (N = 17) thresholds for discriminating pitch (correlate of fundamental frequency, or glottal pulse rate) and formant spacing (correlate of VTL; 'VTL-timbre') for voices that were familiar (participants' friends) and unfamiliar (other participants' friends). As expected, familiar voices were more intelligible. However, discrimination thresholds were no smaller for the same familiar voices. The size of the intelligibility benefit for a familiar over an unfamiliar voice did not relate to the difference in discrimination thresholds for the same voices. Also, the familiar-voice intelligibility benefit was just as large following perceptible manipulations to pitch and VTL-timbre. These results are more consistent with cognitive accounts of speech perception than traditional accounts that predict better discrimination.}, }
@article {pmid36689265, year = {2023}, author = {Ettore, E and Müller, P and Hinze, J and Riemenschneider, M and Benoit, M and Giordana, B and Hurlemann, R and Postin, D and Lecomte, A and Musiol, M and Lindsay, H and Robert, P and König, A}, title = {Digital Phenotyping for Differential Diagnosis of Major Depressive Episode: Narrative Review.}, journal = {JMIR mental health}, volume = {10}, number = {}, pages = {e37225}, pmid = {36689265}, issn = {2368-7959}, abstract = {BACKGROUND: Major depressive episode (MDE) is a common clinical syndrome. It can be found in different pathologies such as major depressive disorder (MDD), bipolar disorder (BD), posttraumatic stress disorder (PTSD), or even occur in the context of psychological trauma. However, only 1 syndrome is described in international classifications (Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition [DSM-5]/International Classification of Diseases 11th Revision [ICD-11]), which do not take into account the underlying pathology at the origin of the MDE. Clinical interviews are currently the best source of information to obtain the etiological diagnosis of MDE. Nevertheless, it does not allow an early diagnosis and there are no objective measures of extracted clinical information. To remedy this, the use of digital tools and their correlation with clinical symptomatology could be useful.
OBJECTIVE: We aimed to review the current application of digital tools for MDE diagnosis while highlighting shortcomings for further research. In addition, our work was focused on digital devices easy to use during clinical interview and mental health issues where depression is common.
METHODS: We conducted a narrative review of the use of digital tools during clinical interviews for MDE by searching papers published in PubMed/MEDLINE, Web of Science, and Google Scholar databases since February 2010. The search was conducted from June to September 2021. Potentially relevant papers were then compared against a checklist for relevance and reviewed independently for inclusion, with focus on 4 allocated topics of (1) automated voice analysis, behavior analysis by (2) video and physiological measures, (3) heart rate variability (HRV), and (4) electrodermal activity (EDA). For this purpose, we were interested in 4 frequently found clinical conditions in which MDE can occur: (1) MDD, (2) BD, (3) PTSD, and (4) psychological trauma.
RESULTS: A total of 74 relevant papers on the subject were qualitatively analyzed and the information was synthesized. Thus, a digital phenotype of MDE seems to emerge consisting of modifications in speech features (namely, temporal, prosodic, spectral, source, and formants) and in speech content, modifications in nonverbal behavior (head, hand, body and eyes movement, facial expressivity, and gaze), and a decrease in physiological measurements (HRV and EDA). We not only found similarities but also differences when MDE occurs in MDD, BD, PTSD, or psychological trauma. However, comparative studies were rare in BD or PTSD conditions, which does not allow us to identify clear and distinct digital phenotypes.
CONCLUSIONS: Our search identified markers from several modalities that hold promise for helping with a more objective diagnosis of MDE. To validate their potential, further longitudinal and prospective studies are needed.}, }
@article {pmid36680472, year = {2023}, author = {Aoyama, K and Hong, L and Flege, JE and Akahane-Yamada, R and Yamada, T}, title = {Relationships Between Acoustic Characteristics and Intelligibility Scores: A Reanalysis of Japanese Speakers' Productions of American English Liquids.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309221140910}, doi = {10.1177/00238309221140910}, pmid = {36680472}, issn = {1756-6053}, abstract = {The primary purpose of this research report was to investigate the relationships between acoustic characteristics and perceived intelligibility for native Japanese speakers' productions of American English liquids. This report was based on a reanalysis of intelligibility scores and acoustic analyses that were reported in two previous studies. We examined which acoustic parameters were associated with higher perceived intelligibility scores for their productions of /l/ and /ɹ/ in American English, and whether Japanese speakers' productions of the two liquids were acoustically differentiated from each other. Results demonstrated that the second formant (F2) was strongly correlated with the perceived intelligibility scores for the Japanese adults' productions. Results also demonstrated that the Japanese adults' and children's productions of /l/ and /ɹ/ were indeed differentiated by some acoustic parameters including the third formant (F3). In addition, some changes occurred in the Japanese children's productions over the course of 1 year. Overall, the present report shows that Japanese speakers of American English may be making a distinction between /l/ and /ɹ/ in production, although the distinctions are made in a different way compared with native English speakers' productions. These findings have implications for setting realistic goals for improving intelligibility of English /l/ and /ɹ/ for Japanese speakers, as well as theoretical advancement of second-language speech learning.}, }
@article {pmid36608104, year = {2023}, author = {Sahin, S and Sen Yilmaz, B}, title = {Effects of the Orthognathic Surgery on the Voice Characteristics of Skeletal Class III Patients.}, journal = {The Journal of craniofacial surgery}, volume = {34}, number = {1}, pages = {253-257}, doi = {10.1097/SCS.0000000000008843}, pmid = {36608104}, issn = {1536-3732}, mesh = {Adult ; Humans ; Male ; Female ; Voice Quality ; Speech Acoustics ; *Orthognathic Surgery ; *Voice ; Acoustics ; }, abstract = {OBJECTIVES: To analyze the effects of the bimaxillary orthognathic surgery on the voice characteristics of skeletal Class III cases, and to evaluate correlations between acoustic and skeletal changes.
METHOD: Skeletal Class III adult patients (7 male, 18 female) were asked to pronounce the sounds "[a], [ɛ], [ɯ], [i], [ɔ], [œ], [u], [y]" for 3 seconds. Voice records and lateral cephalometric x-rays were taken before the surgery (T0) and 6 months after (T1). Voice records were taken for the control group with 6 months of interval (n=20). The formant frequencies (F0, F1, F2, and F3), Shimmer, Jitter and Noise to Harmonic Ratio (NHR) parameters were considered with Praat version 6.0.43.
RESULTS: In the surgery group, significant differences were observed in the F1 of [e], F2 and Shimmer of [ɯ] and F1 and F2 of [œ] and F1 of [y] sound, the post-surgery values were lower. F3 of [u] sound was higher. In comparison with the control group, ΔF3 of the [ɔ], ΔF3 of the [u] and ΔF1 of the [y] sound, ΔShimmer of [ɛ], [ɯ], [i], [ɔ], [u] and [y], and the ΔNHR of [ɔ] sound significantly changed. The Pearson correlation analysis proved some correlations; ΔF2 between ΔSNA for [ɯ] and [œ] sounds, ΔF1 between ΔHBV for [y] sound.
CONCLUSION: Bimaxillary orthognathic surgery changed some voice parameters in skeletal Class III patients. Some correlations were found between skeletal and acoustic parameters. We advise clinicians to consider these findings and inform their patients.}, }
@article {pmid36593767, year = {2023}, author = {Kim, S and Choi, J and Cho, T}, title = {Data on English coda voicing contrast under different prosodic conditions produced by American English speakers and Korean learners of English.}, journal = {Data in brief}, volume = {46}, number = {}, pages = {108816}, pmid = {36593767}, issn = {2352-3409}, abstract = {This data article provides acoustic data for individual speakers' production of coda voicing contrast between stops in English, which are based on laboratory speech recorded by twelve native speakers of American English and twenty-four Korean learners of English. There were four pairs of English monosyllabic target words with voicing contrast in the coda position (bet-bed, pet-ped, bat-bad, pat-pad). The words were produced in carrier sentences in which they were placed in two different prosodic boundary conditions (Intonational Phrase initial and Intonation Phrase medial), two pitch accent conditions (nuclear-pitch accented and unaccented), and three focus conditions (lexical focus, phonological focus and no focus). The raw acoustic measurement values that are included in a CSV-formated file are F0, F1, F2 and duration of each vowel preceding a coda consonant; and Voice Onset Time of word-initial stops. This article also provides figures that exemplify individual speaker variation of vowel duration, F0, F1 and F2 as a function of focus conditions. The data can thus be potentially reused to observe individual variations in phonetic encoding of coda voicing contrast as a function of the aforementioned prosodically-conditioned factors (i.e., prosodic boundary, pitch accent, focus) in native vs. non-native English. Some theoretical aspects of the data are discussed in the full-length article entitled "Phonetic encoding of coda voicing contrast under different focus conditions in L1 vs. L2 English" [1].}, }
@article {pmid36586864, year = {2022}, author = {Herbst, CT and Story, BH}, title = {Computer simulation of vocal tract resonance tuning strategies with respect to fundamental frequency and voice source spectral slope in singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {6}, pages = {3548}, doi = {10.1121/10.0014421}, pmid = {36586864}, issn = {1520-8524}, mesh = {Male ; Female ; Humans ; *Singing ; Computer Simulation ; *Voice ; Sound ; Vibration ; }, abstract = {A well-known concept of singing voice pedagogy is "formant tuning," where the lowest two vocal tract resonances (fR1, fR2) are systematically tuned to harmonics of the laryngeal voice source to maximize the level of radiated sound. A comprehensive evaluation of this resonance tuning concept is still needed. Here, the effect of fR1, fR2 variation was systematically evaluated in silico across the entire fundamental frequency range of classical singing for three voice source characteristics with spectral slopes of -6, -12, and -18 dB/octave. Respective vocal tract transfer functions were generated with a previously introduced low-dimensional computational model, and resultant radiated sound levels were expressed in dB(A). Two distinct strategies for optimized sound output emerged for low vs high voices. At low pitches, spectral slope was the predominant factor for sound level increase, and resonance tuning only had a marginal effect. In contrast, resonance tuning strategies became more prevalent and voice source strength played an increasingly marginal role as fundamental frequency increased to the upper limits of the soprano range. This suggests that different voice classes (e.g., low male vs high female) likely have fundamentally different strategies for optimizing sound output, which has fundamental implications for pedagogical practice.}, }
@article {pmid36578688, year = {2022}, author = {Ji, Y and Hu, Y and Jiang, X}, title = {Segmental and suprasegmental encoding of speaker confidence in Wuxi dialect vowels.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {1028106}, pmid = {36578688}, issn = {1664-1078}, abstract = {INTRODUCTION: Wuxi dialect is a variation of Wu dialect spoken in eastern China and is characterized by a rich tonal system. Compared with standard Mandarin speakers, those of Wuxi dialect as their mother tongue can be more efficient in varying vocal cues to encode communicative meanings in speech communication. While literature has demonstrated that speakers encode high vs. low confidence in global prosodic cues at the sentence level, it is unknown how speakers' intended confidence is encoded at a more local, phonetic level. This study aimed to explore the effects of speakers' intended confidence on both prosodic and formant features of vowels in two lexical tones (the flat tone and the contour tone) of Wuxi dialect.
METHODS: Words of a single vowel were spoken in confident, unconfident, or neutral tone of voice by native Wuxi dialect speakers using a standard elicitation procedure. Linear-mixed effects modeling and parametric bootstrapping testing were performed.
RESULTS: The results showed that (1) the speakers raised both F1 and F2 in the confident level (compared with the neutral-intending expression). Additionally, F1 can distinguish between the confident and unconfident expressions; (2) Compared with the neutral-intending expression, the speakers raised mean f0, had a greater variation of f0 and prolonged pronunciation time in the unconfident level while they raised mean intensity, had a greater variation of intensity and prolonged pronunciation time in the confident level. (3) The speakers modulated mean f0 and mean intensity to a larger extent on the flat tone than the contour tone to differentiate between levels of confidence in the voice, while they modulated f0 and intensity range more only on the contour tone.
DISCUSSION: These findings shed new light on the mechanisms of segmental and suprasegmental encoding of speaker confidence and lack of confidence at the vowel level, highlighting the interplay of lexical tone and vocal expression in speech communication.}, }
@article {pmid36571115, year = {2023}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Expression of concern: 'Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage' (2022) by Grawunder et al.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {378}, number = {1870}, pages = {20220476}, doi = {10.1098/rstb.2022.0476}, pmid = {36571115}, issn = {1471-2970}, }
@article {pmid36508721, year = {2022}, author = {Moya-Galé, G and Wisler, AA and Walsh, SJ and McAuliffe, MJ and Levy, ES}, title = {Acoustic Predictors of Ease of Understanding in Spanish Speakers With Dysarthria Associated With Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {}, number = {}, pages = {1-14}, doi = {10.1044/2022_JSLHR-22-00284}, pmid = {36508721}, issn = {1558-9102}, abstract = {PURPOSE: The purpose of this study was to examine selected baseline acoustic features of hypokinetic dysarthria in Spanish speakers with Parkinson's disease (PD) and identify potential acoustic predictors of ease of understanding in Spanish.
METHOD: Seventeen Spanish-speaking individuals with mild-to-moderate hypokinetic dysarthria secondary to PD and eight healthy controls were recorded reading a translation of the Rainbow Passage. Acoustic measures of vowel space area, as indicated by the formant centralization ratio (FCR), envelope modulation spectra (EMS), and articulation rate were derived from the speech samples. Additionally, 15 healthy adults rated ease of understanding of the recordings on a visual analogue scale. A multiple linear regression model was implemented to investigate the predictive value of the selected acoustic parameters on ease of understanding.
RESULTS: Listeners' ease of understanding was significantly lower for speakers with dysarthria than for healthy controls. The FCR, EMS from the first 10 s of the reading passage, and the difference in EMS between the end and the beginning sections of the passage differed significantly between the two groups of speakers. Findings indicated that 67.7% of the variability in ease of understanding was explained by the predictive model, suggesting a moderately strong relationship between the acoustic and perceptual domains.
CONCLUSIONS: Measures of envelope modulation spectra were found to be highly significant model predictors of ease of understanding of Spanish-speaking individuals with hypokinetic dysarthria associated with PD. Articulation rate was also found to be important (albeit to a lesser degree) in the predictive model. The formant centralization ratio should be further examined with a larger sample size and more severe dysarthria to determine its efficacy in predicting ease of understanding.}, }
@article {pmid36477984, year = {2022}, author = {Peng, H and Li, S and Xing, J and Yang, F and Wu, A}, title = {Surface plasmon resonance of Au/Ag metals for the photoluminescence enhancement of lanthanide ion Ln[3+] doped upconversion nanoparticles in bioimaging.}, journal = {Journal of materials chemistry. B}, volume = {}, number = {}, pages = {}, doi = {10.1039/d2tb02251f}, pmid = {36477984}, issn = {2050-7518}, abstract = {Deep tissue penetration, chemical inertness and biocompatibility give UCNPs a competitive edge over traditional fluorescent materials like organic dyes or quantum dots. However, the low quantum efficiency of UNCPs becomes an obstacle. Among extensive methods and strategies currently used to prominently solve this concerned issue, surface plasmon resonance (SPR) of noble metals is of great use due to the agreement between the SPR peak of metals and absorption band of UCNPs. A key challenge of this match is that the structures and sizes of noble metals have significant influences on the peak of SPR formants, where achieving an explicit elucidation of relationships between the physical properties of noble metals and their SPR formants is of great importance. This review aims to clarify the mechanism of the SPR effect of noble metals on the optical performance of UCNPs. Furthermore, novel research studies in which Au, Ag or Au/Ag composites in various structures and sizes are combined with UCNPs through different synthetic methods are summarized. We provide an overview of improved photoluminescence for bioimaging exhibited by different composite nanoparticles with respect to UCNPs acting as both cores and shells, taking Au@UCNPs, Ag@UCNPs and Au/Ag@UCNPs into account. Finally, there are remaining shortcomings and latent opportunities which deserve further research. This review will provide directions for the bioimaging applications of UCNPs through the introduction of the SPR effect of noble metals.}, }
@article {pmid36460491, year = {2022}, author = {Wang, Y and Hattori, M and Liu, R and Sumita, YI}, title = {Digital acoustic analysis of the first three formant frequencies in patients with a prosthesis after maxillectomy.}, journal = {The Journal of prosthetic dentistry}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.prosdent.2022.10.010}, pmid = {36460491}, issn = {1097-6841}, abstract = {STATEMENT OF PROBLEM: Prosthetic rehabilitation with an obturator can help to restore or improve the intelligibility of speech in patients after maxillectomy. The frequency of formants 1 and 2 as well as their ranges were initially reported in patients with maxillary defects in 2002, and the evaluation method that was used is now applied in clinical evaluation. However, the details of formant 3 are not known and warrant investigation because, according to speech science, formant 3 is related to the pharyngeal volume. Clarifying the formant frequency values of formant 3 in patients after maxillectomy would enable prosthodontists to refer to these data when planning treatment and when assessing the outcome of an obturator.
PURPOSE: The purpose of this clinical study was to determine the acoustic characteristics of formant 3, together with those of formants 1 and 2, by using a digital acoustic analysis during maxillofacial prosthetic treatment. The utility of determining formant 3 in the evaluation of speech in patients after maxillectomy was also evaluated.
MATERIAL AND METHODS: Twenty-six male participants after a maxillectomy (mean age, 63 years; range, 20 to 93 years) were included, and the 5 Japanese vowels /a/, /e/, /i/, /o/, and /u/ produced with and without a definitive obturator prosthesis were recorded. The frequencies of the 3 formants were determined, and their ranges were calculated by using a speech analysis system (Computerized Speech Lab CSL 4400). The Wilcoxon signed rank test was used to compare the formants between the 2 use conditions (α=0.05).
RESULTS: Significant differences were found in the frequencies and ranges of all 3 formants between the use conditions. The ranges of all 3 formants produced with the prosthesis were significantly greater than those produced without it.
CONCLUSIONS: Based on the findings, both the first 2 formants and the third formant were changed by wearing an obturator prosthesis. Because formant 3 is related to the volume of the pharynx, evaluation of this formant and its range can reflect the effectiveness of the prosthesis to seal the oronasal communication and help reduce hypernasality, suggesting the utility of formant 3 analysis in prosthodontic rehabilitation.}, }
@article {pmid36456282, year = {2022}, author = {Voeten, CC and Heeringa, W and Van de Velde, H}, title = {Normalization of nonlinearly time-dynamic vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {5}, pages = {2692}, doi = {10.1121/10.0015025}, pmid = {36456282}, issn = {1520-8524}, abstract = {This study compares 16 vowel-normalization methods for purposes of sociophonetic research. Most of the previous work in this domain has focused on the performance of normalization methods on steady-state vowels. By contrast, this study explicitly considers dynamic formant trajectories, using generalized additive models to model these nonlinearly. Normalization methods were compared using a hand-corrected dataset from the Flemish-Dutch Teacher Corpus, which contains 160 speakers from 8 geographical regions, who spoke regionally accented versions of Netherlandic/Flemish Standard Dutch. Normalization performance was assessed by comparing the methods' abilities to remove anatomical variation, retain vowel distinctions, and explain variation in the normalized F0-F3. In addition, it was established whether normalization competes with by-speaker random effects or supplements it, by comparing how much between-speaker variance remained to be apportioned to random effects after normalization. The results partly reproduce the good performance of Lobanov, Gerstman, and Nearey 1 found earlier and generally favor log-mean and centroid methods. However, newer methods achieve higher effect sizes (i.e., explain more variance) at only marginally worse performances. Random effects were found to be equally useful before and after normalization, showing that they complement it. The findings are interpreted in light of the way that the different methods handle formant dynamics.}, }
@article {pmid36455242, year = {2023}, author = {Leyns, C and Daelman, J and Adriaansen, A and Tomassen, P and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Short-Term Acoustic Effects of Speech Therapy in Transgender Women: A Randomized Controlled Trial.}, journal = {American journal of speech-language pathology}, volume = {32}, number = {1}, pages = {145-168}, doi = {10.1044/2022_AJSLP-22-00135}, pmid = {36455242}, issn = {1558-9110}, mesh = {Humans ; Female ; *Speech Therapy ; Speech Acoustics ; *Transgender Persons ; Acoustics ; Speech ; }, abstract = {PURPOSE: This study measured and compared the acoustic short-term effects of pitch elevation training (PET) and articulation-resonance training (ART) and the combination of both programs, in transgender women.
METHOD: A randomized controlled study with cross-over design was used. Thirty transgender women were included and received 14 weeks of speech training. All participants started with 4 weeks of sham training; after which they were randomly assigned to one of two groups: One group continued with PET (5 weeks), followed by ART (5 weeks); the second group received both trainings in opposite order. Participants were recorded 4 times, in between the training blocks: pre, post 1 (after sham), post 2 (after training 1), and post 3 (after training 2). Speech samples included a sustained vowel, continuous speech during reading, and spontaneous speech and were analyzed using Praat software. Fundamental frequency (f o), intensity, voice range profile, vowel formant frequencies (F 1-2-3-4-5 of /a/-/i/-/u/), formant contrasts, vowel space, and vocal quality (Acoustic Voice Quality Index) were determined.
RESULTS AND CONCLUSIONS: Fundamental frequencies increased after both the PET and ART program, with a higher increase after PET. The combination of both interventions showed a mean increase of the f o of 49 Hz during a sustained vowel, 49 Hz during reading, and 29 Hz during spontaneous speech. However, the lower limit (percentile 5) of the f o during spontaneous speech did not change. Higher values were detected for F 1-2 of /a/, F 3 of /u/, and vowel space after PET and ART separately. F 1-2-3 of /a/, F 1-3-4 of /u/, vowel space, and formant contrasts increased after the combination of PET and ART; hence, the combination induced more increases in formant frequencies. Intensity and voice quality measurements did not change. No order effect was detected; that is, starting with PET or ART did not change the outcome.}, }
@article {pmid36425833, year = {2022}, author = {Chen, S and Han, C and Wang, S and Liu, X and Wang, B and Wei, R and Lei, X}, title = {Hearing the physical condition: The relationship between sexually dimorphic vocal traits and underlying physiology.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {983688}, pmid = {36425833}, issn = {1664-1078}, abstract = {A growing amount of research has shown associations between sexually dimorphic vocal traits and physiological conditions related to reproductive advantage. This paper presented a review of the literature on the relationship between sexually dimorphic vocal traits and sex hormones, body size, and physique. Those physiological conditions are important in reproductive success and mate selection. Regarding sex hormones, there are associations between sex-specific hormones and sexually dimorphic vocal traits; about body size, formant frequencies are more reliable predictors of human body size than pitch/fundamental frequency; with regard to the physique, there is a possible but still controversial association between human voice and strength and combat power, while pitch is more often used as a signal of aggressive intent in conflict. Future research should consider demographic, cross-cultural, cognitive interaction, and emotional motivation influences, in order to more accurately assess the relationship between voice and physiology. Moreover, neurological studies were recommended to gain a deeper understanding of the evolutionary origins and adaptive functions of voice modulation.}, }
@article {pmid36397662, year = {2022}, author = {Eichner, ACO and Donadon, C and Skarżyński, PH and Sanfins, MD}, title = {A Systematic Review of the Literature Between 2009 and 2019 to Identify and Evaluate Publications on the Effects of Age-Related Hearing Loss on Speech Processing.}, journal = {Medical science monitor : international medical journal of experimental and clinical research}, volume = {28}, number = {}, pages = {e938089}, pmid = {36397662}, issn = {1643-3750}, mesh = {Aged ; Animals ; Humans ; Speech ; *Speech Perception/physiology ; Acoustic Stimulation ; *Hearing Loss, Sensorineural ; *Cochlear Implants ; }, abstract = {Changes in central auditory processing due to aging in normal-hearing elderly patients, as well as age-related hearing loss, are often associated with difficulties in speech processing, especially in unfavorable acoustic environments. Speech processing depends on the perception of temporal and spectral features, and for this reason can be assessed by recordings of phase-locked neural activity when synchronized to transient and periodic sound stimuli frequency-following responses (FFRs). An electronic search of the PubMed and Web of Science databases was carried out in July 2019. Studies that evaluated the effects of age-related hearing loss on components of FFRs were included. Studies that were not in English, studies performed on animals, studies with cochlear implant users, literature reviews, letters to the editor, and case studies were excluded. Our search yielded 6 studies, each of which included 30 to 94 subjects aged between 18 and 80 years. Latency increases and significant amplitude reduction of the onset, offset, and sloop V/A components of FFRs were observed. Latency and amplitude impairment of the fundamental frequency, first formant, and high formants were related to peripheral sensorineural hearing loss in the elderly population. Conclusions: Temporal changes in FFR tracing were related to the aging process. Hearing loss also impacts the envelope fine structure, producing poorer speech comprehension in noisy environments. More research is needed to understand aspects related to hearing loss and cognitive aspects common to the elderly.}, }
@article {pmid36376191, year = {2022}, author = {Raveendran, R and Yeshoda, K}, title = {Effects of Resonant Voice Therapy on Perceptual and Acoustic Source and Tract Parameters - A Preliminary Study on Indian Carnatic Classical Singers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.09.023}, pmid = {36376191}, issn = {1873-4588}, abstract = {PURPOSE: The aim of the study was to examine the effects of resonant voice therapy (RVT) on the vocal resonance of trained Carnatic singers. The specific objectives were to evaluate the effects of resonant voice therapy on the auditory perceptual judgments and acoustic source and tract parameters before and after RVT on phonation and sung voice samples.
METHOD: Six vocally healthy trained Carnatic singers, three males and three females aged 18-25 years (M = 23; S.D = 2.09) participated in the study. All the participants were assigned to a 21-days-long Resonance Voice Therapy (RVT) training program. The participants' pre and post training phonation and sung samples were subjected to auditory perceptual analysis and acoustic analysis.
RESULTS: The results revealed that the post training auditory perceptual ratings of the phonation task showed a statistically significant difference from the pre training scores (Z= 2.35; P = 0.019). While for the singing task, the post training perceptual ratings were not significantly different from the pre training perceptual rating scores (Z= 2.66; P = 0.08). A significant difference was observed between the pre and post training values for all the measured acoustic parameters of the phonation task. In singing task, though the fundamental frequency, third and fourth formant frequencies showed no significant difference in the pre and post training conditions (P > 0.05), the parameter of- difference between the first formant frequency and the fundamental frequency showed a significant decrease (P = 0.028).
CONCLUSION: The effects of resonant voice production led to a high vocal economy, as evidenced from the improved source and filter acoustic parameters. Indication for formant tuning through vocal tract modifications, probably an enlarged pharyngeal area resulting in increased resonant voice quality in both phonation and singing tasks, is inferred from these results.}, }
@article {pmid36371478, year = {2022}, author = {Rocchesso, D and Andolina, S and Ilardo, G and Palumbo, SD and Galluzzo, Y and Randazzo, M}, title = {A perceptual sound space for auditory displays based on sung-vowel synthesis.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {19370}, pmid = {36371478}, issn = {2045-2322}, mesh = {Humans ; Sound Spectrography ; *Singing ; Sound ; *Speech Perception ; }, abstract = {When designing displays for the human senses, perceptual spaces are of great importance to give intuitive access to physical attributes. Similar to how perceptual spaces based on hue, saturation, and lightness were constructed for visual color, research has explored perceptual spaces for sounds of a given timbral family based on timbre, brightness, and pitch. To promote an embodied approach to the design of auditory displays, we introduce the Vowel-Type-Pitch (VTP) space, a cylindrical sound space based on human sung vowels, whose timbres can be synthesized by the composition of acoustic formants and can be categorically labeled. Vowels are arranged along the circular dimension, while voice type and pitch of the vowel correspond to the remaining two axes of the cylindrical VTP space. The decoupling and perceptual effectiveness of the three dimensions of the VTP space are tested through a vowel labeling experiment, whose results are visualized as maps on circular slices of the VTP cylinder. We discuss implications for the design of auditory and multi-sensory displays that account for human perceptual capabilities.}, }
@article {pmid36360418, year = {2022}, author = {Yoon, TJ and Ha, S}, title = {Adults' Perception of Children's Vowel Production.}, journal = {Children (Basel, Switzerland)}, volume = {9}, number = {11}, pages = {}, pmid = {36360418}, issn = {2227-9067}, abstract = {The study examined the link between Korean-speaking children's vowel production and its perception by inexperienced adults and also observed whether ongoing vowel changes in mid-back vowels affect adults' perceptions when the vowels are produced by children. This study analyzed vowels in monosyllabic words produced by 20 children, ranging from 2 to 6 years old, with a focus on gender distinction, and used them as perceptual stimuli for word perception by 20 inexperienced adult listeners. Acoustic analyses indicated that F0 was not a reliable cue for distinguishing gender, but the first two formants served as reliable cues for gender distinction. The results confirmed that the spacing of the two low formants is linguistically and para-linguistically important in identifying vowel types and gender. However, a pair of non-low back vowels caused difficulties in correct vowel identification. Proximal distance between the vowels could be interpreted to result in the highest mismatch between children's production and adults' perception of the two non-low back vowels in the Korean language. We attribute the source of the highest mismatch of the two non-low back vowels to the ongoing sound change observed in high and mid-back vowels in adult speech. The ongoing vowel change is also observed in the children's vowel space, which may well be shaped after the caregivers whose non-low back vowels are close to each other.}, }
@article {pmid36359019, year = {2022}, author = {Guo, S and Wu, W and Liu, Y and Kang, X and Li, C}, title = {Effects of Valley Topography on Acoustic Communication in Birds: Why Do Birds Avoid Deep Valleys in Daqinggou Nature Reserve?.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {21}, pages = {}, pmid = {36359019}, issn = {2076-2615}, abstract = {To investigate the effects of valley topography on the acoustic transmission of avian vocalisations, we carried out playback experiments in Daqinggou valley, Inner Mongolia, China. During the experiments, we recorded the vocalisations of five avian species, the large-billed crow (Corvus macrorhynchos Wagler, 1827), common cuckoo (Cuculus canorus Linnaeus, 1758), Eurasian magpie (Pica pica Linnaeus, 1758), Eurasian tree sparrow (Passer montanus Linnaeus, 1758), and meadow bunting (Emberiza cioides Brand, 1843), at transmission distances of 30 m and 50 m in the upper and lower parts of the valley and analysed the intensity, the fundamental frequency (F0), and the first three formant frequencies (F1/F2/F3) of the sounds. We also investigated bird species diversity in the upper and lower valley. We found that: (1) at the distance of 30 m, there were significant differences in F0/F1/F2/F3 in Eurasian magpies, significant differences in F1/F2/F3 in the meadow bunting and Eurasian tree sparrow, and partially significant differences in sound frequency between the upper and lower valley in the other two species; (2) at the distance of 50 m, there were significant differences in F0/F1/F2/F3 in two avian species (large-billed crow and common cuckoo) between the upper and lower valley and partially significant differences in sound frequency between the upper and lower valley in the other three species; (2) there were significant differences in the acoustic intensities of crow, cuckoo, magpie, and bunting calls between the upper and lower valley. (3) Species number and richness were significantly higher in the upper valley than in the lower valley. We suggested that the structure of valley habitats may lead to the breakdown of acoustic signals and communication in birds to varying degrees. The effect of valley topography on acoustic communication could be one reason for animal species avoiding deep valleys.}, }
@article {pmid36351244, year = {2022}, author = {Kim, Y and Thompson, A}, title = {An Acoustic-Phonetic Approach to Effects of Face Masks on Speech Intelligibility.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {12}, pages = {4679-4689}, pmid = {36351244}, issn = {1558-9102}, support = {F31 DC020121/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Female ; Humans ; *Speech Intelligibility ; Phonetics ; Speech Acoustics ; Acoustics ; *Speech Perception ; }, abstract = {PURPOSE: This study aimed to examine the effects of wearing a face mask on speech acoustics and intelligibility, using an acoustic-phonetic analysis of speech. In addition, the effects of speakers' behavioral modification while wearing a mask were examined.
METHOD: Fourteen female adults were asked to read a set of words and sentences under three conditions: (a) conversational, mask-off; (b) conversational, mask-on; and (c) clear, mask-on. Seventy listeners rated speech intelligibility using two methods: orthographic transcription and visual analog scale (VAS). Acoustic measures for vowels included duration, first (F1) and second (F2) formant frequency, and intensity ratio of F1/F2. For consonants, spectral moment coefficients and consonant-vowel (CV) boundary (intensity ratio between consonant and vowel) were measured.
RESULTS: Face masks had a negative impact on speech intelligibility as measured by both intelligibility ratings. However, speech intelligibility was recovered in the clear speech condition for VAS but not for transcription scores. Analysis of orthographic transcription showed that listeners tended to frequently confuse consonants (particularly fricatives, affricates, and stops), rather than vowels in the word-initial position. Acoustic data indicated a significant effect of condition on CV intensity ratio only.
CONCLUSIONS: Our data demonstrate a negative effect of face masks on speech intelligibility, mainly affecting consonants. However, intelligibility can be enhanced by speaking clearly, likely driven by prosodic alterations.}, }
@article {pmid36322641, year = {2022}, author = {Baker, CP and Sundberg, J and Purdy, SC and Rakena, TO}, title = {Female adolescent singing voice characteristics: an exploratory study using LTAS and inverse filtering.}, journal = {Logopedics, phoniatrics, vocology}, volume = {}, number = {}, pages = {1-13}, doi = {10.1080/14015439.2022.2140455}, pmid = {36322641}, issn = {1651-2022}, abstract = {Background and Aim: To date, little research is available that objectively quantifies female adolescent singing-voice characteristics in light of the physiological and functional developments that occur from puberty to adulthood. This exploratory study sought to augment the pool of data available that offers objective voice analysis of female singers in late adolescence.Methods: Using long-term average spectra (LTAS) and inverse filtering techniques, dynamic range and voice-source characteristics were determined in a cohort of vocally healthy cis-gender female adolescent singers (17 to 19 years) from high-school choirs in Aotearoa New Zealand. Non-parametric statistics were used to determine associations and significant differences.Results: Wide intersubject variation was seen between dynamic range, spectral measures of harmonic organisation (formant cluster prominence, FCP), noise components in the spectrum (high-frequency energy ratio, HFER), and the normalised amplitude quotient (NAQ) suggesting great variability in ability to control phonatory mechanisms such as subglottal pressure (Psub), glottal configuration and adduction, and vocal tract shaping. A strong association between the HFER and NAQ suggest that these non-invasive measures may offer complimentary insights into vocal function, specifically with regard to glottal adduction and turbulent noise in the voice signal.Conclusion: Knowledge of the range of variation within healthy adolescent singers is necessary for the development of effective and inclusive pedagogical practices, and for vocal-health professionals working with singers of this age. LTAS and inverse filtering are useful non-invasive tools for determining such characteristics.}, }
@article {pmid36313043, year = {2022}, author = {Easwar, V and Purcell, D and Eeckhoutte, MV and Aiken, SJ}, title = {The Influence of Male- and Female-Spoken Vowel Acoustics on Envelope-Following Responses.}, journal = {Seminars in hearing}, volume = {43}, number = {3}, pages = {223-239}, pmid = {36313043}, issn = {0734-0451}, abstract = {The influence of male and female vowel characteristics on the envelope-following responses (EFRs) is not well understood. This study explored the role of vowel characteristics on the EFR at the fundamental frequency (f0) in response to the vowel /ε/ (as in "head"). Vowel tokens were spoken by five males and five females and EFRs were measured in 25 young adults (21 females). An auditory model was used to estimate changes in auditory processing that might account for talker effects on EFR amplitude. There were several differences between male and female vowels in relation to the EFR. For male talkers, EFR amplitudes were correlated with the bandwidth and harmonic count of the first formant, and the amplitude of the trough below the second formant. For female talkers, EFR amplitudes were correlated with the range of f0 frequencies and the amplitude of the trough above the second formant. The model suggested that the f0 EFR reflects a wide distribution of energy in speech, with primary contributions from high-frequency harmonics mediated from cochlear regions basal to the peaks of the first and second formants, not from low-frequency harmonics with energy near f0. Vowels produced by female talkers tend to produce lower-amplitude EFR, likely because they depend on higher-frequency harmonics where speech sound levels tend to be lower. This work advances auditory electrophysiology by showing how the EFR evoked by speech relates to the acoustics of speech, for both male and female voices.}, }
@article {pmid36304844, year = {2022}, author = {Pah, ND and Indrawati, V and Kumar, DK}, title = {Voice Features of Sustained Phoneme as COVID-19 Biomarker.}, journal = {IEEE journal of translational engineering in health and medicine}, volume = {10}, number = {}, pages = {4901309}, pmid = {36304844}, issn = {2168-2372}, mesh = {Humans ; *COVID-19 ; Cross-Sectional Studies ; Longitudinal Studies ; Pandemics ; SARS-CoV-2 ; Biomarkers ; }, abstract = {BACKGROUND: The COVID-19 pandemic has resulted in enormous costs to our society. Besides finding medicines to treat those infected by the virus, it is important to find effective and efficient strategies to prevent the spreading of the disease. One key factor to prevent transmission is to identify COVID-19 biomarkers that can be used to develop an efficient, accurate, noninvasive, and self-administered screening procedure. Several COVID-19 variants cause significant respiratory symptoms, and thus a voice signal may be a potential biomarker for COVID-19 infection.
AIM: This study investigated the effectiveness of different phonemes and a range of voice features in differentiating people infected by COVID-19 with respiratory tract symptoms.
METHOD: This cross-sectional, longitudinal study recorded six phonemes (i.e., /a/, /e/, /i/, /o/, /u/, and /m/) from 40 COVID-19 patients and 48 healthy subjects for 22 days. The signal features were obtained for the recordings, which were statistically analyzed and classified using Support Vector Machine (SVM).
RESULTS: The statistical analysis and SVM classification show that the voice features related to the vocal tract filtering (e.g., MFCC, VTL, and formants) and the stability of the respiratory muscles and lung volume (Intensity-SD) were the most sensitive to voice change due to COVID-19. The result also shows that the features extracted from the vowel /i/ during the first 3 days after admittance to the hospital were the most effective. The SVM classification accuracy with 18 ranked features extracted from /i/ was 93.5% (with F1 score of 94.3%).
CONCLUSION: A measurable difference exists between the voices of people with COVID-19 and healthy people, and the phoneme /i/ shows the most pronounced difference. This supports the potential for using computerized voice analysis to detect the disease and consider it a biomarker.}, }
@article {pmid36293884, year = {2022}, author = {Choi, MK and Yoo, SD and Park, EJ}, title = {Destruction of Vowel Space Area in Patients with Dysphagia after Stroke.}, journal = {International journal of environmental research and public health}, volume = {19}, number = {20}, pages = {}, pmid = {36293884}, issn = {1660-4601}, mesh = {Humans ; Dysarthria/complications ; *Deglutition Disorders/etiology ; Speech Acoustics ; Deglutition ; *Stroke/complications ; }, abstract = {Dysphagia is associated with dysarthria in stroke patients. Vowel space decreases in stroke patients with dysarthria; destruction of the vowel space is often observed. We determined the correlation of destruction of acoustic vowel space with dysphagia in stroke patients. Seventy-four individuals with dysphagia and dysarthria who had experienced stroke were enrolled. For /a/, /ae/, /i/, and /u/ vowels, we determined formant parameter (it reflects vocal tract resonance frequency as a two-dimensional coordinate point), formant centralization ratio (FCR), and quadrilateral vowel space area (VSA). Swallowing function was assessed using the videofluoroscopic dysphagia scale (VDS) during videofluoroscopic swallowing studies. Pearson's correlation and linear regression were used to determine the correlation between VSA, FCR, and VDS. Subgroups were created based on VSA; vowel space destruction groups were compared using ANOVA and Scheffe's test. VSA and FCR were negatively and positively correlated with VDS, respectively. Groups were separated based on mean and standard deviation of VSA. One-way ANOVA revealed significant differences in VDS, FCR, and age between the VSA groups and no significant differences in VDS between mild and moderate VSA reduction and vowel space destruction groups. VSA and FCR values correlated with swallowing function. Vowel space destruction has characteristics similar to VSA reduction at a moderate-to-severe degree and has utility as an indicator of dysphagia severity.}, }
@article {pmid36289365, year = {2022}, author = {Müller, M and Wang, Z and Caffier, F and Caffier, PP}, title = {New objective timbre parameters for classification of voice type and fach in professional opera singers.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {17921}, pmid = {36289365}, issn = {2045-2322}, mesh = {Humans ; *Singing ; Voice Quality ; *Voice ; Occupations ; Sound ; }, abstract = {Voice timbre is defined as sound color independent of pitch and volume, based on a broad frequency band between 2 and 4 kHz. Since there are no specific timbre parameters, previous studies have come to the very general conclusion that the center frequencies of the singer's formants are somewhat higher in the higher voice types than in the lower ones. For specification, a database was created containing 1723 sound examples of various voice types. The energy distribution in the frequency bands of the singer's formants was extracted for quantitative analysis. When the energy distribution function reached 50%, the corresponding absolute frequency in Hz was defined as Frequency of Half Energy (FHE). This new parameter quantifies the timbre of a singing voice as a concrete measure, independent of fundamental frequency, vowel color and volume. The database allows assigning FHE means ± SD as characteristic or comparative values for sopranos (3092 ± 284 Hz), tenors (2705 ± 221 Hz), baritones (2454 ± 206 Hz) and basses (2384 ± 164 Hz). In addition to vibrato, specific timbre parameters provide another valuable feature in vocal pedagogy for classification of voice type and fach according to the lyric or dramatic character of the voice.}, }
@article {pmid36279585, year = {2022}, author = {Hussain, RO and Kumar, P and Singh, NK}, title = {Subcortical and Cortical Electrophysiological Measures in Children With Speech-in-Noise Deficits Associated With Auditory Processing Disorders.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4454-4468}, doi = {10.1044/2022_JSLHR-22-00094}, pmid = {36279585}, issn = {1558-9102}, mesh = {Child ; Humans ; Adolescent ; *Auditory Perceptual Disorders/diagnosis ; Speech ; Noise ; *Speech Perception/physiology ; Evoked Potentials, Auditory ; Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem/physiology ; }, abstract = {PURPOSE: The aim of this study was to analyze the subcortical and cortical auditory evoked potentials for speech stimuli in children with speech-in-noise (SIN) deficits associated with auditory processing disorder (APD) without any reading or language deficits.
METHOD: The study included 20 children in the age range of 9-13 years. Ten children were recruited to the APD group; they had below-normal scores on the speech-perception-in-noise test and were diagnosed as having APD. The remaining 10 were typically developing (TD) children and were recruited to the TD group. Speech-evoked subcortical (brainstem) and cortical (auditory late latency) responses were recorded and compared across both groups.
RESULTS: The results showed a statistically significant reduction in the amplitudes of the subcortical potentials (both for stimulus in quiet and in noise) and the magnitudes of the spectral components (fundamental frequency and the second formant) in children with SIN deficits in the APD group compared to the TD group. In addition, the APD group displayed enhanced amplitudes of the cortical potentials compared to the TD group.
CONCLUSION: Children with SIN deficits associated with APD exhibited impaired coding/processing of the auditory information at the level of the brainstem and the auditory cortex.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21357735.}, }
@article {pmid36279201, year = {2022}, author = {Bochner, J and Samar, V and Prud'hommeaux, E and Huenerfauth, M}, title = {Phoneme Categorization in Prelingually Deaf Adult Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {11}, pages = {4429-4453}, doi = {10.1044/2022_JSLHR-22-00038}, pmid = {36279201}, issn = {1558-9102}, mesh = {Adult ; Humans ; Young Adult ; *Cochlear Implants ; *Deafness/rehabilitation ; *Speech Perception ; *Cochlear Implantation ; Hearing ; }, abstract = {PURPOSE: Phoneme categorization (PC) for voice onset time and second formant transition was studied in adult cochlear implant (CI) users with early-onset deafness and hearing controls.
METHOD: Identification and discrimination tasks were administered to 30 participants implanted before 4 years of age, 21 participants implanted after 7 years of age, and 21 hearing individuals.
RESULTS: Distinctive identification and discrimination functions confirmed PC within all groups. Compared to hearing participants, the CI groups generally displayed longer/higher category boundaries, shallower identification function slopes, reduced identification consistency, and reduced discrimination performance. A principal component analysis revealed that identification consistency, discrimination accuracy, and identification function slope, but not boundary location, loaded on a single factor, reflecting general PC performance. Earlier implantation was associated with better PC performance within the early CI group, but not the late CI group. Within the early CI group, earlier implantation age but not PC performance was associated with better speech recognition. Conversely, within the late CI group, better PC performance but not earlier implantation age was associated with better speech recognition.
CONCLUSIONS: Results suggest that implantation timing within the sensitive period before 4 years of age partly determines the level of PC performance. They also suggest that early implantation may promote development of higher level processes that can compensate for relatively poor PC performance, as can occur in challenging listening conditions.}, }
@article {pmid36266347, year = {2022}, author = {Skrabal, D and Rusz, J and Novotny, M and Sonka, K and Ruzicka, E and Dusek, P and Tykalova, T}, title = {Articulatory undershoot of vowels in isolated REM sleep behavior disorder and early Parkinson's disease.}, journal = {NPJ Parkinson's disease}, volume = {8}, number = {1}, pages = {137}, pmid = {36266347}, issn = {2373-8057}, abstract = {Imprecise vowels represent a common deficit associated with hypokinetic dysarthria resulting from a reduced articulatory range of motion in Parkinson's disease (PD). It is not yet unknown whether the vowel articulation impairment is already evident in the prodromal stages of synucleinopathy. We aimed to assess whether vowel articulation abnormalities are present in isolated rapid eye movement sleep behaviour disorder (iRBD) and early-stage PD. A total of 180 male participants, including 60 iRBD, 60 de-novo PD and 60 age-matched healthy controls performed reading of a standardized passage. The first and second formant frequencies of the corner vowels /a/, /i/, and /u/ extracted from predefined words, were utilized to construct articulatory-acoustic measures of Vowel Space Area (VSA) and Vowel Articulation Index (VAI). Compared to controls, VSA was smaller in both iRBD (p = 0.01) and PD (p = 0.001) while VAI was lower only in PD (p = 0.002). iRBD subgroup with abnormal olfactory function had smaller VSA compared to iRBD subgroup with preserved olfactory function (p = 0.02). In PD patients, the extent of bradykinesia and rigidity correlated with VSA (r = -0.33, p = 0.01), while no correlation between axial gait symptoms or tremor and vowel articulation was detected. Vowel articulation impairment represents an early prodromal symptom in the disease process of synucleinopathy. Acoustic assessment of vowel articulation may provide a surrogate marker of synucleinopathy in scenarios where a single robust feature to monitor the dysarthria progression is needed.}, }
@article {pmid36266224, year = {2022}, author = {Zhang, T and He, M and Li, B and Zhang, C and Hu, J}, title = {Acoustic Characteristics of Cantonese Speech Through Protective Facial Coverings.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.08.029}, pmid = {36266224}, issn = {1873-4588}, abstract = {OBJECTIVES: Protective facial coverings (PFCs) such as surgical masks attenuate speech transmission and affect speech intelligibility, which is reported in languages such as English and German. The present study intended to verify the detrimental impacts on production of tonal languages such as Cantonese, by examining realization of speech correlates in Cantonese under PFCs including facial masks and shields.
METHODS: We recorded scripted speech in Hong Kong Cantonese produced by three adult speakers who wore various PFCs, including surgical masks, KF94 masks, and face shields (with and without surgical masks). Spectral and temporal parameters were measured, including mean intensity, speaking rate, long-term amplitude spectrum, formant frequencies of vowels, and duration and fundamental frequency (F0) of tone-bearing parts.
RESULTS: Significant changes were observed in all acoustic correlates of Cantonese speech under PFCs. Sound pressure levels were attenuated more intensely at ranges of higher frequencies in speech through face masks, whereas sound transmission was affected more at ranges of lower frequencies in speech under face shields. Vowel spaces derived from formant frequencies shrank under all PFCs, with the vowel /aa/ demonstrating largest changes in the first two formants. All tone-bearing parts were shortened and showed increments of F0 means in speech through PFCs. The decrease of tone duration was statistically significant in High-level and Low-level tones, while the increment of F0 means was significant in High-level tone only.
CONCLUSIONS: General filtering effect of PFCs is observed in Cantonese speech data, confirming language-universal patterns in acoustic attenuation by PFCs. The various coverings lower overall intensity levels of speech and degrade speech signal in higher frequency regions. Modification patterns specific to Hong Kong Cantonese are also identified. Vowel space area is reduced and found associated with increased speaking rates. Tones are produced with higher F0s under PFCs, which may be attributed to vocal tension caused by tightened vocal tract during speaking through facial coverings.}, }
@article {pmid36215575, year = {2022}, author = {Urzúa, AR and Wolf, KB}, title = {Unitary rotation of pixellated polychromatic images.}, journal = {Journal of the Optical Society of America. A, Optics, image science, and vision}, volume = {39}, number = {8}, pages = {1323-1329}, doi = {10.1364/JOSAA.462530}, pmid = {36215575}, issn = {1520-8532}, abstract = {Unitary rotations of polychromatic images on finite two-dimensional pixellated screens provide invertibility, group composition, and thus conservation of information. Rotations have been applied on monochromatic image data sets, where we now examine closer the Gibbs-like oscillations that appear due to discrete "discontinuities" of the input images under unitary transformations. Extended to three-color images, we examine here the display of color at the pixels where, due to oscillations, some pixel color values may fall outside their required common numerical range [0,1], between absence and saturation of the red, green, and blue formant colors we choose to represent the images.}, }
@article {pmid36182345, year = {2022}, author = {Rothenberg, M and Rothenberg, S}, title = {Measuring the distortion of speech by a facemask.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095203}, doi = {10.1121/10.0014002}, pmid = {36182345}, issn = {2691-1191}, mesh = {Acoustics ; Masks ; Mouth ; *Speech ; *Voice ; }, abstract = {Most prior research focuses on the reduced amplitude of speech caused by facemasks. This paper argues that the interaction between the acoustic properties of a facemask and the acoustic properties of the vocal tract contributes to speech distortion by changing the formants of the voice. Speech distortion of a number of masks was tested by measuring the increase in damping of the first formant. Results suggest that masks dampen the first formant and that increasing the distance between the mask wall and mouth can reduce this distortion. These findings contribute to the research studying the impact of masks on speech.}, }
@article {pmid36182341, year = {2022}, author = {Tran Ngoc, A and Meunier, F and Meyer, J}, title = {Testing perceptual flexibility in speech through the categorization of whistled Spanish consonants by French speakers.}, journal = {JASA express letters}, volume = {2}, number = {9}, pages = {095201}, doi = {10.1121/10.0013900}, pmid = {36182341}, issn = {2691-1191}, mesh = {Cues ; Humans ; Language ; Phonetics ; *Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Whistled speech is a form of modified speech where, in non-tonal languages, vowels and consonants are augmented and transposed to whistled frequencies, simplifying their timbre. According to previous studies, these transformations maintain some level of vowel recognition for naive listeners. Here, in a behavioral experiment, naive listeners' capacities for the categorization of four whistled consonants (/p/, /k/, /t/, and /s/) were analyzed. Results show patterns of correct responses and confusions that provide new insights into whistled speech perception, highlighting the importance of frequency modulation cues, transposed from phoneme formants, as well as the perceptual flexibility in processing these cues.}, }
@article {pmid36182291, year = {2022}, author = {Winn, MB and Wright, RA}, title = {Reconsidering commonly used stimuli in speech perception experiments.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {3}, pages = {1394}, doi = {10.1121/10.0013415}, pmid = {36182291}, issn = {1520-8524}, mesh = {Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception/physiology ; *Voice ; }, abstract = {This paper examines some commonly used stimuli in speech perception experiments and raises questions about their use, or about the interpretations of previous results. The takeaway messages are: 1) the Hillenbrand vowels represent a particular dialect rather than a gold standard, and English vowels contain spectral dynamics that have been largely underappreciated, 2) the /ɑ/ context is very common but not clearly superior as a context for testing consonant perception, 3) /ɑ/ is particularly problematic when testing voice-onset-time perception because it introduces strong confounds in the formant transitions, 4) /dɑ/ is grossly overrepresented in neurophysiological studies and yet is insufficient as a generalized proxy for "speech perception," and 5) digit tests and matrix sentences including the coordinate response measure are systematically insensitive to important patterns in speech perception. Each of these stimulus sets and concepts is described with careful attention to their unique value and also cases where they might be misunderstood or over-interpreted.}, }
@article {pmid36171463, year = {2022}, author = {Borodkin, K and Gassner, T and Ershaid, H and Amir, N}, title = {tDCS modulates speech perception and production in second language learners.}, journal = {Scientific reports}, volume = {12}, number = {1}, pages = {16212}, pmid = {36171463}, issn = {2045-2322}, mesh = {Acoustic Stimulation ; Adult ; Humans ; Language ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; *Transcranial Direct Current Stimulation ; }, abstract = {Accurate identification and pronunciation of nonnative speech sounds can be particularly challenging for adult language learners. The current study tested the effects of a brief musical training combined with transcranial direct current stimulation (tDCS) on speech perception and production in a second language (L2). The sample comprised 36 native Hebrew speakers, aged 18-38, who studied English as L2 in a formal setting and had little musical training. Training encompassed musical perception tasks with feedback (i.e., timbre, duration, and tonal memory) and concurrent tDCS applied over the left posterior auditory-related cortex (including posterior superior temporal gyrus and planum temporale). Participants were randomly assigned to anodal or sham stimulation. Musical perception, L2 speech perception (measured by a categorical AXB discrimination task) and speech production (measured by a speech imitation task) were tested before and after training. There were no tDCS-dependent effects on musical perception post-training. However, only participants who received active stimulation showed increased accuracy of L2 phoneme discrimination and greater change in the acoustic properties of L2 speech sound production (i.e., second formant frequency in vowels and center of gravity in consonants). The results of this study suggest neuromodulation can facilitate the processing of nonnative speech sounds in adult learners.}, }
@article {pmid36154230, year = {2022}, author = {Morse, RP and Holmes, SD and Irving, R and McAlpine, D}, title = {Noise helps cochlear implant listeners to categorize vowels.}, journal = {JASA express letters}, volume = {2}, number = {4}, pages = {042001}, doi = {10.1121/10.0010071}, pmid = {36154230}, issn = {2691-1191}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Noise/adverse effects ; Phonetics ; *Speech Perception ; }, abstract = {Theoretical studies demonstrate that controlled addition of noise can enhance the amount of information transmitted by a cochlear implant (CI). The present study is a proof-of-principle for whether stochastic facilitation can improve the ability of CI users to categorize speech sounds. Analogue vowels were presented to CI users through a single electrode with independent noise on multiple electrodes. Noise improved vowel categorization, particularly in terms of an increase in information conveyed by the first and second formant. Noise, however, did not significantly improve vowel recognition: the miscategorizations were just more consistent, giving the potential to improve with experience.}, }
@article {pmid36129844, year = {2022}, author = {Easwar, V and Purcell, D and Lasarev, M and McGrath, E and Galloy, M}, title = {Speech-Evoked Envelope Following Responses in Children and Adults.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {10}, pages = {4009-4023}, doi = {10.1044/2022_JSLHR-22-00156}, pmid = {36129844}, issn = {1558-9102}, mesh = {Acoustic Stimulation ; Adolescent ; Child ; Hearing Tests ; Humans ; Male ; Sensitivity and Specificity ; *Speech ; *Speech Perception/physiology ; Young Adult ; }, abstract = {PURPOSE: Envelope following responses (EFRs) could be useful for objectively evaluating audibility of speech in children who are unable to participate in routine clinical tests. However, relative to adults, the characteristics of EFRs elicited by frequency-specific speech and their utility in predicting audibility in children are unknown.
METHOD: EFRs were elicited by the first (F1) and second and higher formants (F2+) of male-spoken vowels /u/ and /i/ and by fricatives /ʃ/ and /s/ in the token /suʃi/ presented at 15, 35, 55, 65, and 75 dB SPL. The F1, F2+, and fricatives were low-, mid-, and high-frequency dominant, respectively. EFRs were recorded between the vertex and the nape from twenty-three 6- to 17-year-old children and 21 young adults with normal hearing. Sensation levels of stimuli were estimated based on behavioral thresholds.
RESULTS: In children, amplitude decreased with age for /ʃ/-elicited EFRs but remained stable for low- and mid-frequency stimuli. As a group, EFR amplitude and phase coherence did not differ from that of adults. EFR sensitivity (proportion of audible stimuli detected) and specificity (proportion of inaudible stimuli not detected) did not vary between children and adults. Consistent with previous work, EFR sensitivity increased with stimulus frequency and level. The type of statistical indicator used for EFR detection did not influence accuracy in children.
CONCLUSIONS: Adultlike EFRs in 6- to 17-year-old typically developing children suggest mature envelope encoding for low- and mid-frequency stimuli. EFR sensitivity and specificity in children, when considering a wide range of stimulus levels and audibility, are ~77% and ~92%, respectively.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.21136171.}, }
@article {pmid36092651, year = {2022}, author = {Nault, DR and Mitsuya, T and Purcell, DW and Munhall, KG}, title = {Perturbing the consistency of auditory feedback in speech.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {905365}, pmid = {36092651}, issn = {1662-5161}, abstract = {Sensory information, including auditory feedback, is used by talkers to maintain fluent speech articulation. Current models of speech motor control posit that speakers continually adjust their motor commands based on discrepancies between the sensory predictions made by a forward model and the sensory consequences of their speech movements. Here, in two within-subject design experiments, we used a real-time formant manipulation system to explore how reliant speech articulation is on the accuracy or predictability of auditory feedback information. This involved introducing random formant perturbations during vowel production that varied systematically in their spatial location in formant space (Experiment 1) and temporal consistency (Experiment 2). Our results indicate that, on average, speakers' responses to auditory feedback manipulations varied based on the relevance and degree of the error that was introduced in the various feedback conditions. In Experiment 1, speakers' average production was not reliably influenced by random perturbations that were introduced every utterance to the first (F1) and second (F2) formants in various locations of formant space that had an overall average of 0 Hz. However, when perturbations were applied that had a mean of +100 Hz in F1 and -125 Hz in F2, speakers demonstrated reliable compensatory responses that reflected the average magnitude of the applied perturbations. In Experiment 2, speakers did not significantly compensate for perturbations of varying magnitudes that were held constant for one and three trials at a time. Speakers' average productions did, however, significantly deviate from a control condition when perturbations were held constant for six trials. Within the context of these conditions, our findings provide evidence that the control of speech movements is, at least in part, dependent upon the reliability and stability of the sensory information that it receives over time.}, }
@article {pmid36063640, year = {2022}, author = {Frankford, SA and Cai, S and Nieto-Castañón, A and Guenther, FH}, title = {Auditory feedback control in adults who stutter during metronome-paced speech II. Formant Perturbation.}, journal = {Journal of fluency disorders}, volume = {74}, number = {}, pages = {105928}, pmid = {36063640}, issn = {1873-801X}, support = {R01 DC007683/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; *Stuttering/therapy ; Speech/physiology ; Feedback ; Feedback, Sensory/physiology ; Auditory Perception/physiology ; }, abstract = {PURPOSE: Prior work has shown that Adults who stutter (AWS) have reduced and delayed responses to auditory feedback perturbations. This study aimed to determine whether external timing cues, which increase fluency, resolve auditory feedback processing disruptions.
METHODS: Fifteen AWS and sixteen adults who do not stutter (ANS) read aloud a multisyllabic sentence either with natural stress and timing or with each syllable paced at the rate of a metronome. On random trials, an auditory feedback formant perturbation was applied, and formant responses were compared between groups and pacing conditions.
RESULTS: During normally paced speech, ANS showed a significant compensatory response to the perturbation by the end of the perturbed vowel, while AWS did not. In the metronome-paced condition, which significantly reduced the disfluency rate, the opposite was true: AWS showed a significant response by the end of the vowel, while ANS did not.
CONCLUSION: These findings indicate a potential link between the reduction in stuttering found during metronome-paced speech and changes in auditory motor integration in AWS.}, }
@article {pmid36050247, year = {2022}, author = {Lee, SH and Lee, GS}, title = {Long-term Average Spectrum and Nasal Accelerometry in Sentences of Differing Nasality and Forward-Focused Vowel Productions Under Altered Auditory Feedback.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.026}, pmid = {36050247}, issn = {1873-4588}, abstract = {OBJECTIVES AND BACKGROUND: To investigate whether voice focus adjustments can alter the audio-vocal feedback and consequently modulate speech/voice motor control. Speaking with a forward-focused voice was expected to enhance audio-vocal feedback and thus decrease the variability of vocal fundamental frequency (F0).
MATERIALS AND METHOD: Twenty-two healthy, untrained adults (10 males and 12 females) were requested to sustain vowel /a/ with their natural focus and a forward focus and to naturally read the nasal, oral, and mixed oral-nasal sentences in normal noise-masked auditory conditions. Meanwhile, a miniature accelerometer was externally attached on the noise to detect the nasal vibrations during vocalization. Audio recordings were made and analyzed using the long-term average spectrum (LTAS) and power spectral analysis of F0.
RESULTS: Compared with naturally-focused vowel production and oral sentences, forward-focused vowel productions and nasal sentences both showed significant increases in nasal accelerometric amplitude and the spectral power within the range of 200∼300 Hz, and significantly decreased the F0 variability below 3 Hz, which has been reported to be associated with enhanced auditory feedback in our previous research. The auditory masking not only significantly increased the low-frequency F0 variability, but also significantly decreased the ratio of the spectral power within 200∼300 Hz to the power within 300∼1000 Hz for the vowel and sentence productions. Gender differences were found in the correlations between the degree of nasal coupling and F0 stability as well as in the LTAS characteristics in response to noise.
CONCLUSIONS: Variations in nasal-oral acoustic coupling not only change the formant features of speech signals, but involuntarily influence the auditory feedback control of vocal fold vibrations. Speakers tend to show improved F0 stability in response to a forward-focused voice adjustment.}, }
@article {pmid36050180, year = {2022}, author = {Ibrahim, O and Yuen, I and van Os, M and Andreeva, B and Möbius, B}, title = {The combined effects of contextual predictability and noise on the acoustic realisation of German syllables.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {911}, doi = {10.1121/10.0013413}, pmid = {36050180}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Noise/adverse effects ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speakers tend to speak clearly in noisy environments, while they tend to reserve effort by shortening word duration in predictable contexts. It is unclear how these two communicative demands are met. The current study investigates the acoustic realizations of syllables in predictable vs unpredictable contexts across different background noise levels. Thirty-eight German native speakers produced 60 CV syllables in two predictability contexts in three noise conditions (reference = quiet, 0 dB and -10 dB signal-to-noise ratio). Duration, intensity (average and range), F0 (median), and vowel formants of the target syllables were analysed. The presence of noise yielded significantly longer duration, higher average intensity, larger intensity range, and higher F0. Noise levels affected intensity (average and range) and F0. Low predictability syllables exhibited longer duration and larger intensity range. However, no interaction was found between noise and predictability. This suggests that noise-related modifications might be independent of predictability-related changes, with implications for including channel-based and message-based formulations in speech production.}, }
@article {pmid36050169, year = {2022}, author = {Krumbiegel, J and Ufer, C and Blank, H}, title = {Influence of voice properties on vowel perception depends on speaker context.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {820}, doi = {10.1121/10.0013363}, pmid = {36050169}, issn = {1520-8524}, mesh = {Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Different speakers produce the same intended vowel with very different physical properties. Fundamental frequency (F0) and formant frequencies (FF), the two main parameters that discriminate between voices, also influence vowel perception. While it has been shown that listeners comprehend speech more accurately if they are familiar with a talker's voice, it is still unclear how such prior information is used when decoding the speech stream. In three online experiments, we examined the influence of speaker context via F0 and FF shifts on the perception of /o/-/u/ vowel contrasts. Participants perceived vowels from an /o/-/u/ continuum shifted toward /u/ when F0 was lowered or FF increased relative to the original speaker's voice and vice versa. This shift was reduced when the speakers were presented in a block-wise context compared to random order. Conversely, the original base voice was perceived to be shifted toward /u/ when presented in the context of a low F0 or high FF speaker, compared to a shift toward /o/ with high F0 or low FF speaker context. These findings demonstrate that that F0 and FF jointly influence vowel perception in speaker context.}, }
@article {pmid36050157, year = {2022}, author = {Whalen, DH and Chen, WR and Shadle, CH and Fulop, SA}, title = {Formants are easy to measure; resonances, not so much: Lessons from Klatt (1986).}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {2}, pages = {933}, pmid = {36050157}, issn = {1520-8524}, mesh = {*Acoustics ; Algorithms ; Canada ; Humans ; Language ; *Speech Acoustics ; }, abstract = {Formants in speech signals are easily identified, largely because formants are defined to be local maxima in the wideband sound spectrum. Sadly, this is not what is of most interest in analyzing speech; instead, resonances of the vocal tract are of interest, and they are much harder to measure. Klatt [(1986). in Proceedings of the Montreal Satellite Symposium on Speech Recognition, 12th International Congress on Acoustics, edited by P. Mermelstein (Canadian Acoustical Society, Montreal), pp. 5-7] showed that estimates of resonances are biased by harmonics while the human ear is not. Several analysis techniques placed the formant closer to a strong harmonic than to the center of the resonance. This "harmonic attraction" can persist with newer algorithms and in hand measurements, and systematic errors can persist even in large corpora. Research has shown that the reassigned spectrogram is less subject to these errors than linear predictive coding and similar measures, but it has not been satisfactorily automated, making its wider use unrealistic. Pending better techniques, the recommendations are (1) acknowledge limitations of current analyses regarding influence of F0 and limits on granularity, (2) report settings more fully, (3) justify settings chosen, and (4) examine the pattern of F0 vs F1 for possible harmonic bias.}, }
@article {pmid36009709, year = {2022}, author = {Beeck, VC and Heilmann, G and Kerscher, M and Stoeger, AS}, title = {Sound Visualization Demonstrates Velopharyngeal Coupling and Complex Spectral Variability in Asian Elephants.}, journal = {Animals : an open access journal from MDPI}, volume = {12}, number = {16}, pages = {}, pmid = {36009709}, issn = {2076-2615}, support = {P 31034/FWF_/Austrian Science Fund FWF/Austria ; }, abstract = {Sound production mechanisms set the parameter space available for transmitting biologically relevant information in vocal signals. Low-frequency rumbles play a crucial role in coordinating social interactions in elephants' complex fission-fusion societies. By emitting rumbles through either the oral or the three-times longer nasal vocal tract, African elephants alter their spectral shape significantly. In this study, we used an acoustic camera to visualize the sound emission of rumbles in Asian elephants, which have received far less research attention than African elephants. We recorded nine adult captive females and analyzed the spectral parameters of 203 calls, including vocal tract resonances (formants). We found that the majority of rumbles (64%) were nasally emitted, 21% orally, and 13% simultaneously through the mouth and trunk, demonstrating velopharyngeal coupling. Some of the rumbles were combined with orally emitted roars. The nasal rumbles concentrated most spectral energy in lower frequencies exhibiting two formants, whereas the oral and mixed rumbles contained higher formants, higher spectral energy concentrations and were louder. The roars were the loudest, highest and broadest in frequency. This study is the first to demonstrate velopharyngeal coupling in a non-human animal. Our findings provide a foundation for future research into the adaptive functions of the elephant acoustic variability for information coding, localizability or sound transmission, as well as vocal flexibility across species.}, }
@article {pmid36007484, year = {2022}, author = {Rong, P and Hansen, O and Heidrick, L}, title = {Relationship between rate-elicited changes in muscular-kinematic control strategies and acoustic performance in individuals with ALS-A multimodal investigation.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106253}, doi = {10.1016/j.jcomdis.2022.106253}, pmid = {36007484}, issn = {1873-7994}, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis ; Biomechanical Phenomena/physiology ; Humans ; Speech/physiology ; Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; }, abstract = {INTRODUCTION: As a key control variable, duration has been long suspected to mediate the organization of speech motor control strategies, which has management implications for neuromotor speech disorders. This study aimed to experimentally delineate the role of duration in organizing speech motor control in neurologically healthy and impaired speakers using a voluntary speaking rate manipulation paradigm.
METHODS: Thirteen individuals with amyotrophic lateral sclerosis (ALS) and 10 healthy controls performed a sentence reading task three times, first at their habitual rate, then at a slower rate. A multimodal approach combining surface electromyography, kinematic, and acoustic technologies was used to record jaw muscle activities, jaw kinematics, and speech acoustics. Six muscular-kinematic features were extracted and factor-analyzed to characterize the organization of the mandibular control hierarchy. Five acoustic features were extracted, measuring the spectrotemporal properties of the diphthong /ɑɪ/ and the plosives /t/ and /k/.
RESULTS: The muscular-kinematic features converged into two interpretable latent factors, reflecting the level and cohesiveness/flexibility of mandibular control, respectively. Voluntary rate reduction led to a trend toward (1) finer, less cohesive, and more flexible mandibular control, and (2) increased range and decreased transition slope of the diphthong formants, across neurologically healthy and impaired groups. Differential correlations were found between the rate-elicited changes in mandibular control and acoustic performance for neurologically healthy and impaired speakers.
CONCLUSIONS: The results provided empirical evidence for the long-suspected but previously unsubstantiated role of duration in (re)organizing speech motor control strategies. The rate-elicited reorganization of muscular-kinematic control contributed to the acoustic performance of healthy speakers, in ways consistent with theoretical predictions. Such contributions were less consistent in impaired speakers, implying the complex nature of speaking rate reduction in ALS, possibly reflecting an interplay of disease-related constraints and volitional duration control. This information may help to stratify and identify candidates for the rate manipulation therapy.}, }
@article {pmid36002663, year = {2022}, author = {Easwar, V and Aiken, S and Beh, K and McGrath, E and Galloy, M and Scollie, S and Purcell, D}, title = {Variability in the Estimated Amplitude of Vowel-Evoked Envelope Following Responses Caused by Assumed Neurophysiologic Processing Delays.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {23}, number = {6}, pages = {759-769}, pmid = {36002663}, issn = {1438-7573}, support = {//CIHR/Canada ; }, mesh = {Young Adult ; Child ; Male ; Humans ; Adolescent ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; Noise ; Electroencephalography/methods ; Reaction Time/physiology ; }, abstract = {Vowel-evoked envelope following responses (EFRs) reflect neural encoding of the fundamental frequency of voice (f0). Accurate analysis of EFRs elicited by natural vowels requires the use of methods like the Fourier analyzer (FA) to consider the production-related f0 changes. The FA's accuracy in estimating EFRs is, however, dependent on the assumed neurophysiological processing delay needed to time-align the f0 time course and the recorded electroencephalogram (EEG). For male-spoken vowels (f0 ~ 100 Hz), a constant 10-ms delay correction is often assumed. Since processing delays vary with stimulus and physiological factors, we quantified (i) the delay-related variability that would occur in EFR estimation, and (ii) the influence of stimulus frequency, non-f0 related neural activity, and the listener's age on such variability. EFRs were elicited by the low-frequency first formant, and mid-frequency second and higher formants of /u/, /a/, and /i/ in young adults and 6- to 17-year-old children. To time-align with the f0 time course, EEG was shifted by delays between 5 and 25 ms to encompass plausible response latencies. The delay-dependent range in EFR amplitude did not vary by stimulus frequency or age and was significantly smaller when interference from low-frequency activity was reduced. On average, the delay-dependent range was < 22% of the maximum variability in EFR amplitude that could be expected by noise. Results suggest that using a constant EEG delay correction in FA analysis does not substantially alter EFR amplitude estimation. In the present study, the lack of substantial variability was likely facilitated by using vowels with small f0 ranges.}, }
@article {pmid35993422, year = {2022}, author = {Clarke, H and Leav, S and Zestic, J and Mohamed, I and Salisbury, I and Sanderson, P}, title = {Enhanced Neonatal Pulse Oximetry Sounds for the First Minutes of Life: A Laboratory Trial.}, journal = {Human factors}, volume = {}, number = {}, pages = {187208221118472}, doi = {10.1177/00187208221118472}, pmid = {35993422}, issn = {1547-8181}, abstract = {OBJECTIVE: Auditory enhancements to the pulse oximetry tone may help clinicians detect deviations from target ranges for oxygen saturation (SpO2) and heart rate (HR).
BACKGROUND: Clinical guidelines recommend target ranges for SpO2 and HR during neonatal resuscitation in the first 10 minutes after birth. The pulse oximeter currently maps HR to tone rate, and SpO2 to tone pitch. However, deviations from target ranges for SpO2 and HR are not easy to detect.
METHOD: Forty-one participants were presented with 30-second simulated scenarios of an infant's SpO2 and HR levels in the first minutes after birth. Tremolo marked distinct HR ranges and formants marked distinct SpO2 ranges. Participants were randomly allocated to conditions: (a) No Enhancement control, (b) Enhanced HR Only, (c) Enhanced SpO2 Only, and (d) Enhanced Both.
RESULTS: Participants in the Enhanced HR Only and Enhanced SpO2 Only conditions identified HR and SpO2 ranges, respectively, more accurately than participants in the No Enhancement condition, ps < 0.001. In the Enhanced Both condition, the tremolo enhancement of HR did not affect participants' ability to identify SpO2 range, but the formants enhancement of SpO2 may have attenuated participants' ability to identify tremolo-enhanced HR range.
CONCLUSION: Tremolo and formant enhancements improve range identification for HR and SpO2, respectively, and could improve clinicians' ability to identify SpO2 and HR ranges in the first minutes after birth.
APPLICATION: Enhancements to the pulse oximeter tone to indicate clinically important ranges could improve the management of oxygen delivery to the neonate during resuscitation in the first 10 minutes after birth.}, }
@article {pmid35961825, year = {2022}, author = {Nascimento, GFD and Silva, HJD and Oliveira, KGSC and Lira, SZ and Gomes, AOC}, title = {Relationship Between Oropharyngeal Geometry and Acoustic Parameters in Singers: A Preliminary Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.07.012}, pmid = {35961825}, issn = {1873-4588}, abstract = {OBJECTIVE: To verify possible correlations between formant and cepstral parameters and oropharyngeal geometry in singers, stratified by sex.
METHOD: Voice records and oropharyngeal measures of 31 singers - 13 females and 18 males, mean age of 28 (±5.0) years - were retrieved from a database and analyzed. The oropharyngeal geometry measures were collected with acoustic pharyngometry, and the voice records consisted of sustained vowel /Ԑ/ phonation, which were exported to Praat software and edited to obtain the formant and cepstral parameters, stratified by sex. The Pearson linear correlation test was applied to relate voice parameters to oropharyngeal geometry, at the 5% significance level; the linear regression test was used to justify the variable related to the second formant.
RESULTS: Differences between the sexes were identified only in the oral cavity length (greater in males) and pharyngeal cavity length (greater in females). There was a linear correlation between the third formant and the cepstrum in the female group. In the male group, there was a linear correlation between the cepstrum and the third and fourth formants. A positive linear correlation with up to 95% confidence was also identified between the pharyngeal cavity volume and the second formant in the female group, making it possible to estimate a regression model for the second formant (R2 = 0.70).
CONCLUSION: There are correlations between the oropharyngeal geometry and formant and cepstral parameters in relation to sex. The pharyngeal cavity volume showed the greatest correlation between females and the second formant.}, }
@article {pmid35951711, year = {2022}, author = {Nishimura, T and Tokuda, IT and Miyachi, S and Dunn, JC and Herbst, CT and Ishimura, K and Kaneko, A and Kinoshita, Y and Koda, H and Saers, JPP and Imai, H and Matsuda, T and Larsen, ON and Jürgens, U and Hirabayashi, H and Kojima, S and Fitch, WT}, title = {Evolutionary loss of complexity in human vocal anatomy as an adaptation for speech.}, journal = {Science (New York, N.Y.)}, volume = {377}, number = {6607}, pages = {760-763}, doi = {10.1126/science.abm1574}, pmid = {35951711}, issn = {1095-9203}, mesh = {Animals ; *Biological Evolution ; Humans ; *Larynx/anatomy & histology ; *Phonation ; Phonetics ; *Primates ; *Speech ; Speech Acoustics ; *Vocal Cords/anatomy & histology ; }, abstract = {Human speech production obeys the same acoustic principles as vocal production in other animals but has distinctive features: A stable vocal source is filtered by rapidly changing formant frequencies. To understand speech evolution, we examined a wide range of primates, combining observations of phonation with mathematical modeling. We found that source stability relies upon simplifications in laryngeal anatomy, specifically the loss of air sacs and vocal membranes. We conclude that the evolutionary loss of vocal membranes allows human speech to mostly avoid the spontaneous nonlinear phenomena and acoustic chaos common in other primate vocalizations. This loss allows our larynx to produce stable, harmonic-rich phonation, ideally highlighting formant changes that convey most phonetic information. Paradoxically, the increased complexity of human spoken language thus followed simplification of our laryngeal anatomy.}, }
@article {pmid35944059, year = {2022}, author = {Suresh, CH and Krishnan, A}, title = {Frequency-Following Response to Steady-State Vowel in Quiet and Background Noise Among Marching Band Participants With Normal Hearing.}, journal = {American journal of audiology}, volume = {31}, number = {3}, pages = {719-736}, doi = {10.1044/2022_AJA-21-00226}, pmid = {35944059}, issn = {1558-9137}, mesh = {Acoustic Stimulation/methods ; Auditory Perception/physiology ; Hearing ; Humans ; *Noise ; Sound ; *Speech Perception/physiology ; }, abstract = {OBJECTIVE: Human studies enrolling individuals at high risk for cochlear synaptopathy (CS) have reported difficulties in speech perception in adverse listening conditions. The aim of this study is to determine if these individuals show a degradation in the neural encoding of speech in quiet and in the presence of background noise as reflected in neural phase-locking to both envelope periodicity and temporal fine structure (TFS). To our knowledge, there are no published reports that have specifically examined the neural encoding of both envelope periodicity and TFS of speech stimuli (in quiet and in adverse listening conditions) among a sample with loud-sound exposure history who are at risk for CS.
METHOD: Using scalp-recorded frequency-following response (FFR), the authors evaluated the neural encoding of envelope periodicity (FFRENV) and TFS (FFRTFS) for a steady-state vowel (English back vowel /u/) in quiet and in the presence of speech-shaped noise presented at +5- and 0 dB SNR. Participants were young individuals with normal hearing who participated in the marching band for at least 5 years (high-risk group) and non-marching band group with low-noise exposure history (low-risk group).
RESULTS: The results showed no group differences in the neural encoding of either the FFRENV or the first formant (F1) in the FFRTFS in quiet and in noise. Paradoxically, the high-risk group demonstrated enhanced representation of F2 harmonics across all stimulus conditions.
CONCLUSIONS: These results appear to be in line with a music experience-dependent enhancement of F2 harmonics. However, due to sound overexposure in the high-risk group, the role of homeostatic central compensation cannot be ruled out. A larger scale data set with different noise exposure background, longitudinal measurements with an array of behavioral and electrophysiological tests is needed to disentangle the nature of the complex interaction between the effects of central compensatory gain and experience-dependent enhancement.}, }
@article {pmid35944047, year = {2022}, author = {McAllister, T and Eads, A and Kabakoff, H and Scott, M and Boyce, S and Whalen, DH and Preston, JL}, title = {Baseline Stimulability Predicts Patterns of Response to Traditional and Ultrasound Biofeedback Treatment for Residual Speech Sound Disorder.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {8}, pages = {2860-2880}, pmid = {35944047}, issn = {1558-9102}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; }, mesh = {*Apraxias ; Biofeedback, Psychology/methods ; Humans ; Language ; Speech/physiology ; *Speech Sound Disorder/diagnostic imaging/therapy ; Speech Therapy/methods ; }, abstract = {PURPOSE: This study aimed to identify predictors of response to treatment for residual speech sound disorder (RSSD) affecting English rhotics. Progress was tracked during an initial phase of traditional motor-based treatment and a longer phase of treatment incorporating ultrasound biofeedback. Based on previous literature, we focused on baseline stimulability and sensory acuity as predictors of interest.
METHOD: Thirty-three individuals aged 9-15 years with residual distortions of /ɹ/ received a course of individual intervention comprising 1 week of intensive traditional treatment and 9 weeks of ultrasound biofeedback treatment. Stimulability for /ɹ/ was probed prior to treatment, after the traditional treatment phase, and after the end of all treatment. Accuracy of /ɹ/ production in each probe was assessed with an acoustic measure: normalized third formant (F3)-second formant (F2) distance. Model-based clustering analysis was applied to these acoustic measures to identify different average trajectories of progress over the course of treatment. The resulting clusters were compared with respect to acuity in auditory and somatosensory domains.
RESULTS: All but four individuals were judged to exhibit a clinically significant response to the combined course of treatment. Two major clusters were identified. The "low stimulability" cluster was characterized by very low accuracy at baseline, minimal response to traditional treatment, and strong response to ultrasound biofeedback. The "high stimulability" group was more accurate at baseline and made significant gains in both traditional and ultrasound biofeedback phases of treatment. The clusters did not differ with respect to sensory acuity.
CONCLUSIONS: This research accords with clinical intuition in finding that individuals who are more stimulable at baseline are more likely to respond to traditional intervention, whereas less stimulable individuals may derive greater relative benefit from biofeedback.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20422236.}, }
@article {pmid35931553, year = {2022}, author = {Levi, SV}, title = {Teaching acoustic phonetics to undergraduates in communication sciences and disorders: Course structure and sample projects.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {651}, doi = {10.1121/10.0012984}, pmid = {35931553}, issn = {1520-8524}, mesh = {Acoustics ; Communication ; Humans ; *Phonetics ; *Speech Acoustics ; Students ; }, abstract = {Virtually all undergraduate communication sciences and disorders programs require a course that covers acoustic phonetics. Students typically have a separate phonetics (transcription) course prior to taking the acoustic phonetics course. This paper describes a way to structure an acoustic phonetics course into two halves: a first half that focuses on the source, including basic acoustics (simple harmonic motion, harmonics), vocal fold vibration, modes of phonation, and intonation, and a second half that focuses on the filter, including resonance and tube models, vowel formants, and consonant acoustics. Thus, basic acoustic properties are interwoven with specific examples of speech-related acoustics. In addition, two projects that illustrate concepts from the two halves of the course (one on fundamental frequency and the other on vowel formants) are presented.}, }
@article {pmid35931547, year = {2022}, author = {Mills, HE and Shorey, AE and Theodore, RM and Stilp, CE}, title = {Context effects in perception of vowels differentiated by F1 are not influenced by variability in talkers' mean F1 or F3.}, journal = {The Journal of the Acoustical Society of America}, volume = {152}, number = {1}, pages = {55}, doi = {10.1121/10.0011920}, pmid = {35931547}, issn = {1520-8524}, mesh = {*Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Spectral properties of earlier sounds (context) influence recognition of later sounds (target). Acoustic variability in context stimuli can disrupt this process. When mean fundamental frequencies (f0's) of preceding context sentences were highly variable across trials, shifts in target vowel categorization [due to spectral contrast effects (SCEs)] were smaller than when sentence mean f0's were less variable; when sentences were rearranged to exhibit high or low variability in mean first formant frequencies (F1) in a given block, SCE magnitudes were equivalent [Assgari, Theodore, and Stilp (2019) J. Acoust. Soc. Am. 145(3), 1443-1454]. However, since sentences were originally chosen based on variability in mean f0, stimuli underrepresented the extent to which mean F1 could vary. Here, target vowels (/ɪ/-/ɛ/) were categorized following context sentences that varied substantially in mean F1 (experiment 1) or mean F3 (experiment 2) with variability in mean f0 held constant. In experiment 1, SCE magnitudes were equivalent whether context sentences had high or low variability in mean F1; the same pattern was observed in experiment 2 for new sentences with high or low variability in mean F3. Variability in some acoustic properties (mean f0) can be more perceptually consequential than others (mean F1, mean F3), but these results may be task-dependent.}, }
@article {pmid35920586, year = {2023}, author = {Feng, Y and Peng, G}, title = {Development of categorical speech perception in Mandarin-speaking children and adolescents.}, journal = {Child development}, volume = {94}, number = {1}, pages = {28-43}, pmid = {35920586}, issn = {1467-8624}, mesh = {Male ; Adult ; Humans ; Child ; Adolescent ; *Speech Perception ; Cross-Sectional Studies ; Linguistics ; Asian ; China ; }, abstract = {Although children develop categorical speech perception at a very young age, the maturation process remains unclear. A cross-sectional study in Mandarin-speaking 4-, 6-, and 10-year-old children, 14-year-old adolescents, and adults (n = 104, 56 males, all Asians from mainland China) was conducted to investigate the development of categorical perception of four Mandarin phonemic contrasts: lexical tone contrast Tone 1-2, vowel contrast /u/-/i/, consonant aspiration contrast /p/-/p[h] /, and consonant formant transition contrast /p/-/t/. The results indicated that different types of phonemic contrasts, and even the identification and discrimination of the same phonemic contrast, matured asynchronously. The observation that tone and vowel perception are achieved earlier than consonant perception supports the phonological saliency hypothesis.}, }
@article {pmid35916929, year = {2023}, author = {Song, J and Wan, Q and Wang, Y and Zhou, H}, title = {Establishment of a Multi-parameter Evaluation Model for Risk of Aspiration in Dysphagia: A Pilot Study.}, journal = {Dysphagia}, volume = {38}, number = {1}, pages = {406-414}, pmid = {35916929}, issn = {1432-0460}, mesh = {Humans ; Deglutition ; *Deglutition Disorders/diagnosis/etiology ; Pilot Projects ; Risk Factors ; }, abstract = {It's difficult for clinical bedside evaluations to accurately determine the occurrence of aspiration in patients. Although VFSS and FEES are the gold standards for clinical diagnosis of dysphagia, which are mainly used to evaluate people at high risk of dysphagia found by bedside screening, the operation is complicated and time-consuming. The aim of this pilot study was to present an objective measure based on a multi-parameter approach to screen for aspiration risk in patients with dysphagia. Objective evaluation techniques based on speech parameters were used to assess the oral motor function, vocal cord function, and voice changes before and after swallowing in 32 patients with dysphagia (16 low-risk aspiration group, 16 high-risk aspiration group). Student's t test combined with stepwise logistic regression were used to determine the optimal index. The best model consists of three parameters, and the equation is: logit(P) = - 3.824 - (0.504 × maximum phonation time) + (0.008 × second formant frequency of /u/) - 0.085 × (fundamental frequency difference before and after swallowing). An additional eight patients with dysphagia were randomly selected as the validation group of the model. When applied to validation, this model can accurately identify the risk of aspiration in 87.5% of patients, and the sensitivity is as high as 100%. Therefore, it has certain clinical practical value that may help clinicians to assess the risk of aspiration in patients with dysphagia, especially for silent aspiration.}, }
@article {pmid35905807, year = {2022}, author = {Lee, GS and Chang, CW}, title = {Comparisons of auditory brainstem response elicited by compound click-sawtooths sound and synthetic consonant-vowel /da/.}, journal = {Physiology & behavior}, volume = {255}, number = {}, pages = {113922}, doi = {10.1016/j.physbeh.2022.113922}, pmid = {35905807}, issn = {1873-507X}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Humans ; Phonetics ; Reaction Time/physiology ; Sound ; *Speech Perception/physiology ; }, abstract = {The auditory brainstem response to complex sounds (cABR) could be evoked using speech sounds such as the 40 ms synthetic consonant-vowel syllable /da/ (CV-da) that was commonly used in basic and clinical research. cABR consists of responses to formant energy as well as the energy of fundamental frequency. The co-existence of the two energy makes cABR a mixed response. We introduced a new stimulus of click-sawtooths (CSW) with similar time-lock patterns but without formant or harmonic energy. Ten young healthy volunteers were recruited and the cABRs of CV-da and CSW of their 20 ears were acquired. The response latencies, amplitudes, and frequency-domain analytic results were compared pairwisely between stimuli. The response amplitudes were significantly greater for CSW and the latencies were significantly shorter for CSW. The latency-intensity functions were also greater for CSW. For CSW, adjustments of energy component can be made without causing biased changes to the other. CSW may be used in future basic research and clinical applications.}, }
@article {pmid35894373, year = {2022}, author = {França, FP and Almeida, AA and Lopes, LW}, title = {Immediate effect of different exercises in the vocal space of women with and without vocal nodules.}, journal = {CoDAS}, volume = {34}, number = {5}, pages = {e20210157}, pmid = {35894373}, issn = {2317-1782}, mesh = {Exercise ; Female ; Humans ; Language ; *Phonetics ; *Speech Acoustics ; Tongue ; }, abstract = {PURPOSE: To investigate the immediate effect of voiced tongue vibration (VSL), high-resistance straw in the air (CAR), and overarticulation (OA) on the vocal space of vocally healthy women (MVS) and with vocal nodules (MNV).
METHODS: 12 women participated in the MNV and 12 women in the MVS, allocated to perform the vocal exercises of VSL, CAR, and OA. Each participant performed only one of the three proposed exercises, for 5 minutes, preceded and followed by recording a sequence of vehicle sentences for extracting formants (F1 and F2) from the vowel segments [a, i, u]. The vowel space was analyzed through the differences between the measures of the formants of the vowels.
RESULTS: we observed a reduction of F1 in the interval [a]-[i] and [i]-[u] and of F2 between the vowels [a]-[u] and [i]-[u] in the MVS, after performing the CAR. In MNV, we observed a reduction of F2 in the interval [a]-[i] after VSL. In the intergroup analysis, there were higher F1 values between the intervals of the vowels [a]-[i] and [i]-[u] in the MVS, before performing the CAR, and after exercise only in the interval [a]-[i]. A higher value of F1 and F2 was observed in the interval between the vowels [i]-[u] in the MNV after VSL.
CONCLUSION: The VSL exercise reduced the vowel space in MNV women. CAR reduced the vocal space of women in the MVS. The MNV had a smaller vowel space compared to the MVS before and after the CAR. We observed a reduction in the vowel space in the MNV compared to the MNV after the VSL exercise.}, }
@article {pmid35874163, year = {2022}, author = {Wang, H and Max, L}, title = {Inter-Trial Formant Variability in Speech Production Is Actively Controlled but Does Not Affect Subsequent Adaptation to a Predictable Formant Perturbation.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {890065}, pmid = {35874163}, issn = {1662-5161}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; }, abstract = {Despite ample evidence that speech production is associated with extensive trial-to-trial variability, it remains unclear whether this variability represents merely unwanted system noise or an actively regulated mechanism that is fundamental for maintaining and adapting accurate speech movements. Recent work on upper limb movements suggest that inter-trial variability may be not only actively regulated based on sensory feedback, but also provide a type of workspace exploration that facilitates sensorimotor learning. We therefore investigated whether experimentally reducing or magnifying inter-trial formant variability in the real-time auditory feedback during speech production (a) leads to adjustments in formant production variability that compensate for the manipulation, (b) changes the temporal structure of formant adjustments across productions, and (c) enhances learning in a subsequent adaptation task in which a predictable formant-shift perturbation is applied to the feedback signal. Results show that subjects gradually increased formant variability in their productions when hearing auditory feedback with reduced variability, but subsequent formant-shift adaptation was not affected by either reducing or magnifying the perceived variability. Thus, findings provide evidence for speakers' active control of inter-trial formant variability based on auditory feedback from previous trials, but-at least for the current short-term experimental manipulation of feedback variability-not for a role of this variability regulation mechanism in subsequent auditory-motor learning.}, }
@article {pmid35865705, year = {2022}, author = {Mailhos, A and Egea-Caparrós, DA and Guerrero Rodríguez, C and Luzardo, M and Kiskimska, ND and Martínez Sánchez, F}, title = {Vocal Cues to Male Physical Formidability.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {879102}, pmid = {35865705}, issn = {1664-1078}, abstract = {Animal vocalizations convey important information about the emitter, including sex, age, biological quality, and emotional state. Early on, Darwin proposed that sex differences in auditory signals and vocalizations were driven by sexual selection mechanisms. In humans, studies on the association between male voice attributes and physical formidability have thus far reported mixed results. Hence, with a view to furthering our understanding of the role of human voice in advertising physical formidability, we sought to identify acoustic attributes of male voices associated with physical formidability proxies. Mean fundamental frequency (F 0), formant dispersion (D f), formant position (P f), and vocal tract length (VTL) data from a sample of 101 male voices was analyzed for potential associations with height, weight, and maximal handgrip strength (HGS). F 0 correlated negatively with HGS; P f showed negative correlations with HGS, height and weight, whereas VTL positively correlated with HGS, height and weight. All zero-order correlations remained significant after controlling for false discovery rate (FDR) with the Benjamini-Hochberg method. After controlling for height and weight-and controlling for FDR-the correlation between F 0 and HGS remained significant. In addition, to evaluate the ability of human male voices to advertise physical formidability to potential mates, 151 heterosexual female participants rated the voices of the 10 strongest and the 10 weakest males from the original sample for perceived physical strength, and given that physical strength is a desirable attribute in male partners, perceived attractiveness. Generalized linear mixed model analyses-which allow for generalization of inferences to other samples of both raters and targets-failed to support a significant association of perceived strength or attractiveness from voices alone and actual physical strength. These results add to the growing body of work on the role of human voices in conveying relevant biological information.}, }
@article {pmid35858255, year = {2022}, author = {Shao, J and Bakhtiar, M and Zhang, C}, title = {Impaired Categorical Perception of Speech Sounds Under the Backward Masking Condition in Adults Who Stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2554-2570}, doi = {10.1044/2022_JSLHR-21-00276}, pmid = {35858255}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Child ; Humans ; Phonetics ; Speech ; *Speech Perception ; *Stuttering ; *Voice ; }, abstract = {PURPOSE: Evidence increasingly indicates that people with developmental stuttering have auditory perception deficits. Our previous research has indicated similar but slower performance in categorical perception of the speech sounds under the quiet condition in children who stutter and adults who stutter (AWS) compared with their typically fluent counterparts. We hypothesized that the quiet condition may not be sufficiently sensitive to reveal subtle perceptual deficiencies in people who stutter. This study examined this hypothesis by testing the categorical perception of speech and nonspeech sounds under backward masking condition (i.e., a noise was presented immediately after the target stimuli).
METHOD: Fifteen Cantonese-speaking AWS and 15 adults who do not stutter (AWNS) were tested on the categorical perception of four stimulus continua, namely, consonant varying in voice onset time (VOT), vowel, lexical tone, and nonspeech, under the backward masking condition using identification and discrimination tasks.
RESULTS: AWS demonstrated a broader boundary width than AWNS in the identification task. AWS also exhibited a worse performance than AWNS in the discrimination of between-category stimuli but a comparable performance in the discrimination of within-category stimuli, indicating reduced sensitivity to sounds that belonged to different phonemic categories among AWS. Moreover, AWS showed similar patterns of impaired categorical perception across the four stimulus types, although the boundary location on the VOT continuum occurred at an earlier point in AWS than in AWNS.
CONCLUSIONS: The findings provide robust evidence that AWS exhibit impaired categorical perception of speech and nonspeech sounds under the backward masking condition. Temporal processing (i.e., VOT manipulation), frequency/spectral/formant processing (i.e., lexical tone or vowel manipulations), and nonlinguistic pitch processing were all found to be impaired in AWS. Altogether, the findings support the hypothesis that AWS might be less efficient in accessing the phonemic representations when exposed to a demanding listening condition.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.20249718.}, }
@article {pmid35858067, year = {2022}, author = {Baciadonna, L and Solvi, C and Del Vecchio, F and Pilenga, C and Baracchi, D and Bandoli, F and Isaja, V and Gamba, M and Favaro, L}, title = {Vocal accommodation in penguins (Spheniscus demersus) as a result of social environment.}, journal = {Proceedings. Biological sciences}, volume = {289}, number = {1978}, pages = {20220626}, pmid = {35858067}, issn = {1471-2954}, mesh = {Animals ; Communication ; Humans ; Language ; Social Environment ; *Spheniscidae ; Vocalization, Animal ; }, abstract = {The ability to vary the characteristics of one's voice is a critical feature of human communication. Understanding whether and how animals change their calls will provide insights into the evolution of language. We asked to what extent the vocalizations of penguins, a phylogenetically distant species from those capable of explicit vocal learning, are flexible and responsive to their social environment. Using a principal components (PCs) analysis, we reduced 14 vocal parameters of penguin's contact calls to four PCs, each comprising highly correlated parameters and which can be categorized as fundamental frequency, formant frequency, frequency modulation, and amplitude modulation rate and duration. We compared how these differed between individuals with varying degrees of social interactions: same-colony versus different-colony, same colony over 3 years and partners versus non-partners. Our analyses indicate that the more penguins experience each other's calls, the more similar their calls become over time, that vocal convergence requires a long time and relative stability in colony membership, and that partners' unique social bond may affect vocal convergence differently than non-partners. Our results suggest that this implicit form of vocal plasticity is perhaps more widespread across the animal kingdom than previously thought and may be a fundamental capacity of vertebrate vocalization.}, }
@article {pmid35804282, year = {2022}, author = {Easwar, V and Chung, L}, title = {The influence of phoneme contexts on adaptation in vowel-evoked envelope following responses.}, journal = {The European journal of neuroscience}, volume = {56}, number = {5}, pages = {4572-4582}, pmid = {35804282}, issn = {1460-9568}, mesh = {Acoustic Stimulation ; Humans ; Male ; Phonetics ; *Speech Perception/physiology ; }, abstract = {Repeated stimulus presentation leads to neural adaptation and consequent amplitude reduction in vowel-evoked envelope following responses (EFRs)-a response that reflects neural activity phase-locked to envelope periodicity. EFRs are elicited by vowels presented in isolation or in the context of other phonemes such as consonants in syllables. While context phonemes could exert some forward influence on vowel-evoked EFRs, they may reduce the degree of adaptation. Here, we evaluated whether the properties of context phonemes between consecutive vowel stimuli influence adaptation. EFRs were elicited by the low-frequency first formant (resolved harmonics) and middle-to-high-frequency second and higher formants (unresolved harmonics) of a male-spoken /i/ when the presence, number and predictability of context phonemes (/s/, /a/, /∫/ and /u/) between vowel repetitions varied. Monitored over four iterations of /i/, adaptation was evident only for EFRs elicited by the unresolved harmonics. EFRs elicited by the unresolved harmonics decreased in amplitude by ~16-20 nV (10%-17%) after the first presentation of /i/ and remained stable thereafter. EFR adaptation was reduced by the presence of a context phoneme, but the reduction did not change with their number or predictability. The presence of a context phoneme, however, attenuated EFRs by a degree similar to that caused by adaptation (~21-23 nV). Such a trade-off in the short- and long-term influence of context phonemes suggests that the benefit of interleaving EFR-eliciting vowels with other context phonemes depends on whether the use of consonant-vowel syllables is critical to improve the validity of EFR applications.}, }
@article {pmid35802401, year = {2022}, author = {Teferra, BG and Borwein, S and DeSouza, DD and Simpson, W and Rheault, L and Rose, J}, title = {Acoustic and Linguistic Features of Impromptu Speech and Their Association With Anxiety: Validation Study.}, journal = {JMIR mental health}, volume = {9}, number = {7}, pages = {e36828}, pmid = {35802401}, issn = {2368-7959}, abstract = {BACKGROUND: The measurement and monitoring of generalized anxiety disorder requires frequent interaction with psychiatrists or psychologists. Access to mental health professionals is often difficult because of high costs or insufficient availability. The ability to assess generalized anxiety disorder passively and at frequent intervals could be a useful complement to conventional treatment and help with relapse monitoring. Prior work suggests that higher anxiety levels are associated with features of human speech. As such, monitoring speech using personal smartphones or other wearable devices may be a means to achieve passive anxiety monitoring.
OBJECTIVE: This study aims to validate the association of previously suggested acoustic and linguistic features of speech with anxiety severity.
METHODS: A large number of participants (n=2000) were recruited and participated in a single web-based study session. Participants completed the Generalized Anxiety Disorder 7-item scale assessment and provided an impromptu speech sample in response to a modified version of the Trier Social Stress Test. Acoustic and linguistic speech features were a priori selected based on the existing speech and anxiety literature, along with related features. Associations between speech features and anxiety levels were assessed using age and personal income as covariates.
RESULTS: Word count and speaking duration were negatively correlated with anxiety scores (r=-0.12; P<.001), indicating that participants with higher anxiety scores spoke less. Several acoustic features were also significantly (P<.05) associated with anxiety, including the mel-frequency cepstral coefficients, linear prediction cepstral coefficients, shimmer, fundamental frequency, and first formant. In contrast to previous literature, second and third formant, jitter, and zero crossing rate for the z score of the power spectral density acoustic features were not significantly associated with anxiety. Linguistic features, including negative-emotion words, were also associated with anxiety (r=0.10; P<.001). In addition, some linguistic relationships were sex dependent. For example, the count of words related to power was positively associated with anxiety in women (r=0.07; P=.03), whereas it was negatively associated with anxiety in men (r=-0.09; P=.01).
CONCLUSIONS: Both acoustic and linguistic speech measures are associated with anxiety scores. The amount of speech, acoustic quality of speech, and gender-specific linguistic characteristics of speech may be useful as part of a system to screen for anxiety, detect relapse, or monitor treatment.}, }
@article {pmid35778699, year = {2022}, author = {Lin, YC and Yan, HT and Lin, CH and Chang, HH}, title = {Predicting frailty in older adults using vocal biomarkers: a cross-sectional study.}, journal = {BMC geriatrics}, volume = {22}, number = {1}, pages = {549}, pmid = {35778699}, issn = {1471-2318}, mesh = {Aged ; Biomarkers ; Cross-Sectional Studies ; Female ; Frail Elderly ; *Frailty/diagnosis/epidemiology ; Humans ; Male ; Odds Ratio ; *Osteoporotic Fractures ; }, abstract = {BACKGROUND: Frailty is a common issue in the aging population. Given that frailty syndrome is little discussed in the literature on the aging voice, the current study aims to examine the relationship between frailty and vocal biomarkers in older people.
METHODS: Participants aged ≥ 60 years visiting geriatric outpatient clinics were recruited. They underwent frailty assessment (Cardiovascular Health Study [CHS] index; Study of Osteoporotic Fractures [SOF] index; and Fatigue, Resistance, Ambulation, Illness, and Loss of weight [FRAIL] index) and were asked to pronounce a sustained vowel /a/ for approximately 1 s. Four voice parameters were assessed: average number of zero crossings (A1), variations in local peaks and valleys (A2), variations in first and second formant frequencies (A3), and spectral energy ratio (A4).
RESULTS: Among 277 older adults, increased A1 was associated with a lower likelihood of frailty as defined by SOF (odds ratio [OR] 0.84, 95% confidence interval [CI] 0.74-0.96). Participants with larger A2 values were more likely to be frail, as defined by FRAIL and CHS (FRAIL: OR 1.41, 95% CI 1.12-1.79; CHS: OR 1.38, 95% CI 1.10-1.75). Sex differences were observed across the three frailty indices. In male participants, an increase in A3 by 10 points increased the odds of frailty by almost 7% (SOF: OR 1.07, 95% CI 1.02-1.12), 6% (FRAIL: OR 1.06, 95% CI 1.02-1.11), or 6% (CHS: OR 1.06, 95% CI 1.01-1.11). In female participants, an increase in A4 by 0.1 conferred a significant 2.8-fold (SOF: OR 2.81, 95% CI 1.71-4.62), 2.3-fold (FRAIL: OR 2.31, 95% CI 1.45-3.68), or 2.8-fold (CHS: OR 2.82, 95% CI 1.76-4.51, CHS) increased odds of frailty.
CONCLUSIONS: Vocal biomarkers, especially spectral-domain voice parameters, might have potential for estimating frailty, as a non-invasive, instantaneous, objective, and cost-effective estimation tool, and demonstrating sex differences for individualised treatment of frailty.}, }
@article {pmid35778208, year = {2022}, author = {Jibson, J}, title = {Formant detail needed for identifying, rating, and discriminating vowels in Wisconsin English.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {6}, pages = {4004}, doi = {10.1121/10.0011539}, pmid = {35778208}, issn = {1520-8524}, mesh = {*Language ; Wisconsin ; }, abstract = {Neel [(2004). Acoust. Res. Lett. Online 5, 125-131] asked how much time-varying formant detail is needed for vowel identification. In that study, multiple stimuli were synthesized for each vowel: 1-point (monophthongal with midpoint frequencies), 2-point (linear from onset to offset), 3-point, 5-point, and 11-point. Results suggested that a 3-point model was optimal. This conflicted with the dual-target hypothesis of vowel inherent spectral change research, which has found that two targets are sufficient to model vowel identification. The present study replicates and expands upon the work of Neel. Ten English monophthongs were chosen for synthesis. One-, two-, three-, and five-point vowels were created as described above, and another 1-point stimulus was created with onset frequencies rather than midpoint frequencies. Three experiments were administered (n = 18 for each): vowel identification, goodness rating, and discrimination. The results ultimately align with the dual-target hypothesis, consistent with most vowel inherent spectral change studies.}, }
@article {pmid35749662, year = {2022}, author = {Groll, MD and Dahl, KL and Cádiz, MD and Welch, B and Tracy, LF and Stepp, CE}, title = {Resynthesis of Transmasculine Voices to Assess Gender Perception as a Function of Testosterone Therapy.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {7}, pages = {2474-2489}, pmid = {35749662}, issn = {1558-9102}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Speech ; Speech Acoustics ; *Speech Perception ; Testosterone ; *Voice ; Young Adult ; }, abstract = {PURPOSE: The goal of this study was to use speech resynthesis to investigate the effects of changes to individual acoustic features on speech-based gender perception of transmasculine voice samples following the onset of hormone replacement therapy (HRT) with exogenous testosterone. We hypothesized that mean fundamental frequency (f o) would have the largest effect on gender perception of any single acoustic feature.
METHOD: Mean f o, f o contour, and formant frequencies were calculated for three pairs of transmasculine speech samples before and after HRT onset. Sixteen speech samples with unique combinations of these acoustic features from each pair of speech samples were resynthesized. Twenty young adult listeners evaluated each synthesized speech sample for gender perception and synthetic quality. Two analyses of variance were used to investigate the effects of acoustic features on gender perception and synthetic quality.
RESULTS: Of the three acoustic features, mean f o was the only single feature that had a statistically significant effect on gender perception. Differences between the speech samples before and after HRT onset that were not captured by changes in f o and formant frequencies also had a statistically significant effect on gender perception.
CONCLUSION: In these transmasculine voice samples, mean f o was the most important acoustic feature for voice masculinization as a result of HRT; future investigations in a larger number of transmasculine speakers and on the effects of behavioral therapy-based changes in concert with HRT is warranted.}, }
@article {pmid35744460, year = {2022}, author = {Yan, S and Liu, P and Chen, Z and Liu, J and Shen, L and Zhang, X and Cui, J and Li, T and Cui, Y and Ren, Y}, title = {High-Property Refractive Index and Bio-Sensing Dual-Purpose Sensor Based on SPPs.}, journal = {Micromachines}, volume = {13}, number = {6}, pages = {}, pmid = {35744460}, issn = {2072-666X}, abstract = {A high-property plasma resonance-sensor structure consisting of two metal-insulator-metal (MIM) waveguides coupled with a transverse ladder-shaped nano-cavity (TLSNC) is designed based on surface plasmon polaritons. Its transmission characteristics are analyzed using multimode interference coupling mode theory (MICMT), and are simulated using finite element analysis (FEA). Meanwhile, the influence of different structural arguments on the performance of the structure is investigated. This study shows that the system presents four high-quality formants in the transmission spectrum. The highest sensitivity is 3000 nm/RIU with a high FOM[*] of 9.7 × 10[5]. In addition, the proposed structure could act as a biosensor to detect the concentrations of sodium ions (Na[+]), potassium ions (K[+]), and the glucose solution with maximum sensitivities of 0.45, 0.625 and 5.5 nm/mgdL[-1], respectively. Compared with other structures, the designed system has the advantages of a simple construction, a wide working band range, high reliability and easy nano-scale integration, providing a high-performance cavity choice for refractive index sensing and biosensing devices based on surface plasmons.}, }
@article {pmid35737731, year = {2022}, author = {Ham, J and Yoo, HJ and Kim, J and Lee, B}, title = {Vowel speech recognition from rat electroencephalography using long short-term memory neural network.}, journal = {PloS one}, volume = {17}, number = {6}, pages = {e0270405}, pmid = {35737731}, issn = {1932-6203}, mesh = {Animals ; Electroencephalography/methods ; Male ; Memory, Short-Term ; Neural Networks, Computer ; Rats ; Rats, Sprague-Dawley ; Speech ; *Speech Perception ; }, abstract = {Over the years, considerable research has been conducted to investigate the mechanisms of speech perception and recognition. Electroencephalography (EEG) is a powerful tool for identifying brain activity; therefore, it has been widely used to determine the neural basis of speech recognition. In particular, for the classification of speech recognition, deep learning-based approaches are in the spotlight because they can automatically learn and extract representative features through end-to-end learning. This study aimed to identify particular components that are potentially related to phoneme representation in the rat brain and to discriminate brain activity for each vowel stimulus on a single-trial basis using a bidirectional long short-term memory (BiLSTM) network and classical machine learning methods. Nineteen male Sprague-Dawley rats subjected to microelectrode implantation surgery to record EEG signals from the bilateral anterior auditory fields were used. Five different vowel speech stimuli were chosen, /a/, /e/, /i/, /o/, and /u/, which have highly different formant frequencies. EEG recorded under randomly given vowel stimuli was minimally preprocessed and normalized by a z-score transformation to be used as input for the classification of speech recognition. The BiLSTM network showed the best performance among the classifiers by achieving an overall accuracy, f1-score, and Cohen's κ values of 75.18%, 0.75, and 0.68, respectively, using a 10-fold cross-validation approach. These results indicate that LSTM layers can effectively model sequential data, such as EEG; hence, informative features can be derived through BiLSTM trained with end-to-end learning without any additional hand-crafted feature extraction methods.}, }
@article {pmid35731636, year = {2023}, author = {Pravitharangul, N and Miyamoto, JJ and Yoshizawa, H and Matsumoto, T and Suzuki, S and Chantarawaratit, PO and Moriyama, K}, title = {Vowel sound production and its association with cephalometric characteristics in skeletal Class III subjects.}, journal = {European journal of orthodontics}, volume = {45}, number = {1}, pages = {20-28}, doi = {10.1093/ejo/cjac031}, pmid = {35731636}, issn = {1460-2210}, mesh = {Male ; Humans ; *Speech Acoustics ; Speech ; Acoustics ; Cephalometry ; *Overbite ; }, abstract = {BACKGROUND: This study aimed to evaluate differences in vowel production using acoustic analysis in skeletal Class III and Class I Japanese participants and to identify the correlation between vowel sounds and cephalometric variables in skeletal Class III subjects.
MATERIALS AND METHODS: Japanese males with skeletal Class III (ANB < 0°) and Class I skeletal anatomy (0.62° < ANB < 5.94°) were recruited (n = 18/group). Acoustic analysis of vowel sounds and cephalometric analysis of lateral cephalograms were performed. For sound analysis, an isolated Japanese vowel (/a/,/i/,/u/,/e/,/o/) pattern was recorded. Praat software was used to extract acoustic parameters such as fundamental frequency (F0) and the first four formants (F1, F2, F3, and F4). The formant graph area was calculated. Cephalometric values were obtained using ImageJ. Correlations between acoustic and cephalometric variables in skeletal Class III subjects were then investigated.
RESULTS: Skeletal Class III subjects exhibited significantly higher/o/F2 and lower/o/F4 values. Mandibular length, SNB, and overjet of Class III subjects were moderately negatively correlated with acoustic variables.
LIMITATIONS: This study did not take into account vertical skeletal patterns and tissue movements during sound production.
CONCLUSION: Skeletal Class III males produced different /o/ (back and rounded vowel), possibly owing to their anatomical positions or adaptive changes. Vowel production was moderately associated with cephalometric characteristics of Class III subjects. Thus, changes in speech after orthognathic surgery may be expected. A multidisciplinary team approach that included the input of a speech pathologist would be useful.}, }
@article {pmid35728449, year = {2022}, author = {Kabakoff, H and Gritsyk, O and Harel, D and Tiede, M and Preston, JL and Whalen, DH and McAllister, T}, title = {Characterizing sensorimotor profiles in children with residual speech sound disorder: a pilot study.}, journal = {Journal of communication disorders}, volume = {99}, number = {}, pages = {106230}, pmid = {35728449}, issn = {1873-7994}, support = {F31 DC018197/DC/NIDCD NIH HHS/United States ; R01 DC002717/DC/NIDCD NIH HHS/United States ; R01 DC013668/DC/NIDCD NIH HHS/United States ; R01 DC017476/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; *Apraxias ; Child ; Humans ; *Language Development Disorders ; Pilot Projects ; Speech ; Speech Production Measurement ; *Speech Sound Disorder/therapy ; *Stuttering ; }, abstract = {PURPOSE: Children with speech errors who have reduced motor skill may be more likely to develop residual errors associated with lifelong challenges. Drawing on models of speech production that highlight the role of somatosensory acuity in updating motor plans, this pilot study explored the relationship between motor skill and speech accuracy, and between somatosensory acuity and motor skill in children. Understanding the connections among sensorimotor measures and speech outcomes may offer insight into how somatosensation and motor skill cooperate during speech production, which could inform treatment decisions for this population.
METHOD: Twenty-five children (ages 9-14) produced syllables in an /ɹ/ stimulability task before and after an ultrasound biofeedback treatment program targeting rhotics. We first tested whether motor skill (as measured by two ultrasound-based metrics of tongue shape complexity) predicted acoustically measured accuracy (the normalized difference between the second and third formant frequencies). We then tested whether somatosensory acuity (as measured by an oral stereognosis task) predicted motor skill, while controlling for auditory acuity.
RESULTS: One measure of tongue shape complexity was a significant predictor of accuracy, such that higher tongue shape complexity was associated with lower accuracy at pre-treatment but higher accuracy at post-treatment. Based on the same measure, children with better somatosensory acuity produced /ɹ/ tongue shapes that were more complex, but this relationship was only present at post-treatment.
CONCLUSION: The predicted relationships among somatosensory acuity, motor skill, and acoustically measured /ɹ/ production accuracy were observed after treatment, but unexpectedly did not hold before treatment. The surprising finding that greater tongue shape complexity was associated with lower accuracy at pre-treatment highlights the importance of evaluating tongue shape patterns (e.g., using ultrasound) prior to treatment, and has the potential to suggest that children with high tongue shape complexity at pre-treatment may be good candidates for ultrasound-based treatment.}, }
@article {pmid35727115, year = {2022}, author = {González-Alvarez, J and Sos-Peña, R}, title = {Perceiving Body Height From Connected Speech: Higher Fundamental Frequency Is Associated With the Speaker's Height.}, journal = {Perceptual and motor skills}, volume = {129}, number = {5}, pages = {1349-1361}, doi = {10.1177/00315125221110392}, pmid = {35727115}, issn = {1558-688X}, mesh = {Body Height ; Body Size ; Female ; Humans ; Male ; *Speech ; *Speech Perception ; }, abstract = {To a certain degree, human listeners can perceive a speaker's body size from their voice. The speaker's voice pitch or fundamental frequency (Fo) and the vocal formant frequencies are the voice parameters that have been most intensively studied in past body size perception research (particularly for body height). Artificially lowering the Fo of isolated vowels from male speakers improved listeners' accuracy of binary (i.e., tall vs not tall) body height perceptions. This has been explained by the theory that a denser harmonic spectrum provided by a low pitch improved the perceptual resolution of formants that aid formant-based size assessments. In the present study, we extended this research using connected speech (i.e., words and sentences) pronounced by speakers of both sexes. Unexpectedly, we found that raising Fo, not lowering it, increased the participants' perceptual performance in two binary discrimination tasks of body size. We explain our new finding in the temporal domain by the dynamic and time-varying acoustic properties of connected speech. Increased Fo might increase the sampling density of sound wave acoustic cycles and provide more detailed information, such as higher resolution, on the envelope shape.}, }
@article {pmid35712147, year = {2022}, author = {Sugiyama, Y}, title = {Identification of Minimal Pairs of Japanese Pitch Accent in Noise-Vocoded Speech.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {887761}, pmid = {35712147}, issn = {1664-1078}, abstract = {The perception of lexical pitch accent in Japanese was assessed using noise-excited vocoder speech, which contained no fundamental frequency (f o) or its harmonics. While prosodic information such as in lexical stress in English and lexical tone in Mandarin Chinese is known to be encoded in multiple acoustic dimensions, such multidimensionality is less understood for lexical pitch accent in Japanese. In the present study, listeners were tested under four different conditions to investigate the contribution of non-f o properties to the perception of Japanese pitch accent: noise-vocoded speech stimuli consisting of 10 3-ERBN-wide bands and 15 2-ERBN-wide bands created from a male and female speaker. Results found listeners were able to identify minimal pairs of final-accented and unaccented words at a rate better than chance in all conditions, indicating the presence of secondary cues to Japanese pitch accent. Subsequent analyses were conducted to investigate if the listeners' ability to distinguish minimal pairs was correlated with duration, intensity or formant information. The results found no strong or consistent correlation, suggesting the possibility that listeners used different cues depending on the information available in the stimuli. Furthermore, the comparison of the current results with equivalent studies in English and Mandarin Chinese suggest that, although lexical prosodic information exists in multiple acoustic dimensions in Japanese, the primary cue is more salient than in other languages.}, }
@article {pmid35700949, year = {2022}, author = {Preisig, BC and Riecke, L and Hervais-Adelman, A}, title = {Speech sound categorization: The contribution of non-auditory and auditory cortical regions.}, journal = {NeuroImage}, volume = {258}, number = {}, pages = {119375}, doi = {10.1016/j.neuroimage.2022.119375}, pmid = {35700949}, issn = {1095-9572}, mesh = {Acoustic Stimulation/methods ; *Auditory Cortex/diagnostic imaging/physiology ; Auditory Perception ; Hearing ; Humans ; Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {Which processes in the human brain lead to the categorical perception of speech sounds? Investigation of this question is hampered by the fact that categorical speech perception is normally confounded by acoustic differences in the stimulus. By using ambiguous sounds, however, it is possible to dissociate acoustic from perceptual stimulus representations. Twenty-seven normally hearing individuals took part in an fMRI study in which they were presented with an ambiguous syllable (intermediate between /da/ and /ga/) in one ear and with disambiguating acoustic feature (third formant, F3) in the other ear. Multi-voxel pattern searchlight analysis was used to identify brain areas that consistently differentiated between response patterns associated with different syllable reports. By comparing responses to different stimuli with identical syllable reports and identical stimuli with different syllable reports, we disambiguated whether these regions primarily differentiated the acoustics of the stimuli or the syllable report. We found that BOLD activity patterns in left perisylvian regions (STG, SMG), left inferior frontal regions (vMC, IFG, AI), left supplementary motor cortex (SMA/pre-SMA), and right motor and somatosensory regions (M1/S1) represent listeners' syllable report irrespective of stimulus acoustics. Most of these regions are outside of what is traditionally regarded as auditory or phonological processing areas. Our results indicate that the process of speech sound categorization implicates decision-making mechanisms and auditory-motor transformations.}, }
@article {pmid35694910, year = {2022}, author = {Sayyahi, F and Boulenger, V}, title = {A temporal-based therapy for children with inconsistent phonological disorder: A case-series.}, journal = {Clinical linguistics & phonetics}, volume = {}, number = {}, pages = {1-27}, doi = {10.1080/02699206.2022.2075792}, pmid = {35694910}, issn = {1464-5076}, abstract = {Deficits in temporal auditory processing, and in particular higher gap detection thresholds have been reported in children with inconsistent phonological disorder (IPD). Here we hypothesized that providing these children with extra time for phoneme identification may in turn enhance their phonological planning abilities for production, and accordingly improve not only consistency but also accuracy of their speech. We designed and tested a new temporal-based therapy, inspired by Core Vocabulary Therapy and called it T-CVT, where we digitally lengthened formant transitions between phonemes of words used for therapy. This allowed to target both temporal auditory processing and word phonological planning. Four preschool Persian native children with IPD received T-CVT for eight weeks. We measured changes in speech consistency (% inconsistency) and accuracy (percentage of consonants correct PCC) to assess the effects of the intervention. Therapy significantly improved both consistency and accuracy of word production in the four children: % inconsistency decreased from 59% on average before therapy to 2% post-T-CVT, and PCC increased from 61% to 92% on average. Consistency and accuracy were furthermore maintained or even still improved at three-month follow-up (2% inconsistency and 99% PCC). Results in a nonword repetition task showed the generalization of these effects to non-treated material: % inconsistency for nonwords decreased from 67% to 10% post-therapy, and PCC increased from 63% to 90%. These preliminary findings support the efficacy of the T-CVT intervention for children with IPD who show temporal auditory processing deficits as reflected by higher gap detection thresholds.}, }
@article {pmid35673798, year = {2022}, author = {Di Dona, G and Scaltritti, M and Sulpizio, S}, title = {Formant-invariant voice and pitch representations are pre-attentively formed from constantly varying speech and non-speech stimuli.}, journal = {The European journal of neuroscience}, volume = {56}, number = {3}, pages = {4086-4106}, pmid = {35673798}, issn = {1460-9568}, mesh = {Acoustic Stimulation/methods ; Attention ; Female ; Humans ; Male ; Reaction Time ; Speech ; *Speech Perception ; }, abstract = {The present study investigated whether listeners can form abstract voice representations while ignoring constantly changing phonological information and if they can use the resulting information to facilitate voice change detection. Further, the study aimed at understanding whether the use of abstraction is restricted to the speech domain or can be deployed also in non-speech contexts. We ran an electroencephalogram (EEG) experiment including one passive and one active oddball task, each featuring a speech and a rotated speech condition. In the speech condition, participants heard constantly changing vowels uttered by a male speaker (standard stimuli) which were infrequently replaced by vowels uttered by a female speaker with higher pitch (deviant stimuli). In the rotated speech condition, participants heard rotated vowels, in which the natural formant structure of speech was disrupted. In the passive task, the mismatch negativity was elicited after the presentation of the deviant voice in both conditions, indicating that listeners could successfully group together different stimuli into a formant-invariant voice representation. In the active task, participants showed shorter reaction times (RTs), higher accuracy and a larger P3b in the speech condition with respect to the rotated speech condition. Results showed that whereas at a pre-attentive level the cognitive system can track pitch regularities while presumably ignoring constantly changing formant information both in speech and in rotated speech, at an attentive level the use of such information is facilitated for speech. This facilitation was also testified by a stronger synchronisation in the theta band (4-7 Hz), potentially pointing towards differences in encoding/retrieval processes.}, }
@article {pmid35667724, year = {2022}, author = {Hampsey, E and Meszaros, M and Skirrow, C and Strawbridge, R and Taylor, RH and Chok, L and Aarsland, D and Al-Chalabi, A and Chaudhuri, R and Weston, J and Fristed, E and Podlewska, A and Awogbemila, O and Young, AH}, title = {Protocol for Rhapsody: a longitudinal observational study examining the feasibility of speech phenotyping for remote assessment of neurodegenerative and psychiatric disorders.}, journal = {BMJ open}, volume = {12}, number = {6}, pages = {e061193}, pmid = {35667724}, issn = {2044-6055}, mesh = {Feasibility Studies ; Humans ; Longitudinal Studies ; *Mental Disorders ; *Mobile Applications ; Observational Studies as Topic ; Speech ; }, abstract = {INTRODUCTION: Neurodegenerative and psychiatric disorders (NPDs) confer a huge health burden, which is set to increase as populations age. New, remotely delivered diagnostic assessments that can detect early stage NPDs by profiling speech could enable earlier intervention and fewer missed diagnoses. The feasibility of collecting speech data remotely in those with NPDs should be established.
METHODS AND ANALYSIS: The present study will assess the feasibility of obtaining speech data, collected remotely using a smartphone app, from individuals across three NPD cohorts: neurodegenerative cognitive diseases (n=50), other neurodegenerative diseases (n=50) and affective disorders (n=50), in addition to matched controls (n=75). Participants will complete audio-recorded speech tasks and both general and cohort-specific symptom scales. The battery of speech tasks will serve several purposes, such as measuring various elements of executive control (eg, attention and short-term memory), as well as measures of voice quality. Participants will then remotely self-administer speech tasks and follow-up symptom scales over a 4-week period. The primary objective is to assess the feasibility of remote collection of continuous narrative speech across a wide range of NPDs using self-administered speech tasks. Additionally, the study evaluates if acoustic and linguistic patterns can predict diagnostic group, as measured by the sensitivity, specificity, Cohen's kappa and area under the receiver operating characteristic curve of the binary classifiers distinguishing each diagnostic group from each other. Acoustic features analysed include mel-frequency cepstrum coefficients, formant frequencies, intensity and loudness, whereas text-based features such as number of words, noun and pronoun rate and idea density will also be used.
ETHICS AND DISSEMINATION: The study received ethical approval from the Health Research Authority and Health and Care Research Wales (REC reference: 21/PR/0070). Results will be disseminated through open access publication in academic journals, relevant conferences and other publicly accessible channels. Results will be made available to participants on request.
TRIAL REGISTRATION NUMBER: NCT04939818.}, }
@article {pmid35664509, year = {2022}, author = {Roessig, S and Winter, B and Mücke, D}, title = {Tracing the Phonetic Space of Prosodic Focus Marking.}, journal = {Frontiers in artificial intelligence}, volume = {5}, number = {}, pages = {842546}, pmid = {35664509}, issn = {2624-8212}, abstract = {Focus is known to be expressed by a wide range of phonetic cues but only a few studies have explicitly compared different phonetic variables within the same experiment. Therefore, we presented results from an analysis of 19 phonetic variables conducted on a data set of the German language that comprises the opposition of unaccented (background) vs. accented (in focus), as well as different focus types with the nuclear accent on the same syllable (broad, narrow, and contrastive focus). The phonetic variables are measures of the acoustic and articulographic signals of a target syllable. Overall, our results provide the highest number of reliable effects and largest effect sizes for accentuation (unaccented vs. accented), while the differentiation of focus types with accented target syllables (broad, narrow, and contrastive focus) are more subtle. The most important phonetic variables across all conditions are measures of the fundamental frequency. The articulatory variables and their corresponding acoustic formants reveal lower tongue positions for both vowels /o, a/, and larger lip openings for the vowel /a/ under increased prosodic prominence with the strongest effects for accentuation. While duration exhibits consistent mid-ranked results for both accentuation and the differentiation of focus types, measures related to intensity are particularly important for accentuation. Furthermore, voice quality and spectral tilt are affected by accentuation but also in the differentiation of focus types. Our results confirm that focus is realized via multiple phonetic cues. Additionally, the present analysis allows a comparison of the relative importance of different measures to better understand the phonetic space of focus marking.}, }
@article {pmid35664350, year = {2022}, author = {Coughler, C and Quinn de Launay, KL and Purcell, DW and Oram Cardy, J and Beal, DS}, title = {Pediatric Responses to Fundamental and Formant Frequency Altered Auditory Feedback: A Scoping Review.}, journal = {Frontiers in human neuroscience}, volume = {16}, number = {}, pages = {858863}, pmid = {35664350}, issn = {1662-5161}, abstract = {PURPOSE: The ability to hear ourselves speak has been shown to play an important role in the development and maintenance of fluent and coherent speech. Despite this, little is known about the developing speech motor control system throughout childhood, in particular if and how vocal and articulatory control may differ throughout development. A scoping review was undertaken to identify and describe the full range of studies investigating responses to frequency altered auditory feedback in pediatric populations and their contributions to our understanding of the development of auditory feedback control and sensorimotor learning in childhood and adolescence.
METHOD: Relevant studies were identified through a comprehensive search strategy of six academic databases for studies that included (a) real-time perturbation of frequency in auditory input, (b) an analysis of immediate effects on speech, and (c) participants aged 18 years or younger.
RESULTS: Twenty-three articles met inclusion criteria. Across studies, there was a wide variety of designs, outcomes and measures used. Manipulations included fundamental frequency (9 studies), formant frequency (12), frequency centroid of fricatives (1), and both fundamental and formant frequencies (1). Study designs included contrasts across childhood, between children and adults, and between typical, pediatric clinical and adult populations. Measures primarily explored acoustic properties of speech responses (latency, magnitude, and variability). Some studies additionally examined the association of these acoustic responses with clinical measures (e.g., stuttering severity and reading ability), and neural measures using electrophysiology and magnetic resonance imaging.
CONCLUSION: Findings indicated that children above 4 years generally compensated in the opposite direction of the manipulation, however, in several cases not as effectively as adults. Overall, results varied greatly due to the broad range of manipulations and designs used, making generalization challenging. Differences found between age groups in the features of the compensatory vocal responses, latency of responses, vocal variability and perceptual abilities, suggest that maturational changes may be occurring in the speech motor control system, affecting the extent to which auditory feedback is used to modify internal sensorimotor representations. Varied findings suggest vocal control develops prior to articulatory control. Future studies with multiple outcome measures, manipulations, and more expansive age ranges are needed to elucidate findings.}, }
@article {pmid35634052, year = {2022}, author = {Wang, X and Wang, T}, title = {Voice Recognition and Evaluation of Vocal Music Based on Neural Network.}, journal = {Computational intelligence and neuroscience}, volume = {2022}, number = {}, pages = {3466987}, pmid = {35634052}, issn = {1687-5273}, mesh = {Humans ; *Music ; Neural Networks, Computer ; Voice Quality ; Voice Recognition ; Voice Training ; }, abstract = {Artistic voice is the artistic life of professional voice users. In the process of selecting and cultivating artistic performing talents, the evaluation of voice even occupies a very important position. Therefore, an appropriate evaluation of the artistic voice is crucial. With the development of art education, how to scientifically evaluate artistic voice training methods and fairly select artistic voice talents is an urgent need for objective evaluation of artistic voice. The current evaluation methods for artistic voices are time-consuming, laborious, and highly subjective. In the objective evaluation of artistic voice, the selection of evaluation acoustic parameters is very important. Attempt to extract the average energy, average frequency error, and average range error of singing voice by using speech analysis technology as the objective evaluation acoustic parameters, use neural network method to objectively evaluate the singing quality of artistic voice, and compare with the subjective evaluation of senior professional teachers. In this paper, voice analysis technology is used to extract the first formant, third formant, fundamental frequency, sound range, fundamental frequency perturbation, first formant perturbation, third formant perturbation, and average energy of singing acoustic parameters. By using BP neural network methods, the quality of singing was evaluated objectively and compared with the subjective evaluation of senior vocal professional teachers. The results show that the BP neural network method can accurately and objectively evaluate the quality of singing voice by using the evaluation parameters, which is helpful in scientifically guiding the selection and training of artistic voice talents.}, }
@article {pmid35612119, year = {2022}, author = {Rafi, S and Gangloff, C and Paulhet, E and Grimault, O and Soulat, L and Bouzillé, G and Cuggia, M}, title = {Out-of-Hospital Cardiac Arrest Detection by Machine Learning Based on the Phonetic Characteristics of the Caller's Voice.}, journal = {Studies in health technology and informatics}, volume = {294}, number = {}, pages = {445-449}, doi = {10.3233/SHTI220498}, pmid = {35612119}, issn = {1879-8365}, mesh = {*Cardiopulmonary Resuscitation ; Emergency Medical Service Communication Systems ; *Emergency Medical Services ; Humans ; Machine Learning ; *Out-of-Hospital Cardiac Arrest/diagnosis ; Phonetics ; }, abstract = {INTRODUCTION: Out-of-hospital cardiac arrest (OHCA) is a major public health issue. The prognosis is closely related to the time from collapse to return of spontaneous circulation. Resuscitation efforts are frequently initiated at the request of emergency call center professionals who are specifically trained to identify critical conditions over the phone. However, 25% of OHCAs are not recognized during the first call. Therefore, it would be interesting to develop automated computer systems to recognize OHCA on the phone. The aim of this study was to build and evaluate machine learning models for OHCA recognition based on the phonetic characteristics of the caller's voice.
METHODS: All patients for whom a call was done to the emergency call center of Rennes, France, between 01/01/2017 and 01/01/2019 were eligible. The predicted variable was OHCA presence. Predicting variables were collected by computer-automatized phonetic analysis of the call. They were based on the following voice parameters: fundamental frequency, formants, intensity, jitter, shimmer, harmonic to noise ratio, number of voice breaks, and number of periods. Three models were generated using binary logistic regression, random forest, and neural network. The area under the curve (AUC) was the primary outcome used to evaluate each model performance.
RESULTS: 820 patients were included in the study. The best model to predict OHCA was random forest (AUC=74.9, 95% CI=67.4-82.4).
CONCLUSION: Machine learning models based on the acoustic characteristics of the caller's voice can recognize OHCA. The integration of the acoustic parameters identified in this study will help to design decision-making support systems to improve OHCA detection over the phone.}, }
@article {pmid35548492, year = {2022}, author = {Tomaschek, F and Ramscar, M}, title = {Understanding the Phonetic Characteristics of Speech Under Uncertainty-Implications of the Representation of Linguistic Knowledge in Learning and Processing.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {754395}, pmid = {35548492}, issn = {1664-1078}, abstract = {The uncertainty associated with paradigmatic families has been shown to correlate with their phonetic characteristics in speech, suggesting that representations of complex sublexical relations between words are part of speaker knowledge. To better understand this, recent studies have used two-layer neural network models to examine the way paradigmatic uncertainty emerges in learning. However, to date this work has largely ignored the way choices about the representation of inflectional and grammatical functions (IFS) in models strongly influence what they subsequently learn. To explore the consequences of this, we investigate how representations of IFS in the input-output structures of learning models affect the capacity of uncertainty estimates derived from them to account for phonetic variability in speech. Specifically, we examine whether IFS are best represented as outputs to neural networks (as in previous studies) or as inputs by building models that embody both choices and examining their capacity to account for uncertainty effects in the formant trajectories of word final [ɐ], which in German discriminates around sixty different IFS. Overall, we find that formants are enhanced as the uncertainty associated with IFS decreases. This result dovetails with a growing number of studies of morphological and inflectional families that have shown that enhancement is associated with lower uncertainty in context. Importantly, we also find that in models where IFS serve as inputs-as our theoretical analysis suggests they ought to-its uncertainty measures provide better fits to the empirical variance observed in [ɐ] formants than models where IFS serve as outputs. This supports our suggestion that IFS serve as cognitive cues during speech production, and should be treated as such in modeling. It is also consistent with the idea that when IFS serve as inputs to a learning network. This maintains the distinction between those parts of the network that represent message and those that represent signal. We conclude by describing how maintaining a "signal-message-uncertainty distinction" can allow us to reconcile a range of apparently contradictory findings about the relationship between articulation and uncertainty in context.}, }
@article {pmid35529579, year = {2022}, author = {Haiduk, F and Fitch, WT}, title = {Understanding Design Features of Music and Language: The Choric/Dialogic Distinction.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {786899}, pmid = {35529579}, issn = {1664-1078}, abstract = {Music and spoken language share certain characteristics: both consist of sequences of acoustic elements that are combinatorically combined, and these elements partition the same continuous acoustic dimensions (frequency, formant space and duration). However, the resulting categories differ sharply: scale tones and note durations of small integer ratios appear in music, while speech uses phonemes, lexical tone, and non-isochronous durations. Why did music and language diverge into the two systems we have today, differing in these specific features? We propose a framework based on information theory and a reverse-engineering perspective, suggesting that design features of music and language are a response to their differential deployment along three different continuous dimensions. These include the familiar propositional-aesthetic ('goal') and repetitive-novel ('novelty') dimensions, and a dialogic-choric ('interactivity') dimension that is our focus here. Specifically, we hypothesize that music exhibits specializations enhancing coherent production by several individuals concurrently-the 'choric' context. In contrast, language is specialized for exchange in tightly coordinated turn-taking-'dialogic' contexts. We examine the evidence for our framework, both from humans and non-human animals, and conclude that many proposed design features of music and language follow naturally from their use in distinct dialogic and choric communicative contexts. Furthermore, the hybrid nature of intermediate systems like poetry, chant, or solo lament follows from their deployment in the less typical interactive context.}, }
@article {pmid35520977, year = {2021}, author = {Hall, A and Kawai, K and Graber, K and Spencer, G and Roussin, C and Weinstock, P and Volk, MS}, title = {Acoustic analysis of surgeons' voices to assess change in the stress response during surgical in situ simulation.}, journal = {BMJ simulation & technology enhanced learning}, volume = {7}, number = {6}, pages = {471-477}, pmid = {35520977}, issn = {2056-6697}, abstract = {INTRODUCTION: Stress may serve as an adjunct (challenge) or hindrance (threat) to the learning process. Determining the effect of an individual's response to situational demands in either a real or simulated situation may enable optimisation of the learning environment. Studies of acoustic analysis suggest that mean fundamental frequency and formant frequencies of voice vary with an individual's response during stressful events. This hypothesis is reviewed within the otolaryngology (ORL) simulation environment to assess whether acoustic analysis could be used as a tool to determine participants' stress response and cognitive load in medical simulation. Such an assessment could lead to optimisation of the learning environment.
METHODOLOGY: ORL simulation scenarios were performed to teach the participants teamwork and refine clinical skills. Each was performed in an actual operating room (OR) environment (in situ) with a multidisciplinary team consisting of ORL surgeons, OR nurses and anaesthesiologists. Ten of the scenarios were led by an ORL attending and ten were led by an ORL fellow. The vocal communication of each of the 20 individual leaders was analysed using a long-term pitch analysis PRAAT software (autocorrelation method) to obtain mean fundamental frequency (F0) and first four formant frequencies (F1, F2, F3 and F4). In reviewing individual scenarios, each leader's voice was analysed during a non-stressful environment (WHO sign-out procedure) and compared with their voice during a stressful portion of the scenario (responding to deteriorating oxygen saturations in the manikin).
RESULTS: The mean unstressed F0 for the male voice was 161.4 Hz and for the female voice was 217.9 Hz. The mean fundamental frequency of speech in the ORL fellow (lead surgeon) group increased by 34.5 Hz between the scenario's baseline and stressful portions. This was significantly different to the mean change of -0.5 Hz noted in the attending group (p=0.01). No changes were seen in F1, F2, F3 or F4.
CONCLUSIONS: This study demonstrates a method of acoustic analysis of the voices of participants taking part in medical simulations. It suggests acoustic analysis of participants may offer a simple, non-invasive, non-intrusive adjunct in evaluating and titrating the stress response during simulation.}, }
@article {pmid35497112, year = {2022}, author = {Jarollahi, F and Valadbeigi, A and Jalaei, B and Maarefvand, M and Motasaddi Zarandy, M and Haghani, H and Shirzhiyzn, Z}, title = {Comparing Sound-Field Speech-Auditory Brainstem Response Components between Cochlear Implant Users with Different Speech Recognition in Noise Scores.}, journal = {Iranian journal of child neurology}, volume = {16}, number = {2}, pages = {93-105}, pmid = {35497112}, issn = {1735-4668}, abstract = {OBJECTIVES: Many studies have suggested that cochlear implant (CI) users vary in terms of speech recognition in noise. Studies in this field attribute this variety partly to subcortical auditory processing. Studying speech-Auditory Brainstem Response (speech-ABR) provides good information about speech processing; thus, this work was designed to compare speech-ABR components between two groups of CI users with good and poor speech recognition in noise scores.
MATERIALS & METHODS: The present study was conducted on two groups of CI users aged 8-10 years old. The first group (CI-good) consisted of 15 children with prelingual CI who had good speech recognition in noise performance. The second group (CI-poor) was matched with the first group, but they had poor speech recognition in noise performance. The speech-ABR test in a sound-field presentation was performed for all the participants.
RESULTS: The speech-ABR response showed more delay in C, D, E, F, O latencies in CI-poor than CI-good users (P <0.05), meanwhile no significant difference was observed in initial wave (V(t= -0.293, p= 0.771 and A (t= -1.051, p= 0.307). Analysis in spectral-domain showed a weaker representation of fundamental frequency as well as the first formant and high-frequency component of speech stimuli in the CI users with poor auditory performance.
CONCLUSIONS: Results revealed that CI users who showed poor auditory performance in noise performance had deficits in encoding the periodic portion of speech signals at the brainstem level. Also, this study could be as physiological evidence for poorer pitch processing in CI users with poor speech recognition in noise performance.}, }
@article {pmid35452247, year = {2022}, author = {Houle, N and Goudelias, D and Lerario, MP and Levi, SV}, title = {Effect of Anchor Term on Auditory-Perceptual Ratings of Feminine and Masculine Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {6}, pages = {2064-2080}, pmid = {35452247}, issn = {1558-9102}, support = {T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Cues ; Female ; Humans ; Male ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {BACKGROUND: Studies investigating auditory perception of gender expression vary greatly in the specific terms applied to gender expression in rating scales.
PURPOSE: This study examined the effects of different anchor terms on listeners' auditory perceptions of gender expression in phonated and whispered speech. Additionally, token and speaker cues were examined to identify predictors of the auditory-perceptual ratings.
METHOD: Inexperienced listeners (n = 105) completed an online rating study in which they were asked to use one of five visual analog scales (VASs) to rate cis men, cis women, and transfeminine speakers in both phonated and whispered speech. The VASs varied by anchor term (very female/very male, feminine/masculine, feminine female/masculine male, very feminine/not at all feminine, and not at all masculine/very masculine).
RESULTS: Linear mixed-effects models revealed significant two-way interactions of gender expression by anchor term and gender expression by condition. In general, the feminine female/masculine male scale resulted in the most extreme ratings (closest to the end points), and the feminine/masculine scale resulted in the most central ratings. As expected, for all speakers, whispered speech was rated more centrally than phonated speech. Additionally, ratings of phonated speech were predicted by mean fundamental frequency (f o) within each speaker group and by smoothed cepstral peak prominence in cisgender speakers. In contrast, ratings of whispered speech, which lacks an f o, were predicted by indicators of vocal tract resonance (second formant and speaker height).
CONCLUSIONS: The current results indicate that differences in the terms applied to rating scales limit generalization of results across studies. Identifying the patterns across listener ratings of gender expression provide a rationale for researchers and clinicians when making choices about terms. Additionally, beyond f o and vocal tract resonance, predictors of listener ratings vary based on the anchor terms used to describe gender expression.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19617564.}, }
@article {pmid35418360, year = {2022}, author = {Kırbac, A and Turkyılmaz, MD and Yağcıoglu, S}, title = {Gender Effects on Binaural Speech Auditory Brainstem Response.}, journal = {The journal of international advanced otology}, volume = {18}, number = {2}, pages = {125-130}, pmid = {35418360}, issn = {2148-3817}, mesh = {Acoustic Stimulation ; Adult ; Brain Stem/physiology ; *Evoked Potentials, Auditory, Brain Stem/physiology ; Female ; Humans ; Male ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {BACKGROUND: The speech auditory brainstem response is a tool that provides direct information on how speech sound is temporally and spectrally coded by the auditory brainstem. Speech auditory brainstem response is influenced by many variables, but the effect of gender is unclear, particularly in the binaural recording. Studies on speech auditory brainstem response evoked by binaural stimulation are limited, but gender studies are even more limited and contradictory. This study aimed at examining the effect of gender on speech auditory brainstem response in adults.
METHODS: Time- and frequency-domain analyses of speech auditory brainstem response recordings of 30 healthy participants (15 women and 15 men) aged 18-35 years with normal hearing and no musical education were obtained. For each adult, speech auditory brainstem response was recorded with the syllable /da/ presented binaurally. Peaks of time (V, A, C, D, E, F, and O) and frequency (fundamental frequency, first formant frequency, and high frequency) domains of speech auditory brainstem response were compared between men and women.
RESULTS: V, A, and F peak latencies of women were significantly shorter than those of men (P< .05). However, no difference was found in the peak amplitude of the time (P > .05) or frequency domain between women and men (P > .05).
CONCLUSION: Gender differences in binaural speech auditory brainstem response are significant in adults, particularly in the time domain. When speech stimuli are used for auditory brainstem responses, normative data specific to gender are required. Preliminary normative data from this study could serve as a reference for future studies on binaural speech auditory brainstem response among Turkish adults.}, }
@article {pmid35416268, year = {2022}, author = {Yasar, OC and Ozturk, S and Kemal, O and Kocabicak, E}, title = {Effects of Subthalamic Nucleus Deep Brain Stimulation Surgery on Voice and Formant Frequencies of Vowels in Turkish.}, journal = {Turkish neurosurgery}, volume = {32}, number = {5}, pages = {764-772}, doi = {10.5137/1019-5149.JTN.36134-21.2}, pmid = {35416268}, issn = {1019-5149}, mesh = {*Deep Brain Stimulation/methods ; Humans ; Language ; *Parkinson Disease/surgery ; *Subthalamic Nucleus/physiology/surgery ; }, abstract = {AIM: To investigate the effects of deep brain stimulation (DBS) of the subthalamic nucleus (STN) on acoustic characteristics of voice production in Turkish patients with Parkinson's disease (PD).
MATERIAL AND METHODS: This study recruited 20 patients diagnosed with PD. Voice samples were recorded under the "stimulation on" and "stimulation off" conditions of STN-DBS. Acoustic recordings of the patients were made during the production of vowels /a/, /o/, and /i/ and repetition of the syllables /pa/-/ta/-/ka/. Acoustic analyses were performed using Praat.
RESULTS: A significant difference in the parameters was observed among groups for vowels. A positive significant difference was observed between preoperative med-on and postoperative med-on/stim-on groups for /a/ and the postoperative med-on/ stim-on and postoperative med-on/stim-off groups for /o/ and /i/ for frequency perturbation (jitter) and noise-to-harmonics ratio. No significant difference was noted between the preoperative med-on and postoperative med-on/stim-off groups for any vowels.
CONCLUSION: STN-DBS surgery has an acute positive effect on voice. Studies on formant frequency analysis in STN-DBS may be expanded with both articulation and intelligibility tests to enable us to combine patient abilities in various perspectives and to obtain precise results.}, }
@article {pmid35400757, year = {2022}, author = {Whalen, DH and DiCanio, C and Dockum, R}, title = {Phonetic Documentation in Three Collections: Topics and Evolution.}, journal = {Journal of the International Phonetic Association}, volume = {52}, number = {1}, pages = {95-121}, pmid = {35400757}, issn = {0025-1003}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, abstract = {Phonetic aspects of many languages have been documented, though the breadth and focus of such documentation varies substantially. In this survey, phonetic aspects (here called "categories") that are typically reported were assessed in three English-language collections-the Illustrations of the IPA, articles from the Journal of Phonetics, and papers from the Ladefoged/Maddieson Sounds of the World's Languages (SOWL) documentation project. Categories were defined for consonants (e.g., Voice Onset Time (VOT) and frication spectrum; 10 in total), vowels (e.g., formants and duration; 7 total) and suprasegmentals (e.g., stress and distinctive vowel length, 6 total). The Illustrations, due to their brevity, had, on average, limited coverage of the selected categories (12% of the 23 categories). Journal of Phonetics articles were typically theoretically motivated, but 64 had sufficient measurements to count as phonetic documentation; these also covered 12% of the categories. The SOWL studies, designed to cover as much of the phonetic structure as feasible in an article-length treatment, achieved 41% coverage on average. Four book-length studies were also examined, with an average of 49% coverage. Phonetic properties of many language families have been studied, though Indo-European is still disproportionately represented. Physiological measures were excluded as being less common, and perceptual measures were excluded as being typically more theoretical. This preliminary study indicates that certain acoustic properties of languages are typically measured and may be considered as an impetus for later, fuller coverage, but broader consensus on the categories is needed. Current documentation efforts could be more useful if these considerations were addressed.}, }
@article {pmid35394801, year = {2022}, author = {Dahl, KL and François, FA and Buckley, DP and Stepp, CE}, title = {Voice and Speech Changes in Transmasculine Individuals Following Circumlaryngeal Massage and Laryngeal Reposturing.}, journal = {American journal of speech-language pathology}, volume = {31}, number = {3}, pages = {1368-1382}, pmid = {35394801}, issn = {1558-9110}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC020061/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Humans ; Male ; Massage ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: The purpose of this study was to measure the short-term effects of circumlaryngeal massage and laryngeal reposturing on acoustic and perceptual characteristics of voice in transmasculine individuals.
METHOD: Fifteen transmasculine individuals underwent one session of sequential circumlaryngeal massage and laryngeal reposturing with a speech-language pathologist. Voice recordings were collected at three time points-baseline, postmassage, and postreposturing. Fundamental frequency (f o), formant frequencies, and relative fundamental frequency (RFF; an acoustic correlate of laryngeal tension) were measured. Estimates of vocal tract length (VTL) were derived from formant frequencies. Twelve listeners rated the perceived masculinity of participants' voices at each time point. Repeated-measures analyses of variance measured the effect of time point on f o, estimated VTL, RFF, and perceived voice masculinity. Significant effects were evaluated with post hoc Tukey's tests.
RESULTS: Between baseline and end of the session, f o decreased, VTL increased, and participant voices were perceived as more masculine, all with statistically significant differences. RFF did not differ significantly at any time point. Outcomes were highly variable at the individual level.
CONCLUSION: Circumlaryngeal massage and laryngeal reposturing have short-term effects on select acoustic (f o, estimated VTL) and perceptual characteristics (listener-assigned voice masculinity) of voice in transmasculine individuals.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19529299.}, }
@article {pmid35377739, year = {2022}, author = {Swann, Z and Daliri, A and Honeycutt, CF}, title = {Impact of Startling Acoustic Stimuli on Word Repetition in Individuals With Aphasia and Apraxia of Speech Following Stroke.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {5}, pages = {1671-1685}, doi = {10.1044/2022_JSLHR-21-00486}, pmid = {35377739}, issn = {1558-9102}, mesh = {Acoustics ; *Aphasia/etiology ; *Apraxias/etiology ; Humans ; Reflex, Startle/physiology ; Speech Intelligibility ; *Stroke/complications ; }, abstract = {PURPOSE: The StartReact effect, whereby movements are elicited by loud, startling acoustic stimuli (SAS), allows the evaluation of movements when initiated through involuntary circuitry, before auditory feedback. When StartReact is applied during poststroke upper extremity movements, individuals exhibit increased muscle recruitment, reaction times, and reaching distances. StartReact releases unimpaired speech with similar increases in muscle recruitment and reaction time. However, as poststroke communication disorders have divergent neural circuitry from upper extremity tasks, it is unclear if StartReact will enhance speech poststroke. Our objective is to determine if (a) StartReact is present in individuals with poststroke aphasia and apraxia and (b) SAS exposure enhances speech intelligibility.
METHOD: We remotely delivered startling, 105-dB white noise bursts (SAS) and quiet, non-SAS cues to 15 individuals with poststroke aphasia and apraxia during repetition of six words. We evaluated average word intensity, pitch, pitch trajectories, vowel formants F1 and F2 (first and second formants), phonemic error rate, and percent incidence of each SAS versus non-SAS-elicited phoneme produced under each cue type.
RESULTS: For SAS trials compared to non-SAS, speech intensity increased (∆ + 0.6 dB), speech pitch increased (∆ + 22.7 Hz), and formants (F1 and F2) changed, resulting in a smaller vowel space after SAS. SAS affected pitch trajectories for some, but not all, words. Non-SAS trials had more stops (∆ + 4.7 utterances) while SAS trials had more sustained phonemes (fricatives, glides, affricates, liquids; ∆ + 5.4 utterances). SAS trials had fewer distortion errors but no change in substitution errors or overall error rate compared to non-SAS trials.
CONCLUSIONS: We show that stroke-impaired speech is susceptible to StartReact, evidenced by decreased intelligibility due to altered formants, pitch trajectories, and articulation, including increased incidence of sounds that could not be produced without SAS. Future studies should examine the impact of SAS on voluntary speech intelligibility and clinical measures of aphasia and apraxia.}, }
@article {pmid35377182, year = {2022}, author = {Zhang, G and Shao, J and Zhang, C and Wang, L}, title = {The Perception of Lexical Tone and Intonation in Whispered Speech by Mandarin-Speaking Congenital Amusics.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {4}, pages = {1331-1348}, doi = {10.1044/2021_JSLHR-21-00345}, pmid = {35377182}, issn = {1558-9102}, mesh = {*Auditory Perceptual Disorders ; Humans ; Pitch Perception ; Recognition, Psychology ; Speech ; *Speech Perception ; }, abstract = {PURPOSE: A fundamental feature of human speech is variation, including the manner of phonation, as exemplified in the case of whispered speech. In this study, we employed whispered speech to examine an unresolved issue about congenital amusia, a neurodevelopmental disorder of musical pitch processing, which also affects speech pitch processing such as lexical tone and intonation perception. The controversy concerns whether amusia is a pitch-processing disorder or can affect speech processing beyond pitch.
METHOD: We examined lexical tone and intonation recognition in 19 Mandarin-speaking amusics and 19 matched controls in phonated and whispered speech, where fundamental frequency (f o) information is either present or absent.
RESULTS: The results revealed that the performance of congenital amusics was inferior to that of controls in lexical tone identification in both phonated and whispered speech. These impairments were also detected in identifying intonation (statements/questions) in phonated and whispered modes. Across the experiments, regression models revealed that f o and non-f o (duration, intensity, and formant frequency) acoustic cues predicted tone and intonation recognition in phonated speech, whereas non-f o cues predicted tone and intonation recognition in whispered speech. There were significant differences between amusics and controls in the use of both f o and non-f o cues.
CONCLUSION: The results provided the first evidence that the impairments of amusics in lexical tone and intonation identification prevail into whispered speech and support the hypothesis that the deficits of amusia extend beyond pitch processing.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.19302275.}, }
@article {pmid35363414, year = {2022}, author = {Carl, M and Levy, ES and Icht, M}, title = {Speech treatment for Hebrew-speaking adolescents and young adults with developmental dysarthria: A comparison of mSIT and Beatalk.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {3}, pages = {660-679}, doi = {10.1111/1460-6984.12715}, pmid = {35363414}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/etiology/therapy ; Humans ; Language ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Individuals with developmental dysarthria typically demonstrate reduced functioning of one or more of the speech subsystems, which negatively impacts speech intelligibility and communication within social contexts. A few treatment approaches are available for improving speech production and intelligibility among individuals with developmental dysarthria. However, these approaches have only limited application and research findings among adolescents and young adults.
AIMS: To determine and compare the effectiveness of two treatment approaches, the modified Speech Intelligibility Treatment (mSIT) and the Beatalk technique, on speech production and intelligibility among Hebrew-speaking adolescents and young adults with developmental dysarthria.
METHODS & PROCEDURES: Two matched groups of adolescents and young adults with developmental dysarthria participated in the study. Each received one of the two treatments, mSIT or Beatalk, over the course of 9 weeks. Measures of speech intelligibility, articulatory accuracy, voice and vowel acoustics were assessed both pre- and post-treatment.
OUTCOMES & RESULTS: Both the mSIT and Beatalk groups demonstrated gains in at least some of the outcome measures. Participants in the mSIT group exhibited improvement in speech intelligibility and voice measures, while participants in the Beatalk group demonstrated increased articulatory accuracy and gains in voice measures from pre- to post-treatment. Significant increases were noted post-treatment for first formant values for select vowels.
Results of this preliminary study are promising for both treatment approaches. The differentiated results indicate their distinct application to speech intelligibility deficits. The current findings also hold clinical significance for treatment among adolescents and young adults with motor speech disorders and application for a language other than English.
WHAT THIS PAPER ADDS: What is already known on the subject Developmental dysarthria (e.g., secondary to cerebral palsy) is a motor speech disorder that negatively impacts speech intelligibility, and thus communication participation. Select treatment approaches are available with the aim of improving speech intelligibility in individuals with developmental dysarthria; however, these approaches are limited in number and have only seldomly been applied specifically to adolescents and young adults. What this paper adds to existing knowledge The current study presents preliminary data regarding two treatment approaches, the mSIT and Beatalk technique, administered to Hebrew-speaking adolescents and young adults with developmental dysarthria in a group setting. Results demonstrate the initial effectiveness of the treatment approaches, with different gains noted for each approach across speech and voice domains. What are the potential or actual clinical implications of this work? The findings add to the existing literature on potential treatment approaches aiming to improve speech production and intelligibility among individuals with developmental dysarthria. The presented approaches also show promise for group-based treatments as well as the potential for improvement among adolescents and young adults with motor speech disorders.}, }
@article {pmid35344948, year = {2022}, author = {Ho, GY and Kansy, IK and Klavacs, KA and Leonhard, M and Schneider-Stickler, B}, title = {Effect of FFP2/3 Masks on Voice Range Profile Measurement and Voice Acoustics in Routine Voice Diagnostics.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {74}, number = {5}, pages = {335-344}, doi = {10.1159/000524299}, pmid = {35344948}, issn = {1421-9972}, mesh = {*Acoustics ; Adult ; COVID-19 ; COVID-19 Testing ; Female ; Humans ; Male ; *Masks ; Middle Aged ; Pandemics ; Phonation ; Speech Acoustics ; *Voice ; Young Adult ; }, abstract = {INTRODUCTION: Voice diagnostics including voice range profile (VRP) measurement and acoustic voice analysis is essential in laryngology and phoniatrics. Due to COVID-19 pandemic, wearing of 2 or 3 filtering face piece (FFP2/3) masks is recommended when high-risk aerosol-generating procedures like singing and speaking are being performed. Goal of this study was to compare VRP parameters when performed without and with FFP2/3 masks. Further, formant analysis for sustained vowels, singer's formant, and analysis of reading standard text samples were performed without/with FFP2/3 masks.
METHODS: Twenty subjects (6 males and 14 females) were enrolled in this study with an average age of 36 ± 16 years (mean ± SD). Fourteen patients were rated as euphonic/not hoarse and 6 patients as mildly hoarse. All subjects underwent the VRP measurements, vowel, and text recordings without/with FFP2/3 mask using the software DiVAS by XION medical (Berlin, Germany). Voice range of singing voice, equivalent of voice extension measure (eVEM), fundamental frequency (F0), sound pressure level (SPL) of soft speaking and shouting were calculated and analyzed. Maximum phonation time (MPT) and jitter-% were included for Dysphonia Severity Index (DSI) measurement. Analyses of singer's formant were performed. Spectral analyses of sustained vowels /a:/, /i:/, and /u:/ (first = F1 and second = F2 formants), intensity of long-term average spectrum, and alpha-ratio were calculated using the freeware praat.
RESULTS: For all subjects, the mean values of routine voice parameters without/with mask were analyzed: no significant differences were found in results of singing voice range, eVEM, SPL, and frequency of soft speaking/shouting, except significantly lower mean SPL of shouting with FFP2/3 mask, in particular that of the female subjects (p = 0.002). Results of MPT, jitter, and DSI without/with FFP2/3 mask showed no significant differences. Further mean values analyzed without/with mask were ratio singer's formant/loud singing, with lower ratio with FFP2/3 mask (p = 0.001), and F1 and F2 of /a:/, /i:/, /u:/, with no significant differences of the results, with the exception of F2 of /i:/ with lower value with FFP2/3 mask (p = 0.005). With the exceptions mentioned, the t test revealed no significant differences for each of the routine parameters tested in the recordings without and with wearing a FFP2/3 mask.
CONCLUSION: It can be concluded that VRP measurements including DSI performed with FFP2/3 masks provide reliable data in clinical routine with respect to voice condition/constitution. Spectral analyses of sustained vowel, text, and singer's formant will be affected by wearing FFP2/3 masks.}, }
@article {pmid35344807, year = {2022}, author = {Chauvette, L and Fournier, P and Sharp, A}, title = {The frequency-following response to assess the neural representation of spectral speech cues in older adults.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108486}, doi = {10.1016/j.heares.2022.108486}, pmid = {35344807}, issn = {1878-5891}, mesh = {Acoustic Stimulation/methods ; Aged ; Cues ; *Hearing Loss ; Humans ; Speech ; *Speech Perception/physiology ; }, abstract = {Older adults often present difficulties understanding speech that cannot be explained by age-related changes in sound audibility. Psychoacoustic and electrophysiologic studies have linked these suprathreshold difficulties to age-related deficits in the auditory processing of temporal and spectral sound information. These studies suggest the existence of an age-related temporal processing deficit in the central auditory system, but the existence of such deficit in the spectral domain remains understudied. The FFR is an electrophysiological evoked response that assesses the ability of the neural auditory system to reproduce the spectral and temporal patterns of a sound. The main goal of this short review is to investigate if the FFR can identify and measure spectral processing deficits in the elderly compared to younger adults (for both, without hearing loss or competing noise). Furthermore, we want to determine what stimuli and analyses have been used in the literature to assess the neural encoding of spectral cues in older adults. Almost all reviewed articles showed an age-related decline in the auditory processing of spectral acoustic information. Even when using different speech and non-speech stimuli, studies reported an age-related decline at the fundamental frequency, at the first formant, and at other harmonic components using different metrics, such as the response's amplitude, inter-trial phase coherence, signal-to-response correlation, and signal-to-noise ratio. These results suggest that older adults may present age-related spectral processing difficulties, but further FFR studies are needed to clarify the effect of advancing age on the neural encoding of spectral speech cues. Spectral processing research on aging would benefit from using a broader variety of stimuli and from rigorously controlling for hearing thresholds even in the absence of disabling hearing loss. Advances in the understanding of the effect of age on FFR measures of spectral encoding could lead to the development of new clinical tools, with possible applications in the field of hearing aid fitting.}, }
@article {pmid35310278, year = {2022}, author = {Zaltz, Y and Kishon-Rabin, L}, title = {Difficulties Experienced by Older Listeners in Utilizing Voice Cues for Speaker Discrimination.}, journal = {Frontiers in psychology}, volume = {13}, number = {}, pages = {797422}, pmid = {35310278}, issn = {1664-1078}, abstract = {Human listeners are assumed to apply different strategies to improve speech recognition in background noise. Young listeners with normal hearing (NH), e.g., have been shown to follow the voice of a particular speaker based on the fundamental (F0) and formant frequencies, which are both influenced by the gender, age, and size of the speaker. However, the auditory and cognitive processes that underlie the extraction and discrimination of these voice cues across speakers may be subject to age-related decline. The present study aimed to examine the utilization of F0 and formant cues for voice discrimination (VD) in older adults with hearing expected for their age. Difference limens (DLs) for VD were estimated in 15 healthy older adults (65-78 years old) and 35 young adults (18-35 years old) using only F0 cues, only formant frequency cues, and a combination of F0 + formant frequencies. A three-alternative forced-choice paradigm with an adaptive-tracking threshold-seeking procedure was used. Wechsler backward digit span test was used as a measure of auditory working memory. Trail Making Test (TMT) was used to provide cognitive information reflecting a combined effect of processing speed, mental flexibility, and executive control abilities. The results showed that (a) the mean VD thresholds of the older adults were poorer than those of the young adults for all voice cues, although larger variability was observed among the older listeners; (b) both age groups found the formant cues more beneficial for VD, compared to the F0 cues, and the combined (F0 + formant) cues resulted in better thresholds, compared to each cue separately; (c) significant associations were found for the older adults in the combined F0 + formant condition between VD and TMT scores, and between VD and hearing sensitivity, supporting the notion that a decline with age in both top-down and bottom-up mechanisms may hamper the ability of older adults to discriminate between voices. The present findings suggest that older listeners may have difficulty following the voice of a specific speaker and thus implementing doing so as a strategy for listening amid noise. This may contribute to understanding their reported difficulty listening in adverse conditions.}, }
@article {pmid35288014, year = {2022}, author = {Paulino, CEB and Silva, HJD and Gomes, AOC and Silva, JMSD and Cunha, DAD and Coriolano, MDGWS and Lopes, LW and Lira, ZS}, title = {Relationship Between Oropharyngeal Geometry and Vocal Parameters in Subjects With Parkinson's Disease.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.01.020}, pmid = {35288014}, issn = {1873-4588}, abstract = {OBJECTIVE: To verify whether the dimensions of different segments of the oropharyngeal cavity have different proportions between Parkinson's disease patients and vocally healthy subjects and investigate whether the measurements of these subjects' oropharyngeal geometry associate with their acoustic measurements of voice.
METHOD: Quantitative, descriptive, cross-sectional, and retrospective study with secondary data, approved by the Human Research Ethics Committee under no. 4.325.029. We used vocal samples and data from the oropharyngeal geometry of 40 subjects - 20 with Parkinson's disease stages I to III and 20 who formed the control group, matched for sex and age. Each group had 10 males and 10 females, mean age of 61 years (±6.0). Formant (F1, F2, and F3) and cepstral measures of the sustained vowel /ε/ were extracted and arranged in the database to determine their values using Praat software. The data were descriptively analyzed, with statistics generated with R software. The proportion of oropharyngeal geometry measurements was arranged by mean values and coefficients of variation. Pearson's linear correlation test was applied to relate voice parameters to oropharyngeal geometry, considering P < 0.05, and linear regression test, to justify F2.
RESULTS: The Parkinson's disease group showed a linear relationship between oral cavity length and F1 in males (P = 0.04) and between glottal area and F2 in females (P = 0.00); linear relationships were established according to age in both groups, and a regression model for F2 was estimated (R[2] = 0.61). There was no difference between pathological and healthy voices; there was a difference in the proportional relationship of oropharyngeal geometry between the groups.
CONCLUSION: The proportional relationship of oropharyngeal geometry differs between the Parkinson's disease group and the control group, as well as the relationship between oropharyngeal geometry and formant and cepstral values of voice according to the subjects' sex and age.}, }
@article {pmid35276418, year = {2022}, author = {Jüchter, C and Beutelmann, R and Klump, GM}, title = {Speech sound discrimination by Mongolian gerbils.}, journal = {Hearing research}, volume = {418}, number = {}, pages = {108472}, doi = {10.1016/j.heares.2022.108472}, pmid = {35276418}, issn = {1878-5891}, mesh = {Animals ; Auditory Perception/physiology ; Gerbillinae ; Humans ; *Phonetics ; Speech/physiology ; *Speech Perception/physiology ; }, abstract = {The present study establishes the Mongolian gerbil (Meriones unguiculatus) as a model for investigating the perception of human speech sounds. We report data on the discrimination of logatomes (CVCs - consonant-vowel-consonant combinations with outer consonants /b/, /d/, /s/ and /t/ and central vowels /a/, /aː/, /ɛ/, /eː/, /ɪ/, /iː/, /ɔ/, /oː/, /ʊ/ and /uː/, VCVs - vowel-consonant-vowel combinations with outer vowels /a/, /ɪ/ and /ʊ/ and central consonants /b/, /d/, /f/, /g/, /k/, /l/, /m/, /n/, /p/, /s/, /t/ and /v/) by gerbils. Four gerbils were trained to perform an oddball target detection paradigm in which they were required to discriminate a deviant CVC or VCV in a sequence of CVC or VCV standards, respectively. The experiments were performed with an ICRA-1 noise masker with speech-like spectral properties, and logatomes of multiple speakers were presented at various signal-to-noise ratios. Response latencies were measured to generate perceptual maps employing multidimensional scaling, which visualize the gerbils' internal maps of the sounds. The dimensions of the perceptual maps were correlated to multiple phonetic features of the speech sounds for evaluating which features of vowels and consonants are most important for the discrimination. The perceptual representation of vowels and consonants in gerbils was similar to that of humans, although gerbils needed higher signal-to-noise ratios for the discrimination of speech sounds than humans. The gerbils' discrimination of vowels depended on differences in the frequencies of the first and second formant determined by tongue height and position. Consonants were discriminated based on differences in combinations of their articulatory features. The similarities in the perception of logatomes by gerbils and humans renders the gerbil a suitable model for human speech sound discrimination.}, }
@article {pmid35259200, year = {2022}, author = {Tamura, T and Tanaka, Y and Watanabe, Y and Sato, K}, title = {Relationships between maximum tongue pressure and second formant transition in speakers with different types of dysarthria.}, journal = {PloS one}, volume = {17}, number = {3}, pages = {e0264995}, pmid = {35259200}, issn = {1932-6203}, mesh = {Adult ; Aged ; *Dysarthria ; Female ; Humans ; Male ; Pressure ; *Speech Acoustics ; Speech Intelligibility/physiology ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {The effects of muscle weakness on speech are currently not fully known. We investigated the relationships between maximum tongue pressure and second formant transition in adults with different types of dysarthria. It focused on the slope in the second formant transition because it reflects the tongue velocity during articulation. Sixty-three Japanese speakers with dysarthria (median age, 68 years; interquartile range, 58-77 years; 44 men and 19 women) admitted to acute and convalescent hospitals were included. Thirty neurologically normal speakers aged 19-85 years (median age, 22 years; interquartile range, 21.0-23.8 years; 14 men and 16 women) were also included. The relationship between the maximum tongue pressure and speech function was evaluated using correlation analysis in the dysarthria group. Speech intelligibility, the oral diadochokinesis rate, and the second formant slope were based on the impaired speech index. More than half of the speakers had mild to moderate dysarthria. Speakers with dysarthria showed significantly lower maximum tongue pressure, speech intelligibility, oral diadochokinesis rate, and second formant slope than neurologically normal speakers. Only the second formant slope was significantly correlated with the maximum tongue pressure (r = 0.368, p = 0.003). The relationship between the second formant slope and maximum tongue pressure showed a similar correlation in the analysis of subgroups divided by sex. The oral diadochokinesis rate, which is related to the speed of articulation, is affected by voice on/off, mandibular opening/closing, and range of motion. In contrast, the second formant slope was less affected by these factors. These results suggest that the maximum isometric tongue strength is associated with tongue movement speed during articulation.}, }
@article {pmid35250034, year = {2022}, author = {Georgiou, GP}, title = {Acoustic markers of vowels produced with different types of face masks.}, journal = {Applied acoustics. Acoustique applique. Angewandte Akustik}, volume = {191}, number = {}, pages = {108691}, pmid = {35250034}, issn = {0003-682X}, abstract = {The wide spread of SARS-CoV-2 led to the extensive use of face masks in public places. Although masks offer significant protection from infectious droplets, they also impact verbal communication by altering speech signal. The present study examines how two types of face masks affect the speech properties of vowels. Twenty speakers were recorded producing their native vowels in a /pVs/ context, maintaining a normal speaking rate. Speakers were asked to produce the vowels in three conditions: (a) with a surgical mask, (b) with a cotton mask, and (c) without a mask. The speakers' output was analyzed through Praat speech acoustics software. We fitted three linear mixed-effects models to investigate the mask-wearing effects on the first formant (F1), second formant (F2), and duration of vowels. The results demonstrated that F1 and duration of vowels remained intact in the masked conditions compared to the unmasked condition, while F2 was altered for three out of five vowels (/e a u/) in the surgical mask and two out of five vowels (/e a/) in the cotton mask. So, both types of masks altered to some extent speech signal and they mostly affected the same vowel qualities. It is concluded that some acoustic properties are more sensitive than other to speech signal modification when speech is filtered through masks, while various sounds are affected in a different way. The findings may have significant implications for second/foreign language instructors who teach pronunciation and for speech therapists who teach sounds to individuals with language disorders.}, }
@article {pmid35249395, year = {2022}, author = {Bertucci, V and Stevens, K and Sidhu, N and Suri, S and Bressmann, T}, title = {The Impact of Fan-Type Rapid Palatal Expanders on Speech in Patients With Unilateral Cleft Lip and Palate.}, journal = {The Cleft palate-craniofacial journal : official publication of the American Cleft Palate-Craniofacial Association}, volume = {}, number = {}, pages = {10556656221084541}, doi = {10.1177/10556656221084541}, pmid = {35249395}, issn = {1545-1569}, abstract = {Rapid palatal expanders (RPEs) are commonly used in patients with cleft lip and palate (CLP) prior to secondary alveolar bone grafting (SABG). Their position and size can impede tongue movement and affect speech. This study assessed changes in perception and production of speech over the course of RPE treatment. Prospective longitudinal. Tertiary university-affiliated hospital. Twenty-five patients with unilateral CLP treated with Fan-type RPEs, and their parents. Patient and parent speech questionnaires and patient speech recordings were collected at baseline before RPE insertion (T1), directly after RPE insertion (T2), during RPE expansion (T3), during RPE retention (T4), directly after RPE removal but before SABG (T5), and at short-term follow-up after RPE removal and SABG (T6). Ratings for patient and parent questionnaires, first (F1) and second (F2) formants for vowels /a/, /i/, and /u/, and nasalance scores for non-nasal and nasal sentences, were obtained and analyzed using mixed model analyses of variance. Ratings worsened at T2. For the vowel /a/, F1 and F2 were unchanged at T2. For the vowel /i/, F1 increased and F2 decreased at T2. For the vowel /u/, F1 was unchanged and F2 decreased at T2. Nasalance was unchanged at T2. All outcome measures returned to T1 levels by T4. RPE insertion resulted in initial adverse effects on speech perception and production, which decreased to baseline prior to removal. Information regarding transient speech dysfunction and distress may help prepare patients for treatment.}, }
@article {pmid35242348, year = {2022}, author = {Anikin, A and Pisanski, K and Reby, D}, title = {Static and dynamic formant scaling conveys body size and aggression.}, journal = {Royal Society open science}, volume = {9}, number = {1}, pages = {211496}, pmid = {35242348}, issn = {2054-5703}, abstract = {When producing intimidating aggressive vocalizations, humans and other animals often extend their vocal tracts to lower their voice resonance frequencies (formants) and thus sound big. Is acoustic size exaggeration more effective when the vocal tract is extended before, or during, the vocalization, and how do listeners interpret within-call changes in apparent vocal tract length? We compared perceptual effects of static and dynamic formant scaling in aggressive human speech and nonverbal vocalizations. Acoustic manipulations corresponded to elongating or shortening the vocal tract either around (Experiment 1) or from (Experiment 2) its resting position. Gradual formant scaling that preserved average frequencies conveyed the impression of smaller size and greater aggression, regardless of the direction of change. Vocal tract shortening from the original length conveyed smaller size and less aggression, whereas vocal tract elongation conveyed larger size and more aggression, and these effects were stronger for static than for dynamic scaling. Listeners familiarized with the speaker's natural voice were less often 'fooled' by formant manipulations when judging speaker size, but paid more attention to formants when judging aggressive intent. Thus, within-call vocal tract scaling conveys emotion, but a better way to sound large and intimidating is to keep the vocal tract consistently extended.}, }
@article {pmid35240298, year = {2022}, author = {Haider, CL and Suess, N and Hauswald, A and Park, H and Weisz, N}, title = {Masking of the mouth area impairs reconstruction of acoustic speech features and higher-level segmentational features in the presence of a distractor speaker.}, journal = {NeuroImage}, volume = {252}, number = {}, pages = {119044}, doi = {10.1016/j.neuroimage.2022.119044}, pmid = {35240298}, issn = {1095-9572}, mesh = {Acoustic Stimulation ; Acoustics ; Humans ; Mouth ; *Speech ; *Speech Perception ; Visual Perception ; }, abstract = {Multisensory integration enables stimulus representation even when the sensory input in a single modality is weak. In the context of speech, when confronted with a degraded acoustic signal, congruent visual inputs promote comprehension. When this input is masked, speech comprehension consequently becomes more difficult. But it still remains inconclusive which levels of speech processing are affected under which circumstances by occluding the mouth area. To answer this question, we conducted an audiovisual (AV) multi-speaker experiment using naturalistic speech. In half of the trials, the target speaker wore a (surgical) face mask, while we measured the brain activity of normal hearing participants via magnetoencephalography (MEG). We additionally added a distractor speaker in half of the trials in order to create an ecologically difficult listening situation. A decoding model on the clear AV speech was trained and used to reconstruct crucial speech features in each condition. We found significant main effects of face masks on the reconstruction of acoustic features, such as the speech envelope and spectral speech features (i.e. pitch and formant frequencies), while reconstruction of higher level features of speech segmentation (phoneme and word onsets) were especially impaired through masks in difficult listening situations. As we used surgical face masks in our study, which only show mild effects on speech acoustics, we interpret our findings as the result of the missing visual input. Our findings extend previous behavioural results, by demonstrating the complex contextual effects of occluding relevant visual information on speech processing.}, }
@article {pmid35232632, year = {2022}, author = {Hoyer, P and Riedler, M and Unterhofer, C and Graf, S}, title = {Vocal Tract and Subglottal Impedance in High Performance Singing: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2022.01.015}, pmid = {35232632}, issn = {1873-4588}, abstract = {OBJECTIVES/HYPOTHESIS: The respiratory process is important in vocal training and in professional singing, the airflow is highly important. It is hypothesized that subglottal resonances are important to the singing voice in high performance singing.
STUDY DESIGN: Single subject, prospective.
METHOD: A professional soprano singer shaped her vocal tract to form the vowels [a], [e], [i], [o], and [u] at the pitch d4. We measured phonated vowels and the vocal tract impedance spectra with a deterministic noise supplied by an iPhone buzzer in the range of 200 to 4,000 Hz at closed glottis, during exhalation and during inhalation while maintaining the shape of the vocal tract.
RESULTS: Measurements of the phonated vowels before and after the different glottal adjustments were highly reproducible. Vocal tract resonances and the ones resulting during respiration are reported. The impedance spectra show vowel dependent resonances with closed and open glottis. The formants of the vocal spectra are explained by including both, the vocal tract, and the subglottal resonances.
CONCLUSION: The findings indicate that subglottal resonances influence the first formant as well as the singers's formant cluster in high-performance singing. The instrumental setup used for the impedance measurement allows a simple and lightweight procedure for a measurement of vocal tract and subglottal resonances.}, }
@article {pmid35232067, year = {2022}, author = {Luberadzka, J and Kayser, H and Hohmann, V}, title = {Making sense of periodicity glimpses in a prediction-update-loop-A computational model of attentive voice tracking.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {712}, pmid = {35232067}, issn = {1520-8524}, support = {R01 DC015429/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Bayes Theorem ; Computer Simulation ; Humans ; Periodicity ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Humans are able to follow a speaker even in challenging acoustic conditions. The perceptual mechanisms underlying this ability remain unclear. A computational model of attentive voice tracking, consisting of four computational blocks: (1) sparse periodicity-based auditory features (sPAF) extraction, (2) foreground-background segregation, (3) state estimation, and (4) top-down knowledge, is presented. The model connects the theories about auditory glimpses, foreground-background segregation, and Bayesian inference. It is implemented with the sPAF, sequential Monte Carlo sampling, and probabilistic voice models. The model is evaluated by comparing it with the human data obtained in the study by Woods and McDermott [Curr. Biol. 25(17), 2238-2246 (2015)], which measured the ability to track one of two competing voices with time-varying parameters [fundamental frequency (F0) and formants (F1,F2)]. Three model versions were tested, which differ in the type of information used for the segregation: version (a) uses the oracle F0, version (b) uses the estimated F0, and version (c) uses the spectral shape derived from the estimated F0 and oracle F1 and F2. Version (a) simulates the optimal human performance in conditions with the largest separation between the voices, version (b) simulates the conditions in which the separation in not sufficient to follow the voices, and version (c) is closest to the human performance for moderate voice separation.}, }
@article {pmid35232065, year = {2022}, author = {Saba, JN and Hansen, JHL}, title = {The effects of Lombard perturbation on speech intelligibility in noise for normal hearing and cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {2}, pages = {1007}, pmid = {35232065}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; *Cochlear Implants ; Hearing ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Natural compensation of speech production in challenging listening environments is referred to as the Lombard effect (LE). The resulting acoustic differences between neutral and Lombard speech have been shown to provide intelligibility benefits for normal hearing (NH) and cochlear implant (CI) listeners alike. Motivated by this outcome, three LE perturbation approaches consisting of pitch, duration, formant, intensity, and spectral contour modifications were designed specifically for CI listeners to combat speech-in-noise performance deficits. Experiment 1 analyzed the effects of loudness, quality, and distortion of approaches on speech intelligibility with and without formant-shifting. Significant improvements of +9.4% were observed in CI listeners without the formant-shifting approach at +5 dB signal-to-noise ratio (SNR) large-crowd-noise (LCN) when loudness was controlled, however, performance was found to be significantly lower for NH listeners. Experiment 2 evaluated the non-formant-shifting approach with additional spectral contour and high pass filtering to reduce spectral smearing and decrease distortion observed in Experiment 1. This resulted in significant intelligibility benefits of +30.2% for NH and +21.2% for CI listeners at 0 and +5 dB SNR LCN, respectively. These results suggest that LE perturbation may be useful as front-end speech modification approaches to improve intelligibility for CI users in noise.}, }
@article {pmid35180005, year = {2022}, author = {Sen, A and Thakkar, H and Vincent, V and Rai, S and Singh, A and Mohanty, S and Roy, A and Ramakrishnan, L}, title = {Endothelial colony forming cells' tetrahydrobiopterin level in coronary artery disease patients and its association with circulating endothelial progenitor cells.}, journal = {Canadian journal of physiology and pharmacology}, volume = {100}, number = {5}, pages = {473-485}, doi = {10.1139/cjpp-2021-0548}, pmid = {35180005}, issn = {1205-7541}, mesh = {Biopterin/analogs & derivatives ; *Coronary Artery Disease ; *Endothelial Progenitor Cells ; Humans ; }, abstract = {Endothelial colony forming cells (ECFCs) participate in neovascularization. Endothelial nitric oxide synthase (eNOS) derived NO· helps in homing of endothelial progenitor cells (EPCs) at the site of vascular injury. The enzyme cofactor tetrahydrobiopterin (BH4) stabilizes the catalytic active state of eNOS. Association of intracellular ECFCs biopterins and ratio of reduced to oxidized biopterin (BH4:BH2) with circulatory EPCs and ECFCs functionality have not been studied. We investigated ECFCs biopterin levels and its association with circulatory EPCs as well as ECFCs proliferative potential in terms of day of appearance in culture. Circulatory EPCs were enumerated by flowcytometry in 53 coronary artery disease (CAD) patients and 42 controls. ECFCs were cultured, characterized, and biopterin levels assessed by high performance liquid chromatography. Appearance of ECFCs' colony and their number were recorded. Circulatory EPCs were significantly lower in CAD and ECFCs appeared in 56% and 33% of CAD and control subjects, respectively. Intracellular BH4 and BH4:BH2 were significantly reduced in CAD. BH4:BH2 was positively correlated with circulatory EPCs (p = 0.01), and negatively with day of appearance of ECFCs (p = 0.04). Circulatory EPCs negatively correlated with ECFCs appearance (p = 0.02). These findings suggest the role of biopterins in maintaining circulatory EPCs and functional integrity of ECFCs.}, }
@article {pmid35175986, year = {2022}, author = {Lou, Q and Wang, X and Jiang, L and Wang, G and Chen, Y and Liu, Q}, title = {Subjective and Objective Evaluation of Speech in Adult Patients with Unrepaired Cleft Palate.}, journal = {The Journal of craniofacial surgery}, volume = {33}, number = {5}, pages = {e528-e532}, doi = {10.1097/SCS.0000000000008567}, pmid = {35175986}, issn = {1536-3732}, mesh = {Adult ; *Cleft Palate/complications/surgery ; Humans ; Speech ; Speech Disorders/diagnosis/etiology ; Speech Intelligibility ; Speech Production Measurement/methods ; Voice Quality ; Young Adult ; }, abstract = {OBJECTIVE: To explore the speech outcomes of adult patients through subjective perception evaluation and objective acoustic analysis, and to compare the differences in pronunciation characteristics between speakers with adult patients with unrepaired cleft palate and their non-cleft peers.
PARTICIPANTS AND INTERVENTION: Subjective evaluation indicators included speech intelligibility, nasality, and consonant missing rate, whereas objective acoustic parameters included normalized vowel formants, voice onset time, and the analysis of three-dimensional spectrogram and spectrum, were carried out on speech samples produced by 2 groups of speakers: (a) speakers with unrepaired cleft palate (n = 65, mean age = 25.1 years) and (b) typical speakers (n = 30, mean age = 23.7 years).
RESULTS: Compared with typical speakers, individuals with unrepaired cleft palate exhibited a lower speech intelligibility with higher nasality and consonant missing rate, the missing rate is highest for the 6 consonants syllables The acoustic parameters are mainly manifested as differences in vowel formants and voice onset time.
CONCLUSIONS: The results revealed important acoustical differences between adult patients with unrepaired cleft palate and typical speakers. The trend of spectral deviation may have contributed to the difficulty in producing pressure vowels and aspirated consonants in individuals with speech disorders related to cleft palate.}, }
@article {pmid35166414, year = {2022}, author = {Nguyen, DD and Chacon, A and Payten, C and Black, R and Sheth, M and McCabe, P and Novakovic, D and Madill, C}, title = {Acoustic characteristics of fricatives, amplitude of formants and clarity of speech produced without and with a medical mask.}, journal = {International journal of language & communication disorders}, volume = {57}, number = {2}, pages = {366-380}, pmid = {35166414}, issn = {1460-6984}, mesh = {Acoustics ; Humans ; Phonetics ; *Speech ; Speech Acoustics ; Speech Disorders ; *Speech Perception ; }, abstract = {BACKGROUND: Previous research has found that high-frequency energy of speech signals decreased while wearing face masks. However, no study has examined the specific spectral characteristics of fricative consonants and vowels and the perception of clarity of speech in mask wearing.
AIMS: To investigate acoustic-phonetic characteristics of fricative consonants and vowels and auditory perceptual rating of clarity of speech produced with and without wearing a face mask.
METHODS & PROCEDURES: A total of 16 healthcare workers read the Rainbow Passage using modal phonation in three conditions: without a face mask, with a standard surgical mask and with a KN95 mask (China GB2626-2006, a medical respirator with higher barrier level than the standard surgical mask). Speech samples were acoustically analysed for root mean square (RMS) amplitude (ARMS) and spectral moments of four fricatives /f/, /s/, /ʃ/ and /z/; and amplitude of the first three formants (A1, A2 and A3) measured from the reading passage and extracted vowels. Auditory perception of speech clarity was performed. Data were compared across mask and non-mask conditions using linear mixed models.
OUTCOMES & RESULTS: The ARMS of all included fricatives was significantly lower in surgical mask and KN95 mask compared with non-mask condition. Centre of gravity of /f/ decreased in both surgical and KN95 mask while other spectral moments did not show systematic significant linear trends across mask conditions. None of the formant amplitude measures was statistically different across conditions. Speech clarity was significantly poorer in both surgical and KN95 mask conditions.
Speech produced while wearing either a surgical mask or KN95 mask was associated with decreased fricative amplitude and poorer speech clarity.
WHAT THIS PAPER ADDS: What is already known on the subject Previous studies have shown that the overall spectral levels in high frequency ranges and intelligibility are decreased for speech produced with a face mask. It is unclear how different types of the speech signals that is, fricatives and vowels are presented in speech produced with wearing either a medical surgical or KN95 mask. It is also unclear whether ratings of speech clarity are similar for speech produced with these face masks. What this paper adds to existing knowledge Speech data collected using a real-world, clinical and non-laboratory-controlled settings showed differences in the amplitude of fricatives and speech clarity ratings between non-mask and mask-wearing conditions. Formant amplitude did not show significant differences in mask-wearing conditions compared with non-mask. What are the potential or actual clinical implications of this work? Wearing a surgical mask or a KN95 mask had different effects on consonants and vowels. It appeared from the findings in this study that these masks only affected fricative consonants and did not affect vowel production. The poorer speech clarity in these mask-wearing conditions has important implications for speech perception in communication between clinical staff and between medical officers and patients in clinics, and between people in everyday situations. The impact of these masks on speech perception may be more pronounced in people with hearing impairment and communication disorders. In voice evaluation and/or therapy sessions, the effects of wearing a medical mask can occur bidirectionally for both the clinician and the patient. The patient may find it more challenging to understand the speech conveyed by the clinician while the clinician may not perceptually assess patient's speech and voice accurately. Given the significant correlation between clarity ratings and fricative amplitude, improving fricative signals would be useful to improve speech clarity while wearing these medical face masks.}, }
@article {pmid35142977, year = {2022}, author = {Gábor, A and Kaszás, N and Faragó, T and Pérez Fraga, P and Lovas, M and Andics, A}, title = {The acoustic bases of human voice identity processing in dogs.}, journal = {Animal cognition}, volume = {25}, number = {4}, pages = {905-916}, pmid = {35142977}, issn = {1435-9456}, mesh = {Acoustics ; Animals ; Cues ; Dogs ; Humans ; Recognition, Psychology ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {Speech carries identity-diagnostic acoustic cues that help individuals recognize each other during vocal-social interactions. In humans, fundamental frequency, formant dispersion and harmonics-to-noise ratio serve as characteristics along which speakers can be reliably separated. The ability to infer a speaker's identity is also adaptive for members of other species (like companion animals) for whom humans (as owners) are relevant. The acoustic bases of speaker recognition in non-humans are unknown. Here, we tested whether dogs can recognize their owner's voice and whether they rely on the same acoustic parameters for such recognition as humans use to discriminate speakers. Stimuli were pre-recorded sentences spoken by the owner and control persons, played through loudspeakers placed behind two non-transparent screens (with each screen hiding a person). We investigated the association between acoustic distance of speakers (examined along several dimensions relevant in intraspecific voice identification) and dogs' behavior. Dogs chose their owner's voice more often than that of control persons', suggesting that they can identify it. Choosing success and time spent looking in the direction of the owner's voice were positively associated, showing that looking time is an index of the ease of choice. Acoustic distance of speakers in mean fundamental frequency and jitter were positively associated with looking time, indicating that the shorter the acoustic distance between speakers with regard to these parameters, the harder the decision. So, dogs use these cues to discriminate their owner's voice from unfamiliar voices. These findings reveal that dogs use some but probably not all acoustic parameters that humans use to identify speakers. Although dogs can detect fine changes in speech, their perceptual system may not be fully attuned to identity-diagnostic cues in the human voice.}, }
@article {pmid35141903, year = {2022}, author = {V, K and S, SP}, title = {Hybrid machine learning classification scheme for speaker identification.}, journal = {Journal of forensic sciences}, volume = {67}, number = {3}, pages = {1033-1048}, doi = {10.1111/1556-4029.15006}, pmid = {35141903}, issn = {1556-4029}, mesh = {*Machine Learning ; Speech ; *Support Vector Machine ; }, abstract = {Motivated by the requirement to prepare for the next generation of "Automatic Spokesperson Recognition" (ASR) system, this paper applied the fused spectral features with hybrid machine learning (ML) strategy to the speech communication field. This strategy involved the combined spectral features such as mel-frequency cepstral coefficients (MFCCs), spectral kurtosis, spectral skewness, normalized pitch frequency (NPF), and formants. The characterization of suggested classification method could possibly serve in advanced speaker identification scenarios. Special attention was given to hybrid ML scheme capable of finding unknown speakers equipped with speaker id-detecting classifier technique, known as "Random Forest-Support Vector Machine" (RF-SVM). The extracted speaker precise spectral attributes are applied to the hybrid RF-SVM classifier to identify/verify the particular speaker. This work aims to construct an ensemble decision tree on a bounded area with minimal misclassification error using a hybrid ensemble RF-SVM strategy. A series of standard, real-time speaker databases, and noise conditions are functionally tested to validate its performance with other state-of-the-art mechanisms. The proposed fusion method succeeds in the speaker identification task with a high identification rate (97% avg) and lower equal error rate (EER) (<2%), compared with the individual schemes for the recorded experimental dataset. The robustness of the classifier is validated using the standard ELSDSR, TIMIT, and NIST audio datasets. Experiments on ELSDSR, TIMIT, and NIST datasets show that the hybrid classifier produces 98%, 99%, and 94% accuracy, and EERs were 2%, 1%, and 2% respectively. The findings are then compared with well-known other speaker recognition schemes and found to be superior.}, }
@article {pmid35135714, year = {2022}, author = {Menezes, DP and de Lira, ZS and Araújo, ANB and de Almeida, AAF and Gomes, AOC and Moraes, BT and Lucena, JA}, title = {Prosodic Differences in the Voices of Transgender and Cisgender Women: Self-Perception of Voice - An Auditory and Acoustic Analysis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.12.020}, pmid = {35135714}, issn = {1873-4588}, abstract = {INTRODUCTION: The voice is an important parameter for identifying the speaker's gender. Transgender people seek to adapt their bodies to gender identity, and transgender women have greater difficulties in achieving vocal acceptance. In this context, the evaluation of the various parameters of the voice of transgender and cisgender women is essential to make it possible to propose appropriate intervention measures.
OBJECTIVES: To identify the differences in vocal characteristics of transgender and cisgender women.
METHODS: An sectional study was conducted. The sample comprised 20 transgender women and 20 cisgender women who underwent evaluation of acoustic parameters, emotional prosody, self-perception, and perception of gender by lay listeners.
RESULTS: The vocal characteristics of transgender and cisgender women differ in terms of the following parameters: f0, glottal noise excitation (GNE), vocal intensity, speech range profile (SRP), the first three formants of the vowel /a/, and in terms of emotional prosody, including duration and melodic contour. Higher values were mostly found in the cisgender population, except for noise level and vocal intensity. In addition, in most cases lay listeners identified the voices of transgender women as belonging to the male gender. There was a negative correlation between vocal dissatisfaction and f0 among transgender women.
CONCLUSIONS: Even though they perform vocal adjustments, the voices of transgender women are different from cisgender women in terms of acoustic parameters, vocal extension, and emotional prosody including duration and melodic contour. These differences have repercussions on the perception of gender by listeners.}, }
@article {pmid35130577, year = {2022}, author = {Rishiq, D and Harkrider, A and Springer, C and Hedrick, M}, title = {Effects of Spectral Shaping on Speech Auditory Brainstem Responses to Stop Consonant-Vowel Syllables.}, journal = {Journal of the American Academy of Audiology}, volume = {33}, number = {4}, pages = {232-243}, doi = {10.1055/a-1764-9805}, pmid = {35130577}, issn = {2157-3107}, mesh = {Humans ; Male ; Young Adult ; Aged ; Evoked Potentials, Auditory, Brain Stem/physiology ; Speech ; Acoustic Stimulation/methods ; *Speech Perception/physiology ; *Hearing Aids ; }, abstract = {BACKGROUND: Spectral shaping is employed by hearing aids to make consonantal information, such as formant transitions, audible for listeners with hearing loss. How manipulations of the stimuli, such as spectral shaping, may alter encoding in the auditory brainstem has not been thoroughly studied.
PURPOSE: The aim of this study was to determine how spectral shaping of synthetic consonant-vowel (CV) syllables, varying in their second formant (F2) onset frequency, may affect encoding of the syllables in the auditory brainstem.
RESEARCH DESIGN: We employed a repeated measure design.
STUDY SAMPLE: Sixteen young adults (mean = 20.94 years, 6 males) and 11 older adults (mean = 58.60 years, 4 males) participated in this study.
DATA COLLECTION AND ANALYSIS: Speech-evoked auditory brainstem responses (speech-ABRs) were obtained from each participant using three CV exemplars selected from synthetic stimuli generated for a /ba-da-ga/ continuum. Brainstem responses were also recorded to corresponding three CV exemplars that were spectrally shaped to decrease low-frequency information and provide gain for middle and high frequencies according to a Desired Sensation Level function. In total, six grand average waveforms (3 phonemes [/ba/, /da/, /ga/] X 2 shaping conditions [unshaped, shaped]) were produced for each participant. Peak latencies and amplitudes, referenced to prestimulus baseline, were identified for 15 speech-ABR peaks. Peaks were marked manually using the program cursor on each individual waveform. Repeated-measures analysis of variances were used to determine the effects of shaping on the latencies and amplitudes of the speech-ABR peaks.
RESULTS: Shaping effects produced changes within participants in ABR latencies and amplitudes involving onset and major peaks of the speech-ABR waveform for certain phonemes. Specifically, data from onset peaks showed that shaping decreased latency for /ga/ in older listeners, and decreased amplitude onset for /ba/ in younger listeners. Shaping also increased the amplitudes of major peaks for /ga/ stimuli in both groups.
CONCLUSIONS: Encoding of speech in the ABR waveform may be more complex and multidimensional than a simple demarcation of source and filter information, and may also be influenced by cue intensity and age. These results suggest a more complex subcortical encoding of vocal tract filter information in the ABR waveform, which may also be influenced by cue intensity and age.}, }
@article {pmid35120354, year = {2022}, author = {Easwar, V and Boothalingam, S and Wilson, E}, title = {Sensitivity of Vowel-Evoked Envelope Following Responses to Spectra and Level of Preceding Phoneme Context.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1327-1335}, doi = {10.1097/AUD.0000000000001190}, pmid = {35120354}, issn = {1538-4667}, mesh = {Electroencephalography ; Humans ; Male ; *Speech Perception/physiology ; Young Adult ; }, abstract = {OBJECTIVE: Vowel-evoked envelope following responses (EFRs) could be a useful noninvasive tool for evaluating neural activity phase-locked to the fundamental frequency of voice (f0). Vowel-evoked EFRs are often elicited by vowels in consonant-vowel syllables or words. Considering neural activity is susceptible to temporal masking, EFR characteristics elicited by the same vowel may vary with the features of the preceding phoneme. To this end, the objective of the present study was to evaluate the influence of the spectral and level characteristics of the preceding phoneme context on vowel-evoked EFRs.
DESIGN: EFRs were elicited by a male-spoken /i/ (stimulus; duration = 350 msec), modified to elicit two EFRs, one from the region of the first formant (F1) and one from the second and higher formants (F2+). The stimulus, presented at 65 dB SPL, was preceded by one of the four contexts: /∫/, /m/, /i/ or a silent gap of duration equal to that of the stimulus. The level of the context phonemes was either 50 or 80 dB SPL, 15 dB lower and higher than the level of the stimulus /i/. In a control condition, EFRs to the stimulus /i/ were elicited in isolation without any preceding phoneme contexts. The stimulus and the contexts were presented monaurally to a randomly chosen test ear in 21 young adults with normal hearing. EFRs were recorded using single-channel electroencephalogram between the vertex and the nape.
RESULTS: A repeated measures analysis of variance indicated a significant three-way interaction between context type (/∫/, /i/, /m/, silent gap), level (50, 80 dB SPL), and EFR-eliciting formant (F1, F2+). Post hoc analyses indicated no influence of the preceding phoneme context on F1-elicited EFRs. Relative to a silent gap as the preceding context, F2+-elicited EFRs were attenuated by /∫/ and /m/ presented at 50 and 80 dB SPL, as well as by /i/ presented at 80 dB SPL. The average attenuation ranged from 14.9 to 27.9 nV. When the context phonemes were presented at matched levels of 50 or 80 dB SPL, F2+-elicited EFRs were most often attenuated when preceded by /∫/. At 80 dB SPL, relative to the silent preceding gap, the average attenuation was 15.7 nV, and at 50 dB SPL, relative to the preceding context phoneme /i/, the average attenuation was 17.2 nV.
CONCLUSION: EFRs elicited by the second and higher formants of /i/ are sensitive to the spectral and level characteristics of the preceding phoneme context. Such sensitivity, measured as an attenuation in the present study, may influence the comparison of EFRs elicited by the same vowel in different consonant-vowel syllables or words. However, the degree of attenuation with realistic context levels exceeded the minimum measurable change only 12% of the time. Although the impact of the preceding context is statistically significant, it is likely to be clinically insignificant a majority of the time.}, }
@article {pmid35111103, year = {2021}, author = {Chiu, C and Weng, Y and Chen, BW}, title = {Tongue Postures and Tongue Centers: A Study of Acoustic-Articulatory Correspondences Across Different Head Angles.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {768754}, pmid = {35111103}, issn = {1664-1078}, abstract = {Recent research on body and head positions has shown that postural changes may induce varying degrees of changes on acoustic speech signals and articulatory gestures. While the preservation of formant profiles across different postures is suitably accounted for by the two-tube model and perturbation theory, it remains unclear whether it is resulted from the accommodation of tongue postures. Specifically, whether the tongue accommodates the changes in head angle to maintain the target acoustics is yet to be determined. The present study examines vowel acoustics and their correspondence with the articulatory maneuvers of the tongue, including both tongue postures and movements of the tongue center, across different head angles. The results show that vowel acoustics, including pitch and formants, are largely unaffected by upward or downward tilting of the head. These preserved acoustics may be attributed to the lingual gestures that compensate for the effects of gravity. Our results also reveal that the tongue postures in response to head movements appear to be vowel-dependent, and the tongue center may serve as an underlying drive that covariates with the head angle changes. These results imply a close relationship between vowel acoustics and tongue postures as well as a target-oriented strategy for different head angles.}, }
@article {pmid35105035, year = {2022}, author = {Merritt, B and Bent, T}, title = {Revisiting the acoustics of speaker gender perception: A gender expansive perspective.}, journal = {The Journal of the Acoustical Society of America}, volume = {151}, number = {1}, pages = {484}, doi = {10.1121/10.0009282}, pmid = {35105035}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Femininity ; Humans ; Male ; Masculinity ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Examinations of speaker gender perception have primarily focused on the roles of fundamental frequency (fo) and formant frequencies from structured speech tasks using cisgender speakers. Yet, there is evidence to suggest that fo and formants do not fully account for listeners' perceptual judgements of gender, particularly from connected speech. This study investigated the perceptual importance of fo, formant frequencies, articulation, and intonation in listeners' judgements of gender identity and masculinity/femininity from spontaneous speech from cisgender male and female speakers as well as transfeminine and transmasculine speakers. Stimuli were spontaneous speech samples from 12 speakers who are cisgender (6 female and 6 male) and 12 speakers who are transgender (6 transfeminine and 6 transmasculine). Listeners performed a two-alternative forced choice (2AFC) gender identification task and masculinity/femininity rating task in two experiments that manipulated which acoustic cues were available. Experiment 1 confirmed that fo and formant frequency manipulations were insufficient to alter listener judgements across all speakers. Experiment 2 demonstrated that articulatory cues had greater weighting than intonation cues on the listeners' judgements when the fo and formant frequencies were in a gender ambiguous range. These findings counter the assumptions that fo and formant manipulations are sufficient to effectively alter perceived speaker gender.}, }
@article {pmid35104414, year = {2022}, author = {Kim, Y and Chung, H and Thompson, A}, title = {Acoustic and Articulatory Characteristics of English Semivowels /ɹ, l, w/ Produced by Adult Second-Language Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {890-905}, doi = {10.1044/2021_JSLHR-21-00152}, pmid = {35104414}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Humans ; *Language ; *Multilingualism ; Phonetics ; Speech Acoustics ; }, abstract = {PURPOSE: This study presents the results of acoustic and kinematic analyses of word-initial semivowels (/ɹ, l, w/) produced by second-language (L2) speakers of English whose native language is Korean. In addition, the relationship of acoustic and kinematic measures to the ratings of foreign accent was examined by correlation analyses.
METHOD: Eleven L2 speakers and 10 native speakers (first language [L1]) of English read The Caterpillar passage. Acoustic and kinematic data were simultaneously recorded using an electromagnetic articulography system. In addition to speaking rate, two acoustic measures (ratio of third-formant [F3] frequency to second-formant [F2] frequency and duration of steady states of F2) and two kinematic measures (lip aperture and duration of lingual maximum hold) were obtained from individual target sounds. To examine the degree of contrast among the three sounds, acoustic and kinematic Euclidean distances were computed on the F2-F3 and x-y planes, respectively.
RESULTS: Compared with L1 speakers, L2 speakers exhibited a significantly slower speaking rate. For the three semivowels, L2 speakers showed a reduced F3/F2 ratio during constriction, increased lip aperture, and reduced acoustic Euclidean distances among semivowels. Additionally, perceptual ratings of foreign accent were significantly correlated with three measures: duration of steady F2, acoustic Euclidean distance, and kinematic Euclidean distance.
CONCLUSIONS: The findings provide acoustic and kinematic evidence for challenges that L2 speakers experience in the production of English semivowels, especially /ɹ/ and /w/. The robust and consistent finding of reduced contrasts among semivowels and their correlations with perceptual accent ratings suggests using sound contrasts as a potentially effective approach to accent modification paradigms.}, }
@article {pmid35093243, year = {2022}, author = {Takemoto, N and Sanuki, T and Esaki, S and Iwasaki, S}, title = {Rabbit model with vocal fold hyperadduction.}, journal = {Auris, nasus, larynx}, volume = {49}, number = {5}, pages = {810-815}, doi = {10.1016/j.anl.2022.01.008}, pmid = {35093243}, issn = {1879-1476}, mesh = {Animals ; *Dysphonia ; Glottis ; Humans ; Laryngeal Muscles ; Phonation/physiology ; Rabbits ; *Vocal Cords ; }, abstract = {OBJECTIVE: Adductor spasmodic dysphonia (AdSD) is caused by hyperadduction of the vocal folds during phonation, resulting in a strained voice. Animal models are not yet used to elucidate this intractable disease because AdSD has a difficult pathology without a definitive origin. For the first step, we established an animal model with vocal fold hyperadduction and evaluated its validity by assessing laryngeal function.
METHODS: In this experimental animal study, three adult Japanese 20-week-old rabbits were used. The models were created using a combination of cricothyroid approximation, forced airflow, and electrical stimulation of the recurrent laryngeal nerves (RLNs). Cricothyroid approximation was added to produce a glottal slit. Thereafter, both RLNs were electrically stimulated to induce vocal fold hyperadduction. Finally, the left RLN was transected to relieve hyperadduction. The sound, endoscopic images, and subglottal pressure were recorded, and acoustic analysis was performed.
RESULTS: Subglottal pressure increased significantly, and the strained sound was produced after the electrical stimulation of the RLNs. After transecting the left RLN, the subglottal pressure decreased significantly, and the strained sound decreased. Acoustic analysis revealed an elevation of the standard deviation of F0 (SDF0) and degree of voice breaks (DVB) through stimulation of the RLNs, and degradation of SDF0 and DVB through RLN transection. Formant bands in the sound spectrogram were interrupted by the stimulation and appeared again after the RLN section.
CONCLUSION: This study developed a rabbit model with vocal fold hyperadduction . The subglottal pressure and acoustic analysis of this model resembled the characteristics of patients with AdSD. This model could be helpful to elucidate the pathology of the larynx caused by hyperadduction, and evaluate and compare the treatments for strained phonation.}, }
@article {pmid35086866, year = {2022}, author = {Heeringa, AN and Köppl, C}, title = {Auditory Nerve Fiber Discrimination and Representation of Naturally-Spoken Vowels in Noise.}, journal = {eNeuro}, volume = {9}, number = {1}, pages = {}, pmid = {35086866}, issn = {2373-2822}, mesh = {Auditory Perception/physiology ; Cochlear Nerve/physiology ; Nerve Fibers/physiology ; *Noise ; Phonetics ; Speech ; *Speech Perception/physiology ; }, abstract = {To understand how vowels are encoded by auditory nerve (AN) fibers, a number of representation schemes have been suggested that extract the vowel's formant frequencies from AN-fiber spiking patterns. The current study aims to apply and compare these schemes for AN-fiber responses to naturally-spoken vowels in a speech-shaped background noise. Responses to three vowels were evaluated; based on behavioral experiments in the same species, two of these were perceptually difficult to discriminate from each other (/e/ vs /i/), and one was perceptually easy to discriminate from the other two (/a:/). Single-unit AN fibers were recorded from ketamine/xylazine-anesthetized Mongolian gerbils of either sex (n = 8). First, single-unit discrimination between the three vowels was studied. Compared with the perceptually easy discriminations, the average spike timing-based discrimination values were significantly lower for the perceptually difficult vowel discrimination. This was not true for an average rate-based discrimination metric, the rate d-prime (d'). Consistently, spike timing-based representation schemes, plotting the temporal responses of all recorded units as a function of their best frequency (BF), i.e., dominant component schemes, average localized interval rate, and fluctuation profiles, revealed representation of the vowel's formant frequencies, whereas no such representation was apparent in the rate-based excitation pattern. Making use of perceptual discrimination data, this study reveals that discrimination difficulties of naturally-spoken vowels in speech-shaped noise originate peripherally and can be studied in the spike timing patterns of single AN fibers.}, }
@article {pmid35077652, year = {2022}, author = {Yüksel, M}, title = {Reliability and Efficiency of Pitch-Shifting Plug-Ins in Voice and Hearing Research.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {3}, pages = {878-889}, doi = {10.1044/2021_JSLHR-21-00440}, pmid = {35077652}, issn = {1558-9102}, mesh = {Feedback, Sensory ; Female ; Hearing ; Humans ; Male ; *Music ; Pitch Perception ; Reproducibility of Results ; *Voice ; }, abstract = {PURPOSE: Auditory feedback perturbation with voice pitch manipulation has been widely used in previous studies. There are several hardware and software tools for such manipulations, but audio plug-ins developed for music, movies, and radio applications that operate in digital audio workstations may be extremely beneficial and are easy to use, accessible, and cost effective. However, it is unknown whether these plug-ins can perform similarly to tools that have been described in previous literature. Hence, this study aimed to evaluate the reliability and efficiency of these plug-ins.
METHOD: Six different plug-ins were used at +1 and -1 st pitch shifting with formant correction on and off to pitch shift the sustained /ɑ/ voice recording sample of 12 healthy participants (six cisgender males and six cisgender females). Pitch-shifting accuracy, formant shifting amount, intensity changes, and total latency values were reported.
RESULTS: Some variability was observed between different plug-ins and pitch shift settings. One plug-in managed to perform similarly in all four measured aspects with well-known hardware and software units with 1-cent pitch-shifting accuracy, low latency values, negligible intensity difference, and preserved formants. Other plug-ins performed similarly in some respects.
CONCLUSIONS: Audio plug-ins may be used effectively in pitch-shifting applications. Researchers and clinicians can access these plug-ins easily and test whether the features also fit their aims.}, }
@article {pmid35071434, year = {2021}, author = {Cao, S and Xia, M and Zhou, R and Wang, J and Jin, CY and Pei, B and Zhou, ZK and Qian, YM and Jiang, H}, title = {Voice parameters for difficult mask ventilation evaluation: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {23}, pages = {1740}, pmid = {35071434}, issn = {2305-5839}, abstract = {BACKGROUND: Mask ventilation (MV) is an essential component of airway management. Difficult mask ventilation (DMV) is a major cause for perioperative hypoxic brain injury; however, predicting DMV remains a challenge. This study aimed to determine the potential value of voice parameters as novel predictors of DMV in patients scheduled for general anesthesia.
METHODS: We included 1,160 adult patients scheduled for elective surgery under general anesthesia. The clinical variables usually reported as predictors of DMV were collected before surgery. Voice sample of phonemes ([a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]) were recorded and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. The definition of DMV was the inability of an unassisted anesthesiologist to ensure adequate ventilation during MV under general anesthesia. Univariate and multivariate logistic regression analyses were used to explore the association between voice parameters and DMV. The predictive value of the voice parameters was evaluated by assessment of area under the curve (AUC) of receiver operating characteristic (ROC) curves of a stepwise forward model.
RESULTS: The prevalence of DMV was 218/1,160 (18.8%). The AUC of the stepwise forward model (including o_f4, e_bw2, i_f3, u_pitch, u_f1, u_f4, ü_bw4, ci_f1, qi_f1, qi_f4, qi_bw4, chi_f1, chi_bw2, chi_bw4, le_pitch, le_bw3, ke_bw2, en_pitch, and en_f2, en_bw4) attained a value of 0.779. The sensitivity and specificity of the model were 75.0% and 71.0%, respectively.
CONCLUSIONS: Voice parameters may be considered as alternative predictors of DMV, but additional studies are needed to confirm the initial findings.}, }
@article {pmid35069371, year = {2021}, author = {Lee, A and Ng, E}, title = {Hong Kong Women Project a Larger Body When Speaking to Attractive Men.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {786507}, pmid = {35069371}, issn = {1664-1078}, abstract = {In this pilot study we investigated the vocal strategies of Cantonese women when addressing an attractive vs. unattractive male. We recruited 19 young female native speakers of Hong Kong Cantonese who completed an attractiveness rating task, followed by a speech production task where they were presented a subset of the same faces. By comparing the rating results and corresponding acoustic data of the facial stimuli, we found that when young Cantonese women spoke to an attractive male, they were less breathy, lower in fundamental frequency, and with denser formants, all of which are considered to project a larger body. Participants who were more satisfied with their own height used these vocal strategies more actively. These results are discussed in terms of the body size projection principle.}, }
@article {pmid35062025, year = {2022}, author = {Suess, N and Hauswald, A and Reisinger, P and Rösch, S and Keitel, A and Weisz, N}, title = {Cortical tracking of formant modulations derived from silently presented lip movements and its decline with age.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {21}, pages = {4818-4833}, pmid = {35062025}, issn = {1460-2199}, support = {P 31230/FWF_/Austrian Science Fund FWF/Austria ; P 34237/FWF_/Austrian Science Fund FWF/Austria ; }, mesh = {Humans ; *Speech Perception ; Acoustic Stimulation ; Lip ; Speech ; Movement ; }, abstract = {The integration of visual and auditory cues is crucial for successful processing of speech, especially under adverse conditions. Recent reports have shown that when participants watch muted videos of speakers, the phonological information about the acoustic speech envelope, which is associated with but independent from the speakers' lip movements, is tracked by the visual cortex. However, the speech signal also carries richer acoustic details, for example, about the fundamental frequency and the resonant frequencies, whose visuophonological transformation could aid speech processing. Here, we investigated the neural basis of the visuo-phonological transformation processes of these more fine-grained acoustic details and assessed how they change as a function of age. We recorded whole-head magnetoencephalographic (MEG) data while the participants watched silent normal (i.e., natural) and reversed videos of a speaker and paid attention to their lip movements. We found that the visual cortex is able to track the unheard natural modulations of resonant frequencies (or formants) and the pitch (or fundamental frequency) linked to lip movements. Importantly, only the processing of natural unheard formants decreases significantly with age in the visual and also in the cingulate cortex. This is not the case for the processing of the unheard speech envelope, the fundamental frequency, or the purely visual information carried by lip movements. These results show that unheard spectral fine details (along with the unheard acoustic envelope) are transformed from a mere visual to a phonological representation. Aging affects especially the ability to derive spectral dynamics at formant frequencies. As listening in noisy environments should capitalize on the ability to track spectral fine details, our results provide a novel focus on compensatory processes in such challenging situations.}, }
@article {pmid35038295, year = {2022}, author = {Almaghrabi, SA and Thewlis, D and Thwaites, S and Rogasch, NC and Lau, S and Clark, SR and Baumert, M}, title = {The Reproducibility of Bio-Acoustic Features is Associated With Sample Duration, Speech Task, and Gender.}, journal = {IEEE transactions on neural systems and rehabilitation engineering : a publication of the IEEE Engineering in Medicine and Biology Society}, volume = {30}, number = {}, pages = {167-175}, doi = {10.1109/TNSRE.2022.3143117}, pmid = {35038295}, issn = {1558-0210}, mesh = {Acoustics ; Adult ; Female ; Humans ; Male ; Reproducibility of Results ; *Speech ; Speech Acoustics ; *Voice ; }, abstract = {Bio-acoustic properties of speech show evolving value in analyzing psychiatric illnesses. Obtaining a sufficient speech sample length to quantify these properties is essential, but the impact of sample duration on the stability of bio-acoustic features has not been systematically explored. We aimed to evaluate bio-acoustic features' reproducibility against changes in speech durations and tasks. We extracted source, spectral, formant, and prosodic features in 185 English-speaking adults (98 w, 87 m) for reading-a-story and counting tasks. We compared features at 25% of the total sample duration of the reading task to those obtained from non-overlapping randomly selected sub-samples shortened to 75%, 50%, and 25% of total duration using intraclass correlation coefficients. We also compared the features extracted from entire recordings to those measured at 25% of the duration and features obtained from 50% of the duration. Further, we compared features extracted from reading-a-story to counting tasks. Our results show that the number of reproducible features (out of 125) decreased stepwise with duration reduction. Spectral shape, pitch, and formants reached excellent reproducibility. Mel-frequency cepstral coefficients (MFCCs), loudness, and zero-crossing rate achieved excellent reproducibility only at a longer duration. Reproducibility of source, MFCC derivatives, and voicing probability (VP) was poor. Significant gender differences existed in jitter, MFCC first-derivative, spectral skewness, pitch, VP, and formants. Around 97% of features in both genders were not reproducible across speech tasks, in part due to the short counting task duration. In conclusion, bio-acoustic features are less reproducible in shorter samples and are affected by gender.}, }
@article {pmid35005711, year = {2021}, author = {Gaines, JL and Kim, KS and Parrell, B and Ramanarayanan, V and Nagarajan, SS and Houde, JF}, title = {Discrete constriction locations describe a comprehensive range of vocal tract shapes in the Maeda model.}, journal = {JASA express letters}, volume = {1}, number = {12}, pages = {124402}, pmid = {35005711}, issn = {2691-1191}, support = {F32 DC019538/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, abstract = {The Maeda model was used to generate a large set of vocoid-producing vocal tract configurations. The resulting dataset (a) produced a comprehensive range of formant frequencies and (b) displayed discrete tongue body constriction locations (palatal, velar/uvular, and lower pharyngeal). The discrete parameterization of constriction location across the vowel space suggests this is likely a fundamental characteristic of the human vocal tract, and not limited to any specific set of vowel contrasts. These findings suggest that in addition to established articulatory-acoustic constraints, fundamental biomechanical constraints of the vocal tract may also explain such discreteness.}, }
@article {pmid34987356, year = {2021}, author = {Cheng, FY and Xu, C and Gold, L and Smith, S}, title = {Rapid Enhancement of Subcortical Neural Responses to Sine-Wave Speech.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {747303}, pmid = {34987356}, issn = {1662-4548}, support = {K01 DC017192/DC/NIDCD NIH HHS/United States ; }, abstract = {The efferent auditory nervous system may be a potent force in shaping how the brain responds to behaviorally significant sounds. Previous human experiments using the frequency following response (FFR) have shown efferent-induced modulation of subcortical auditory function online and over short- and long-term time scales; however, a contemporary understanding of FFR generation presents new questions about whether previous effects were constrained solely to the auditory subcortex. The present experiment used sine-wave speech (SWS), an acoustically-sparse stimulus in which dynamic pure tones represent speech formant contours, to evoke FFRSWS. Due to the higher stimulus frequencies used in SWS, this approach biased neural responses toward brainstem generators and allowed for three stimuli (/bɔ/, /bu/, and /bo/) to be used to evoke FFRSWS before and after listeners in a training group were made aware that they were hearing a degraded speech stimulus. All SWS stimuli were rapidly perceived as speech when presented with a SWS carrier phrase, and average token identification reached ceiling performance during a perceptual training phase. Compared to a control group which remained naïve throughout the experiment, training group FFRSWS amplitudes were enhanced post-training for each stimulus. Further, linear support vector machine classification of training group FFRSWS significantly improved post-training compared to the control group, indicating that training-induced neural enhancements were sufficient to bolster machine learning classification accuracy. These results suggest that the efferent auditory system may rapidly modulate auditory brainstem representation of sounds depending on their context and perception as non-speech or speech.}, }
@article {pmid34975607, year = {2021}, author = {Meykadeh, A and Golfam, A and Nasrabadi, AM and Ameri, H and Sommer, W}, title = {First Event-Related Potentials Evidence of Auditory Morphosyntactic Processing in a Subject-Object-Verb Nominative-Accusative Language (Farsi).}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {698165}, pmid = {34975607}, issn = {1664-1078}, abstract = {While most studies on neural signals of online language processing have focused on a few-usually western-subject-verb-object (SVO) languages, corresponding knowledge on subject-object-verb (SOV) languages is scarce. Here we studied Farsi, a language with canonical SOV word order. Because we were interested in the consequences of second-language acquisition, we compared monolingual native Farsi speakers and equally proficient bilinguals who had learned Farsi only after entering primary school. We analyzed event-related potentials (ERPs) to correct and morphosyntactically incorrect sentence-final syllables in a sentence correctness judgment task. Incorrect syllables elicited a late posterior positivity at 500-700 ms after the final syllable, resembling the P600 component, as previously observed for syntactic violations at sentence-middle positions in SVO languages. There was no sign of a left anterior negativity (LAN) preceding the P600. Additionally, we provide evidence for a real-time discrimination of phonological categories associated with morphosyntactic manipulations (between 35 and 135 ms), manifesting the instantaneous neural response to unexpected perturbations. The L2 Farsi speakers were indistinguishable from L1 speakers in terms of performance and neural signals of syntactic violations, indicating that exposure to a second language at school entry may results in native-like performance and neural correlates. In nonnative (but not native) speakers verbal working memory capacity correlated with the late posterior positivity and performance accuracy. Hence, this first ERP study of morphosyntactic violations in a spoken SOV nominative-accusative language demonstrates ERP effects in response to morphosyntactic violations and the involvement of executive functions in non-native speakers in computations of subject-verb agreement.}, }
@article {pmid34966297, year = {2021}, author = {Yamada, Y and Shinkawa, K and Nemoto, M and Arai, T}, title = {Automatic Assessment of Loneliness in Older Adults Using Speech Analysis on Responses to Daily Life Questions.}, journal = {Frontiers in psychiatry}, volume = {12}, number = {}, pages = {712251}, pmid = {34966297}, issn = {1664-0640}, abstract = {Loneliness is a perceived state of social and emotional isolation that has been associated with a wide range of adverse health effects in older adults. Automatically assessing loneliness by passively monitoring daily behaviors could potentially contribute to early detection and intervention for mitigating loneliness. Speech data has been successfully used for inferring changes in emotional states and mental health conditions, but its association with loneliness in older adults remains unexplored. In this study, we developed a tablet-based application and collected speech responses of 57 older adults to daily life questions regarding, for example, one's feelings and future travel plans. From audio data of these speech responses, we automatically extracted speech features characterizing acoustic, prosodic, and linguistic aspects, and investigated their associations with self-rated scores of the UCLA Loneliness Scale. Consequently, we found that with increasing loneliness scores, speech responses tended to have less inflections, longer pauses, reduced second formant frequencies, reduced variances of the speech spectrum, more filler words, and fewer positive words. The cross-validation results showed that regression and binary-classification models using speech features could estimate loneliness scores with an R [2] of 0.57 and detect individuals with high loneliness scores with 95.6% accuracy, respectively. Our study provides the first empirical results suggesting the possibility of using speech data that can be collected in everyday life for the automatic assessments of loneliness in older adults, which could help develop monitoring technologies for early detection and intervention for mitigating loneliness.}, }
@article {pmid34963204, year = {2021}, author = {Hussain, Q and Kochetov, A}, title = {Acoustic classification of coronal stops of Eastern Punjabi.}, journal = {Phonetica}, volume = {79}, number = {1}, pages = {77-110}, doi = {10.1515/phon-2021-2015}, pmid = {34963204}, issn = {1423-0321}, mesh = {Acoustics ; Humans ; *Language ; Phonetics ; *Speech Acoustics ; Voice Quality ; }, abstract = {Punjabi is an Indo-Aryan language which contrasts a rich set of coronal stops at dental and retroflex places of articulation across three laryngeal configurations. Moreover, all these stops occur contrastively in various positions (word-initially, -medially, and -finally). The goal of this study is to investigate how various coronal place and laryngeal contrasts are distinguished acoustically both within and across word positions. A number of temporal and spectral correlates were examined in data from 13 speakers of Eastern Punjabi: Voice Onset Time, release and closure durations, fundamental frequency, F1-F3 formants, spectral center of gravity and standard deviation, H1*-H2*, and cepstral peak prominence. The findings indicated that higher formants and spectral measures were most important for the classification of place contrasts across word positions, whereas laryngeal contrasts were reliably distinguished by durational and voice quality measures. Word-medially and -finally, F2 and F3 of the preceding vowels played a key role in distinguishing the dental and retroflex stops, while spectral noise measures were more important word-initially. The findings of this study contribute to a better understanding of factors involved in the maintenance of typologically rare and phonetically complex sets of place and laryngeal contrasts in the coronal stops of Indo-Aryan languages.}, }
@article {pmid34924928, year = {2021}, author = {Zheng, Z and Li, K and Feng, G and Guo, Y and Li, Y and Xiao, L and Liu, C and He, S and Zhang, Z and Qian, D and Feng, Y}, title = {Relative Weights of Temporal Envelope Cues in Different Frequency Regions for Mandarin Vowel, Consonant, and Lexical Tone Recognition.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {744959}, pmid = {34924928}, issn = {1662-4548}, abstract = {Objectives: Mandarin-speaking users of cochlear implants (CI) perform poorer than their English counterpart. This may be because present CI speech coding schemes are largely based on English. This study aims to evaluate the relative contributions of temporal envelope (E) cues to Mandarin phoneme (including vowel, and consonant) and lexical tone recognition to provide information for speech coding schemes specific to Mandarin. Design: Eleven normal hearing subjects were studied using acoustic temporal E cues that were extracted from 30 continuous frequency bands between 80 and 7,562 Hz using the Hilbert transform and divided into five frequency regions. Percent-correct recognition scores were obtained with acoustic E cues presented in three, four, and five frequency regions and their relative weights calculated using the least-square approach. Results: For stimuli with three, four, and five frequency regions, percent-correct scores for vowel recognition using E cues were 50.43-84.82%, 76.27-95.24%, and 96.58%, respectively; for consonant recognition 35.49-63.77%, 67.75-78.87%, and 87.87%; for lexical tone recognition 60.80-97.15%, 73.16-96.87%, and 96.73%. For frequency region 1 to frequency region 5, the mean weights in vowel recognition were 0.17, 0.31, 0.22, 0.18, and 0.12, respectively; in consonant recognition 0.10, 0.16, 0.18, 0.23, and 0.33; in lexical tone recognition 0.38, 0.18, 0.14, 0.16, and 0.14. Conclusion: Regions that contributed most for vowel recognition was Region 2 (502-1,022 Hz) that contains first formant (F1) information; Region 5 (3,856-7,562 Hz) contributed most to consonant recognition; Region 1 (80-502 Hz) that contains fundamental frequency (F0) information contributed most to lexical tone recognition.}, }
@article {pmid34889651, year = {2022}, author = {Polka, L and Masapollo, M and Ménard, L}, title = {Setting the Stage for Speech Production: Infants Prefer Listening to Speech Sounds With Infant Vocal Resonances.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {65}, number = {1}, pages = {109-120}, doi = {10.1044/2021_JSLHR-21-00412}, pmid = {34889651}, issn = {1558-9102}, mesh = {Adult ; Auditory Perception ; Humans ; Infant ; Phonetics ; Speech ; *Speech Perception ; *Voice ; }, abstract = {PURPOSE: Current models of speech development argue for an early link between speech production and perception in infants. Recent data show that young infants (at 4-6 months) preferentially attend to speech sounds (vowels) with infant vocal properties compared to those with adult vocal properties, suggesting the presence of special "memory banks" for one's own nascent speech-like productions. This study investigated whether the vocal resonances (formants) of the infant vocal tract are sufficient to elicit this preference and whether this perceptual bias changes with age and emerging vocal production skills.
METHOD: We selectively manipulated the fundamental frequency (f0) of vowels synthesized with formants specifying either an infant or adult vocal tract, and then tested the effects of those manipulations on the listening preferences of infants who were slightly older than those previously tested (at 6-8 months).
RESULTS: Unlike findings with younger infants (at 4-6 months), slightly older infants in Experiment 1 displayed a robust preference for vowels with infant formants over adult formants when f0 was matched. The strength of this preference was also positively correlated with age among infants between 4 and 8 months. In Experiment 2, this preference favoring infant over adult formants was maintained when f0 values were modulated.
CONCLUSIONS: Infants between 6 and 8 months of age displayed a robust and distinct preference for speech with resonances specifying a vocal tract that is similar in size and length to their own. This finding, together with data indicating that this preference is not present in younger infants and appears to increase with age, suggests that nascent knowledge of the motor schema of the vocal tract may play a role in shaping this perceptual bias, lending support to current models of speech development.
SUPPLEMENTAL MATERIAL: https://doi.org/10.23641/asha.17131805.}, }
@article {pmid34860148, year = {2021}, author = {Sundberg, J and Lindblom, B and Hefele, AM}, title = {Voice source, formant frequencies and vocal tract shape in overtone singing. A case study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {}, number = {}, pages = {1-13}, doi = {10.1080/14015439.2021.1998607}, pmid = {34860148}, issn = {1651-2022}, abstract = {Purpose: In overtone singing a singer produces two pitches simultaneously, a low-pitched, continuous drone plus a melody played on the higher, flutelike and strongly enhanced overtones of the drone. The purpose of this study was to analyse underlying acoustical, phonatory and articulatory phenomena.Methods: The voice source was analyzed by inverse filtering the sound, the articulation from a dynamic MRI video of the vocal tract profile, and the lip opening from a frontal-view video recording. Vocal tract cross-distances were measured in the MR recording and converted to area functions, the formant frequencies of which computed.Results: Inverse filtering revealed that the overtone enhancement resulted from a close clustering of formants 2 and 3. The MRI material showed that for low enhanced overtone frequencies (FE) the tongue tip was raised and strongly retracted, while for high FE the tongue tip was less retracted but forming a longer constriction. Thus, the tongue configuration changed from an apical/anterior to a dorsal/posterior articulation. The formant frequencies derived from the area functions matched almost perfectly those used for the inverse filtering. Further, analyses of the area functions revealed that the second formant frequency was strongly dependent on the back cavity, and the third on the front cavity, which acted like a Helmholtz resonator, tuned by the tongue tip position and lip opening.Conclusions: This type of overtone singing can be fully explained by the well-established source-filter theory of voice production, as recently found by Bergevin et al. [1] for another type of overtone singing.}, }
@article {pmid34852626, year = {2021}, author = {Roberts, B and Summers, RJ and Bailey, PJ}, title = {Mandatory dichotic integration of second-formant information: Contralateral sine bleats have predictable effects on consonant place judgments.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3693}, doi = {10.1121/10.0007132}, pmid = {34852626}, issn = {1520-8524}, mesh = {Acoustic Stimulation ; Judgment ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {Speech-on-speech informational masking arises because the interferer disrupts target processing (e.g., capacity limitations) or corrupts it (e.g., intrusions into the target percept); the latter should produce predictable errors. Listeners identified the consonant in monaural buzz-excited three-formant analogues of approximant-vowel syllables, forming a place of articulation series (/w/-/l/-/j/). There were two 11-member series; the vowel was either high-front or low-back. Series members shared formant-amplitude contours, fundamental frequency, and F1+F3 frequency contours; they were distinguished solely by the F2 frequency contour before the steady portion. Targets were always presented in the left ear. For each series, F2 frequency and amplitude contours were also used to generate interferers with altered source properties-sine-wave analogues of F2 (sine bleats) matched to their buzz-excited counterparts. Accompanying each series member with a fixed mismatched sine bleat in the contralateral ear produced systematic and predictable effects on category judgments; these effects were usually largest for bleats involving the fastest rate or greatest extent of frequency change. Judgments of isolated sine bleats using the three place labels were often unsystematic or arbitrary. These results indicate that informational masking by interferers involved corruption of target processing as a result of mandatory dichotic integration of F2 information, despite the grouping cues disfavoring this integration.}, }
@article {pmid34852620, year = {2021}, author = {Lodermeyer, A and Bagheri, E and Kniesburges, S and Näger, C and Probst, J and Döllinger, M and Becker, S}, title = {The mechanisms of harmonic sound generation during phonation: A multi-modal measurement-based approach.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3485}, doi = {10.1121/10.0006974}, pmid = {34852620}, issn = {1520-8524}, mesh = {Glottis/diagnostic imaging ; Humans ; *Larynx ; *Phonation ; Sound ; Vocal Cords/diagnostic imaging ; }, abstract = {Sound generation during voiced speech remains an open research topic because the underlying process within the human larynx is hardly accessible for direct measurements. In the present study, harmonic sound generation during phonation was investigated with a model that replicates the fully coupled fluid-structure-acoustic interaction (FSAI). The FSAI was captured using a multi-modal approach by measuring the flow and acoustic source fields based on particle image velocimetry, as well as the surface velocity of the vocal folds based on laser vibrometry and high-speed imaging. Strong harmonic sources were localized near the glottis, as well as further downstream, during the presence of the supraglottal jet. The strongest harmonic content of the vocal fold surface motion was verified for the area near the glottis, which directly interacts with the glottal jet flow. Also, the acoustic back-coupling of the formant frequencies onto the harmonic oscillation of the vocal folds was verified. These findings verify that harmonic sound generation is the result of a strong interrelation between the vocal fold motion, modulated flow field, and vocal tract geometry.}, }
@article {pmid34852594, year = {2021}, author = {Barreda, S and Assmann, PF}, title = {Perception of gender in children's voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {5}, pages = {3949}, doi = {10.1121/10.0006785}, pmid = {34852594}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; Child ; Cues ; Female ; Humans ; Male ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {To investigate the perception of gender from children's voices, adult listeners were presented with /hVd/ syllables, in isolation and in sentence context, produced by children between 5 and 18 years. Half the listeners were informed of the age of the talker during trials, while the other half were not. Correct gender identifications increased with talker age; however, performance was above chance even for age groups where the cues most often associated with gender differentiation (i.e., average fundamental frequency and formant frequencies) were not consistently different between boys and girls. The results of acoustic models suggest that cues were used in an age-dependent manner, whether listeners were explicitly told the age of the talker or not. Overall, results are consistent with the hypothesis that talker age and gender are estimated jointly in the process of speech perception. Furthermore, results show that the gender of individual talkers can be identified accurately well before reliable anatomical differences arise in the vocal tracts of females and males. In general, results support the notion that the transmission of gender information from voice depends substantially on gender-dependent patterns of articulation, rather than following deterministically from anatomical differences between male and female talkers.}, }
@article {pmid34847585, year = {2021}, author = {Wilson, RH and Scherer, NJ}, title = {Waveform Amplitude and Temporal Symmetric/Asymmetric Characteristics of Phoneme and Syllable Segments in the W-1 Spondaic Words Recorded by Four Speakers.}, journal = {Journal of the American Academy of Audiology}, volume = {32}, number = {7}, pages = {445-463}, doi = {10.1055/s-0041-1730959}, pmid = {34847585}, issn = {2157-3107}, mesh = {Data Collection ; Female ; Humans ; Male ; *Phonetics ; *Speech ; }, abstract = {BACKGROUND: The amplitude and temporal asymmetry of the speech waveform are mostly associated with voiced speech utterances and are obvious in recent graphic depictions in the literature. The asymmetries are attributed to the presence and interactions of the major formants characteristic of voicing with possible contributions from the unidirectional air flow that accompanies speaking.
PURPOSE: This study investigated the amplitude symmetry/asymmetry characteristics (polarity) of speech waveforms that to our knowledge have not been quantified.
STUDY SAMPLE: Thirty-six spondaic words spoken by two male speakers and two female speakers were selected because they were multisyllabic words providing a reasonable sampling of speech sounds and four recordings were available that were not related to the topic under study.
RESEARCH DESIGN: Collectively, the words were segmented into phonemes (vowels [130], diphthongs [77], voiced consonants [258], voiceless consonants [219]), syllables (82), and blends (6). For each segment the following were analyzed separately for the positive and negative datum points: peak amplitude, the percent of the total segment datum points, the root-mean-square (rms) amplitude, and the crest factor.
DATA COLLECTION AND ANALYSES: The digitized words (44,100 samples/s; 16-bit) were parsed into 144 files (36 words × 4 speakers), edited, transcribed to numeric values (±1), and stored in a spread sheet in which all analyses were performed with in-house routines. Overall approximately 85% of each waveform was analyzed, which excluded portions of silent intervals, transitions, and diminished waveform endings.
RESULTS: The vowel, diphthong, and syllable segments had durations (180-220 ms) that were about twice as long as the consonant durations (∼90 ms) and peak and rms amplitudes that were 6 to 12 dB higher than the consonant peak and rms amplitudes. Vowel, diphthong, and syllable segments had 10% more positive datum points (55%) than negative points (45%), which suggested temporal asymmetries within the segments. With voiced consonants, the distribution of positive and negative datum points dropped to 52 and 48% and essentially was equal with the voiceless consonants (50.3 and 49.6%). The mean rms amplitudes of the negative datum points were higher than the rms amplitudes for the positive points by 2 dB (vowels, diphthongs, and syllables), 1 dB (voiced consonants), and 0.1 dB (voiceless consonants). The 144 waveforms and segmentations are illustrated in the Supplementary Material along with the tabularized positive and negative segment characteristics.
CONCLUSIONS: The temporal and amplitude waveform asymmetries were by far most notable in segments that had a voicing component, which included the voiced consonants. These asymmetries were characterized by larger envelopes and more energy in the negative side of the waveform segment than in the positive side. Interestingly, these segments had more positive datum points than negative points, which indicated temporal asymmetry. All aspects of the voiceless consonants were equally divided between the positive and negative domains. There were female/male differences but with these limited samples such differences should not be generalized beyond the speakers in this study. The influence of the temporal and amplitude asymmetries on monaural word-recognition performance is thought to be negligible.}, }
@article {pmid34827803, year = {2021}, author = {Hedwig, D and Poole, J and Granli, P}, title = {Does Social Complexity Drive Vocal Complexity? Insights from the Two African Elephant Species.}, journal = {Animals : an open access journal from MDPI}, volume = {11}, number = {11}, pages = {}, pmid = {34827803}, issn = {2076-2615}, abstract = {The social complexity hypothesis (SCH) for communication states that the range and frequency of social interactions drive the evolution of complex communication systems. Surprisingly, few studies have empirically tested the SHC for vocal communication systems. Filling this gap is important because a co-evolutionary runaway process between social and vocal complexity may have shaped the most intricate communication system, human language. We here propose the African elephant Loxodonta spec. as an excellent study system to investigate the relationships between social and vocal complexity. We review how the distinct differences in social complexity between the two species of African elephants, the forest elephant L. cyclotis and the savanna elephant L. africana, relate to repertoire size and structure, as well as complex communication skills in the two species, such as call combination or intentional formant modulation including the trunk. Our findings suggest that Loxodonta may contradict the SCH, as well as other factors put forth to explain patterns of vocal complexity across species. We propose that life history traits, a factor that has gained little attention as a driver of vocal complexity, and the extensive parental care associated with a uniquely low and slow reproductive rate, may have led to the emergence of pronounced vocal complexity in the forest elephant despite their less complex social system compared to the savanna elephant. Conclusions must be drawn cautiously, however. A better understanding of vocal complexity in the genus Loxodonta will depend on continuing advancements in remote data collection technologies to overcome the challenges of observing forest elephants in their dense rainforest habitat, as well as the availability of directly comparable data and methods, quantifying both structural and contextual variability in the production of rumbles and other vocalizations in both species of African elephants.}, }
@article {pmid34809062, year = {2021}, author = {Du, X and Zhang, X and Wang, Y and Ma, G and Liu, Y and Wang, B and Mao, H}, title = {Highly sensitive detection of plant growth regulators by using terahertz time-domain spectroscopy combined with metamaterials.}, journal = {Optics express}, volume = {29}, number = {22}, pages = {36535-36545}, doi = {10.1364/OE.437909}, pmid = {34809062}, issn = {1094-4087}, mesh = {Biosensing Techniques/*methods ; Computer Simulation ; Equipment Design ; Glycylglycine/*analysis ; Hydrazines/*analysis ; Plant Growth Regulators/*analysis ; Plants/*chemistry ; Refractometry ; Sensitivity and Specificity ; Terahertz Spectroscopy/instrumentation/*methods ; }, abstract = {The rapid and sensitive detection of plant-growth-regulator (PGR) residue is essential for ensuring food safety for consumers. However, there are many disadvantages in current approaches to detecting PGR residue. In this paper, we demonstrate a highly sensitive PGR detection method by using terahertz time-domain spectroscopy combined with metamaterials. We propose a double formant metamaterial resonator based on a split-ring structure with titanium-gold nanostructure. The metamaterial resonator is a split-ring structure composed of a titanium-gold nanostructure based on polyimide film as the substrate. Also, terahertz spectral response and electric field distribution of metamaterials under different analyte thickness and refractive index were investigated. The simulation results showed that the theoretical sensitivity of resonance peak 1 and peak 2 of the refractive index sensor based on our designed metamaterial resonator approaches 780 and 720 gigahertz per refractive index unit (GHz/RIU), respectively. In experiments, a rapid solution analysis platform based on the double formant metamaterial resonator was set up and PGR residues in aqueous solution were directly and rapidly detected through terahertz time-domain spectroscopy. The results showed that metamaterials can successfully detect butylhydrazine and N-N diglycine at a concentration as low as 0.05 mg/L. This study paves a new way for sensitive, rapid, low-cost detection of PGRs. It also means that the double formant metamaterial resonator has significant potential for other applications in terahertz sensing.}, }
@article {pmid34808474, year = {2022}, author = {Li, P and Ross, CF and Luo, ZX}, title = {Morphological disparity and evolutionary transformations in the primate hyoid apparatus.}, journal = {Journal of human evolution}, volume = {162}, number = {}, pages = {103094}, doi = {10.1016/j.jhevol.2021.103094}, pmid = {34808474}, issn = {1095-8606}, mesh = {Animals ; Female ; Haplorhini ; Hyoid Bone/anatomy & histology ; Phylogeny ; *Placenta ; Pregnancy ; *Primates/anatomy & histology ; }, abstract = {The hyoid apparatus plays an integral role in swallowing, respiration, and vocalization in mammals. Most placental mammals have a rod-shaped basihyal connected to the basicranium via both soft tissues and a mobile bony chain-the anterior cornu-whereas anthropoid primates have broad, shield-like or even cup-shaped basihyals suspended from the basicranium by soft tissues only. How the unique anthropoid hyoid morphology evolved is unknown, and hyoid morphology of nonanthropoid primates is poorly documented. Here we use phylogenetic comparative methods and linear morphometrics to address knowledge gaps in hyoid evolution among primates and their euarchontan outgroups. We find that dermopterans have variable reduction of cornu elements. Cynocephalus volans are sexually dimorphic in hyoid morphology. Tupaia and all lemuroids except Daubentonia have a fully ossified anterior cornu connecting a rod-shaped basihyal to the basicranium; this is the ancestral mammalian pattern that is also characteristic of the last common ancestor of Primates. Haplorhines exhibit a reduced anterior cornu, and anthropoids underwent further increase in basihyal aspect ratio values and in relative basihyal volume. Convergent with haplorhines, lorisoid strepsirrhines independently evolved a broad basihyal and reduced anterior cornua. While a reduced anterior cornu is hypothesized to facilitate vocal tract lengthening and lower formant frequencies in some mammals, our results suggest vocalization adaptations alone are unlikely to drive the iterative reduction of anterior cornua within Primates. Our new data on euarchontan hyoid evolution provide an anatomical basis for further exploring the form-function relationships of the hyoid across different behaviors, including vocalization, chewing, and swallowing.}, }
@article {pmid34799495, year = {2022}, author = {Xu, L and Luo, J and Xie, D and Chao, X and Wang, R and Zahorik, P and Luo, X}, title = {Reverberation Degrades Pitch Perception but Not Mandarin Tone and Vowel Recognition of Cochlear Implant Users.}, journal = {Ear and hearing}, volume = {43}, number = {4}, pages = {1139-1150}, doi = {10.1097/AUD.0000000000001173}, pmid = {34799495}, issn = {1538-4667}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; *Deafness/rehabilitation ; Humans ; Pitch Perception/physiology ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: The primary goal of this study was to investigate the effects of reverberation on Mandarin tone and vowel recognition of cochlear implant (CI) users and normal-hearing (NH) listeners. To understand the performance of Mandarin tone recognition, this study also measured participants' pitch perception and the availability of temporal envelope cues in reverberation.
DESIGN: Fifteen CI users and nine NH listeners, all Mandarin speakers, were asked to recognize Mandarin single-vowels produced in four lexical tones and rank harmonic complex tones in pitch with different reverberation times (RTs) from 0 to 1 second. Virtual acoustic techniques were used to simulate rooms with different degrees of reverberation. Vowel duration and correlation between amplitude envelope and fundamental frequency (F0) contour were analyzed for different tones as a function of the RT.
RESULTS: Vowel durations of different tones significantly increased with longer RTs. Amplitude-F0 correlation remained similar for the falling Tone 4 but greatly decreased for the other tones in reverberation. NH listeners had robust pitch-ranking, tone recognition, and vowel recognition performance as the RT increased. Reverberation significantly degraded CI users' pitch-ranking thresholds but did not significantly affect the overall scores of tone and vowel recognition with CIs. Detailed analyses of tone confusion matrices showed that CI users reduced the flat Tone-1 responses but increased the falling Tone-4 responses in reverberation, possibly due to the falling amplitude envelope of late reflections after the original vowel segment. CI users' tone recognition scores were not correlated with their pitch-ranking thresholds.
CONCLUSIONS: NH listeners can reliably recognize Mandarin tones in reverberation using salient pitch cues from spectral and temporal fine structures. However, CI users have poorer pitch perception using F0-related amplitude modulations that are reduced in reverberation. Reverberation distorts speech amplitude envelopes, which affect the distribution of tone responses but not the accuracy of tone recognition with CIs. Recognition of vowels with stationary formant trajectories is not affected by reverberation for both NH listeners and CI users, regardless of the available spectral resolution. Future studies should test how the relatively stable vowel and tone recognition may contribute to sentence recognition in reverberation of Mandarin-speaking CI users.}, }
@article {pmid34783468, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Popadyuk, VI and Vostrikov, AM and Sheveleva, VA and Kleyman, VK and Shalamov, KP and Torshin, VI}, title = {[Dynamics of vowel acoustic space indicators in patients with long-term hearing loss].}, journal = {Vestnik otorinolaringologii}, volume = {86}, number = {5}, pages = {17-21}, doi = {10.17116/otorino20218605117}, pmid = {34783468}, issn = {0042-4668}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Male ; Phonetics ; Russia ; Speech Acoustics ; }, abstract = {UNLABELLED: New procedure of vowel acoustic space (VAS) (of vowel acoustic triangles) transformation for the purpose of characterization of vowel production in individuals with long-term hearing loss (HL) was developed.
OBJECTIVE: To characterize VAS of adult Russian speakers with long-term HL using newly developed acoustic indicators.
MATERIAL AND METHODS: Recordings of sustained Russian cardinal vowels /a/, /i/, /u/ of 10 women and 10 men with long-term HL were acoustically analyzed. For each participant, two first formants of each vowel were measured and log-transformed (logF1, logF2). VAS was transformed into right triangles, their /u/ corners were moved to the origin, and their legs were aligned with axes. VAS was almost symmetrical, equal and have a maximum size in the control group consisted of subjects without hearing impairment while these of long-term HL group VAS size tended to have reduced and VAS stretched along one axis.
RESULTS: Our study showed that a new VAS normalization approach can distinguish at least three groups of people with long-term HL.
CONCLUSION: There are those with vowel triangles stretched along logF1-axis, with vowel triangles stretched along logF2-axis, and with symmetrical vowel triangles. Causes of the VAS differences require further investigation.}, }
@article {pmid34776842, year = {2021}, author = {Melchor, J and Vergara, J and Figueroa, T and Morán, I and Lemus, L}, title = {Formant-Based Recognition of Words and Other Naturalistic Sounds in Rhesus Monkeys.}, journal = {Frontiers in neuroscience}, volume = {15}, number = {}, pages = {728686}, pmid = {34776842}, issn = {1662-4548}, abstract = {In social animals, identifying sounds is critical for communication. In humans, the acoustic parameters involved in speech recognition, such as the formant frequencies derived from the resonance of the supralaryngeal vocal tract, have been well documented. However, how formants contribute to recognizing learned sounds in non-human primates remains unclear. To determine this, we trained two rhesus monkeys to discriminate target and non-target sounds presented in sequences of 1-3 sounds. After training, we performed three experiments: (1) We tested the monkeys' accuracy and reaction times during the discrimination of various acoustic categories; (2) their ability to discriminate morphing sounds; and (3) their ability to identify sounds consisting of formant 1 (F1), formant 2 (F2), or F1 and F2 (F1F2) pass filters. Our results indicate that macaques can learn diverse sounds and discriminate from morphs and formants F1 and F2, suggesting that information from few acoustic parameters suffice for recognizing complex sounds. We anticipate that future neurophysiological experiments in this paradigm may help elucidate how formants contribute to the recognition of sounds.}, }
@article {pmid34775826, year = {2022}, author = {Cartei, V and Reby, D and Garnham, A and Oakhill, J and Banerjee, R}, title = {Peer audience effects on children's vocal masculinity and femininity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200397}, pmid = {34775826}, issn = {1471-2970}, mesh = {Acoustics ; Child ; Female ; *Femininity ; Humans ; Male ; Masculinity ; *Voice ; }, abstract = {Existing evidence suggests that children from around the age of 8 years strategically alter their public image in accordance with known values and preferences of peers, through the self-descriptive information they convey. However, an important but neglected aspect of this 'self-presentation' is the medium through which such information is communicated: the voice itself. The present study explored peer audience effects on children's vocal productions. Fifty-six children (26 females, aged 8-10 years) were presented with vignettes where a fictional child, matched to the participant's age and sex, is trying to make friends with a group of same-sex peers with stereotypically masculine or feminine interests (rugby and ballet, respectively). Participants were asked to impersonate the child in that situation and, as the child, to read out loud masculine, feminine and gender-neutral self-descriptive statements to these hypothetical audiences. They also had to decide which of those self-descriptive statements would be most helpful for making friends. In line with previous research, boys and girls preferentially selected masculine or feminine self-descriptive statements depending on the audience interests. Crucially, acoustic analyses of fundamental frequency and formant frequency spacing revealed that children also spontaneously altered their vocal productions: they feminized their voices when speaking to members of the ballet club, while they masculinized their voices when speaking to members of the rugby club. Both sexes also feminized their voices when uttering feminine sentences, compared to when uttering masculine and gender-neutral sentences. Implications for the hitherto neglected role of acoustic qualities of children's vocal behaviour in peer interactions are discussed. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, }
@article {pmid34775821, year = {2022}, author = {Pisanski, K and Anikin, A and Reby, D}, title = {Vocal size exaggeration may have contributed to the origins of vocalic complexity.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200401}, pmid = {34775821}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; Body Size ; Speech ; Vocalization, Animal ; *Voice ; }, abstract = {Vocal tract elongation, which uniformly lowers vocal tract resonances (formant frequencies) in animal vocalizations, has evolved independently in several vertebrate groups as a means for vocalizers to exaggerate their apparent body size. Here, we propose that smaller speech-like articulatory movements that alter only individual formants can serve a similar yet less energetically costly size-exaggerating function. To test this, we examine whether uneven formant spacing alters the perceived body size of vocalizers in synthesized human vowels and animal calls. Among six synthetic vowel patterns, those characterized by the lowest first and second formant (the vowel /u/ as in 'boot') are consistently perceived as produced by the largest vocalizer. Crucially, lowering only one or two formants in animal-like calls also conveys the impression of a larger body size, and lowering the second and third formants simultaneously exaggerates perceived size to a similar extent as rescaling all formants. As the articulatory movements required for individual formant shifts are minor compared to full vocal tract extension, they represent a rapid and energetically efficient mechanism for acoustic size exaggeration. We suggest that, by favouring the evolution of uneven formant patterns in vocal communication, this deceptive strategy may have contributed to the origins of the phonemic diversification required for articulated speech. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, }
@article {pmid34775819, year = {2022}, author = {Grawunder, S and Uomini, N and Samuni, L and Bortolato, T and Girard-Buttoz, C and Wittig, RM and Crockford, C}, title = {Chimpanzee vowel-like sounds and voice quality suggest formant space expansion through the hominoid lineage.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {377}, number = {1841}, pages = {20200455}, pmid = {34775819}, issn = {1471-2970}, mesh = {Acoustics ; Animals ; *Pan troglodytes/physiology ; Phonetics ; Speech Acoustics ; *Voice Quality ; }, abstract = {The origins of human speech are obscure; it is still unclear what aspects are unique to our species or shared with our evolutionary cousins, in part due to a lack of a common framework for comparison. We asked what chimpanzee and human vocal production acoustics have in common. We examined visible supra-laryngeal articulators of four major chimpanzee vocalizations (hoos, grunts, barks, screams) and their associated acoustic structures, using techniques from human phonetic and animal communication analysis. Data were collected from wild adult chimpanzees, Taï National Park, Ivory Coast. Both discriminant and principal component classification procedures revealed classification of call types. Discriminating acoustic features include voice quality and formant structure, mirroring phonetic features in human speech. Chimpanzee lip and jaw articulation variables also offered similar discrimination of call types. Formant maps distinguished call types with different vowel-like sounds. Comparing our results with published primate data, humans show less F1-F2 correlation and further expansion of the vowel space, particularly for [i] sounds. Unlike recent studies suggesting monkeys achieve human vowel space, we conclude from our results that supra-laryngeal articulatory capacities show moderate evolutionary change, with vowel space expansion continuing through hominoid evolution. Studies on more primate species will be required to substantiate this. This article is part of the theme issue 'Voice modulation: from origin and mechanism to social impact (Part II)'.}, }
@article {pmid34756498, year = {2021}, author = {Davatz, GC and Yamasaki, R and Hachiya, A and Tsuji, DH and Montagnoli, AN}, title = {Source and Filter Acoustic Measures of Young, Middle-Aged and Elderly Adults for Application in Vowel Synthesis.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.08.025}, pmid = {34756498}, issn = {1873-4588}, abstract = {INTRODUCTION: The output sound has important changes throughout life due to anatomical and physiological modifications in the larynx and vocal tract. Understanding the young adult to the elderly speech acoustic characteristics may assist in the synthesis of representative voices of men and women of different age groups.
OBJECTIVE: To obtain the fundamental frequency (f0), formant frequencies (F1, F2, F3, F4), and bandwidth (B1, B2, B3, B4) values extracted from the sustained vowel /a/ of young, middle-aged, and elderly adults who are Brazilian Portuguese speakers; to present the application of these parameters in vowel synthesis.
STUDY DESIGN: Prospective study.
METHODS: The acoustic analysis of tokens of the 162 sustained vowel /a/ produced by vocally healthy adults, men, and women, between 18 and 80 years old, was performed. The adults were divided into three groups: young adults (18 to 44 years old); middle-aged adults (45 to 59 years old) and, elderly adults (60 to 80 years old). The f0, F1, F2, F3, F4, B1, B2, B3, B4 were extracted from the audio signals. Their average values were applied to a source-filter mathematical model to perform vowel synthesis in each age group both men and woman.
RESULTS: Young women had higher f0 than middle-aged and elderly women. Elderly women had lower F1 than middle-aged women. Young women had higher F2 than elderly women. For the men's output sound, the source-filter acoustic measures were statistically equivalent among the age groups. Average values of the f0, F1, F2, F3, F4, B1, and B2 were higher in women. The sound waves distance in signals, the position of formant frequencies and the dimension of the bandwidths visible in spectra of the synthesized sounds represent the average values extracted from the volunteers' emissions for the sustained vowel /a/ in Brazilian Portuguese.
CONCLUSION: Sustained vowel /a/ produced by women presented different values of f0,F1 and F2 between age groups, which was not observed for men. In addition to the f0 and the formant frequencies, the bandwidths were also different between women and men. The synthetic vowels available represent the acoustic changes found for each sex as a function of age.}, }
@article {pmid34735295, year = {2021}, author = {Rowe, HP and Stipancic, KL and Lammert, AC and Green, JR}, title = {Validation of an Acoustic-Based Framework of Speech Motor Control: Assessing Criterion and Construct Validity Using Kinematic and Perceptual Measures.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4736-4753}, pmid = {34735295}, issn = {1558-9102}, support = {F31 DC019556/DC/NIDCD NIH HHS/United States ; R01 DC013547/DC/NIDCD NIH HHS/United States ; R01 DC009890/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; R01 DC017291/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; *Speech ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {PURPOSE: This study investigated the criterion (analytical and clinical) and construct (divergent) validity of a novel, acoustic-based framework composed of five key components of motor control: Coordination, Consistency, Speed, Precision, and Rate.
METHOD: Acoustic and kinematic analyses were performed on audio recordings from 22 subjects with amyotrophic lateral sclerosis during a sequential motion rate task. Perceptual analyses were completed by two licensed speech-language pathologists, who rated each subject's speech on the five framework components and their overall severity. Analytical and clinical validity were assessed by comparing performance on the acoustic features to their kinematic correlates and to clinician ratings of the five components, respectively. Divergent validity of the acoustic-based framework was then assessed by comparing performance on each pair of acoustic features to determine whether the features represent distinct articulatory constructs. Bivariate correlations and partial correlations with severity as a covariate were conducted for each comparison.
RESULTS: Results revealed moderate-to-strong analytical validity for every acoustic feature, both with and without controlling for severity, and moderate-to-strong clinical validity for all acoustic features except Coordination, without controlling for severity. When severity was included as a covariate, the strong associations for Speed and Precision became weak. Divergent validity was supported by weak-to-moderate pairwise associations between all acoustic features except Speed (second-formant [F2] slope of consonant transition) and Precision (between-consonant variability in F2 slope).
CONCLUSIONS: This study demonstrated that the acoustic-based framework has potential as an objective, valid, and clinically useful tool for profiling articulatory deficits in individuals with speech motor disorders. The findings also suggest that compared to clinician ratings, instrumental measures are more sensitive to subtle differences in articulatory function. With further research, this framework could provide more accurate and reliable characterizations of articulatory impairment, which may eventually increase clinical confidence in the diagnosis and treatment of patients with different articulatory phenotypes.}, }
@article {pmid34734018, year = {2021}, author = {Xia, M and Cao, S and Zhou, R and Wang, JY and Xu, TY and Zhou, ZK and Qian, YM and Jiang, H}, title = {Acoustic features as novel predictors of difficult laryngoscopy in orthognathic surgery: an observational study.}, journal = {Annals of translational medicine}, volume = {9}, number = {18}, pages = {1466}, pmid = {34734018}, issn = {2305-5839}, abstract = {BACKGROUND: The evaluation of the difficult intubation is an important process before anaesthesia. The unanticipated difficult intubation is associated with morbidity and mortality. This study aimed to determine whether acoustic features are valuable as an alternative method to predict difficult laryngoscopy (DL) in patients scheduled to undergo orthognathic surgery.
METHODS: This study included 225 adult patients who were undergoing elective orthognathic surgery under general anaesthesia with tracheal intubation. Preoperatively, clinical airway evaluation was performed, and the acoustic data were collected. Twelve phonemes {[a], [o], [e], [i], [u], [ü], [ci], [qi], [chi], [le], [ke], and [en]} were recorded, and their formants (f1-f4) and bandwidths (bw1-bw4) were extracted. Difficult laryngoscopy was defined as direct laryngoscopy with a Cormack-Lehane grade of 3 or 4. Univariate and multivariate logistic regression analyses were used to examine the associations between acoustic features and DL.
RESULTS: Difficult laryngoscopy was reported in 59/225 (26.2%) patients. The area under the curve (AUC) of the backward stepwise model including en_f2 [odds ratio (OR), 0.996; 95% confidence interval (CI), 0.994-0.999; P=0.006], ci_bw4 (OR, 0.997; 95% CI, 0.993-1.000; P=0.057), qi_bw4 (OR, 0.996; 95% CI, 0.993-0.999; P=0.017), le_f3 (OR, 0.998; 95% CI, 0.996-1.000; P=0.079), o_bw4 (OR, 1.001; 95% CI, 1.000-1.003; P=0.014), chi_f4 (OR, 1.003; 95% CI, 1.000-1.005; P=0.041), a_bw4 (OR, 0.999; 95% CI, 0.998-1.000; P=0.078) attained a value of 0.761 in the training set, but a value of 0.709 in the testing set. The sensitivity and specificity of the model in the testing set are 86.7% and 63.0%, respectively.
CONCLUSIONS: Acoustic features may be considered as useful predictors of DL during orthognathic surgery.}, }
@article {pmid34731577, year = {2021}, author = {Abur, D and Subaciute, A and Daliri, A and Lester-Smith, RA and Lupiani, AA and Cilento, D and Enos, NM and Weerathunge, HR and Tardif, MC and Stepp, CE}, title = {Feedback and Feedforward Auditory-Motor Processes for Voice and Articulation in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {12}, pages = {4682-4694}, pmid = {34731577}, issn = {1558-9102}, support = {F31 DC019032/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Parkinson Disease/complications ; Speech ; Speech Intelligibility/physiology ; Speech Production Measurement ; *Voice ; }, abstract = {PURPOSE: Unexpected and sustained manipulations of auditory feedback during speech production result in "reflexive" and "adaptive" responses, which can shed light on feedback and feedforward auditory-motor control processes, respectively. Persons with Parkinson's disease (PwPD) have shown aberrant reflexive and adaptive responses, but responses appear to differ for control of vocal and articulatory features. However, these responses have not been examined for both voice and articulation in the same speakers and with respect to auditory acuity and functional speech outcomes (speech intelligibility and naturalness).
METHOD: Here, 28 PwPD on their typical dopaminergic medication schedule and 28 age-, sex-, and hearing-matched controls completed tasks yielding reflexive and adaptive responses as well as auditory acuity for both vocal and articulatory features.
RESULTS: No group differences were found for any measures of auditory-motor control, conflicting with prior findings in PwPD while off medication. Auditory-motor measures were also compared with listener ratings of speech function: first formant frequency acuity was related to speech intelligibility, whereas adaptive responses to vocal fundamental frequency manipulations were related to speech naturalness.
CONCLUSIONS: These results support that auditory-motor processes for both voice and articulatory features are intact for PwPD receiving medication. This work is also the first to suggest associations between measures of auditory-motor control and speech intelligibility and naturalness.}, }
@article {pmid34717445, year = {2021}, author = {Cheung, ST and Thompson, K and Chen, JL and Yunusova, Y and Beal, DS}, title = {Response patterns to vowel formant perturbations in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {4}, pages = {2647}, doi = {10.1121/10.0006567}, pmid = {34717445}, issn = {1520-8524}, mesh = {Adaptation, Physiological ; Adolescent ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback is an important component of speech motor control, but its precise role in developing speech is less understood. The role of auditory feedback in development was probed by perturbing the speech of children 4-9 years old. The vowel sound /ɛ/ was shifted to /æ/ in real time and presented to participants as their own auditory feedback. Analyses of the resultant formant magnitude changes in the participants' speech indicated that children compensated and adapted by adjusting their formants to oppose the perturbation. Older and younger children responded to perturbation differently in F1 and F2. The compensatory change in F1 was greater for younger children, whereas the increase in F2 was greater for older children. Adaptation aftereffects were observed in both groups. Exploratory directional analyses in the two-dimensional formant space indicated that older children responded more directly and less variably to the perturbation than younger children, shifting their vowels back toward the vowel sound /ɛ/ to oppose the perturbation. Findings support the hypothesis that auditory feedback integration continues to develop between the ages of 4 and 9 years old such that the differences in the adaptive and compensatory responses arise between younger and older children despite receiving the same auditory feedback perturbation.}, }
@article {pmid34717269, year = {2021}, author = {Tang, DL and McDaniel, A and Watkins, KE}, title = {Disruption of speech motor adaptation with repetitive transcranial magnetic stimulation of the articulatory representation in primary motor cortex.}, journal = {Cortex; a journal devoted to the study of the nervous system and behavior}, volume = {145}, number = {}, pages = {115-130}, pmid = {34717269}, issn = {1973-8102}, support = {/WT_/Wellcome Trust/United Kingdom ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Humans ; *Motor Cortex ; *Speech ; Transcranial Magnetic Stimulation ; }, abstract = {When auditory feedback perturbation is introduced in a predictable way over a number of utterances, speakers learn to compensate by adjusting their own productions, a process known as sensorimotor adaptation. Despite multiple lines of evidence indicating the role of primary motor cortex (M1) in motor learning and memory, whether M1 causally contributes to sensorimotor adaptation in the speech domain remains unclear. Here, we aimed to assay whether temporary disruption of the articulatory representation in left M1 by repetitive transcranial magnetic stimulation (rTMS) impairs speech adaptation. To induce sensorimotor adaptation, the frequencies of first formants (F1) were shifted up and played back to participants when they produced "head", "bed", and "dead" repeatedly (the learning phase). A low-frequency rTMS train (.6 Hz, subthreshold, 12 min) over either the tongue or the hand representation of M1 (between-subjects design) was applied before participants experienced altered auditory feedback in the learning phase. We found that the group who received rTMS over the hand representation showed the expected compensatory response for the upwards shift in F1 by significantly reducing F1 and increasing the second formant (F2) frequencies in their productions. In contrast, these expected compensatory changes in both F1 and F2 did not occur in the group that received rTMS over the tongue representation. Critically, rTMS (subthreshold) over the tongue representation did not affect vowel production, which was unchanged from baseline. These results provide direct evidence that the articulatory representation in left M1 causally contributes to sensorimotor learning in speech. Furthermore, these results also suggest that M1 is critical to the network supporting a more global adaptation that aims to move the altered speech production closer to a learnt pattern of speech production used to produce another vowel.}, }
@article {pmid34714438, year = {2022}, author = {Sturdy, SK and Smith, DRR and George, DN}, title = {Domestic dogs (Canis lupus familiaris) are sensitive to the correlation between pitch and timbre in human speech.}, journal = {Animal cognition}, volume = {25}, number = {3}, pages = {545-554}, pmid = {34714438}, issn = {1435-9456}, mesh = {Animals ; Dogs ; Female ; Humans ; Male ; Pitch Perception ; Sex Characteristics ; Speech ; Speech Acoustics ; *Voice ; *Wolves ; }, abstract = {The perceived pitch of human voices is highly correlated with the fundamental frequency (f0) of the laryngeal source, which is determined largely by the length and mass of the vocal folds. The vocal folds are larger in adult males than in adult females, and men's voices consequently have a lower pitch than women's. The length of the supralaryngeal vocal tract (vocal-tract length; VTL) affects the resonant frequencies (formants) of speech which characterize the timbre of the voice. Men's longer vocal tracts produce lower frequency, and less dispersed, formants than women's shorter vocal tracts. Pitch and timbre combine to influence the perception of speaker characteristics such as size and age. Together, they can be used to categorize speaker sex with almost perfect accuracy. While it is known that domestic dogs can match a voice to a person of the same sex, there has been no investigation into whether dogs are sensitive to the correlation between pitch and timbre. We recorded a female voice giving three commands ('Sit', 'Lay down', 'Come here'), and manipulated the recordings to lower the fundamental frequency (thus lowering pitch), increase simulated VTL (hence affecting timbre), or both (synthesized adult male voice). Dogs responded to the original adult female and synthesized adult male voices equivalently. Their tendency to obey the commands was, however, reduced when either pitch or timbre was manipulated alone. These results suggest that dogs are sensitive to both the pitch and timbre of human voices, and that they learn about the natural covariation of these perceptual attributes.}, }
@article {pmid34649740, year = {2021}, author = {Lester-Smith, RA and Derrick, E and Larson, CR}, title = {Characterization of Source-Filter Interactions in Vocal Vibrato Using a Neck-Surface Vibration Sensor: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, pmid = {34649740}, issn = {1873-4588}, support = {90AR5015/ACL/ACL HHS/United States ; R21 DC017001/DC/NIDCD NIH HHS/United States ; }, abstract = {PURPOSE: Vocal vibrato is a singing technique that involves periodic modulation of fundamental frequency (fo) and intensity. The physiological sources of modulation within the speech mechanism and the interactions between the laryngeal source and vocal tract filter in vibrato are not fully understood. Therefore, the purpose of this study was to determine if differences in the rate and extent of fo and intensity modulation could be captured using simultaneously recorded signals from a neck-surface vibration sensor and a microphone, which represent features of the source before and after supraglottal vocal tract filtering.
METHOD: Nine classically-trained singers produced sustained vowels with vibrato while simultaneous signals were recorded using a vibration sensor and a microphone. Acoustical analyses were performed to measure the rate and extent of fo and intensity modulation for each trial. Paired-samples sign tests were used to analyze differences between the rate and extent of fo and intensity modulation in the vibration sensor and microphone signals.
RESULTS: The rate and extent of fo modulation and the extent of intensity modulation were equivalent in the vibration sensor and microphone signals, but the rate of intensity modulation was significantly higher in the microphone signal than in the vibration sensor signal. Larger differences in the rate of intensity modulation were seen with vowels that typically have smaller differences between the first and second formant frequencies.
CONCLUSIONS: This study demonstrated that the rate of intensity modulation at the source prior to supraglottal vocal tract filtering, as measured in neck-surface vibration sensor signals, was lower than the rate of intensity modulation after supraglottal vocal tract filtering, as measured in microphone signals. The difference in rate varied based on the vowel. These findings provide further support of the resonance-harmonics interaction in vocal vibrato. Further investigation is warranted to determine if differences in the physiological source(s) of vibrato account for inconsistent relationships between the extent of intensity modulation in neck-surface vibration sensor and microphone signals.}, }
@article {pmid34642073, year = {2021}, author = {Tarai, SK and Chatterjee, I and Pani, S}, title = {A Comparative Acoustic Analysis of Bangla Folk Song and RabindraSangeet on Long-Term Average Spectrum.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.08.014}, pmid = {34642073}, issn = {1873-4588}, abstract = {BACKGROUND: Singing is defined as a sensory-motor phenomenon that requires particular balanced physical skills such as respiration, phonation, resonance, and articulation. The long-term average spectrum (LTAS) is widely accepted as a robust and effective tool for the assessment of voice characteristics.
METHOD: Eighty female singers within the age range of 18-30 years were considered for the study. Among 80 participants, 40 were asked to perform one traditional song from Bangla Folk representing the Baul style and another 40 were asked to perform a traditional song from Rabindra Sangeet. Recordings were done and then acoustic (LTAS) analyses were done through PRAAT software. Statistical analyses were done for the analyzed data. software package of social sciences (Version 20.0) was used.
RESULTS: The averaged LTAS curve of Baul style showed a broad peak in the frequency range between 2000 and 3600Hz and its amplitude about 16 dB, Rabindra Sangeet showed a broader peak in the frequency range between 2200 and 3800 Hz and its amplitude about 15 dB. This evidence showed the presence of singer's formants in both singing styles.
CONCLUSION: It can be concluded from the present study that, there is an acoustical difference between the Bangla Folk and Rabindra Sangeet singing style which can be evidenced using LTAS through PRAAT.}, }
@article {pmid34642071, year = {2021}, author = {Lee, Y and Park, HJ and Bae, IH and Kim, G}, title = {Resonance Characteristics in Epiglottic Cyst: Formant Frequency, Vowel Space Area, Vowel Articulatory Index, and Formant Centralization Ratio.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.09.008}, pmid = {34642071}, issn = {1873-4588}, abstract = {OBJECTIVES: Resonance characteristics can change due to alterations in the shape of the vocal tract in patients with epiglottic cysts. This study aimed to analyze the resonance characteristics before and after the surgical excision of epiglottic cysts.
METHODS: Twelve male patients with epiglottic cysts were enrolled in this study. We analyzed the first and second formants (F1 and F2) in vowels /a/, /e/, /i/, /o/, and /u/, vowel space area (VSA), vowel articulatory index (VAI), and formant centralization ratio (FCR). We measured these parameters before and after the surgical excision of epiglottic cysts.
RESULTS: There was a significant increase in the F1 values of /a/, VSA, and VAI, and a significant decrease in the value of FCR after the surgery.
CONCLUSION: We confirmed the change in the resonance characteristics in patients with epiglottic cysts. It is considered that further studies on epiglottic cysts and resonance changes are needed in the future.}, }
@article {pmid34641989, year = {2021}, author = {König, A and Mallick, E and Tröger, J and Linz, N and Zeghari, R and Manera, V and Robert, P}, title = {Measuring neuropsychiatric symptoms in patients with early cognitive decline using speech analysis.}, journal = {European psychiatry : the journal of the Association of European Psychiatrists}, volume = {64}, number = {1}, pages = {e64}, pmid = {34641989}, issn = {1778-3585}, mesh = {Aged ; Anxiety/diagnosis ; *Apathy ; *Cognitive Dysfunction/diagnosis ; Female ; Humans ; Machine Learning ; Male ; Neuropsychological Tests ; Speech ; }, abstract = {BACKGROUND: Certain neuropsychiatric symptoms (NPS), namely apathy, depression, and anxiety demonstrated great value in predicting dementia progression, representing eventually an opportunity window for timely diagnosis and treatment. However, sensitive and objective markers of these symptoms are still missing. Therefore, the present study aims to investigate the association between automatically extracted speech features and NPS in patients with mild neurocognitive disorders.
METHODS: Speech of 141 patients aged 65 or older with neurocognitive disorder was recorded while performing two short narrative speech tasks. NPS were assessed by the neuropsychiatric inventory. Paralinguistic markers relating to prosodic, formant, source, and temporal qualities of speech were automatically extracted, correlated with NPS. Machine learning experiments were carried out to validate the diagnostic power of extracted markers.
RESULTS: Different speech variables are associated with specific NPS; apathy correlates with temporal aspects, and anxiety with voice quality-and this was mostly consistent between male and female after correction for cognitive impairment. Machine learning regressors are able to extract information from speech features and perform above baseline in predicting anxiety, apathy, and depression scores.
CONCLUSIONS: Different NPS seem to be characterized by distinct speech features, which are easily extractable automatically from short vocal tasks. These findings support the use of speech analysis for detecting subtypes of NPS in patients with cognitive impairment. This could have great implications for the design of future clinical trials as this cost-effective method could allow more continuous and even remote monitoring of symptoms.}, }
@article {pmid34632373, year = {2021}, author = {Coto-Solano, R and Stanford, JN and Reddy, SK}, title = {Advances in Completely Automated Vowel Analysis for Sociophonetics: Using End-to-End Speech Recognition Systems With DARLA.}, journal = {Frontiers in artificial intelligence}, volume = {4}, number = {}, pages = {662097}, pmid = {34632373}, issn = {2624-8212}, abstract = {In recent decades, computational approaches to sociophonetic vowel analysis have been steadily increasing, and sociolinguists now frequently use semi-automated systems for phonetic alignment and vowel formant extraction, including FAVE (Forced Alignment and Vowel Extraction, Rosenfelder et al., 2011; Evanini et al., Proceedings of Interspeech, 2009), Penn Aligner (Yuan and Liberman, J. Acoust. Soc. America, 2008, 123, 3878), and DARLA (Dartmouth Linguistic Automation), (Reddy and Stanford, DARLA Dartmouth Linguistic Automation: Online Tools for Linguistic Research, 2015a). Yet these systems still have a major bottleneck: manual transcription. For most modern sociolinguistic vowel alignment and formant extraction, researchers must first create manual transcriptions. This human step is painstaking, time-consuming, and resource intensive. If this manual step could be replaced with completely automated methods, sociolinguists could potentially tap into vast datasets that have previously been unexplored, including legacy recordings that are underutilized due to lack of transcriptions. Moreover, if sociolinguists could quickly and accurately extract phonetic information from the millions of hours of new audio content posted on the Internet every day, a virtual ocean of speech from newly created podcasts, videos, live-streams, and other audio content would now inform research. How close are the current technological tools to achieving such groundbreaking changes for sociolinguistics? Prior work (Reddy et al., Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75) showed that an HMM-based Automated Speech Recognition system, trained with CMU Sphinx (Lamere et al., 2003), was accurate enough for DARLA to uncover evidence of the US Southern Vowel Shift without any human transcription. Even so, because that automatic speech recognition (ASR) system relied on a small training set, it produced numerous transcription errors. Six years have passed since that study, and since that time numerous end-to-end automatic speech recognition (ASR) algorithms have shown considerable improvement in transcription quality. One example of such a system is the RNN/CTC-based DeepSpeech from Mozilla (Hannun et al., 2014). (RNN stands for recurrent neural networks, the learning mechanism for DeepSpeech. CTC stands for connectionist temporal classification, the mechanism to merge phones into words). The present paper combines DeepSpeech with DARLA to push the technological envelope and determine how well contemporary ASR systems can perform in completely automated vowel analyses with sociolinguistic goals. Specifically, we used these techniques on audio recordings from 352 North American English speakers in the International Dialects of English Archive (IDEA), extracting 88,500 tokens of vowels in stressed position from spontaneous, free speech passages. With this large dataset we conducted acoustic sociophonetic analyses of the Southern Vowel Shift and the Northern Cities Chain Shift in the North American IDEA speakers. We compared the results using three different sources of transcriptions: 1) IDEA's manual transcriptions as the baseline "ground truth", 2) the ASR built on CMU Sphinx used by Reddy et al. (Proceedings of the North American Association for Computational Linguistics 2015 Conference, 2015b, 71-75), and 3) the latest publicly available Mozilla DeepSpeech system. We input these three different transcriptions to DARLA, which automatically aligned and extracted the vowel formants from the 352 IDEA speakers. Our quantitative results show that newer ASR systems like DeepSpeech show considerable promise for sociolinguistic applications like DARLA. We found that DeepSpeech's automated transcriptions had significantly fewer character error rates than those from the prior Sphinx system (from 46 to 35%). When we performed the sociolinguistic analysis of the extracted vowel formants from DARLA, we found that the automated transcriptions from DeepSpeech matched the results from the ground truth for the Southern Vowel Shift (SVS): five vowels showed a shift in both transcriptions, and two vowels didn't show a shift in either transcription. The Northern Cities Shift (NCS) was more difficult to detect, but ground truth and DeepSpeech matched for four vowels: One of the vowels showed a clear shift, and three showed no shift in either transcription. Our study therefore shows how technology has made progress toward greater automation in vowel sociophonetics, while also showing what remains to be done. Our statistical modeling provides a quantified view of both the abilities and the limitations of a completely "hands-free" analysis of vowel shifts in a large dataset. Naturally, when comparing a completely automated system against a semi-automated system involving human manual work, there will always be a tradeoff between accuracy on the one hand versus speed and replicability on the other hand [Kendall and Joseph, Towards best practices in sociophonetics (with Marianna DiPaolo), 2014]. The amount of "noise" that can be tolerated for a given study will depend on the particular research goals and researchers' preferences. Nonetheless, our study shows that, for certain large-scale applications and research goals, a completely automated approach using publicly available ASR can produce meaningful sociolinguistic results across large datasets, and these results can be generated quickly, efficiently, and with full replicability.}, }
@article {pmid34632133, year = {2021}, author = {Sondhi, S and Salhan, A and Santoso, CA and Doucoure, M and Dharmawan, DM and Sureka, A and Natasha, BN and Danusaputro, AD and Dowson, NS and Yap, MSL and Hadiwidjaja, MA and Veeraraghavan, SG and Hatta, AZR and Lee, C and Megantara, RA and Wihardja, AN and Sharma, M and Lardizabal, EL and Sondhi, LJ and Raina, R and Vashisth, S and Hedwig, R}, title = {Voice processing for COVID-19 scanning and prognostic indicator.}, journal = {Heliyon}, volume = {7}, number = {10}, pages = {e08134}, pmid = {34632133}, issn = {2405-8440}, abstract = {COVID-19 pandemic has posed serious risk of contagion to humans. There is a need to find reliable non-contact tests like vocal correlates of COVID-19 infection. Thirty-six Asian ethnic volunteers 16 (8M & 8F) infected subjects and 20 (10M &10F) non-infected controls participated in this study by vocalizing vowels /a/, /e/, /i/, /o/, /u/. Voice correlates of 16 COVID-19 positive patients were compared during infection and after recovery with 20 non-infected controls. Compared to non-infected controls, significantly higher values of energy intensity for /o/ (p = 0.048); formant F1 for /o/ (p = 0.014); and formant F3 for /u/ (p = 0.032) were observed in male patients, while higher values of Jitter (local, abs) for /o/ (p = 0.021) and Jitter (ppq5) for /a/ (p = 0.014) were observed in female patients. However, formant F2 for /u/ (p = 0.018), mean pitch F0 for /e/, /i/ and /o/ (p = 0.033; 0.036; 0.047) decreased for female patients under infection. Compared to recovered conditions, HNR for /e/ (p = 0.014) was higher in male patients under infection, while Jitter (rap) for /a/ (p = 0.041); Jitter (ppq5) for /a/ (p = 0.032); Shimmer (local, dB) for /i/ (p = 0.024); Shimmer (apq5) for /u/ (p = 0.019); and formant F4 for vowel /o/ (p = 0.022) were higher in female patients under infection. However, HNR for /e/ (p = 0.041); and formant F1 for /o/ (p = 0.002) were lower in female patients compared to their recovered conditions. Obtained results support the hypothesis since changes in voice parameters were observed in the infected patients which can be correlated to a combination of acoustic measures like fundamental frequency, formant characteristics, HNR, and voice perturbations like jitter and shimmer for different vowels. Thus, voice analysis can be used for scanning and prognosis of COVID-19 infection. Based on the findings of this study, a mobile application can be developed to analyze human voice in real-time to detect COVID-19 symptoms for remedial measures and necessary action.}, }
@article {pmid34550454, year = {2022}, author = {Gama, R and Castro, ME and van Lith-Bijl, JT and Desuter, G}, title = {Does the wearing of masks change voice and speech parameters?.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {279}, number = {4}, pages = {1701-1708}, pmid = {34550454}, issn = {1434-4726}, mesh = {Acoustics ; Humans ; Phonation ; Speech ; Speech Acoustics ; *Voice ; *Voice Disorders/etiology/prevention & control ; Voice Quality ; }, abstract = {PURPOSE: The authors aim to review available reports on the potential effects of masks on voice and speech parameters.
METHODS: A literature search was conducted using MEDLINE and Google Scholar databases through July 2021. Several targeted populations, mask scenarios and methodologies were approached. The assessed voice parameters were divided into self-reported, acoustic and aerodynamic.
RESULTS: It was observed that the wearing of a face mask has been shown to induce several changes in voice parameters: (1) self-reported-significantly increased vocal effort and fatigue, increased vocal tract discomfort and increased values of voice handicap index (VHI) were observed; (2) acoustics-increased voice intensity, altered formants frequency (F2 and F3) with no changes in fundamental frequency, increased harmonics-to-noise ratio (HNR) and increased mean spectral values in high-frequency levels (1000-8000 Hz), especially with KN95 mask; (3) aerodynamics-maximum phonatory time was assessed in only two reports, and showed no alterations.
CONCLUSION: Despite the different populations, mask-type scenarios and methodologies described by each study, the results of this review outline the significant changes in voice characteristics with the use of face masks. Wearing a mask shows to increase the perception of vocal effort and an alteration of the vocal tract length and speech articulatory movements, leading to spectral sound changes, impaired communication and perception. Studies analyzing the effect of masks on voice aerodynamics are lacking. Further research is required to study the long-term effects of face masks on the potential development of voice pathology.}, }
@article {pmid34543515, year = {2021}, author = {Wang, Y and Qiu, X and Wang, F and Li, Y and Guo, H and Nie, L}, title = {Single-crystal ordered macroporous metal-organic framework as support for molecularly imprinted polymers and their integration in membrane formant for the specific recognition of zearalenone.}, journal = {Journal of separation science}, volume = {44}, number = {22}, pages = {4190-4199}, doi = {10.1002/jssc.202100393}, pmid = {34543515}, issn = {1615-9314}, mesh = {Chromatography, High Pressure Liquid/methods ; Edible Grain/*chemistry ; Extraction and Processing Industry/methods ; Food Contamination/analysis ; Metal-Organic Frameworks ; Molecular Imprinting/methods ; Molecularly Imprinted Polymers ; Mycotoxins/analysis/chemistry ; Solid Phase Extraction/methods ; Zearalenone/*analysis/chemistry ; }, abstract = {Zearalenone is a fungal contaminant that is widely present in grains. Here, a novel molecularly imprinted membrane based on SOM-ZIF-8 was developed for the rapid and highly selective identification of zearalenone in grain samples. The molecularly imprinted membrane was prepared using polyvinylidene fluoride, cyclododecyl 2,4-dihydroxybenzoate as a template and SOM-ZIF-8 as a carrier. The factors influencing the extraction of zearalenone using this membrane, including the solution pH, extraction time, elution solvent, elution time, and elution volume, were studied in detail. The optimized conditions were 5 mL of sample solution at pH 6, extraction time of 45 min, 4 mL of acetonitrile:methanol = 9:1 as elution solvent, and elution time of 20 min. This method displayed a good linear range of 12-120 ng/g (R[2 ] = 0.998) with the limits of detection and quantification of this method are 1.7 and 5.5 ng/g, respectively. In addition, the membrane was used to selectively identify zearalenone in grain samples with percent recoveries ranging from 87.9 to 101.0% and relative standard deviation of less than 6.6%. Overall, this study presents a simple and effective chromatographic pretreatment method for detecting zearalenone in food samples.}, }
@article {pmid34538710, year = {2022}, author = {Erdur, OE and Yilmaz, BS}, title = {Voice changes after surgically assisted rapid maxillary expansion.}, journal = {American journal of orthodontics and dentofacial orthopedics : official publication of the American Association of Orthodontists, its constituent societies, and the American Board of Orthodontics}, volume = {161}, number = {1}, pages = {125-132}, doi = {10.1016/j.ajodo.2020.06.055}, pmid = {34538710}, issn = {1097-6752}, mesh = {Acoustics ; Adult ; Humans ; Maxilla ; *Palatal Expansion Technique ; *Voice Quality ; }, abstract = {INTRODUCTION: This study aimed to investigate voice changes in patients who had surgically assisted rapid maxillary expansion (SARME).
METHODS: Nineteen adult patients with maxillary transverse deficiency were asked to pronounce the sounds "[a], [ϵ], [ɯ], [i], [ɔ], [œ] [u], [y]" for 3 seconds. Voice records were taken before the expansion appliance was placed (T0) and 5.8 weeks after removal (T1, after 5.2 months of retention). The same records were taken for the control group (n = 19). The formant frequencies (F0, F1, F2, and F3), shimmer, jitter, and noise-to-harmonics ratio (NHR) parameters were considered with Praat (version 6.0.43).
RESULTS: In the SARME group, significant differences were observed in the F1 of [a] (P = 0.005), F2 of [ϵ] (P = 0.008), and [œ] sounds (P = 0.004). The postexpansion values were lower than those recorded before. In contrast, the F1 of [y] sound (P = 0.02), F2 of [u] sound (P = 0.01), the jitter parameter of [ɯ] and [i] sounds (P = 0.04; P = 0.002), and the NHR value of [ϵ] sound (P = 0.04) were significantly than the baseline values. In the comparison with the control group, significant differences were found in the F0 (P = 0.025) and F1 (P = 0.046) of the [u] sound, the F1 of the [a] sound (P = 0.03), and the F2 of the [ϵ] sound (P = 0.037). Significant differences were also found in the shimmer of [i] (P = 0.017) and [ɔ] (P = 0.002), the jitter of [ϵ] (P = 0.046) and [i] (P = 0.017), and the NHR of [i] (P = 0.012) and [ɔ] (P = 0.009).
CONCLUSION: SARME led to significant differences in some of the acoustics parameters.}, }
@article {pmid34498908, year = {2022}, author = {Perlman, M and Paul, J and Lupyan, G}, title = {Vocal communication of magnitude across language, age, and auditory experience.}, journal = {Journal of experimental psychology. General}, volume = {151}, number = {4}, pages = {885-896}, doi = {10.1037/xge0001103}, pmid = {34498908}, issn = {1939-2222}, mesh = {Adolescent ; Animals ; China ; Culture ; Humans ; *Language ; *Voice ; }, abstract = {Like many other vocalizing vertebrates, humans convey information about their body size through the sound of their voice. Vocalizations of larger animals are typically longer in duration, louder in intensity, and lower in frequency. We investigated people's ability to use voice-size correspondences to communicate about the magnitude of external referents. First, we asked hearing children, as well as deaf children and adolescents, living in China to improvise nonlinguistic vocalizations to distinguish between paired items contrasting in magnitude (e.g., a long vs. short string, a big vs. small ball). Then we played these vocalizations back to adult listeners in the United States and China to assess their ability to correctly guess the intended referents. We find that hearing and deaf producers both signaled greater magnitude items with longer and louder vocalizations and with smaller formant spacing. Only hearing producers systematically used fundamental frequency, communicating greater magnitude with higher fo. The vocalizations of both groups were understandable to Chinese and American listeners, although accuracy was higher with vocalizations from older producers. American listeners relied on the same acoustic properties as Chinese listeners: both groups interpreted vocalizations with longer duration and greater intensity as referring to greater items; neither American nor Chinese listeners consistently used fo or formant spacing as a cue. These findings show that the human ability to use vocalizations to communicate about the magnitude of external referents is highly robust, extending across listeners of disparate linguistic and cultural backgrounds, as well as across age and auditory experience. (PsycInfo Database Record (c) 2022 APA, all rights reserved).}, }
@article {pmid34482728, year = {2021}, author = {Stansbury, AL and Janik, VM}, title = {The role of vocal learning in call acquisition of wild grey seal pups.}, journal = {Philosophical transactions of the Royal Society of London. Series B, Biological sciences}, volume = {376}, number = {1836}, pages = {20200251}, pmid = {34482728}, issn = {1471-2970}, mesh = {Animals ; Female ; *Learning ; Male ; *Seals, Earless ; *Vocalization, Animal ; }, abstract = {Pinnipeds have been identified as one of the best available models for the study of vocal learning. Experimental evidence for their learning skills is demonstrated with advanced copying skills, particularly in formant structure when copying human speech sounds and melodies. By contrast, almost no data are available on how learning skills are used in their own communication systems. We investigated the impact of playing modified seal sounds in a breeding colony of grey seals (Halichoerus grypus) to study how acoustic input influenced vocal development of eight pups. Sequences of two or three seal pup calls were edited so that the average peak frequency between calls in a sequence changed up or down. We found that seals copied the specific stimuli played to them and that copies became more accurate over time. The differential response of different groups showed that vocal production learning was used to achieve conformity, suggesting that geographical variation in seal calls can be caused by horizontal cultural transmission. While learning of pup calls appears to have few benefits, we suggest that it also affects the development of the adult repertoire, which may facilitate social interactions such as mate choice. This article is part of the theme issue 'Vocal learning in animals and humans'.}, }
@article {pmid34474938, year = {2021}, author = {Güths, RC and Rolim, MRP and Coelho, A}, title = {Glottal Voice Distortions: Nasolaryngoscopic and Spectral Analysis of Anatomophysiologic Changes in Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.07.018}, pmid = {34474938}, issn = {1873-4588}, abstract = {The distorted voices, commonly called vocal drives in Brazil and in some other South American countries, are vocal ornaments belonging to the aesthetics of popular singing and desired by singers of different styles. The advances in vocal sciences have allowed the demystification of this type of technique in the last four decades, classifying them as glottal, supraglottic or mixed distortions/drives. The interdisciplinary approach in the evaluation of singers who use glottal distortions is fundamental for a broad understanding of the particularities of each case. The present study has as main objective to describe the anatomophysiological and spectral findings of the glottal distortions, identified in the practice of many singers. A sample of three singers in a sung emission with and without vocal distortions was collected. PreSonus® AudioBox Studio One kit was used to record the voice during the nasolaryngoscopic evaluation. The singers underwent vocal warm-up and functional evaluation of the larynx based on two studies on contemporary singers. The singers performed the Snarl Voice and Phaser distortions and both showed particular anatomophysiological behaviors. The larynx was low in the first distortion and the level of the clean voice in the second, with the posterior opening of the glottis in both distortions being observed, with opening of the middle third of the glottis for the first as well. Formants vary according to the vocal tract settings used for the distortions. The glottic distortions present a complex anatomophysiological behavior in their composition, with fundamental participation of the transverse interarytenoid muscle and lateral cricoarytenoids, as well as the the participation of the vocal fold in the frequency break. F3 varied according to the longitudinal length and F4 with the diameter, both being related to the three-dimensional adjustments of the vocal tract.}, }
@article {pmid34470280, year = {2021}, author = {Stehr, DA and Hickok, G and Ferguson, SH and Grossman, ED}, title = {Examining vocal attractiveness through articulatory working space.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1548}, doi = {10.1121/10.0005730}, pmid = {34470280}, issn = {1520-8524}, mesh = {Acoustics ; Female ; Humans ; Language ; Male ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Robust gender differences exist in the acoustic correlates of clearly articulated speech, with females, on average, producing speech that is acoustically and phonetically more distinct than that of males. This study investigates the relationship between several acoustic correlates of clear speech and subjective ratings of vocal attractiveness. Talkers were recorded producing vowels in /bVd/ context and sentences containing the four corner vowels. Multiple measures of working vowel space were computed from continuously sampled formant trajectories and were combined with measures of speech timing known to co-vary with clear articulation. Partial least squares regression (PLS-R) modeling was used to predict ratings of vocal attractiveness for male and female talkers based on the acoustic measures. PLS components that loaded on size and shape measures of working vowel space-including the quadrilateral vowel space area, convex hull area, and bivariate spread of formants-along with measures of speech timing were highly successful at predicting attractiveness in female talkers producing /bVd/ words. These findings are consistent with a number of hypotheses regarding human attractiveness judgments, including the role of sexual dimorphism in mate selection, the significance of traits signalling underlying health, and perceptual fluency accounts of preferences.}, }
@article {pmid34470262, year = {2021}, author = {Sahoo, S and Dandapat, S}, title = {Analyzing the vocal tract characteristics for out-of-breath speech.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {2}, pages = {1524}, doi = {10.1121/10.0005945}, pmid = {34470262}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Sound Spectrography ; Speech ; *Speech Acoustics ; *Voice ; }, abstract = {In this work, vocal tract characteristic changes under the out-of-breath condition are explored. Speaking under the influence of physical exercise is called out-of-breath speech. The change in breathing pattern results in perceptual changes in the produced sound. For vocal tract, the first four formants show a lowering in their average frequency. The bandwidths BF1 and BF2 widen, whereas the other two get narrowed. The change in bandwidth is small for the last three. For a speaker, the change in frequency and bandwidth may not be uniform across formants. Subband analysis is carried out around formants for comparing the variation of the vocal tract with the source. A vocal tract adaptive empirical wavelet transform is used for extracting formant specific subbands from speech and source. The support vector machine performs the subband-based binary classification between the normal and out-of-breath speech. For all speakers, it shows an F1-score improvement of 4% over speech subbands. Similarly, a performance improvement of 5% can be seen for both male and female speakers. Furthermore, the misclassification amount is less for source compared to speech. These results suggest that physical exercise influences the source more than the vocal tract.}, }
@article {pmid34470045, year = {2022}, author = {Dastolfo-Hromack, C and Bush, A and Chrabaszcz, A and Alhourani, A and Lipski, W and Wang, D and Crammond, DJ and Shaiman, S and Dickey, MW and Holt, LL and Turner, RS and Fiez, JA and Richardson, RM}, title = {Articulatory Gain Predicts Motor Cortex and Subthalamic Nucleus Activity During Speech.}, journal = {Cerebral cortex (New York, N.Y. : 1991)}, volume = {32}, number = {7}, pages = {1337-1349}, pmid = {34470045}, issn = {1460-2199}, support = {U01 NS098969/NS/NINDS NIH HHS/United States ; }, mesh = {*Deep Brain Stimulation ; Humans ; *Motor Cortex/physiology ; *Parkinson Disease/therapy ; Speech ; *Subthalamic Nucleus/physiology ; }, abstract = {Speaking precisely is important for effective verbal communication, and articulatory gain is one component of speech motor control that contributes to achieving this goal. Given that the basal ganglia have been proposed to regulate the speed and size of limb movement, that is, movement gain, we explored the basal ganglia contribution to articulatory gain, through local field potentials (LFP) recorded simultaneously from the subthalamic nucleus (STN), precentral gyrus, and postcentral gyrus. During STN deep brain stimulation implantation for Parkinson's disease, participants read aloud consonant-vowel-consonant syllables. Articulatory gain was indirectly assessed using the F2 Ratio, an acoustic measurement of the second formant frequency of/i/vowels divided by/u/vowels. Mixed effects models demonstrated that the F2 Ratio correlated with alpha and theta activity in the precentral gyrus and STN. No correlations were observed for the postcentral gyrus. Functional connectivity analysis revealed that higher phase locking values for beta activity between the STN and precentral gyrus were correlated with lower F2 Ratios, suggesting that higher beta synchrony impairs articulatory precision. Effects were not related to disease severity. These data suggest that articulatory gain is encoded within the basal ganglia-cortical loop.}, }
@article {pmid34400103, year = {2023}, author = {Aires, MM and de Vasconcelos, D and Lucena, JA and Gomes, AOC and Moraes, BT}, title = {Effect of Wendler glottoplasty on voice and quality of life of transgender women.}, journal = {Brazilian journal of otorhinolaryngology}, volume = {89}, number = {1}, pages = {22-29}, pmid = {34400103}, issn = {1808-8686}, mesh = {Male ; Humans ; Female ; Adult ; *Transgender Persons ; Quality of Life ; Prospective Studies ; Treatment Outcome ; Speech Acoustics ; }, abstract = {OBJECTIVE: To investigate the effect of Wendler glottoplasty on voice feminization, voice quality and voice-related quality of life.
METHODS: Prospective interventional cohort of transgender women submitted to Wendler glottoplasty. Acoustic analysis of the voice included assessment of fundamental frequency, maximum phonation time formant frequencies (F1 and F2), frequency range, jitter and shimmer. Voice quality was blindly assessed through GRBAS scale. Voice-related quality of life was measured using the Trans Woman Voice Questionnaire and the self-perceived femininity of the voice.
RESULTS: A total of 7 patients were included. The mean age was 35.4 years, and the mean postoperative follow-up time was 13.7 months. There was a mean increase of 47.9 ± 46.6 Hz (p = 0.023) in sustained/e/F0 and a mean increase of 24.6 ± 27.5 Hz (p = 0.029) in speaking F0 after glottoplasty. There was no statistical significance in the pre- and postoperative comparison of maximum phonation time, formant frequencies, frequency range, jitter, shimmer, and grade, roughness, breathiness, asthenia, and strain scale. Trans Woman Voice Questionnaire decreased following surgery from 98.3 ± 9.2 to 54.1 ± 25.0 (p = 0.007) and mean self-perceived femininity of the voice increased from 2.8 ± 1.8 to 7.7 ± 2.4 (p = 0.008). One patient (14%) presented a postoperative granuloma and there was 1 (14%) premature suture dehiscence.
CONCLUSION: Glottoplasty is safe and effective for feminizing the voice of transgender women. There was an increase in fundamental frequency, without aggravating other acoustic parameters or voice quality. Voice-related quality of life improved after surgery.}, }
@article {pmid34396801, year = {2022}, author = {Chung, H}, title = {Acoustic Characteristics of Pre- and Post-vocalic /l/: Patterns from One Southern White Vernacular English.}, journal = {Language and speech}, volume = {65}, number = {2}, pages = {513-528}, doi = {10.1177/00238309211037368}, pmid = {34396801}, issn = {1756-6053}, mesh = {Acoustics ; Adult ; Female ; Humans ; Language ; Male ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study examined acoustic characteristics of the phoneme /l/ produced by young female and male adult speakers of Southern White Vernacular English (SWVE) from Louisiana. F1, F2, and F2-F1 values extracted at the /l/ midpoint were analyzed by word position (pre- vs. post-vocalic) and vowel contexts (/i, ɪ/ vs. /ɔ, a/). Descriptive analysis showed that SWVE /l/ exhibited characteristics of the dark /l/ variant. The formant patterns of /l/, however, differed significantly by word position and vowel context, with pre-vocalic /l/ showing significantly higher F2-F1 values than post-vocalic /l/, and /l/ in the high front vowel context showing significantly higher F2-F1 values than those in the low back vowel context. Individual variation in the effects of word position and vowel contexts on /l/ pattern was also observed. Overall, the findings of the current study showed a gradient nature of SWVE /l/ variants whose F2-F1 patterns generally fell into the range of the dark /l/ variant, while varying by word position and vowel context.}, }
@article {pmid34388438, year = {2021}, author = {Yang, L and Fu, K and Zhang, J and Shinozaki, T}, title = {Non-native acoustic modeling for mispronunciation verification based on language adversarial representation learning.}, journal = {Neural networks : the official journal of the International Neural Network Society}, volume = {142}, number = {}, pages = {597-607}, doi = {10.1016/j.neunet.2021.07.017}, pmid = {34388438}, issn = {1879-2782}, mesh = {Acoustics ; Humans ; *Language ; Language Development ; Speech ; *Speech Perception ; }, abstract = {Non-native mispronunciation verification is designed to provide feedback to guide language learners to correct their pronunciation errors in their further learning and it plays an important role in the computer-aided pronunciation training (CAPT) system. Most existing approaches focus on establishing the acoustic model directly using non-native corpus thus they are suffering the data sparsity problem due to time-consuming non-native speech data collection and annotation tasks. In this work, to address this problem, we propose a pre-trained approach to utilize the speech data of two native languages (the learner's native and target languages) for non-native mispronunciation verification. We set up an unsupervised model to extract knowledge from a large scale of unlabeled raw speech of the target language by making predictions about future observations in the speech signal, then the model is trained with language adversarial training using the learner's native language to align the feature distribution of two languages by confusing a language discriminator. In addition, sinc filter is incorporated at the first convolutional layer to capture the formant-like feature. Formant is relevant to the place and manner of articulation. Therefore, it is useful not only for pronunciation error detection but also for providing instructive feedback. Then the pre-trained model serves as the feature extractor in the downstream mispronunciation verification task. Through the experiments on the Japanese part of the BLCU inter-Chinese speech corpus, the experimental results demonstrate that for the non-native phone recognition and mispronunciation verification tasks (1) the knowledge learned from two native languages speech with the proposed unsupervised approach is useful for these two tasks (2) our proposed language adversarial representation learning is effective to improve the performance (3) formant-like feature can be incorporated by introducing sinc filter to further improve the performance of mispronunciation verification.}, }
@article {pmid34384662, year = {2021}, author = {Leyns, C and Corthals, P and Cosyns, M and Papeleu, T and Van Borsel, J and Morsomme, D and T'Sjoen, G and D'haeseleer, E}, title = {Acoustic and Perceptual Effects of Articulation Exercises in Transgender Women.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.06.033}, pmid = {34384662}, issn = {1873-4588}, abstract = {PURPOSE: This study measured the impact of articulation exercises using a cork and articulation exercises for lip spreading on the formant frequencies of vowels and listener perceptions of femininity in transgender women.
METHODS: Thirteen transgender women were recorded before and after the cork exercise and before and after the lip spreading exercise. Speech samples included continuous speech during reading and were analyzed using Praat software. Vowel formant frequencies (F1, F2, F3, F4, F5) and vowel space were determined. A listening experiment was organized using naïve cisgender women and cisgender men rating audio samples of continuous speech. Masculinity/femininity, vocal quality and age were rated, using a visual analogue scale (VAS).
RESULTS: Concerning vowel formant frequencies, F2 /a/ and F5 /u/ significantly increased after the lip spreading exercise, as well as F3 /a/, F3 /u/ and F4 /a/ after the cork exercise. The lip spreading exercise had more impact on the F2 /a/ than the cork exercise. Vowel space did not change after the exercises. The fundamental frequency (fo) increased simultaneously during both exercises. Both articulation exercises were associated with significantly increased listener perceptions of femininity of the voice.
CONCLUSION: Subtle changes in formant frequencies can be observed after performing articulation exercises, but not in every formant frequency or vowel. Cisgender listeners rated the speech of the transgender women more feminine after the exercises. Further research with a more extensive therapy program and listening experiment is needed to examine these preliminary findings.}, }
@article {pmid34344099, year = {2021}, author = {Yang, JJ and Cheng, LY and Xu, W}, title = {[Study on changes of voice characteristics after adenotonsillectomy or adenoidectomy in children].}, journal = {Zhonghua er bi yan hou tou jing wai ke za zhi = Chinese journal of otorhinolaryngology head and neck surgery}, volume = {56}, number = {7}, pages = {724-729}, doi = {10.3760/cma.j.cn115330-20200813-00672}, pmid = {34344099}, issn = {1673-0860}, mesh = {Adenoidectomy ; *Adenoids/surgery ; Child ; Child, Preschool ; Female ; Humans ; Male ; Speech Acoustics ; *Tonsillectomy ; Voice Quality ; }, abstract = {Objective: To study voice changes in children after adenotonsillectomy or adenoidectomy and the relationship with the vocal tract structure. Methods: Fifty patients were recruited in this study prospectively, aged from 4 to 12 years old with the median age of 6. They were underwent adenotonsillectomy or adenoidectomy in Beijing Tongren Hospital, Capital Medical University from July 2019 to August 2020. In the cases, there are 31 males and 19 females. Thirty-six patients underwent adenotonsillectomy and 14 patients underwent adenoidectomy alone. Twenty-two children (13 males, 9 females) with Ⅰ degree of bilateral tonsils without adenoid hypertrophy and no snoring were selected as normal controls. Adenoid and tonsil sizes were evaluated. Subjective changes of voice were recorded after surgery. Moreover, voice data including fundamental frequency(F0), jitter, shimmer, noise to harmonic ratio(NHR), maximum phonation time(MPT), formant frequencies(F1-F5) and bandwidths(B1-B5) of vowel/a/and/i/were analyzed before, 3 days and 1 month after surgery respectively.SPSS 23.0 was used for statistical analysis. Results: Thirty-six patients(72.0%,36/50) complained of postoperative voice changes. The incidence was inversely correlated with age. In children aged 4-6, 7-9, and 10-12, the incidence was 83.3%(25/30), 63.6%(7/11) and 44.4%(4/9) respectively. Voice changes appeared more common in children underwent adenotonsillectomy(77.8%,28/36) than in those underwent adenoidectomy alone(57.1%,8/14), but there was no statistical difference. After operation, for vowel/a/, MPT(Z=2.18,P=0.041) and F2(t=2.13,P=0.040) increased, B2(Z=2.04,P=0.041) and B4(Z=2.00,P=0.046) decreased. For vowel/i/, F2(t=2.035,P=0.050) and F4(t=4.44,P=0.0001) increased, B2(Z=2.36,P=0.019) decreased. Other acoustic parameters were not significantly different from those before surgery. The F2(r=-0.392, P =0.032) of vowel/a/and F2(r=-0.279, P=0.048) and F4 (r=-0.401, P =0.028) of vowel/i/after adenotonsillectomy were significantly higher than those of adenoidectomy alone. Half of patients with postopertive voice changes can recover spontaneously 1 month after surgery. Conclusions: Voice changes in children underwent adenotonsillectomy or adenoidectomy might be related to their changes in formants and bandwidths. The effect of adenotonsillectomy on voice was more significant compared with that of adenoidectomy alone. The acoustic parameters did not change significantly after surgery except MPT.}, }
@article {pmid34342877, year = {2021}, author = {Frey, R and Wyman, MT and Johnston, M and Schofield, M and Locatelli, Y and Reby, D}, title = {Roars, groans and moans: Anatomical correlates of vocal diversity in polygynous deer.}, journal = {Journal of anatomy}, volume = {239}, number = {6}, pages = {1336-1369}, pmid = {34342877}, issn = {1469-7580}, mesh = {Acoustics ; Animals ; *Deer ; Female ; *Larynx ; Male ; Vocal Cords ; Vocalization, Animal ; }, abstract = {Eurasian deer are characterized by the extraordinary diversity of their vocal repertoires. Male sexual calls range from roars with relatively low fundamental frequency (hereafter fo) in red deer Cervus elaphus, to moans with extremely high fo in sika deer Cervus nippon, and almost infrasonic groans with exceptionally low fo in fallow deer Dama dama. Moreover, while both red and fallow males are capable of lowering their formant frequencies during their calls, sika males appear to lack this ability. Female contact calls are also characterized by relatively less pronounced, yet strong interspecific differences. The aim of this study is to examine the anatomical bases of these inter-specific and inter-sexual differences by identifying if the acoustic variation is reflected in corresponding anatomical variation. To do this, we investigated the vocal anatomy of male and female specimens of each of these three species. Across species and sexes, we find that the observed acoustic variability is indeed related to expected corresponding anatomical differences, based on the source-filter theory of vocal production. At the source level, low fo is associated with larger vocal folds, whereas high fo is associated with smaller vocal folds: sika deer have the smallest vocal folds and male fallow deer the largest. Red and sika deer vocal folds do not appear to be sexually dimorphic, while fallow deer exhibit strong sexual dimorphism (after correcting for body size differences). At the filter level, the variability in formants is related to the configuration of the vocal tract: in fallow and red deer, both sexes have evolved a permanently descended larynx (with a resting position of the larynx much lower in males than in females). Both sexes also have the potential for momentary, call-synchronous vocal tract elongation, again more pronounced in males than in females. In contrast, the resting position of the larynx is high in both sexes of sika deer and the potential for further active vocal tract elongation is virtually absent in both sexes. Anatomical evidence suggests an evolutionary reversal in larynx position within sika deer, that is, a secondary larynx ascent. Together, our observations confirm that the observed diversity of vocal behaviour in polygynous deer is supported by strong anatomical differences, highlighting the importance of anatomical specializations in shaping mammalian vocal repertoires. Sexual selection is discussed as a potential evolutionary driver of the observed vocal diversity and sexual dimorphisms.}, }
@article {pmid34340503, year = {2021}, author = {Strycharczuk, P and Ćavar, M and Coretta, S}, title = {Distance vs time. Acoustic and articulatory consequences of reduced vowel duration in Polish.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {592}, doi = {10.1121/10.0005585}, pmid = {34340503}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; Poland ; Speech ; *Speech Acoustics ; }, abstract = {This paper presents acoustic and articulatory (ultrasound) data on vowel reduction in Polish. The analysis focuses on the question of whether the change in formant value in unstressed vowels can be explained by duration-driven undershoot alone or whether there is also evidence for additional stress-specific articulatory mechanisms that systematically affect vowel formants. On top of the expected durational differences between the stressed and unstressed conditions, the duration is manipulated by inducing changes in the speech rate. The observed vowel formants are compared to expected formants derived from the articulatory midsagittal tongue data in different conditions. The results show that the acoustic vowel space is reduced in size and raised in unstressed vowels compared to stressed vowels. Most of the spectral reduction can be explained by reduced vowel duration, but there is also an additional systematic effect of F1-lowering in unstressed non-high vowels that does not follow from tongue movement. The proposed interpretation is that spectral vowel reduction in Polish behaves largely as predicted by the undershoot model of vowel reduction, but the effect of undershoot is enhanced for low unstressed vowels, potentially by a stress marking strategy which involves raising the fundamental frequency.}, }
@article {pmid34340486, year = {2021}, author = {Petersen, EA and Colinot, T and Silva, F and H-Turcotte, V}, title = {The bassoon tonehole lattice: Links between the open and closed holes and the radiated sound spectrum.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {398}, doi = {10.1121/10.0005627}, pmid = {34340486}, issn = {1520-8524}, abstract = {The acoustics of the bassoon has been the subject of relatively few studies compared with other woodwind instruments. One reason for this may lie in its complicated resonator geometry, which includes irregularly spaced toneholes with chimney heights ranging from 3 to 31 mm. The current article evaluates the effect of the open and closed tonehole lattice (THL) on the acoustic response of the bassoon resonator. It is shown that this response can be divided into three distinct frequency bands that are determined by the open and closed THL: below 500 Hz, 500-2200 Hz, and above 2200 Hz. The first is caused by the stopband of the open THL, where the low frequency effective length of the instrument is determined by the location of the first open tonehole. The second is due to the passband of the open THL, such that the modes are proportional to the total length of the resonator. The third is due to the closed THL, where part of the acoustical power is trapped within the resonator. It is proposed that these three frequency bands impact the radiated spectrum by introducing a formant in the vicinity of 500 Hz and suppressing radiation above 2200 Hz for most first register fingerings.}, }
@article {pmid34340472, year = {2021}, author = {Uezu, Y and Hiroya, S and Mochida, T}, title = {Articulatory compensation for low-pass filtered formant-altered auditory feedback.}, journal = {The Journal of the Acoustical Society of America}, volume = {150}, number = {1}, pages = {64}, doi = {10.1121/10.0004775}, pmid = {34340472}, issn = {1520-8524}, mesh = {Feedback ; Feedback, Sensory ; Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Auditory feedback while speaking plays an important role in stably controlling speech articulation. Its importance has been verified in formant-altered auditory feedback (AAF) experiments where speakers utter while listening to speech with perturbed first (F1) and second (F2) formant frequencies. However, the contribution of the frequency components higher than F2 to the articulatory control under the perturbations of F1 and F2 has not yet been investigated. In this study, a formant-AAF experiment was conducted in which a low-pass filter was applied to speech. The experimental results showed that the deviation in the compensatory response was significantly larger when a low-pass filter with a cutoff frequency of 3 kHz was used compared to that when cutoff frequencies of 4 and 8 kHz were used. It was also found that the deviation in the 3-kHz condition correlated with the fundamental frequency and spectral tilt of the produced speech. Additional simulation results using a neurocomputational model of speech production (SimpleDIVA model) and the experimental data showed that the feedforward learning rate increased as the cutoff frequency decreased. These results suggest that high-frequency components of the auditory feedback would be involved in the determination of corrective motor commands from auditory errors.}, }
@article {pmid34291230, year = {2021}, author = {Lynn, E and Narayanan, SS and Lammert, AC}, title = {Dark tone quality and vocal tract shaping in soprano song production: Insights from real-time MRI.}, journal = {JASA express letters}, volume = {1}, number = {7}, pages = {075202}, pmid = {34291230}, issn = {2691-1191}, abstract = {Tone quality termed "dark" is an aesthetically important property of Western classical voice performance and has been associated with lowered formant frequencies, lowered larynx, and widened pharynx. The present study uses real-time magnetic resonance imaging with synchronous audio recordings to investigate dark tone quality in four professionally trained sopranos with enhanced ecological validity and a relatively complete view of the vocal tract. Findings differ from traditional accounts, indicating that labial narrowing may be the primary driver of dark tone quality across performers, while many other aspects of vocal tract shaping are shown to differ significantly in a performer-specific way.}, }
@article {pmid34265989, year = {2021}, author = {Liu, R and Wang, G and Deng, D and Zhang, T}, title = {Spin Hall effect of Laguerre-Gaussian beams in PT symmetric metamaterials.}, journal = {Optics express}, volume = {29}, number = {14}, pages = {22192-22201}, doi = {10.1364/OE.427869}, pmid = {34265989}, issn = {1094-4087}, abstract = {Spin Hall effect (SHE) of Laguerre-Gaussian (LG) beams reflected and transmitted in parity-time (PT) symmetric metamaterials are investigated near the coherent-perfect-absorption (CPA)-laser point and exceptional points (EPs). The numerical results show that large transverse shifts occur at the CPA-laser point regardless of the incident direction. But at EPs, the SHE increases at one side and disappears at the other side, thus achieving the intense SHE of the reflected light beams at the specified side incidence. In addition, it is found that Bragg oscillation can be generated by increasing the period number of PT symmetric metamaterial layers, thus increasing the number of formants in transverse displacement. In particular, the transverse shift peaks of the transmitted beams merge into a positive peak when the incident angle is close to 90[∘] and does not change basically with the increasing of Im(ɛ), which can also be considered as a strong tolerance to the variation of Im(ɛ). This feature is expected to realize a new type of optoelectronic devices with anti-interference performance. These results provide a feasible path for the modulation of spin Hall effect of light (SHEL) and provide the possibility for the development of new nanophotonic devices.}, }
@article {pmid34261582, year = {2021}, author = {Joshi, A and Procter, T and Kulesz, PA}, title = {COVID-19: Acoustic Measures of Voice in Individuals Wearing Different Facemasks.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, pmid = {34261582}, issn = {1873-4588}, abstract = {AIM: The global health pandemic caused by the SARS-coronavirus 2 (COVID-19) has led to the adoption of facemasks as a necessary safety precaution. Depending on the level of risk for exposure to the virus, the facemasks that are used can vary. The aim of this study was to examine the effect of different types of facemasks, typically used by healthcare professionals and the public during the COVID-19 pandemic, on measures of voice.
METHODS: Nineteen adults (ten females, nine males) with a normal voice quality completed sustained vowel tasks. All tasks were performed for each of the six mask conditions: no mask, cloth mask, surgical mask, KN95 mask and, surgical mask over a KN95 mask with and without a face shield. Intensity measurements were obtained at a 1ft and 6ft distance from the speaker with sound level meters. Tasks were recorded with a 1ft mouth-to-microphone distance. Acoustic variables of interest were fundamental frequency (F0), and formant frequencies (F1, F2) for /a/ and /i/ and smoothed cepstral peak prominence (CPPs) for /a/.
RESULTS: Data were analyzed to compare differences between sex and mask types. There was statistical significance between males and females for intensity measures and all acoustic variables except F2 for /a/ and F1 for /i/. Few pairwise comparisons between masks reached significance even though main effects for mask type were observed. These are further discussed in the article.
CONCLUSION: The masks tested in this study did not have a significant impact on intensity, fundamental frequency, CPPs, first or second formant frequency compared to voice output without a mask. Use of a face shield seemed to affect intensity and CPPs to some extent. Implications of these findings are discussed further in the article.}, }
@article {pmid34260437, year = {2022}, author = {Easwar, V and Birstler, J and Harrison, A and Scollie, S and Purcell, D}, title = {The Influence of Sensation Level on Speech-Evoked Envelope Following Responses.}, journal = {Ear and hearing}, volume = {43}, number = {1}, pages = {250-254}, pmid = {34260437}, issn = {1538-4667}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; //CIHR/Canada ; }, mesh = {Acoustic Stimulation ; Adult ; Female ; Humans ; Male ; Phonetics ; Sensation ; *Speech ; *Speech Perception/physiology ; }, abstract = {OBJECTIVES: To evaluate sensation level (SL)-dependent characteristics of envelope following responses (EFRs) elicited by band-limited speech dominant in low, mid, and high frequencies.
DESIGN: In 21 young normal hearing adults, EFRs were elicited by 8 male-spoken speech stimuli-the first formant, and second and higher formants of /u/, /a/ and /i/, and modulated fricatives, /∫/ and /s/. Stimulus SL was computed from behaviorally measured thresholds.
RESULTS: At 30 dB SL, the amplitude and phase coherence of fricative-elicited EFRs were ~1.5 to 2 times higher than all vowel-elicited EFRs, whereas fewer and smaller differences were found among vowel-elicited EFRs. For all stimuli, EFR amplitude and phase coherence increased by roughly 50% for every 10 dB increase in SL between ~0 and 50 dB.
CONCLUSIONS: Stimulus and frequency dependency in EFRs exist despite accounting for differences in audibility of speech sounds. The growth rate of EFR characteristics with SL is independent of stimulus and its frequency.}, }
@article {pmid34256982, year = {2021}, author = {Zealouk, O and Satori, H and Hamidi, M and Laaidi, N and Salek, A and Satori, K}, title = {Analysis of COVID-19 Resulting Cough Using Formants and Automatic Speech Recognition System.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, pmid = {34256982}, issn = {1873-4588}, abstract = {As part of our contributions to researches on the ongoing COVID-19 pandemic worldwide, we have studied the cough changes to the infected people based on the Hidden Markov Model (HMM) speech recognition classification, formants frequency and pitch analysis. In this paper, An HMM-based cough recognition system was implemented with 5 HMM states, 8 Gaussian Mixture Distributions (GMMs) and 13 dimensions of the basic Mel-Frequency Cepstral Coefficients (MFCC) with 39 dimensions of the overall feature vector. A comparison between formants frequency and pitch extracted values is realized based on the cough of COVID-19 infected people and healthy ones to confirm our cough recognition system results. The experimental results present that the difference between the recognition rates of infected and non-infected people is 6.7%. Whereas, the formant analysis variation based on the cough of infected and non-infected people is clearly observed with F1, F3, and F4 and lower for F0 and F2.}, }
@article {pmid34251887, year = {2021}, author = {Easwar, V and Scollie, S and Lasarev, M and Urichuk, M and Aiken, SJ and Purcell, DW}, title = {Characteristics of Speech-Evoked Envelope Following Responses in Infancy.}, journal = {Trends in hearing}, volume = {25}, number = {}, pages = {23312165211004331}, pmid = {34251887}, issn = {2331-2165}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Infant ; Phonetics ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {Envelope following responses (EFRs) may be a useful tool for evaluating the audibility of speech sounds in infants. The present study aimed to evaluate the characteristics of speech-evoked EFRs in infants with normal hearing, relative to adults, and identify age-dependent changes in EFR characteristics during infancy. In 42 infants and 21 young adults, EFRs were elicited by the first (F1) and the second and higher formants (F2+) of the vowels /u/, /a/, and /i/, dominant in low and mid frequencies, respectively, and by amplitude-modulated fricatives /s/ and /∫/, dominant in high frequencies. In a subset of 20 infants, the in-ear stimulus level was adjusted to match that of an average adult ear (65 dB sound pressure level [SPL]). We found that (a) adult-infant differences in EFR amplitude, signal-to-noise ratio, and intertrial phase coherence were larger and spread across the frequency range when in-ear stimulus level was adjusted in infants, (b) adult-infant differences in EFR characteristics were the largest for low-frequency stimuli, (c) infants demonstrated adult-like phase coherence when they received a higher (i.e., unadjusted) stimulus level, and (d) EFR phase coherence and signal-to-noise ratio changed with age in the first year of life for a few F2+ vowel stimuli in a level-specific manner. Together, our findings reveal that development-related changes in EFRs during infancy likely vary by stimulus frequency, with low-frequency stimuli demonstrating the largest adult-infant differences. Consistent with previous research, our findings emphasize the significant role of stimulus level calibration methods while investigating developmental trends in EFRs.}, }
@article {pmid34241428, year = {2021}, author = {Echternach, M and Herbst, CT and Köberlein, M and Story, B and Döllinger, M and Gellrich, D}, title = {Are source-filter interactions detectable in classical singing during vowel glides?.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {4565}, doi = {10.1121/10.0005432}, pmid = {34241428}, issn = {1520-8524}, mesh = {Female ; Humans ; Male ; Occupations ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {In recent studies, it has been assumed that vocal tract formants (Fn) and the voice source could interact. However, there are only few studies analyzing this assumption in vivo. Here, the vowel transition /i/-/a/-/u/-/i/ of 12 professional classical singers (6 females, 6 males) when phonating on the pitch D4 [fundamental frequency (ƒo) ca. 294 Hz] were analyzed using transnasal high speed videoendoscopy (20.000 fps), electroglottography (EGG), and audio recordings. Fn data were calculated using a cepstral method. Source-filter interaction candidates (SFICs) were determined by (a) algorithmic detection of major intersections of Fn/nƒo and (b) perceptual assessment of the EGG signal. Although the open quotient showed some increase for the /i-a/ and /u-i/ transitions, there were no clear effects at the expected Fn/nƒo intersections. In contrast, ƒo adjustments and changes in the phonovibrogram occurred at perceptually derived SFICs, suggesting level-two interactions. In some cases, these were constituted by intersections between higher nƒo and Fn. The presented data partially corroborates that vowel transitions may result in level-two interactions also in professional singers. However, the lack of systematically detectable effects suggests either the absence of a strong interaction or existence of confounding factors, which may potentially counterbalance the level-two-interactions.}, }
@article {pmid34241427, year = {2021}, author = {Zhang, C and Jepson, K and Lohfink, G and Arvaniti, A}, title = {Comparing acoustic analyses of speech data collected remotely.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {6}, pages = {3910}, pmid = {34241427}, issn = {1520-8524}, mesh = {Acoustics ; *COVID-19 ; Humans ; Phonetics ; SARS-CoV-2 ; *Speech ; Speech Acoustics ; }, abstract = {Face-to-face speech data collection has been next to impossible globally as a result of the COVID-19 restrictions. To address this problem, simultaneous recordings of three repetitions of the cardinal vowels were made using a Zoom H6 Handy Recorder with an external microphone (henceforth, H6) and compared with two alternatives accessible to potential participants at home: the Zoom meeting application (henceforth, Zoom) and two lossless mobile phone applications (Awesome Voice Recorder, and Recorder; henceforth, Phone). F0 was tracked accurately by all of the devices; however, for formant analysis (F1, F2, F3), Phone performed better than Zoom, i.e., more similarly to H6, although the data extraction method (VoiceSauce, Praat) also resulted in differences. In addition, Zoom recordings exhibited unexpected drops in intensity. The results suggest that lossless format phone recordings present a viable option for at least some phonetic studies.}, }
@article {pmid34240071, year = {2021}, author = {Diamant, N and Amir, O}, title = {Examining the voice of Israeli transgender women: Acoustic measures, voice femininity and voice-related quality-of-life.}, journal = {International journal of transgender health}, volume = {22}, number = {3}, pages = {281-293}, pmid = {34240071}, issn = {2689-5277}, abstract = {BACKGROUND: Transgender women may experience gender-dysphoria associated with their voice and the way it is perceived. Previous studies have shown that specific acoustic measures are associated with the perception of voice-femininity and with voice-related quality-of-life, yet results are inconsistent.
AIMS: This study aimed to examine the associations between specific voice measures of transgender women, voice-related quality-of-life, and the perception of voice-femininity by listeners and by the speakers themselves.
METHODS: Thirty Hebrew speaking transgender women were recorded. They had also rated their voice-femininity and completed the Hebrew version of the TVQ[MtF] questionnaire. Recordings were analyzed to extract mean fundamental frequency (F0), formant frequencies (F1, F2, F3), and vocal-range (calculated in Hz. and in semitones). Recordings were also rated on a voice-gender 7-point scale, by 20 naïve cisgender listeners.
RESULTS: Significant correlations were found between both F0 and F1 and listeners' as well as speakers' evaluation of voice-femininity. TVQ[MtF] scores were significantly correlated with F0 and with the lower and upper boundaries of the vocal-range. Voice-femininity ratings were strongly correlated with vocal-range, when calculated in Hz, but not when defined in semitones. Listeners' evaluation and speakers' self-evaluation of voice-femininity were significantly correlated. However, TVQ[MtF] scores were significantly correlated only with the speakers' voice-femininity ratings, but not with those of the listeners.
CONCLUSION: Higher F0 and F1, which are perceived as more feminine, jointly improved speakers' satisfaction with their voice. Speakers' self-evaluation of voice-femininity does not mirror listeners' judgment, as it is affected by additional factors, related to self-satisfaction and personal experience. Combining listeners' and speakers' voice evaluation with acoustic analysis is valuable by providing a more holistic view on how transgender women feel about their voice and how it is perceived by listeners.}, }
@article {pmid34232704, year = {2021}, author = {Leung, Y and Oates, J and Chan, SP and Papp, V}, title = {Associations Between Speaking Fundamental Frequency, Vowel Formant Frequencies, and Listener Perceptions of Speaker Gender and Vocal Femininity-Masculinity.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {7}, pages = {2600-2622}, doi = {10.1044/2021_JSLHR-20-00747}, pmid = {34232704}, issn = {1558-9102}, mesh = {Australia ; Female ; *Femininity ; Humans ; Male ; *Masculinity ; Perception ; Speech Acoustics ; }, abstract = {Purpose The aim of the study was to examine associations between speaking fundamental frequency (f os), vowel formant frequencies (F), listener perceptions of speaker gender, and vocal femininity-masculinity. Method An exploratory study was undertaken to examine associations between f os, F 1-F 3, listener perceptions of speaker gender (nominal scale), and vocal femininity-masculinity (visual analog scale). For 379 speakers of Australian English aged 18-60 years, f os mode and F 1-F 3 (12 monophthongs; total of 36 Fs) were analyzed on a standard reading passage. Seventeen listeners rated speaker gender and vocal femininity-masculinity on randomized audio recordings of these speakers. Results Model building using principal component analysis suggested the 36 Fs could be succinctly reduced to seven principal components (PCs). Generalized structural equation modeling (with the seven PCs of F and f os as predictors) suggested that only F 2 and f os predicted listener perceptions of speaker gender (male, female, unable to decide). However, listener perceptions of vocal femininity-masculinity behaved differently and were predicted by F 1, F 3, and the contrast between monophthongs at the extremities of the F 1 acoustic vowel space, in addition to F 2 and f os. Furthermore, listeners' perceptions of speaker gender also influenced ratings of vocal femininity-masculinity substantially. Conclusion Adjusted odds ratios highlighted the substantially larger contribution of F to listener perceptions of speaker gender and vocal femininity-masculinity relative to f os than has previously been reported.}, }
@article {pmid34229221, year = {2021}, author = {Easwar, V and Boothalingam, S and Flaherty, R}, title = {Fundamental frequency-dependent changes in vowel-evoked envelope following responses.}, journal = {Hearing research}, volume = {408}, number = {}, pages = {108297}, doi = {10.1016/j.heares.2021.108297}, pmid = {34229221}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Hearing Tests ; Humans ; Male ; Speech ; *Speech Perception ; *Voice ; Young Adult ; }, abstract = {Scalp-recorded envelope following responses (EFRs) provide a non-invasive method to assess the encoding of the fundamental frequency (f0) of voice that is important for speech understanding. It is well-known that EFRs are influenced by voice f0. However, this effect of f0 has not been examined independent of concomitant changes in spectra or neural generators. We evaluated the effect of voice f0 on EFRs while controlling for vowel formant characteristics and potentially avoiding significant changes in dominant neural generators using a small f0 range. EFRs were elicited by a male-spoken vowel /u/ (average f0 = 100.4 Hz) and its lowered f0 version (average f0 = 91.9 Hz) with closely matched formant characteristics. Vowels were presented to each ear of 17 young adults with normal hearing. EFRs were simultaneously recorded between the vertex and the nape, and the vertex and the ipsilateral mastoid-the two most common electrode montages used for EFRs. Our results indicate that when vowel formant characteristics are matched, an increase in f0 by 8.5 Hz reduces EFR amplitude by 25 nV, phase coherence by 0.05 and signal-to-noise ratio by 3.5 dB, on average. The reduction in EFR characteristics was similar across ears of stimulation and the two montages used. These findings will help parse the influence of f0 or stimulus spectra on EFRs when both co-vary.}, }
@article {pmid34213387, year = {2022}, author = {Eravci, FC and Yildiz, BD and Özcan, KM and Moran, M and Çolak, M and Karakurt, SE and Karakuş, MF and Ikinciogullari, A}, title = {Acoustic parameter changes after bariatric surgery.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {4}, pages = {256-261}, doi = {10.1080/14015439.2021.1945676}, pmid = {34213387}, issn = {1651-2022}, mesh = {Humans ; Adult ; Middle Aged ; *Speech Acoustics ; Voice Quality ; Prospective Studies ; Longitudinal Studies ; Acoustics ; *Bariatric Surgery/adverse effects ; Weight Loss ; }, abstract = {OBJECTIVE: To investigate the acoustic parameter changes after weight loss in bariatric surgery patients.
MATERIALS AND METHODS: This prospective, longitudinal study was conducted with 15 patients with planned bariatric surgery, who were evaluated pre-operatively and at 6 months post-operatively. Fundamental frequency (F0), Formant frequency (F1, F2, F3, and F4), Frequency perturbation (Jitter), Amplitude perturbation (Shimmer) and Noise-to-Harmonics Ratio (NHR) parameters were evaluated for /a/, /e/, /i/, /o/, and /u/ vowels. Changes in the acoustic analysis parameters for each vowel were compared. The study group was separated into two groups according to whether the Mallampati score had not changed (Group 1) or had decreased (Group 2) and changes in the formant frequencies were compared between these groups.
RESULTS: A total of 15 patients with a median age of 40 ± 11 years completed the study. The median weight of the patients was 122 ± 14 kg pre-operatively and 80 ± 15 kg, post-operatively. BMI declined from 46 ± 4 to 31 ± 5 kg/m[2]. The Mallampati score decreased by one point in six patients and remained stable in nine. Of the acoustic voice analysis parameters of vowels, in general, fundamental frequency tended to decrease, and shimmer and jitter values tended to increase. Some of the formant frequencies were specifically affected by the weight loss and this showed statistical significance between Group 1 and Group 2.
CONCLUSION: The present study reveals that some specific voice characteristics might be affected by successful weight loss after bariatric surgery.HighlightsObesity reduces the size of the pharyngeal lumen at different levels.The supralaryngeal vocal tract size and configuration is a determinative factor in the features of the voice.Changes in the length and shape of the vocal tract, or height and position of the tongue can result in changes especially in formant frequencies in acoustic analysis.}, }
@article {pmid34160929, year = {2021}, author = {Yang, J}, title = {Vowel development in young Mandarin-English bilingual children.}, journal = {Phonetica}, volume = {78}, number = {3}, pages = {241-272}, doi = {10.1515/phon-2021-2006}, pmid = {34160929}, issn = {1423-0321}, mesh = {Child ; Child, Preschool ; Humans ; Language ; Language Development ; *Multilingualism ; Phonetics ; *Speech Perception ; }, abstract = {This study examined the development of vowel categories in young Mandarin -English bilingual children. The participants included 35 children aged between 3 and 4 years old (15 Mandarin-English bilinguals, six English monolinguals, and 14 Mandarin monolinguals). The bilingual children were divided into two groups: one group had a shorter duration (<1 year) of intensive immersion in English (Bi-low group) and one group had a longer duration (>1 year) of intensive immersion in English (Bi-high group). The participants were recorded producing one list of Mandarin words containing the vowels /a, i, u, y, ɤ/ and/or one list of English words containing the vowels /i, ɪ, e, ɛ, æ, u, ʊ, o, ɑ, ʌ/. Formant frequency values were extracted at five equidistant time locations (the 20-35-50-65-80% point) over the course of vowel duration. Cross-language and within-language comparisons were conducted on the midpoint formant values and formant trajectories. The results showed that children in the Bi-low group produced their English vowels into clusters and showed positional deviations from the monolingual targets. However, they maintained the phonetic features of their native vowel sounds well and mainly used an assimilatory process to organize the vowel systems. Children in the Bi-high group separated their English vowels well. They used both assimilatory and dissimilatory processes to construct and refine the two vowel systems. These bilingual children approximated monolingual English children to a better extent than the children in the Bi-low group. However, when compared to the monolingual peers, they demonstrated observable deviations in both L1 and L2.}, }
@article {pmid34116888, year = {2021}, author = {Lin, Y and Cheng, L and Wang, Q and Xu, W}, title = {Effects of Medical Masks on Voice Assessment During the COVID-19 Pandemic.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.04.028}, pmid = {34116888}, issn = {1873-4588}, abstract = {OBJECTIVE: Voice assessment is of great significance to the evaluation of voice quality. Our study aims to explore the effects of medical masks on healthy people in acoustic, aerodynamic and formant parameters during the COVID-19 pandemic. In addition, we also attempted to verify the differences between different sexes and ages.
METHODS: Fifty-three healthy participants (25 males and 28 females) were involved in our study. The acoustic parameters, including fundamental frequency (F0), sound pressure level (SPL), percentage of jitter (%), percentage of shimmer (%), noise to harmonic ratio (NHR) and cepstral peak prominence (CPP), aerodynamic parameter (maximum phonation time, MPT) and formant parameters (formant frequency, F1, F2, F3) without and with wearing medical masks were included. We further investigated the potential differences in the impact on different sexes and ages (≤45 years old and >45 years old).
RESULTS: While wearing medical masks, the SPL significantly increased (71.22±4.25 dB, 72.42±3.96 dB, P = 0.021). Jitter and shimmer significantly decreased (jitter 1.19±0.83, 0.87±0.67 P = 0.005; shimmer 4.49±2.20, 3.66±2.02 P = 0.002), as did F3 (2855±323.34 Hz, 2781.89±353.42 Hz P = 0.004). F0, MPT, F1 and F2 showed increasing trends without statistical significance, and NHR as well as CPP showed little change without and with wearing medical masks. There were no significant differences seen between males and females. Regarding to age, a significant difference in MPT was seen (>45-year-old 16.15±6.98 s, 15.38±7.02 s; ≤45-year-old 20.26±6.47 s, 21.44±6.98 s, P = 0.032).
CONCLUSION: Healthy participants showed a significantly higher SPL, a smaller perturbation and an evident decrease in F3 after wearing medical masks. These changes may result from the adjustment of the vocal tract and the filtration function of medical masks, leading to the stability of voices we recorded being overstated. The impacts of medical masks on sex were not evident, while the MPT in the >45-year-old group was influenced more than that in the ≤45-year-old group.}, }
@article {pmid34091212, year = {2021}, author = {Madrid, AM and Walker, KA and Smith, SB and Hood, LJ and Prieve, BA}, title = {Relationships between click auditory brainstem response and speech frequency following response with development in infants born preterm.}, journal = {Hearing research}, volume = {407}, number = {}, pages = {108277}, doi = {10.1016/j.heares.2021.108277}, pmid = {34091212}, issn = {1878-5891}, support = {R01 DC011777/DC/NIDCD NIH HHS/United States ; }, mesh = {Child, Preschool ; *Evoked Potentials, Auditory, Brain Stem ; Gestational Age ; Humans ; Infant ; Infant, Newborn ; Infant, Premature ; Speech ; *Speech Perception ; }, abstract = {The speech evoked frequency following response (sFFR) is used to study relationships between neural processing and functional aspects of speech and language that are not captured by click or toneburst evoked auditory brainstem responses (ABR). The sFFR is delayed, deviant, or weak in school age children having a variety of disorders, including autism, dyslexia, reading and language disorders, in relation to their typically developing peers. Much less is known about the developmental characteristics of sFFR, especially in preterm infants, who are at risk of having language delays. In term neonates, phase locking and spectral representation of the fundamental frequency is developed in the early days of life. Spectral representation of higher harmonics and latencies associated with transient portions of the stimulus are still developing in term infants through at least 10 months of age. The goal of this research was to determine whether sFFR could be measured in preterm infants and to characterize its developmental trajectory in the time and frequency domain. Click ABR and sFFR were measured in 28 preterm infants at ages 33 to 64 weeks gestational age. The sFFR could be measured in the majority of infants at 33 weeks gestational age, and the detectability of all sFFR waves was 100% by 64 weeks gestational age. The latency of all waves associated with the transient portion of the response (waves V, A, and O), and most waves (waves D and E) associated with the quasi-steady state decreased with increasing age. The interpeak wave A-O latency did not change with age, indicating that these waves share a neural generator, or the neural generators are developing at the same rate. The spectral amplitude of F0 and the lower frequencies of the first formant increased with age, but that for higher frequencies of the first formant and higher harmonics did not. The results suggest that the sFFR can be reliably recorded in preterm infants, including those cared for in the neonatal intensive care unit. These findings support that in preterm infants, F0 amplitude continues to develop within the first 6 months of life and develops before efficient representation of higher frequency harmonics. Further research is needed to determine if the sFFR in preterm infants is predictive of long-term language or learning disorders.}, }
@article {pmid34045154, year = {2021}, author = {Andrade, PA and Frič, M and Otčenášek, Z}, title = {Assessment of Changes in Laryngeal Configuration and Voice Parameters Among Different Frequencies of Neuromuscular Electrical Stimulation (NMES) and Cumulative Effects of NMES in a Normophonic Subject: A Pilot Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.03.018}, pmid = {34045154}, issn = {1873-4588}, abstract = {INTRODUCTION: Neuromuscular electrical stimulation (NMES) is a complementary resource to voice therapy that can be used for the treatment of hypofunctional voice disorders. Although positive clinical studies have been reported, neutral and even potentially harmful effects of NMES are also described in the literature. Furthermore, in the studies examined by the authors, the use of different methods of NMES have been identified, which further contributes to the inconsistent results found among studies. Moreover, limited rationale is provided for the chosen NMES parameters such as electrode placement, frequency of NMES and length of treatment. The aims of this pilot study were to investigate the a) impact of different frequencies of NMES on glottal configuration and vocal fold vibration patterns and b) changes in laryngeal configuration and vocal output across 12 minutes of NMES.
METHOD: Three experiments were carried out looking at changes in laryngeal configuration and voice output using different imaging techniques (fibreoptic nasolaryngoscopy and high-speed video), acoustical analysis (F0, formant analysis, SPL, CPPS and LHSR values), electroglottography (EGG) and Relative Fundamental Frequency (RFF) analyses. Glottal parameters and acoustical measures were recorded before, during, and after stimulation. Data was collected at rest and during phonation.
RESULTS: Overall the results showed global changes in laryngeal configuration from normal to hyperfunctional (ie, increased RFF, SPL, CQ, and stiffness). Changes were more pronounced for lower frequencies of NMES and were significant within less than three minutes of application.
CONCLUSION: NMES is an effective resource for the activation of intrinsic laryngeal muscles producing significant levels of adduction within few minutes of application. Lower NMES frequencies produced greater muscle activation when compared to higher frequencies.}, }
@article {pmid34043445, year = {2021}, author = {Daliri, A}, title = {A Computational Model for Estimating the Speech Motor System's Sensitivity to Auditory Prediction Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1841-1854}, pmid = {34043445}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adaptation, Physiological ; Feedback, Sensory ; Female ; Humans ; Sound ; *Speech ; *Speech Perception ; }, abstract = {Purpose The speech motor system uses feedforward and feedback control mechanisms that are both reliant on prediction errors. Here, we developed a state-space model to estimate the error sensitivity of the control systems. We examined (a) whether the model accounts for the error sensitivity of the control systems and (b) whether the two systems have similar error sensitivity. Method Participants (N = 50) completed an adaptation paradigm, in which their first and second formants were perturbed such that a participant's /ε/ would sound like her /ӕ/. We measured adaptive responses to the perturbations at early (0-80 ms) and late (220-300 ms) time points relative to the onset of the perturbations. As data-driven correlates of the error sensitivity of the feedforward and feedback systems, we used the average early responses and difference responses (i.e., late minus early responses), respectively. We fitted the state-space model to participants' adaptive responses and used the model's parameters as model-based estimates of error sensitivity. Results We found that the late responses were larger than the early responses. Additionally, the model-based estimates of error sensitivity strongly correlated with the data-driven estimates. However, the data-driven and model-based estimates of error sensitivity of the feedforward system did not correlate with those of the feedback system. Conclusions Overall, our results suggested that the dynamics of adaptive responses as well as error sensitivity of the control systems can be accurately predicted by the model. Furthermore, our results suggested that the feedforward and feedback control systems function independently. Supplemental Material https://doi.org/10.23641/asha.14669808.}, }
@article {pmid34019777, year = {2021}, author = {Souza, PE and Ellis, G and Marks, K and Wright, R and Gallun, F}, title = {Does the Speech Cue Profile Affect Response to Amplitude Envelope Distortion?.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {2053-2069}, pmid = {34019777}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Cues ; *Hearing Aids ; *Hearing Loss, Sensorineural ; Humans ; Speech ; *Speech Perception ; }, abstract = {Purpose A broad area of interest to our group is to understand the consequences of the "cue profile" (a measure of how well a listener can utilize audible temporal and/or spectral cues for listening scenarios in which a subset of cues is distorted. The study goal was to determine if listeners whose cue profile indicated that they primarily used temporal cues for recognition would respond differently to speech-envelope distortion than listeners who utilized both spectral and temporal cues. Method Twenty-five adults with sensorineural hearing loss participated in the study. The listener's cue profile was measured by analyzing identification patterns for a set of synthetic syllables in which envelope rise time and formant transitions were varied. A linear discriminant analysis quantified the relative contributions of spectral and temporal cues to identification patterns. Low-context sentences in noise were processed with time compression, wide-dynamic range compression, or a combination of time compression and wide-dynamic range compression to create a range of speech-envelope distortions. An acoustic metric, a modified version of the Spectral Correlation Index, was calculated to quantify envelope distortion. Results A binomial generalized linear mixed-effects model indicated that envelope distortion, the cue profile, the interaction between envelope distortion and the cue profile, and the pure-tone average were significant predictors of sentence recognition. Conclusions The listeners with good perception of spectro-temporal contrasts were more resilient to the detrimental effects of envelope compression than listeners who used temporal cues to a greater extent. The cue profile may provide information about individual listening that can direct choice of hearing aid parameters, especially those parameters that affect the speech envelope.}, }
@article {pmid33987821, year = {2021}, author = {Stilp, CE and Assgari, AA}, title = {Contributions of natural signal statistics to spectral context effects in consonant categorization.}, journal = {Attention, perception & psychophysics}, volume = {83}, number = {6}, pages = {2694-2708}, pmid = {33987821}, issn = {1943-393X}, mesh = {Acoustic Stimulation ; Humans ; Language ; *Phonetics ; Sound ; Sound Spectrography ; Speech Acoustics ; *Speech Perception ; }, abstract = {Speech perception, like all perception, takes place in context. Recognition of a given speech sound is influenced by the acoustic properties of surrounding sounds. When the spectral composition of earlier (context) sounds (e.g., a sentence with more energy at lower third formant [F3] frequencies) differs from that of a later (target) sound (e.g., consonant with intermediate F3 onset frequency), the auditory system magnifies this difference, biasing target categorization (e.g., towards higher-F3-onset /d/). Historically, these studies used filters to force context stimuli to possess certain spectral compositions. Recently, these effects were produced using unfiltered context sounds that already possessed the desired spectral compositions (Stilp & Assgari, 2019, Attention, Perception, & Psychophysics, 81, 2037-2052). Here, this natural signal statistics approach is extended to consonant categorization (/g/-/d/). Context sentences were either unfiltered (already possessing the desired spectral composition) or filtered (to imbue specific spectral characteristics). Long-term spectral characteristics of unfiltered contexts were poor predictors of shifts in consonant categorization, but short-term characteristics (last 475 ms) were excellent predictors. This diverges from vowel data, where long-term and shorter-term intervals (last 1,000 ms) were equally strong predictors. Thus, time scale plays a critical role in how listeners attune to signal statistics in the acoustic environment.}, }
@article {pmid33979206, year = {2021}, author = {Dromey, C and Richins, M and Low, T}, title = {Kinematic and Acoustic Changes to Vowels and Diphthongs in Bite Block Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6}, pages = {1794-1801}, doi = {10.1044/2021_JSLHR-20-00630}, pmid = {33979206}, issn = {1558-9102}, mesh = {Acoustics ; Biomechanical Phenomena ; Humans ; Phonetics ; *Speech ; *Speech Acoustics ; Young Adult ; }, abstract = {Purpose We examined the effect of bite block insertion (BBI) on lingual movements and formant frequencies in corner vowel and diphthong production in a sentence context. Method Twenty young adults produced the corner vowels (/u/, /ɑ/, /æ/, /i/) and the diphthong /ɑɪ/ in sentence contexts before and after BBI. An electromagnetic articulograph measured the movements of the tongue back, middle, and front. Results There were significant decreases in the acoustic vowel articulation index and vowel space area following BBI. The kinematic vowel articulation index decreased significantly for the back and middle of the tongue but not for the front. There were no significant acoustic changes post-BBI for the diphthong, other than a longer transition duration. Diphthong kinematic changes after BBI included smaller movements for the back and middle of the tongue, but not the front. Conclusions BBI led to a smaller acoustic working space for the corner vowels. The adjustments made by the front of the tongue were sufficient to compensate for the BBI perturbation in the diphthong, resulting in unchanged formant trajectories. The back and middle of the tongue were likely biomechanically restricted in their displacement by the fixation of the jaw, whereas the tongue front showed greater movement flexibility.}, }
@article {pmid33977813, year = {2021}, author = {Onosson, S and Stewart, J}, title = {The Effects of Language Contact on Non-Native Vowel Sequences in Lexical Borrowings: The Case of Media Lengua.}, journal = {Language and speech}, volume = {}, number = {}, pages = {238309211014911}, doi = {10.1177/00238309211014911}, pmid = {33977813}, issn = {1756-6053}, abstract = {Media Lengua (ML), a mixed language derived from Quichua and Spanish, exhibits a phonological system that largely conforms to that of Quichua acoustically. Yet, it incorporates a large number of vowel sequences from Spanish which do not occur in the Quichua system. This includes the use of mid-vowels, which are phonetically realized in ML as largely overlapping with the high-vowels in acoustic space. We analyze and compare production of vowel sequences by speakers of ML, Quichua, and Spanish through the use of generalized additive mixed models to determine statistically significant differences between vowel formant trajectories. Our results indicate that Spanish-derived ML vowel sequences frequently differ significantly from their Spanish counterparts, largely occupying a more central region of the vowel space and frequently exhibiting markedly reduced trajectories over time. In contrast, we find only one case where an ML vowel sequence differs significantly from its Quichua counterpart-and even in this case the difference from Spanish is substantially greater. Our findings show how the vowel system of ML successfully integrates novel vowel sequence patterns from Spanish into what is essentially Quichua phonology by markedly adapting their production, while still maintaining contrasts which are not expressed in Quichua.}, }
@article {pmid33951578, year = {2021}, author = {Isler, B and Giroud, N and Hirsiger, S and Kleinjung, T and Meyer, M}, title = {Bilateral age-related atrophy in the planum temporale is associated with vowel discrimination difficulty in healthy older adults.}, journal = {Hearing research}, volume = {406}, number = {}, pages = {108252}, doi = {10.1016/j.heares.2021.108252}, pmid = {33951578}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Aged ; Atrophy/pathology ; *Auditory Cortex/diagnostic imaging ; Humans ; Speech ; *Speech Perception ; *Temporal Lobe/pathology ; }, abstract = {In this study we investigated the association between age-related brain atrophy and behavioural as well as electrophysiological markers of vowel perception in a sample of healthy younger and older adults with normal pure-tone hearing. Twenty-three older adults and 27 younger controls discriminated a set of vowels with altered second formants embedded in consonant-vowel syllables. Additionally, mismatch negativity (MMN) responses were recorded in a separate oddball paradigm with the same set of stimuli. A structural magnet resonance scan was obtained for each participant to determine cortical architecture of the left and right planum temporale (PT). The PT was chosen for its function as a major processor of auditory cues and speech. Results suggested that older adults performed worse in vowel discrimination despite normal-for-age pure-tone hearing. In the older group, we found evidence that those with greater age-related cortical atrophy (i.e., lower cortical surface area and cortical volume) in the left and right PT also showed weaker vowel discrimination. In comparison, we found a lateralized correlation in the younger group suggesting that those with greater cortical thickness in only the left PT performed weaker in the vowel discrimination task. We did not find any associations between macroanatomical traits of the PT and MMN responses. We conclude that deficient vowel processing is not only caused by pure-tone hearing loss but is also influenced by atrophy-related changes in the ageing auditory-related cortices. Furthermore, our results suggest that auditory processing might become more bilateral across the lifespan.}, }
@article {pmid33938165, year = {2021}, author = {Xiao, Y and Wang, T and Deng, W and Yang, L and Zeng, B and Lao, X and Zhang, S and Liu, X and Ouyang, D and Liao, G and Liang, Y}, title = {Data mining of an acoustic biomarker in tongue cancers and its clinical validation.}, journal = {Cancer medicine}, volume = {10}, number = {11}, pages = {3822-3835}, pmid = {33938165}, issn = {2045-7634}, mesh = {Adult ; Aged ; Analysis of Variance ; Area Under Curve ; Articulation Disorders/diagnosis/*physiopathology ; China ; Cross-Sectional Studies ; *Data Mining ; Female ; Humans ; Male ; Middle Aged ; Quality of Life ; Sex Factors ; Speech Production Measurement/methods ; Support Vector Machine ; Tongue/surgery ; Tongue Neoplasms/diagnosis/pathology/*physiopathology/surgery ; }, abstract = {The promise of speech disorders as biomarkers in clinical examination has been identified in a broad spectrum of neurodegenerative diseases. However, to the best of our knowledge, a validated acoustic marker with established discriminative and evaluative properties has not yet been developed for oral tongue cancers. Here we cross-sectionally collected a screening dataset that included acoustic parameters extracted from 3 sustained vowels /ɑ/, /i/, /u/ and binary perceptual outcomes from 12 consonant-vowel syllables. We used a support vector machine with linear kernel function within this dataset to identify the formant centralization ratio (FCR) as a dominant predictor of different perceptual outcomes across gender and syllable. The Acoustic analysis, Perceptual evaluation and Quality of Life assessment (APeQoL) was used to validate the FCR in 33 patients with primary resectable oral tongue cancers. Measurements were taken before (pre-op) and four to six weeks after (post-op) surgery. The speech handicap index (SHI), a speech-specific questionnaire, was also administrated at these time points. Pre-op correlation analysis within the APeQoL revealed overall consistency and a strong correlation between FCR and SHI scores. FCRs also increased significantly with increasing T classification pre-operatively, especially for women. Longitudinally, the main effects of T classification, the extent of resection, and their interaction effects with time (pre-op vs. post-op) on FCRs were all significant. For pre-operative FCR, after merging the two datasets, a cut-off value of 0.970 produced an AUC of 0.861 (95% confidence interval: 0.785-0.938) for T3-4 patients. In sum, this study determined that FCR is an acoustic marker with the potential to detect disease and related speech function in oral tongue cancers. These are preliminary findings that need to be replicated in longitudinal studies and/or larger cohorts.}, }
@article {pmid33909840, year = {2021}, author = {Rocha-Muniz, CN and Schochat, E}, title = {Investigation of the neural discrimination of acoustic characteristics of speech sounds in normal-hearing individuals through Frequency-following Response (FFR).}, journal = {CoDAS}, volume = {33}, number = {1}, pages = {e20180324}, doi = {10.1590/2317-1782/20202018324}, pmid = {33909840}, issn = {2317-1782}, mesh = {Acoustic Stimulation ; Acoustics ; Child ; Evoked Potentials, Auditory, Brain Stem ; Hearing ; Humans ; *Phonetics ; *Speech Perception ; }, abstract = {PURPOSE: To evaluate how the auditory pathways encode and discriminate the plosive syllables [ga], [da] and [ba] using the auditory evoked Frequency-following Response (FFR) in children with typical development.
METHODS: Twenty children aged 6-12 years were evaluated using the FFR for the [ga], [da] and [ba] stimuli. The stimuli were composed of six formants and were differentiated in the F2 to F3 transition (transient portion). The other formants were identical in the three syllables (sustained portion). The latencies of the 16 waves of the transient portion (<70ms) and of the 21 waves of the sustained portion (90-160ms) of the stimuli were analyzed in the neural responses obtained for each of the syllables.
RESULTS: The transient portion latencies were different in the three syllables, indicating a distinction in the acoustic characteristics of these syllables through their neural representations. In addition, the transient portion latencies progressively increased in the following order: [ga] <[da] <[ba], whereas no significant differences were observed in the sustained portion.
CONCLUSION: The FFR proved to be an efficient tool to investigate the subcortical acoustic differences in speech sounds, since it demonstrated different electrophysiological responses for the three evoked syllables. Changes in latency were observed in the transient portion (consonants) but not in the sustained portion (vowels) for the three stimuli. These results indicate the neural ability to distinguish between acoustic characteristics of the [ga], [da] and [ba] stimuli.}, }
@article {pmid33900806, year = {2021}, author = {Chiu, YF and Neel, A and Loux, T}, title = {Exploring the Acoustic Perceptual Relationship of Speech in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1560-1570}, doi = {10.1044/2021_JSLHR-20-00610}, pmid = {33900806}, issn = {1558-9102}, mesh = {Acoustics ; Aged ; Dysarthria/diagnosis/etiology ; Humans ; *Parkinson Disease/complications ; *Speech ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Auditory perceptual judgments are commonly used to diagnose dysarthria and assess treatment progress. The purpose of the study was to examine the acoustic underpinnings of perceptual speech abnormalities in individuals with Parkinson's disease (PD). Method Auditory perceptual judgments were obtained from sentences produced by 13 speakers with PD and five healthy older adults. Twenty young listeners rated overall ease of understanding, articulatory precision, voice quality, and prosodic adequacy on a visual analog scale. Acoustic measures associated with the speech subsystems of articulation, phonation, and prosody were obtained, including second formant transitions, articulation rate, cepstral and spectral measures of voice, and pitch variations. Regression analyses were performed to assess the relationships between perceptual judgments and acoustic variables. Results Perceptual impressions of Parkinsonian speech were related to combinations of several acoustic variables. Approximately 36%-49% of the variance in the perceptual ratings were explained by the acoustic measures indicating a modest acoustic perceptual relationship. Conclusions The relationships between perceptual ratings and acoustic signals in Parkinsonian speech are multifactorial and involve a variety of acoustic features simultaneously. The modest acoustic perceptual relationships, however, suggest that future work is needed to further examine the acoustic bases of perceptual judgments in dysarthria.}, }
@article {pmid33900786, year = {2021}, author = {Parrell, B and Ivry, RB and Nagarajan, SS and Houde, JF}, title = {Intact Correction for Self-Produced Vowel Formant Variability in Individuals With Cerebellar Ataxia Regardless of Auditory Feedback Availability.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2234-2247}, pmid = {33900786}, issn = {1558-9102}, support = {R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cerebellar Ataxia ; Feedback ; Feedback, Sensory ; Humans ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose Individuals with cerebellar ataxia (CA) caused by cerebellar degeneration exhibit larger reactive compensatory responses to unexpected auditory feedback perturbations than neurobiologically typical speakers, suggesting they may rely more on feedback control during speech. We test this hypothesis by examining variability in unaltered speech. Previous studies of typical speakers have demonstrated a reduction in formant variability (centering) observed during the initial phase of vowel production from vowel onset to vowel midpoint. Centering is hypothesized to reflect feedback-based corrections for self-produced variability and thus may provide a behavioral assay of feedback control in unperturbed speech in the same manner as the compensatory response does for feedback perturbations. Method To comprehensively compare centering in individuals with CA and controls, we examine centering in two vowels (/i/ and /ɛ/) under two contexts (isolated words and connected speech). As a control, we examine speech produced both with and without noise to mask auditory feedback. Results Individuals with CA do not show increased centering compared to age-matched controls, regardless of vowel, context, or masking. Contrary to previous results in neurobiologically typical speakers, centering was not affected by the presence of masking noise in either group. Conclusions The similar magnitude of centering seen with and without masking noise questions whether centering is driven by auditory feedback. However, if centering is at least partially driven by auditory/somatosensory feedback, these results indicate that the larger compensatory response to altered auditory feedback observed in individuals with CA may not reflect typical motor control processes during normal, unaltered speech production.}, }
@article {pmid33895925, year = {2021}, author = {Kovalenko, AN and Kastyro, IV and Reshetov, IV and Popadyuk, VI}, title = {Study of the Role of Hearing Aid on the Area of the Acoustic Field of Vowels.}, journal = {Doklady. Biochemistry and biophysics}, volume = {497}, number = {1}, pages = {108-111}, pmid = {33895925}, issn = {1608-3091}, mesh = {*Acoustics ; Adult ; Female ; *Hearing Aids ; Humans ; Male ; Sound ; }, abstract = {The method of transformation of acoustic vowel triangles (AVT) /a/, /i/, /u/ was used for an objective assessment of the acoustic features of vowels in the speech production of 20 persons with long-term hearing impairment (LHI). The logarithm of the values of the first two formants of each vowel (logF1, logF2) was determined for each subject. AVTs were transformed into the right-angled triangles, the vertices of the sound /u/ of which were moved to the origin of coordinates and the legs were aligned with the coordinate axes. In patients with LHI, the size of the triangles usually decreased, and they were stretched along one of the axes, which probably depends not only on the hearing loss severity but also on the duration of hearing aid use. The presented approach to the normalization of AVTs makes it possible to distinguish at least three groups of persons with LHI: in the first group, vowel triangles are stretched along the logF1 axis; in the second group, vowel triangles are stretched along the logF2 axis; and in the third group, AVT are symmetric.}, }
@article {pmid33863624, year = {2021}, author = {Lã, FMB and Silva, LS and Granqvist, S}, title = {Long-Term Average Spectrum Characteristics of Portuguese Fado-Canção from Coimbra.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.03.005}, pmid = {33863624}, issn = {1873-4588}, abstract = {Descriptions of acoustical characteristics of Fado, a Portuguese urban style sung in Lisbon and Oporto, are scarce, particularly concerning Fado-Canção, a related style sung in Coimbra. The present study aims at describing long-term average spectrum (LTAS) parameters of 16 professional singers while singing and reading the lyrics of a typical Fado-Canção. LTAS parameters were investigated in terms of: (1) equivalent sound level (Leq); (2) spectral differences between 3 frequency bands 0-2, 2-5, and 5-8 kHz; and (3) quantification of spectral prominence between 2 and 4 kHz, calculated as the level difference between the peak in this frequency region and a reference trendline between 1 and 5 kHz, henceforth Formant Cluster Prominence (FCP). Given that Fado-Canção, besides Fado and traditional styles, originated also from classical singing, and that previous studies on Fado suggest the absence of a singer's formant cluster, the averaged LTAS for all Fado-Canção singers was further compared to the LTAS of two world-touring opera baritones singing an operatic aria and a lied. Results show that Fado-Canção is commonly sung with a Leq of 86.4 dB and a FCP of about 10 dB, values significantly higher when compared to reading. The FCP in Fado-Canção, although smaller than for the two classical opera singers' examples (14.8 and 20 dB, respectively), suggests that the style preserved some of its original lyrical influence. However, because younger singers present higher energy in the 5-8 kHz region relative to the remaining frequency bands as compared to older singers, it seems that Fado-Canção may be drifting towards non-classical vocal practices. FCP seems to be a promising straightforward method to quantify the degree of formant clustering around the region of the singer's formant in LTAS, allowing comparisons between different singers and singing styles.}, }
@article {pmid33856659, year = {2021}, author = {Loni, DY and Subbaraman, S}, title = {Genetically related singers-acoustic feature analysis and impact on singer identification.}, journal = {Journal of applied genetics}, volume = {62}, number = {3}, pages = {459-467}, pmid = {33856659}, issn = {2190-3883}, mesh = {Acoustics ; Female ; Humans ; Male ; Music ; Parents ; Siblings ; Singing/*genetics ; Voice Quality/*genetics ; }, abstract = {Studies relating music with genetics have been one of the fascinating fields of research. In this study, we have attempted to answer the most curious question-how acoustically close are the genetically related singers? The present study has investigated this perception using two genetically different relations-three female sibling singers and father-son singer relation. These are famous Indian playback singers and the acoustic features are extracted using the songs of Bollywood films. Three different sets of self-developed cappella database are used for the experimentation. Positive correlations among the major musical aptitudes-pitch, vibrato, formant, and harmonic spectral envelope for both the singer relationships-revealed the genetic impact on the acoustic features. Also, the investigation of timbre spectral feature proved it a significant acoustic feature that differentiates similar voices. With Spearman's correlation coefficient, we conclude that strong acoustical association was observed between the acoustic features of genetically related singers, especially the female sibling singers. This was further validated by correlating these singers with genetically unrelated singers. A human perception test performed using cover songs indicated the genetic impact in voice similarity, while the automatic singer identification system discriminated singers more accurately than the human listeners.}, }
@article {pmid33833720, year = {2021}, author = {Hsieh, IH and Yeh, WT}, title = {The Interaction Between Timescale and Pitch Contour at Pre-attentive Processing of Frequency-Modulated Sweeps.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {637289}, pmid = {33833720}, issn = {1664-1078}, abstract = {Speech comprehension across languages depends on encoding the pitch variations in frequency-modulated (FM) sweeps at different timescales and frequency ranges. While timescale and spectral contour of FM sweeps play important roles in differentiating acoustic speech units, relatively little work has been done to understand the interaction between the two acoustic dimensions at early cortical processing. An auditory oddball paradigm was employed to examine the interaction of timescale and pitch contour at pre-attentive processing of FM sweeps. Event-related potentials to frequency sweeps that vary in linguistically relevant pitch contour (fundamental frequency F0 vs. first formant frequency F1) and timescale (local vs. global) in Mandarin Chinese were recorded. Mismatch negativities (MMNs) were elicited by all types of sweep deviants. For local timescale, FM sweeps with F0 contours yielded larger MMN amplitudes than F1 contours. A reversed MMN amplitude pattern was obtained with respect to F0/F1 contours for global timescale stimuli. An interhemispheric asymmetry of MMN topography was observed corresponding to local and global-timescale contours. Falling but not rising frequency difference waveforms sweep contours elicited right hemispheric dominance. Results showed that timescale and pitch contour interacts with each other in pre-attentive auditory processing of FM sweeps. Findings suggest that FM sweeps, a type of non-speech signal, is processed at an early stage with reference to its linguistic function. That the dynamic interaction between timescale and spectral pattern is processed during early cortical processing of non-speech frequency sweep signal may be critical to facilitate speech encoding at a later stage.}, }
@article {pmid33833252, year = {2021}, author = {Wright, E and Grawunder, S and Ndayishimiye, E and Galbany, J and McFarlin, SC and Stoinski, TS and Robbins, MM}, title = {Chest beats as an honest signal of body size in male mountain gorillas (Gorilla beringei beringei).}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6879}, pmid = {33833252}, issn = {2045-2322}, mesh = {Acoustics ; Animals ; *Body Size ; *Competitive Behavior ; Gorilla gorilla/*physiology ; Male ; *Reproduction ; Thorax/*physiology ; Vocalization, Animal/*physiology ; }, abstract = {Acoustic signals that reliably indicate body size, which usually determines competitive ability, are of particular interest for understanding how animals assess rivals and choose mates. Whereas body size tends to be negatively associated with formant dispersion in animal vocalizations, non-vocal signals have received little attention. Among the most emblematic sounds in the animal kingdom is the chest beat of gorillas, a non-vocal signal that is thought to be important in intra and inter-sexual competition, yet it is unclear whether it reliably indicates body size. We examined the relationship among body size (back breadth), peak frequency, and three temporal characteristics of the chest beat: duration, number of beats and beat rate from sound recordings of wild adult male mountain gorillas. Using linear mixed models, we found that larger males had significantly lower peak frequencies than smaller ones, but we found no consistent relationship between body size and the temporal characteristics measured. Taken together with earlier findings of positive correlations among male body size, dominance rank and reproductive success, we conclude that the gorilla chest beat is an honest signal of competitive ability. These results emphasize the potential of non-vocal signals to convey important information in mammal communication.}, }
@article {pmid33831309, year = {2021}, author = {Jekiel, M and Malarski, K}, title = {Musical Hearing and Musical Experience in Second Language English Vowel Acquisition.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {5}, pages = {1666-1682}, doi = {10.1044/2021_JSLHR-19-00253}, pmid = {33831309}, issn = {1558-9102}, mesh = {Adult ; Hearing ; Hearing Tests ; Humans ; Language ; *Multilingualism ; *Music ; Phonetics ; *Speech Perception ; }, abstract = {Purpose Former studies suggested that music perception can help produce certain accentual features in the first and second language (L2), such as intonational contours. What was missing in many of these studies was the identification of the exact relationship between specific music perception skills and the production of different accentual features in a foreign language. Our aim was to verify whether empirically tested musical hearing skills can be related to the acquisition of English vowels by learners of English as an L2 before and after a formal accent training course. Method Fifty adult Polish speakers of L2 English were tested before and after a two-semester accent training in order to observe the effect of musical hearing on the acquisition of English vowels. Their L2 English vowel formant contours produced in consonant-vowel-consonant context were compared with the target General British vowels produced by their pronunciation teachers. We juxtaposed these results with their musical hearing test scores and self-reported musical experience to observe a possible relationship between successful L2 vowel acquisition and musical aptitude. Results Preexisting rhythmic memory was reported as a significant predictor before training, while musical experience was reported as a significant factor in the production of more native-like L2 vowels after training. We also observed that not all vowels were equally acquired or affected by musical hearing or musical experience. The strongest estimate we observed was the closeness to model before training, suggesting that learners who already managed to acquire some features of a native-like accent were also more successful after training. Conclusions Our results are revealing in two aspects. First, the learners' former proficiency in L2 pronunciation is the most robust predictor in acquiring a native-like accent. Second, there is a potential relationship between rhythmic memory and L2 vowel acquisition before training, as well as years of musical experience after training, suggesting that specific musical skills and music practice can be an asset in learning a foreign language accent.}, }
@article {pmid33825503, year = {2021}, author = {Michell, CT and Nyman, T}, title = {Microbiomes of willow-galling sawflies: effects of host plant, gall type, and phylogeny on community structure and function.}, journal = {Genome}, volume = {64}, number = {6}, pages = {615-626}, doi = {10.1139/gen-2020-0018}, pmid = {33825503}, issn = {1480-3321}, mesh = {Animals ; Bacteria/*classification/*genetics ; Biodiversity ; Host Microbial Interactions ; Host Specificity ; Insecta ; Larva ; Microbiota/*genetics/*physiology ; *Phylogeny ; Plant Growth Regulators ; Plant Leaves ; RNA, Ribosomal, 16S/genetics ; Salix/*microbiology ; }, abstract = {While free-living herbivorous insects are thought to harbor microbial communities composed of transient bacteria derived from their diet, recent studies indicate that insects that induce galls on plants may be involved in more intimate host-microbe relationships. We used 16S rDNA metabarcoding to survey larval microbiomes of 20 nematine sawfly species that induce bud or leaf galls on 13 Salix species. The 391 amplicon sequence variants (ASVs) detected represented 69 bacterial genera in six phyla. Multi-variate statistical analyses showed that the structure of larval microbiomes is influenced by willow host species as well as by gall type. Nevertheless, a "core" microbiome composed of 58 ASVs is shared widely across the focal galler species. Within the core community, the presence of many abundant, related ASVs representing multiple distantly related bacterial taxa is reflected as a statistically significant effect of bacterial phylogeny on galler-microbe associations. Members of the core community have a variety of inferred functions, including degradation of phenolic compounds, nutrient supplementation, and production of plant hormones. Hence, our results support suggestions of intimate and diverse interactions between galling insects and microbes and add to a growing body of evidence that microbes may play a role in the induction of insect galls on plants.}, }
@article {pmid33798490, year = {2021}, author = {Zhang, K and Sjerps, MJ and Peng, G}, title = {Integral perception, but separate processing: The perceptual normalization of lexical tones and vowels.}, journal = {Neuropsychologia}, volume = {156}, number = {}, pages = {107839}, doi = {10.1016/j.neuropsychologia.2021.107839}, pmid = {33798490}, issn = {1873-3514}, mesh = {Adult ; Cues ; Humans ; Language ; Phonetics ; Pitch Perception ; Speech ; *Speech Acoustics ; *Speech Perception ; }, abstract = {In tonal languages, speech variability arises in both lexical tone (i.e., suprasegmentally) and vowel quality (segmentally). Listeners can use surrounding speech context to overcome variability in both speech cues, a process known as extrinsic normalization. Although vowels are the main carriers of tones, it is still unknown whether the combined percept (lexical tone and vowel quality) is normalized integrally or in partly separate processes. Here we used electroencephalography (EEG) to investigate the time course of lexical tone normalization and vowel normalization to answer this question. Cantonese adults listened to synthesized three-syllable stimuli in which the identity of a target syllable - ambiguous between high vs. mid-tone (Tone condition) or between /o/ vs. /u/ (Vowel condition) - was dependent on either the tone range (Tone condition) or the formant range (Vowel condition) of the first two syllables. It was observed that the ambiguous tone was more often interpreted as a high-level tone when the context had a relatively low pitch than when it had a high pitch (Tone condition). Similarly, the ambiguous vowel was more often interpreted as /o/ when the context had a relatively low formant range than when it had a relatively high formant range (Vowel condition). These findings show the typical pattern of extrinsic tone and vowel normalization. Importantly, the EEG results of participants showing the contrastive normalization effect demonstrated that the effects of vowel normalization could already be observed within the N2 time window (190-350 ms), while the first reliable effect of lexical tone normalization on cortical processing was observable only from the P3 time window (220-500 ms) onwards. The ERP patterns demonstrate that the contrastive perceptual normalization of lexical tones and that of vowels occur at least in partially separate time windows. This suggests that the extrinsic normalization can operate at the level of phonemes and tonemes separately instead of operating on the whole syllable at once.}, }
@article {pmid33795617, year = {2021}, author = {Smith, ML and Winn, MB}, title = {Individual Variability in Recalibrating to Spectrally Shifted Speech: Implications for Cochlear Implants.}, journal = {Ear and hearing}, volume = {42}, number = {5}, pages = {1412-1427}, pmid = {33795617}, issn = {1538-4667}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Humans ; Reproducibility of Results ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: Cochlear implant (CI) recipients are at a severe disadvantage compared with normal-hearing listeners in distinguishing consonants that differ by place of articulation because the key relevant spectral differences are degraded by the implant. One component of that degradation is the upward shifting of spectral energy that occurs with a shallow insertion depth of a CI. The present study aimed to systematically measure the effects of spectral shifting on word recognition and phoneme categorization by specifically controlling the amount of shifting and using stimuli whose identification specifically depends on perceiving frequency cues. We hypothesized that listeners would be biased toward perceiving phonemes that contain higher-frequency components because of the upward frequency shift and that intelligibility would decrease as spectral shifting increased.
DESIGN: Normal-hearing listeners (n = 15) heard sine wave-vocoded speech with simulated upward frequency shifts of 0, 2, 4, and 6 mm of cochlear space to simulate shallow CI insertion depth. Stimuli included monosyllabic words and /b/-/d/ and /∫/-/s/ continua that varied systematically by formant frequency transitions or frication noise spectral peaks, respectively. Recalibration to spectral shifting was operationally defined as shifting perceptual acoustic-phonetic mapping commensurate with the spectral shift. In other words, adjusting frequency expectations for both phonemes upward so that there is still a perceptual distinction, rather than hearing all upward-shifted phonemes as the higher-frequency member of the pair.
RESULTS: For moderate amounts of spectral shifting, group data suggested a general "halfway" recalibration to spectral shifting, but individual data suggested a notably different conclusion: half of the listeners were able to recalibrate fully, while the other halves of the listeners were utterly unable to categorize shifted speech with any reliability. There were no participants who demonstrated a pattern intermediate to these two extremes. Intelligibility of words decreased with greater amounts of spectral shifting, also showing loose clusters of better- and poorer-performing listeners. Phonetic analysis of word errors revealed certain cues were more susceptible to being compromised due to a frequency shift (place and manner of articulation), while voicing was robust to spectral shifting.
CONCLUSIONS: Shifting the frequency spectrum of speech has systematic effects that are in line with known properties of speech acoustics, but the ensuing difficulties cannot be predicted based on tonotopic mismatch alone. Difficulties are subject to substantial individual differences in the capacity to adjust acoustic-phonetic mapping. These results help to explain why speech recognition in CI listeners cannot be fully predicted by peripheral factors like electrode placement and spectral resolution; even among listeners with functionally equivalent auditory input, there is an additional factor of simply being able or unable to flexibly adjust acoustic-phonetic mapping. This individual variability could motivate precise treatment approaches guided by an individual's relative reliance on wideband frequency representation (even if it is mismatched) or limited frequency coverage whose tonotopy is preserved.}, }
@article {pmid33792205, year = {2021}, author = {Chen, F and Zhang, H and Ding, H and Wang, S and Peng, G and Zhang, Y}, title = {Neural coding of formant-exaggerated speech and nonspeech in children with and without autism spectrum disorders.}, journal = {Autism research : official journal of the International Society for Autism Research}, volume = {14}, number = {7}, pages = {1357-1374}, doi = {10.1002/aur.2509}, pmid = {33792205}, issn = {1939-3806}, mesh = {*Autism Spectrum Disorder/complications ; Child ; Child, Preschool ; Evoked Potentials ; Humans ; Language Development ; Phonetics ; Speech ; *Speech Perception ; }, abstract = {The presence of vowel exaggeration in infant-directed speech (IDS) may adapt to the age-appropriate demands in speech and language acquisition. Previous studies have provided behavioral evidence of atypical auditory processing towards IDS in children with autism spectrum disorders (ASD), while the underlying neurophysiological mechanisms remain unknown. This event-related potential (ERP) study investigated the neural coding of formant-exaggerated speech and nonspeech in 24 4- to 11-year-old children with ASD and 24 typically-developing (TD) peers. The EEG data were recorded using an alternating block design, in which each stimulus type (exaggerated/non-exaggerated sound) was presented with equal probability. ERP waveform analysis revealed an enhanced P1 for vowel formant exaggeration in the TD group but not in the ASD group. This speech-specific atypical processing in ASD was not found for the nonspeech stimuli which showed similar P1 enhancement in both ASD and TD groups. Moreover, the time-frequency analysis indicated that children with ASD showed differences in neural synchronization in the delta-theta bands for processing acoustic formant changes embedded in nonspeech. Collectively, the results add substantiating neurophysiological evidence (i.e., a lack of neural enhancement effect of vowel exaggeration) for atypical auditory processing of IDS in children with ASD, which may exert a negative effect on phonetic encoding and language learning. LAY SUMMARY: Atypical responses to motherese might act as a potential early marker of risk for children with ASD. This study investigated the neural responses to such socially relevant stimuli in the ASD brain, and the results suggested a lack of neural enhancement responding to the motherese even in individuals without intellectual disability.}, }
@article {pmid33786072, year = {2021}, author = {Carmona-Duarte, C and Ferrer, MA and Plamondon, R and Gómez-Rodellar, A and Gómez-Vilda, P}, title = {Sigma-Lognormal Modeling of Speech.}, journal = {Cognitive computation}, volume = {13}, number = {2}, pages = {488-503}, pmid = {33786072}, issn = {1866-9956}, abstract = {Human movement studies and analyses have been fundamental in many scientific domains, ranging from neuroscience to education, pattern recognition to robotics, health care to sports, and beyond. Previous speech motor models were proposed to understand how speech movement is produced and how the resulting speech varies when some parameters are changed. However, the inverse approach, in which the muscular response parameters and the subject's age are derived from real continuous speech, is not possible with such models. Instead, in the handwriting field, the kinematic theory of rapid human movements and its associated Sigma-lognormal model have been applied successfully to obtain the muscular response parameters. This work presents a speech kinematics-based model that can be used to study, analyze, and reconstruct complex speech kinematics in a simplified manner. A method based on the kinematic theory of rapid human movements and its associated Sigma-lognormal model are applied to describe and to parameterize the asymptotic impulse response of the neuromuscular networks involved in speech as a response to a neuromotor command. The method used to carry out transformations from formants to a movement observation is also presented. Experiments carried out with the (English) VTR-TIMIT database and the (German) Saarbrucken Voice Database, including people of different ages, with and without laryngeal pathologies, corroborate the link between the extracted parameters and aging, on the one hand, and the proportion between the first and second formants required in applying the kinematic theory of rapid human movements, on the other. The results should drive innovative developments in the modeling and understanding of speech kinematics.}, }
@article {pmid33775469, year = {2021}, author = {Oren, L and Rollins, M and Gutmark, E and Howell, R}, title = {How Face Masks Affect Acoustic and Auditory Perceptual Characteristics of the Singing Voice.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {}, number = {}, pages = {}, doi = {10.1016/j.jvoice.2021.02.028}, pmid = {33775469}, issn = {1873-4588}, abstract = {Wearing a face mask has been accepted as one of the most effective ways for slowing the spread of COVID-19. Yet information regarding the degree to which masks affect acoustics and perception associated with voice performers is scarce. This study examines these effects with common face masks, namely a neck gaiter, disposable surgical mask, and N95 mask, as well as a novel material that could be used as a mask (acoustic foam). A recorded excerpt from the "Star-Spangled Banner" was played through a miniature speaker placed inside the mouth of a masked manikin. Experienced listeners were asked to rate perceptual qualities of these singing stimuli by blindly comparing them with the same recording captured without a mask. Acoustic analysis showed that face masks affected the sound by enhancing or suppressing different frequency bands compared to no mask. Acoustic energy around the singer's formant was reduced when using surgical and N95 masks, which matches observations that these masks are more detrimental to the perceptions of singing voice compared with neck gaiter or acoustic foam. It suggests that singers can benefit from masks designed for minimal impact on auditory perception of the singing voice while maintaining reasonable efficacy of filtering efficiency.}, }
@article {pmid33773895, year = {2023}, author = {Havel, M and Sundberg, J and Traser, L and Burdumy, M and Echternach, M}, title = {Effects of Nasalization on Vocal Tract Response Curve.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {339-347}, doi = {10.1016/j.jvoice.2021.02.013}, pmid = {33773895}, issn = {1873-4588}, mesh = {Humans ; *Nose/physiology ; *Paranasal Sinuses/physiology ; Vibration ; Magnetic Resonance Imaging ; Models, Biological ; Speech Acoustics ; }, abstract = {BACKGROUND: Earlier studies have shown that nasalization affects the radiated spectrum by modifying the vocal tract transfer function in a complex manner.
METHODS: Here we study this phenomenon by measuring sine-sweep response of 3-D models of the vowels /u, a, ᴂ, i/, derived from volumetric MR imaging, coupled by means of tubes of different lengths and diameters to a 3-D model of a nasal tract.
RESULTS: The coupling introduced a dip into the vocal tract transfer function. The dip frequency was close to the main resonance of the nasal tract, a result in agreement with the Fujimura & Lindqvist in vivo sweep tone measurements [Fujimura & Lindqvist, 1972]. With increasing size of the coupling tube the depth of the dip increased and the first formant peak either changed in frequency or was split by the dip. Only marginal effects were observed of the paranasal sinuses. For certain coupling tube sizes, the spectrum balance was changed, boosting the formant peaks in the 2 - 4 kHz range.
CONCLUSION: A velopharyngeal opening introduces a dip in the transfer function at the main resonance of the nasal tract. Its depth increases with the area of the opening and its frequency rises in some vowels.}, }
@article {pmid33769836, year = {2021}, author = {Coughler, C and Hamel, EM and Cardy, JO and Archibald, LMD and Purcell, DW}, title = {Compensation to Altered Auditory Feedback in Children With Developmental Language Disorder and Typical Development.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2363-2376}, doi = {10.1044/2020_JSLHR-20-00374}, pmid = {33769836}, issn = {1558-9102}, mesh = {Child ; Feedback ; Humans ; *Language Development Disorders ; Speech ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Purpose Developmental language disorder (DLD), an unexplained problem using and understanding spoken language, has been hypothesized to have an underlying auditory processing component. Auditory feedback plays a key role in speech motor control. The current study examined whether auditory feedback is used to regulate speech production in a similar way by children with DLD and their typically developing (TD) peers. Method Participants aged 6-11 years completed tasks measuring hearing, language, first formant (F1) discrimination thresholds, partial vowel space, and responses to altered auditory feedback with F1 perturbation. Results Children with DLD tended to compensate more than TD children for the positive F1 manipulation and compensated less than TD children in the negative shift condition. Conclusion Our findings suggest that children with DLD make atypical use of auditory feedback.}, }
@article {pmid33758251, year = {2021}, author = {Arenillas-Alcón, S and Costa-Faidella, J and Ribas-Prats, T and Gómez-Roig, MD and Escera, C}, title = {Neural encoding of voice pitch and formant structure at birth as revealed by frequency-following responses.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {6660}, pmid = {33758251}, issn = {2045-2322}, mesh = {*Acoustic Stimulation ; Adult ; Age Factors ; Biomarkers ; Brain/*physiology ; Cognition ; Humans ; Infant, Newborn ; Pediatrics ; *Pitch Perception ; Sound Spectrography ; Speech Perception ; *Voice ; }, abstract = {Detailed neural encoding of voice pitch and formant structure plays a crucial role in speech perception, and is of key importance for an appropriate acquisition of the phonetic repertoire in infants since birth. However, the extent to what newborns are capable of extracting pitch and formant structure information from the temporal envelope and the temporal fine structure of speech sounds, respectively, remains unclear. Here, we recorded the frequency-following response (FFR) elicited by a novel two-vowel, rising-pitch-ending stimulus to simultaneously characterize voice pitch and formant structure encoding accuracy in a sample of neonates and adults. Data revealed that newborns tracked changes in voice pitch reliably and no differently than adults, but exhibited weaker signatures of formant structure encoding, particularly at higher formant frequency ranges. Thus, our results indicate a well-developed encoding of voice pitch at birth, while formant structure representation is maturing in a frequency-dependent manner. Furthermore, we demonstrate the feasibility to assess voice pitch and formant structure encoding within clinical evaluation times in a hospital setting, and suggest the possibility to use this novel stimulus as a tool for longitudinal developmental studies of the auditory system.}, }
@article {pmid33741872, year = {2021}, author = {Emrani, E and Ghaemi, H and Labafchi, A and Samieirad, S}, title = {The Effect of Bimaxillary Orthognathic Surgery on Voice Characteristics in Skeletal Class 3 Deformity Patients: An Evaluation Using Acoustic Analysis.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {6}, pages = {2129-2133}, doi = {10.1097/SCS.0000000000007479}, pmid = {33741872}, issn = {1536-3732}, mesh = {Acoustics ; Adult ; Cephalometry ; Female ; Follow-Up Studies ; Humans ; Male ; *Malocclusion, Angle Class III/surgery ; Mandible ; Maxilla ; *Orthognathic Surgery ; *Orthognathic Surgical Procedures ; Osteotomy, Le Fort ; Osteotomy, Sagittal Split Ramus ; }, abstract = {The aim of this study was to analyze the effects of bimaxillary orthognathic surgery on the acoustic voice characteristics of skeletal class 3 patients. All healthy nonsyndromic patients with Class 3 deformity who were eligible for bimaxillary orthognathic surgery, were included in this before and after quasi-experimental study. This experiment's main intervention was mandibular setback surgery by bilateral sagittal split osteotomy plus maxillary advancement using LeFort 1 osteotomy. Age, sex, and intraoperative jaw movements were recorded. Acoustic analysis of voice samples (vowels /a/ and /i/) was performed with Praat software as outcome variables. The formant frequencies (F0, F1, F2, and F3) of these vowels were extracted 1 week preoperatively (T0), 1 and 6 months (T1, T2) postoperatively by a speech therapist. The significance level was set at 0.05 using SPSS 19. The study sample comprised 20 patients including 11 women (55%) and 9 men (45%) with a mean age of 31.95 ± 4.72 years. The average mandibular setback and maxillary advancement were 3.30 ± 0.86 and 2.85 ± 0.74 mm, respectively. The fundamental frequency (F0) and the first, second, and third formants (F1, F2, F3) of vowels /i/ and /a/ were significantly decreased over time intervals, postoperatively (P < 0.05). The finding revealed that bimaxillary orthognathic surgery (maxillary advancement and mandibular setback with bilateral sagittal split osteotomy) might reduce the acoustic formant parameters of voice to the normal frequency ranges, in patients with class 3 skeletal deformities. More clinical trials with greater sample sizes and long-term follow-ups are suggested in the future.}, }
@article {pmid33740875, year = {2022}, author = {Geng, P and Gu, W}, title = {Acoustic and Perceptual Characteristics of Mandarin Speech in Gay and Heterosexual Male Speakers.}, journal = {Language and speech}, volume = {65}, number = {4}, pages = {1096-1109}, doi = {10.1177/00238309211000783}, pmid = {33740875}, issn = {1756-6053}, mesh = {Male ; Humans ; Speech Acoustics ; Speech ; Heterosexuality ; Acoustics ; *Speech Perception ; *Sexual and Gender Minorities ; }, abstract = {This study investigated acoustic and perceptual characteristics of Mandarin speech produced by gay and heterosexual male speakers. Acoustic analysis of monosyllabic words showed significant differences between the two groups in voice fundamental frequency (F0), F1 of low vowel, and duration of aspiration/frication in consonants. The acoustic patterns on F0, formants, and center of gravity as well as spectral skewness of /s/ differed from those reported for Western languages like American English, which could be interpreted from a sociopsychological point of view based on different acceptability of gay identity in the two societies. The results of a perceptual experiment revealed significant but weak correlations between the acoustic parameters and the score of perceived gayness, which was significantly higher on gay speech than on heterosexual male speech. Although the observed F0 and F1 patterns in Mandarin gay speech were opposite to the stereotype of gayness, gay identity can still be identified to some extent from speech due to the existence of other acoustic cues such as a longer fricative duration, which is not a stereotype of gayness but has been consistently observed in Mandarin and Western languages.}, }
@article {pmid33739930, year = {2021}, author = {König, A and Riviere, K and Linz, N and Lindsay, H and Elbaum, J and Fabre, R and Derreumaux, A and Robert, P}, title = {Measuring Stress in Health Professionals Over the Phone Using Automatic Speech Analysis During the COVID-19 Pandemic: Observational Pilot Study.}, journal = {Journal of medical Internet research}, volume = {23}, number = {4}, pages = {e24191}, pmid = {33739930}, issn = {1438-8871}, mesh = {Adult ; Anxiety/*diagnosis/etiology/psychology ; Burnout, Professional/*diagnosis/etiology/psychology ; COVID-19/epidemiology/*psychology ; Female ; Health Personnel/*psychology ; Humans ; Male ; Pandemics ; Pilot Projects ; SARS-CoV-2 ; Speech/*physiology ; *Speech Acoustics ; Surveys and Questionnaires ; Telephone ; }, abstract = {BACKGROUND: During the COVID-19 pandemic, health professionals have been directly confronted with the suffering of patients and their families. By making them main actors in the management of this health crisis, they have been exposed to various psychosocial risks (stress, trauma, fatigue, etc). Paradoxically, stress-related symptoms are often underreported in this vulnerable population but are potentially detectable through passive monitoring of changes in speech behavior.
OBJECTIVE: This study aims to investigate the use of rapid and remote measures of stress levels in health professionals working during the COVID-19 outbreak. This was done through the analysis of participants' speech behavior during a short phone call conversation and, in particular, via positive, negative, and neutral storytelling tasks.
METHODS: Speech samples from 89 health care professionals were collected over the phone during positive, negative, and neutral storytelling tasks; various voice features were extracted and compared with classical stress measures via standard questionnaires. Additionally, a regression analysis was performed.
RESULTS: Certain speech characteristics correlated with stress levels in both genders; mainly, spectral (ie, formant) features, such as the mel-frequency cepstral coefficient, and prosodic characteristics, such as the fundamental frequency, appeared to be sensitive to stress. Overall, for both male and female participants, using vocal features from the positive tasks for regression yielded the most accurate prediction results of stress scores (mean absolute error 5.31).
CONCLUSIONS: Automatic speech analysis could help with early detection of subtle signs of stress in vulnerable populations over the phone. By combining the use of this technology with timely intervention strategies, it could contribute to the prevention of burnout and the development of comorbidities, such as depression or anxiety.}, }
@article {pmid33733165, year = {2020}, author = {Strycharczuk, P and López-Ibáñez, M and Brown, G and Leemann, A}, title = {General Northern English. Exploring Regional Variation in the North of England With Machine Learning.}, journal = {Frontiers in artificial intelligence}, volume = {3}, number = {}, pages = {48}, pmid = {33733165}, issn = {2624-8212}, abstract = {In this paper, we present a novel computational approach to the analysis of accent variation. The case study is dialect leveling in the North of England, manifested as reduction of accent variation across the North and emergence of General Northern English (GNE), a pan-regional standard accent associated with middle-class speakers. We investigated this instance of dialect leveling using random forest classification, with audio data from a crowd-sourced corpus of 105 urban, mostly highly-educated speakers from five northern UK cities: Leeds, Liverpool, Manchester, Newcastle upon Tyne, and Sheffield. We trained random forest models to identify individual northern cities from a sample of other northern accents, based on first two formant measurements of full vowel systems. We tested the models using unseen data. We relied on undersampling, bagging (bootstrap aggregation) and leave-one-out cross-validation to address some challenges associated with the data set, such as unbalanced data and relatively small sample size. The accuracy of classification provides us with a measure of relative similarity between different pairs of cities, while calculating conditional feature importance allows us to identify which input features (which vowels and which formants) have the largest influence in the prediction. We do find a considerable degree of leveling, especially between Manchester, Leeds and Sheffield, although some differences persist. The features that contribute to these differences most systematically are typically not the ones discussed in previous dialect descriptions. We propose that the most systematic regional features are also not salient, and as such, they serve as sociolinguistic regional indicators. We supplement the random forest results with a more traditional variationist description of by-city vowel systems, and we use both sources of evidence to inform a description of the vowels of General Northern English.}, }
@article {pmid33705674, year = {2021}, author = {Niziolek, CA and Parrell, B}, title = {Responses to Auditory Feedback Manipulations in Speech May Be Affected by Previous Exposure to Auditory Errors.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2169-2181}, pmid = {33705674}, issn = {1558-9102}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; Reproducibility of Results ; *Speech ; *Speech Perception ; }, abstract = {Purpose Speakers use auditory feedback to guide their speech output, although individuals differ in the magnitude of their compensatory response to perceived errors in feedback. Little is known about the factors that contribute to the compensatory response or how fixed or flexible they are within an individual. Here, we test whether manipulating the perceived reliability of auditory feedback modulates speakers' compensation to auditory perturbations, as predicted by optimal models of sensorimotor control. Method Forty participants produced monosyllabic words in two separate sessions, which differed in the auditory feedback given during an initial exposure phase. In the veridical session exposure phase, feedback was normal. In the noisy session exposure phase, small, random formant perturbations were applied, reducing reliability of auditory feedback. In each session, a subsequent test phase introduced larger unpredictable formant perturbations. We assessed whether the magnitude of within-trial compensation for these larger perturbations differed across the two sessions. Results Compensatory responses to downward (though not upward) formant perturbations were larger in the veridical session than the noisy session. However, in post hoc testing, we found the magnitude of this effect is highly dependent on the choice of analysis procedures. Compensation magnitude was not predicted by other production measures, such as formant variability, and was not reliably correlated across sessions. Conclusions Our results, though mixed, provide tentative support that the feedback control system monitors the reliability of sensory feedback. These results must be interpreted cautiously given the potentially limited stability of auditory feedback compensation measures across analysis choices and across sessions. Supplemental Material https://doi.org/10.23641/asha.14167136.}, }
@article {pmid33705004, year = {2021}, author = {Hernández-García, E and Velazquez, LM and González, R and Godino Llorente, JI and Plaza, G}, title = {Influence of Upper Airway Surgery on Voice and Speech Recognition.}, journal = {The Journal of craniofacial surgery}, volume = {32}, number = {2}, pages = {660-663}, doi = {10.1097/SCS.0000000000007175}, pmid = {33705004}, issn = {1536-3732}, mesh = {Humans ; Prospective Studies ; Speech ; Speech Acoustics ; *Speech Perception ; *Voice ; Voice Quality ; }, abstract = {PURPOSE: Upper airway surgery comprises a set of techniques that modify the anatomy of the vocal tract, including tonsillectomy and septoplasty. The objective of this work is to study the changes in acoustic parameters and the effects on the identification or verification of the speaker through the speech produced after the vocal tract surgeries, comparing them with a control group.
METHODS: A prospective study was performed between January 2019 and June 2019 including. The final study sample consisted of 84 patients who met the inclusion criteria. Of these, 31 underwent septoplasty, 26 tonsillectomy patients, and 27 controls. Demographic data and GRBAS evaluation were statistically evaluated. Tests were taken before surgery, 2 weeks after surgery and 3 months later. Furthermore, to establish the equal error rate, the recording of patients' voices was made with a succeeding acoustic analysis and programmed identification of the speaker through machine learning systems.
RESULTS: A significant variance was observed in GRBAS, after surgery. Regarding acoustic parameters, a greater change was observed in the fundamental frequency at 2 weeks after surgery in the tonsillectomy group. Formants (F1-F3) and antiformants (AntiF1-AntiF3) changed in septoplasty group, not in tonsillectomy and control group at 3 months. When studying the impact of voice changes on the verification of the speaker through the speech, it was observed that there was a greater error in recognition in the tonsillectomy group at 2 weeks, coinciding with the results obtained in the rest of the parameters studied.
CONCLUSIONS: Results suggest that upper airway surgery produces modifications in the vocal tract affecting GRBAS, acoustic parameters, including formants and antiformants, producing an effect on verification of the speaker through the speech.}, }
@article {pmid33679344, year = {2021}, author = {Riedinger, M and Nagels, A and Werth, A and Scharinger, M}, title = {Asymmetries in Accessing Vowel Representations Are Driven by Phonological and Acoustic Properties: Neural and Behavioral Evidence From Natural German Minimal Pairs.}, journal = {Frontiers in human neuroscience}, volume = {15}, number = {}, pages = {612345}, pmid = {33679344}, issn = {1662-5161}, abstract = {In vowel discrimination, commonly found discrimination patterns are directional asymmetries where discrimination is faster (or easier) if differing vowels are presented in a certain sequence compared to the reversed sequence. Different models of speech sound processing try to account for these asymmetries based on either phonetic or phonological properties. In this study, we tested and compared two of those often-discussed models, namely the Featurally Underspecified Lexicon (FUL) model (Lahiri and Reetz, 2002) and the Natural Referent Vowel (NRV) framework (Polka and Bohn, 2011). While most studies presented isolated vowels, we investigated a large stimulus set of German vowels in a more naturalistic setting within minimal pairs. We conducted an mismatch negativity (MMN) study in a passive and a reaction time study in an active oddball paradigm. In both data sets, we found directional asymmetries that can be explained by either phonological or phonetic theories. While behaviorally, the vowel discrimination was based on phonological properties, both tested models failed to explain the found neural patterns comprehensively. Therefore, we additionally examined the influence of a variety of articulatory, acoustical, and lexical factors (e.g., formant structure, intensity, duration, and frequency of occurrence) but also the influence of factors beyond the well-known (perceived loudness of vowels, degree of openness) in depth via multiple regression analyses. The analyses revealed that the perceptual factor of perceived loudness has a greater impact than considered in the literature and should be taken stronger into consideration when analyzing preattentive natural vowel processing.}, }
@article {pmid33675539, year = {2021}, author = {Kim, KS and Max, L}, title = {Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, journal = {The European journal of neuroscience}, volume = {53}, number = {9}, pages = {3093-3108}, pmid = {33675539}, issn = {1460-9568}, support = {R01 DC017444/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adaptation, Physiological ; Adult ; Child ; Feedback ; Feedback, Sensory ; Humans ; Learning ; *Speech ; *Stuttering ; }, abstract = {The neural mechanisms underlying stuttering remain poorly understood. A large body of work has focused on sensorimotor integration difficulties in individuals who stutter, including recently the capacity for sensorimotor learning. Typically, sensorimotor learning is assessed with adaptation paradigms in which one or more sensory feedback modalities are experimentally perturbed in real time. Our own previous work on speech with perturbed auditory feedback revealed substantial auditory-motor learning limitations in both children and adults who stutter (AWS). It remains unknown, however, which subprocesses of sensorimotor learning are impaired. Indeed, new insights from research on upper limb motor control indicate that sensorimotor learning involves at least two distinct components: (a) an explicit component that includes intentional strategy use and presumably is driven by target error and (b) an implicit component that updates an internal model without awareness of the learner and presumably is driven by sensory prediction error. Here, we attempted to dissociate these components for speech auditory-motor learning in AWS versus adults who do not stutter (AWNS). Our formant-shift auditory-motor adaptation results replicated previous findings that such sensorimotor learning is limited in AWS. Novel findings are that neither control nor stuttering participants reported any awareness of changing their productions in response to the auditory perturbation and that neither group showed systematic drift in auditory target judgments made throughout the adaptation task. These results indicate that speech auditory-motor adaptation to formant-shifted feedback relies exclusively on implicit learning processes. Thus, limited adaptation in AWS reflects poor implicit sensorimotor learning. Speech auditory-motor adaptation to formant-shifted feedback lacks an explicit component: Reduced adaptation in adults who stutter reflects limitations in implicit sensorimotor learning.}, }
@article {pmid33658966, year = {2021}, author = {Stefanich, S and Cabrelli, J}, title = {The Effects of L1 English Constraints on the Acquisition of the L2 Spanish Alveopalatal Nasal.}, journal = {Frontiers in psychology}, volume = {12}, number = {}, pages = {640354}, pmid = {33658966}, issn = {1664-1078}, abstract = {This study examines whether L1 English/L2 Spanish learners at different proficiency levels acquire a novel L2 phoneme, the Spanish palatal nasal /ɲ/. While alveolar /n/ is part of the Spanish and English inventories, /ɲ/, which consists of a tautosyllabic palatal nasal+glide element, is not. This crosslinguistic disparity presents potential difficulty for L1 English speakers due to L1 segmental and phonotactic constraints; the closest English approximation is the heterosyllabic sequence /nj/ (e.g., "canyon" /kænjn/ ['k[h]æn.jn], cf. Spanish cañón "canyon" /kaɲon/ [ka.'ɲon]). With these crosslinguistic differences in mind, we ask: (1a) Do L1 English learners of L2 Spanish produce acoustically distinct Spanish /n/ and /ɲ/ and (1b) Does the distinction of /n/ and /ɲ/ vary by proficiency? In the case that learners distinguish /n/ and /ɲ/, the second question investigates the acoustic quality of /ɲ/ to determine (2a) if learners' L2 representation patterns with that of an L1 Spanish representation or if learners rely on an L1 representation (here, English /nj/) and (2b) if the acoustic quality of L2 Spanish /ɲ/ varies as a function of proficiency. Beginner (n = 9) and advanced (n = 8) L1 English/L2 Spanish speakers and a comparison group of 10 L1 Spanish/L2 English speakers completed delayed repetition tasks in which disyllabic nonce words were produced in a carrier phrase. English critical items contained an intervocalic heterosyllabic /nj/ sequence (e.g., ['p[h]an.jə]); Spanish critical items consisted of items with either intervocalic onset /ɲ/ (e.g., ['xa.ɲa]) or /n/ ['xa.na]. We measured duration and formant contours of the following vocalic portion as acoustic indices of the /n/~/ɲ/ and /ɲ/ ~/nj/ distinctions. Results show that, while L2 Spanish learners produce an acoustically distinct /n/ ~ /ɲ/ contrast even at a low level of proficiency, the beginners produce an intermediate /ɲ/ that falls acoustically between their English /nj/ and the L1 Spanish /ɲ/ while the advanced learners' Spanish /ɲ/ and English /nj/ appear to be in the process of equivalence classification. We discuss these outcomes as they relate to the robustness of L1 phonological constraints in late L2 acquisition coupled with the role of perceptual cues, functional load, and questions of intelligibility.}, }
@article {pmid33657098, year = {2021}, author = {Tabas, A and von Kriegstein, K}, title = {Neural modelling of the encoding of fast frequency modulation.}, journal = {PLoS computational biology}, volume = {17}, number = {3}, pages = {e1008787}, pmid = {33657098}, issn = {1553-7358}, mesh = {Adult ; Auditory Cortex/*physiology ; Auditory Pathways/*physiology ; Computational Biology ; Female ; Humans ; Male ; *Models, Neurological ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Frequency modulation (FM) is a basic constituent of vocalisation in many animals as well as in humans. In human speech, short rising and falling FM-sweeps of around 50 ms duration, called formant transitions, characterise individual speech sounds. There are two representations of FM in the ascending auditory pathway: a spectral representation, holding the instantaneous frequency of the stimuli; and a sweep representation, consisting of neurons that respond selectively to FM direction. To-date computational models use feedforward mechanisms to explain FM encoding. However, from neuroanatomy we know that there are massive feedback projections in the auditory pathway. Here, we found that a classical FM-sweep perceptual effect, the sweep pitch shift, cannot be explained by standard feedforward processing models. We hypothesised that the sweep pitch shift is caused by a predictive feedback mechanism. To test this hypothesis, we developed a novel model of FM encoding incorporating a predictive interaction between the sweep and the spectral representation. The model was designed to encode sweeps of the duration, modulation rate, and modulation shape of formant transitions. It fully accounted for experimental data that we acquired in a perceptual experiment with human participants as well as previously published experimental results. We also designed a new class of stimuli for a second perceptual experiment to further validate the model. Combined, our results indicate that predictive interaction between the frequency encoding and direction encoding neural representations plays an important role in the neural processing of FM. In the brain, this mechanism is likely to occur at early stages of the processing hierarchy.}, }
@article {pmid33656916, year = {2021}, author = {Levy, ES and Chang, YM and Hwang, K and McAuliffe, MJ}, title = {Perceptual and Acoustic Effects of Dual-Focus Speech Treatment in Children With Dysarthria.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {6S}, pages = {2301-2316}, doi = {10.1044/2020_JSLHR-20-00301}, pmid = {33656916}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; *Dysarthria/etiology/therapy ; Humans ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Purpose Children with dysarthria secondary to cerebral palsy may experience reduced speech intelligibility and diminished communicative participation. However, minimal research has been conducted examining the outcomes of behavioral speech treatments in this population. This study examined the effect of Speech Intelligibility Treatment (SIT), a dual-focus speech treatment targeting increased articulatory excursion and vocal intensity, on intelligibility of narrative speech, speech acoustics, and communicative participation in children with dysarthria. Method American English-speaking children with dysarthria (n = 17) received SIT in a 3-week summer camplike setting at Columbia University. SIT follows motor-learning principles to train the child-friendly, dual-focus strategy, "Speak with your big mouth and strong voice." Children produced a story narrative at baseline, immediate posttreatment (POST), and at 6-week follow-up (FUP). Outcomes were examined via blinded listener ratings of ease of understanding (n = 108 adult listeners), acoustic analyses, and questionnaires focused on communicative participation. Results SIT resulted in significant increases in ease of understanding at POST, that were maintained at FUP. There were no significant changes to vocal intensity, speech rate, or vowel spectral characteristics, with the exception of an increase in second formant difference between vowels following SIT. Significantly enhanced communicative participation was evident at POST and FUP. Considerable variability in response to SIT was observed between children. Conclusions Dual-focus treatment shows promise for improving intelligibility and communicative participation in children with dysarthria, although responses to treatment vary considerably across children. Possible mechanisms underlying the intelligibility gains, enhanced communicative participation, and variability in treatment effects are discussed.}, }
@article {pmid33646815, year = {2021}, author = {Howson, PJ and Redford, MA}, title = {The Acquisition of Articulatory Timing for Liquids: Evidence From Child and Adult Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {734-753}, pmid = {33646815}, issn = {1558-9102}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Aged, 80 and over ; Child ; Child, Preschool ; Family ; Humans ; *Language ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; Tongue ; Young Adult ; }, abstract = {Purpose Liquids are among the last sounds to be acquired by English-speaking children. The current study considers their acquisition from an articulatory timing perspective by investigating anticipatory posturing for /l/ versus /ɹ/ in child and adult speech. Method In Experiment 1, twelve 5-year-old, twelve 8-year-old, and 11 college-aged speakers produced carrier phrases with penultimate stress on monosyllabic words that had /l/, /ɹ/, or /d/ (control) as singleton onsets and /æ/ or /u/ as the vowel. Short-domain anticipatory effects were acoustically investigated based on schwa formant values extracted from the preceding determiner (= the) and dynamic formant values across the /ə#LV/ sequence. In Experiment 2, long-domain effects were perceptually indexed using a previously validated forward-gated audiovisual speech prediction task. Results Experiment 1 results indicated that all speakers distinguished /l/ from /ɹ/ along F3. Adults distinguished /l/ from /ɹ/ with a lower F2. Older children produced subtler versions of the adult pattern; their anticipatory posturing was also more influenced by the following vowel. Younger children did not distinguish /l/ from /ɹ/ along F2, but both liquids were distinguished from /d/ in the domains investigated. Experiment 2 results indicated that /ɹ/ was identified earlier than /l/ in gated adult speech; both liquids were identified equally early in 5-year-olds' speech. Conclusions The results are interpreted to suggest a pattern of early tongue-body retraction for liquids in /ə#LV/ sequences in children's speech. More generally, it is suggested that children must learn to inhibit the influence of vowels on liquid articulation to achieve an adultlike contrast between /l/ and /ɹ/ in running speech.}, }
@article {pmid33639824, year = {2021}, author = {Raharjo, I and Kothare, H and Nagarajan, SS and Houde, JF}, title = {Speech compensation responses and sensorimotor adaptation to formant feedback perturbations.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {1147}, pmid = {33639824}, issn = {1520-8524}, support = {R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 DC017690/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 NS100440/NS/NINDS NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Phonetics ; *Speech ; *Speech Perception ; }, abstract = {Control of speech formants is important for the production of distinguishable speech sounds and is achieved with both feedback and learned feedforward control. However, it is unclear whether the learning of feedforward control involves the mechanisms of feedback control. Speakers have been shown to compensate for unpredictable transient mid-utterance perturbations of pitch and loudness feedback, demonstrating online feedback control of these speech features. To determine whether similar feedback control mechanisms exist in the production of formants, responses to unpredictable vowel formant feedback perturbations were examined. Results showed similar within-trial compensatory responses to formant perturbations that were presented at utterance onset and mid-utterance. The relationship between online feedback compensation to unpredictable formant perturbations and sensorimotor adaptation to consistent formant perturbations was further examined. Within-trial online compensation responses were not correlated with across-trial sensorimotor adaptation. A detailed analysis of within-trial time course dynamics across trials during sensorimotor adaptation revealed that across-trial sensorimotor adaptation responses did not result from an incorporation of within-trial compensation response. These findings suggest that online feedback compensation and sensorimotor adaptation are governed by distinct neural mechanisms. These findings have important implications for models of speech motor control in terms of how feedback and feedforward control mechanisms are implemented.}, }
@article {pmid33639809, year = {2021}, author = {Carignan, C}, title = {A practical method of estimating the time-varying degree of vowel nasalization from acoustic features.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {2}, pages = {911}, doi = {10.1121/10.0002925}, pmid = {33639809}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This paper presents a simple and easy-to-use method of creating a time-varying signal of the degree of nasalization in vowels, generated from acoustic features measured in oral and nasalized vowel contexts. The method is presented for separate models constructed using two sets of acoustic features: (1) an uninformed set of 13 Mel-frequency cepstral coefficients (MFCCs) and (2) a combination of the 13 MFCCs and a phonetically informed set of 20 acoustic features of vowel nasality derived from previous research. Both models are compared against two traditional approaches to estimating vowel nasalization from acoustics: A1-P0 and A1-P1, as well as their formant-compensated counterparts. Data include productions from six speakers of different language backgrounds, producing 11 different qualities within the vowel quadrilateral. The results generated from each of the methods are compared against nasometric measurements, representing an objective "ground truth" of the degree of nasalization. The results suggest that the proposed method is more robust than conventional acoustic approaches, generating signals which correlate strongly with nasometric measures across all vowel qualities and all speakers and accurately approximate the time-varying change in the degree of nasalization. Finally, an experimental example is provided to help researchers implement the method in their own study designs.}, }
@article {pmid33630668, year = {2021}, author = {Chung, H and Weismer, G}, title = {Formant Trajectory Patterns of American English /l/ Produced by Adults and Children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {3}, pages = {809-822}, doi = {10.1044/2020_JSLHR-20-00345}, pmid = {33630668}, issn = {1558-9102}, mesh = {Adult ; Child ; Child Language ; Child, Preschool ; Female ; Humans ; Language ; Language Development ; *Phonetics ; *Speech ; Speech Acoustics ; United States ; }, abstract = {Purpose Most acoustic and articulatory studies on /l/ have focused on either duration, formant frequencies, or tongue shape during the constriction interval. Only a limited set of data exists for the transition characteristics of /l/ to and from surrounding vowels. The aim of this study was to examine second formant (F2) transition characteristics of /l/ produced by young children and adults. This was to better understand articulatory behaviors in the production of /l/ and potential clinical applications of these data to typical and delayed /l/ development. Method Participants included 17 children with typically developing speech between the ages of 2 and 5 years, and 10 female adult speakers of Southern American English. Each subject produced single words containing pre- and postvocalic /l/ in two vowel contexts (/i, ɪ/ and /ɔ, ɑ/). F2 transitions, out of and into /l/ constriction intervals from the adjacent vowels, were analyzed for perceptually acceptable /l/ productions. The F2 transition extent, duration, and rate, as well as F2 loci data, were compared across age groups by vowel context for both pre- and postvocalic /l/. Results F2 transitions of adults' /l/ showed a great similarity across and within speakers. Those of young children showed greater variability, but became increasingly similar to those of adults with age. The F2 loci data seemed consistent with greater coarticulation among children than adults. This conclusion, however, must be regarded as preliminary due to the possible influence of different vocal tract size across ages and variability in the data. Conclusions The results suggest that adult patterns can serve as a reliable reference to which children's /l/ productions can be evaluated. The articulatory configurations associated with the /l/ constriction interval and the vocal tract movements into and out of that interval may provide insight into the underlying difficulties related to misarticulated /l/.}, }
@article {pmid33615923, year = {2021}, author = {Ng, ML and Woo, HK}, title = {Effect of total laryngectomy on vowel production: An acoustic study of vowels produced by alaryngeal speakers of Cantonese.}, journal = {International journal of speech-language pathology}, volume = {23}, number = {6}, pages = {652-661}, doi = {10.1080/17549507.2021.1876166}, pmid = {33615923}, issn = {1754-9515}, mesh = {Acoustics ; Humans ; Laryngectomy ; *Larynx, Artificial ; Phonetics ; Speech ; Speech Acoustics ; *Speech, Alaryngeal ; }, abstract = {Purpose: To investigate the effect of total laryngectomy on vowel production, the present study examined the change in vowel articulation associated with different types of alaryngeal speech in comparison with laryngeal speech using novel derived formant metrics.Method: Six metrics derived from the first two formants (F1 and F2) including the First and Second Formant Range Ratios (F1RR and F2RR), triangular and pentagonal Vowel Space Area (tVSA and pVSA), Formant Centralisation Ratio (FCR) and Average Vowel Spacing (AVS) were measured from vowels (/i, y, ɛ, a, ɔ, œ, u/) produced by oesophageal (ES), tracheoesophageal (TE), electrolaryngeal (EL), pneumatic artificial laryngeal (PA) speakers, as well as laryngeal speakers.Result: Data revealed a general reduction in articulatory range and a tendency of vowel centralisation in Cantonese alaryngeal speakers. Significant articulatory difference was found for PA and EL compared with ES, TE, and laryngeal speakers.Conclusion: The discrepant results among alaryngeal speakers may be related to the difference in new sound source (external vs internal). Sensitivity and correlation analyses confirmed the use of the matrix of derived formant metrics provided a more comprehensive profile of the articulatory pattern in the alaryngeal population.}, }
@article {pmid33608184, year = {2023}, author = {Maryn, Y and Wuyts, FL and Zarowski, A}, title = {Are Acoustic Markers of Voice and Speech Signals Affected by Nose-and-Mouth-Covering Respiratory Protective Masks?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {3}, pages = {468.e1-468.e12}, pmid = {33608184}, issn = {1873-4588}, mesh = {Humans ; *Speech ; Masks ; Speech Acoustics ; *COVID-19 ; Acoustics ; Speech Production Measurement ; }, abstract = {BACKGROUND: Worldwide use of nose-and-mouth-covering respiratory protective mask (RPM) has become ubiquitous during COVID19 pandemic. Consequences of wearing RPMs, especially regarding perception and production of spoken communication, are gradually emerging. The present study explored how three prevalent RPMs affect various speech and voice sound properties.
METHODS: Pre-recorded sustained [a] vowels and read sentences from 47 subjects were played by a speech production model ('Voice Emitted by Spare Parts', or 'VESPA') in four conditions: without RPM (C1), with disposable surgical mask (C2), with FFP2 mask (C3), and with transparent plastic mask (C4). Differences between C1 and masked conditions were assessed with Dunnett's t test in 26 speech sound properties related to voice production (fundamental frequency, sound intensity level), voice quality (jitter percent, shimmer percent, harmonics-to-noise ratio, smoothed cepstral peak prominence, Acoustic Voice Quality Index), articulation and resonance (first and second formant frequencies, first and second formant bandwidths, spectral center of gravity, spectral standard deviation, spectral skewness, spectral kurtosis, spectral slope, and spectral energy in ten 1-kHz bands from 0 to 10 kHz).
RESULTS: C2, C3, and C4 significantly affected 10, 15, and 19 of the acoustic speech markers, respectively. Furthermore, absolute differences between unmasked and masked conditions were largest for C4 and smallest for C2.
CONCLUSIONS: All RPMs influenced more or less speech sound properties. However, this influence was least for surgical RPMs and most for plastic RPMs. Surgical RPMs are therefore preferred when spoken communication is priority next to respiratory protection.}, }
@article {pmid33600430, year = {2021}, author = {Cavalcanti, JC and Eriksson, A and Barbosa, PA}, title = {Acoustic analysis of vowel formant frequencies in genetically-related and non-genetically related speakers with implications for forensic speaker comparison.}, journal = {PloS one}, volume = {16}, number = {2}, pages = {e0246645}, pmid = {33600430}, issn = {1932-6203}, mesh = {Acoustics ; Adult ; Brazil ; Forensic Sciences/methods ; Humans ; Language ; Male ; Phonetics ; Psychoacoustics ; Speech/*physiology ; *Speech Acoustics ; Speech Perception/physiology ; Twins, Monozygotic ; Verbal Behavior/*physiology ; }, abstract = {The purpose of this study was to explore the speaker-discriminatory potential of vowel formant mean frequencies in comparisons of identical twin pairs and non-genetically related speakers. The influences of lexical stress and the vowels' acoustic distances on the discriminatory patterns of formant frequencies were also assessed. Acoustic extraction and analysis of the first four speech formants F1-F4 were carried out using spontaneous speech materials. The recordings comprise telephone conversations between identical twin pairs while being directly recorded through high-quality microphones. The subjects were 20 male adult speakers of Brazilian Portuguese (BP), aged between 19 and 35. As for comparisons, stressed and unstressed oral vowels of BP were segmented and transcribed manually in the Praat software. F1-F4 formant estimates were automatically extracted from the middle points of each labeled vowel. Formant values were represented in both Hertz and Bark. Comparisons within identical twin pairs using the Bark scale were performed to verify whether the measured differences would be potentially significant when following a psychoacoustic criterion. The results revealed consistent patterns regarding the comparison of low-frequency and high-frequency formants in twin pairs and non-genetically related speakers, with high-frequency formants displaying a greater speaker-discriminatory power compared to low-frequency formants. Among all formants, F4 seemed to display the highest discriminatory potential within identical twin pairs, followed by F3. As for non-genetically related speakers, both F3 and F4 displayed a similar high discriminatory potential. Regarding vowel quality, the central vowel /a/ was found to be the most speaker-discriminatory segment, followed by front vowels. Moreover, stressed vowels displayed a higher inter-speaker discrimination than unstressed vowels in both groups; however, the combination of stressed and unstressed vowels was found even more explanatory in terms of the observed differences. Although identical twins displayed a higher phonetic similarity, they were not found phonetically identical.}, }
@article {pmid33589372, year = {2023}, author = {Lau, HYC and Scherer, RC}, title = {Objective Measures of Two Musical Interpretations of an Excerpt From Berlioz's "La mort d'Ophélie".}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {301.e9-301.e25}, doi = {10.1016/j.jvoice.2020.12.045}, pmid = {33589372}, issn = {1873-4588}, mesh = {Humans ; Voice Quality ; *Music ; Speech Acoustics ; Glottis/physiology ; *Voice ; }, abstract = {OBJECTIVE/HYPOTHESIS: This study aimed to determine objective production differences relative to two emotional interpretations in performing an excerpt from a classical art song. The null hypothesis was proposed.
METHODS: The first author recorded an excerpt from an art song. The excerpt was sung with two contrasting musical interpretations: an "empathetic legato" approach, and a "sarcastic" approach characterized by emphatic attacks. Microphone, airflow, and electroglottography signals were digitized. The vowels were analyzed in terms of intensity, long term average spectra, fundamental frequency (fo), airflow vibrato rate and extent, vowel onset slope, intensity comparison of harmonic frequencies, and glottal measures based on electroglottograph waveforms. Four consonant tokens were analyzed relative to airflow, voice onset time, and production duration.
RESULTS & CONCLUSIONS: The emphatic performance had faster vowel onset, increased glottal adduction, increased intensity of harmonics in 2-3 kHz, increased intensity in the fourth and fifth formants, inferred subglottal pressure increase, increased airflow for /f/, and greater aspiration airflow for /p, t/. Vibrato extents for intensity, fo, and airflow were wider in the emphatic approach. Findings revealed larger EGGW25 and peak-to-peak amplitude values of the electroglottography waveform, suggesting greater vocal fold contact area and longer glottal closure for the emphatic approach. Long-term average spectrum analyses of the entire production displayed minor variation across all formant frequencies, suggesting an insignificant change in vocal tract shaping between the two approaches. This single-case objective study emphasizes the reality of physiological, aerodynamic, and acoustic production differences in the interpretive and pedagogical aspects of art song performance.}, }
@article {pmid33577218, year = {2021}, author = {Easwar, V and Bridgwater, E and Purcell, D}, title = {The Influence of Vowel Identity, Vowel Production Variability, and Consonant Environment on Envelope Following Responses.}, journal = {Ear and hearing}, volume = {42}, number = {3}, pages = {662-672}, doi = {10.1097/AUD.0000000000000966}, pmid = {33577218}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Evoked Potentials, Auditory, Brain Stem ; Humans ; Language ; Phonetics ; Speech ; Speech Acoustics ; *Speech Perception ; Young Adult ; }, abstract = {OBJECTIVES: The vowel-evoked envelope following response (EFR) is a useful tool for studying brainstem processing of speech in natural consonant-vowel productions. Previous work, however, demonstrates that the amplitude of EFRs is highly variable across vowels. To clarify factors contributing to the variability observed, the objectives of the present study were to evaluate: (1) the influence of vowel identity and the consonant context surrounding each vowel on EFR amplitude and (2) the effect of variations in repeated productions of a vowel on EFR amplitude while controlling for the consonant context.
DESIGN: In Experiment 1, EFRs were recorded in response to seven English vowels (/ij/, /Ι/, /ej/, /ε/, /æ/, /u/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic1/v/2021-04-30T105427Z/r/image-tiff/) embedded in each of four consonant contexts (/hVd/, /sVt/, /zVf/, and /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic2/v/2021-04-30T105427Z/r/image-tiffVv/). In Experiment 2, EFRs were recorded in response to four different variants of one of the four possible vowels (/ij/, /ε/, /æ/, or /JOURNAL/earher/04.03/00003446-202105000-00017/inline-graphic3/v/2021-04-30T105427Z/r/image-tiff/), embedded in the same consonant-vowel-consonant environments used in Experiment 1. All vowels were edited to minimize formant transitions before embedding in a consonant context. Different talkers were used for the two experiments. Data from a total of 30 and 64 (16 listeners/vowel) young adults with normal hearing were included in Experiments 1 and 2, respectively. EFRs were recorded using a single-channel electrode montage between the vertex and nape of the neck while stimuli were presented monaurally.
RESULTS: In Experiment 1, vowel identity had a significant effect on EFR amplitude with the vowel /æ/ eliciting the highest amplitude EFRs (170 nV, on average), and the vowel /ej/ eliciting the lowest amplitude EFRs (106 nV, on average). The consonant context surrounding each vowel stimulus had no statistically significant effect on EFR amplitude. Similarly in Experiment 2, consonant context did not influence the amplitude of EFRs elicited by the vowel variants. Vowel identity significantly altered EFR amplitude with /ε/ eliciting the highest amplitude EFRs (104 nV, on average). Significant, albeit small, differences (<21 nV, on average) in EFR amplitude were evident between some variants of /ε/ and /u/.
CONCLUSION: Based on a comprehensive set of naturally produced vowel samples in carefully controlled consonant contexts, the present study provides additional evidence for the sensitivity of EFRs to vowel identity and variations in vowel production. The surrounding consonant context (after removal of formant transitions) has no measurable effect on EFRs, irrespective of vowel identity and variant. The sensitivity of EFRs to nuances in vowel acoustics emphasizes the need for adequate control and evaluation of stimuli proposed for clinical and research purposes.}, }
@article {pmid33568701, year = {2021}, author = {Hodges-Simeon, CR and Grail, GPO and Albert, G and Groll, MD and Stepp, CE and Carré, JM and Arnocky, SA}, title = {Testosterone therapy masculinizes speech and gender presentation in transgender men.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {3494}, pmid = {33568701}, issn = {2045-2322}, support = {DC013017/NH/NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Speech/*drug effects/physiology ; Speech Acoustics ; Speech Perception/*drug effects/physiology ; Testosterone/*pharmacology ; Transgender Persons/psychology ; Transsexualism/*drug therapy ; Voice/drug effects ; Voice Quality/drug effects ; Young Adult ; }, abstract = {Voice is one of the most noticeably dimorphic traits in humans and plays a central role in gender presentation. Transgender males seeking to align internal identity and external gender expression frequently undergo testosterone (T) therapy to masculinize their voices and other traits. We aimed to determine the importance of changes in vocal masculinity for transgender men and to determine the effectiveness of T therapy at masculinizing three speech parameters: fundamental frequency (i.e., pitch) mean and variation (fo and fo-SD) and estimated vocal tract length (VTL) derived from formant frequencies. Thirty transgender men aged 20 to 40 rated their satisfaction with traits prior to and after T therapy and contributed speech samples and salivary T. Similar-aged cisgender men and women contributed speech samples for comparison. We show that transmen viewed voice change as critical to transition success compared to other masculine traits. However, T therapy may not be sufficient to fully masculinize speech: while fo and fo-SD were largely indistinguishable from cismen, VTL was intermediate between cismen and ciswomen. fo was correlated with salivary T, and VTL associated with T therapy duration. This argues for additional approaches, such as behavior therapy and/or longer duration of hormone therapy, to improve speech transition.}, }
@article {pmid33555417, year = {2021}, author = {Heimbauer, LA and Beran, MJ and Owren, MJ}, title = {A chimpanzee recognizes varied acoustical versions of sine-wave and noise-vocoded speech.}, journal = {Animal cognition}, volume = {24}, number = {4}, pages = {843-854}, pmid = {33555417}, issn = {1435-9456}, mesh = {Acoustic Stimulation/veterinary ; Animals ; Cues ; Noise ; Pan troglodytes ; *Speech ; *Speech Perception ; }, abstract = {Previous research demonstrated that a language-trained chimpanzee recognized familiar English words in sine-wave and noise-vocoded forms (Heimbauer et al. Curr Biol 21:1210-1214, 2011). However, those results did not provide information regarding processing strategies of the specific acoustic cues to which the chimpanzee may have attended. The current experiments tested this chimpanzee and adult humans using sine-wave and noise-vocoded speech manipulated using specific sine-waves and a different number of noise bands, respectively. Similar to humans tested with the same stimuli, the chimpanzee was more successful identifying sine-wave speech when both SW1 and SW2 were present - the components that are modeled on formants F1 and F2 in the natural speech signal. Results with noise-vocoded speech revealed that the chimpanzee and humans performed best with stimuli that included four or five noise bands, as compared to those with three and two. Overall, amplitude and frequency modulation over time were important for identification of sine-wave and noise-vocoded speech, with further evidence that a nonhuman primate is capable of using top-down processes for speech perception when the signal is altered and incomplete.}, }
@article {pmid33524265, year = {2021}, author = {Yang, J and Xu, L}, title = {Vowel Production in Prelingually Deafened Mandarin-Speaking Children With Cochlear Implants.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {64}, number = {2}, pages = {664-682}, doi = {10.1044/2020_JSLHR-20-00469}, pmid = {33524265}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Child ; Child, Preschool ; *Cochlear Implantation ; *Cochlear Implants ; *Deafness/surgery ; Humans ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {Purpose The purpose of this study was to characterize the acoustic profile and to evaluate the intelligibility of vowel productions in prelingually deafened, Mandarin-speaking children with cochlear implants (CIs). Method Twenty-five children with CIs and 20 age-matched children with normal hearing (NH) were recorded producing a list of Mandarin disyllabic and trisyllabic words containing 20 Mandarin vowels [a, i, u, y, ɤ, ɿ, ʅ, ai, ei, ia, ie, ye, ua, uo, au, ou, iau, iou, uai, uei] located in the first consonant-vowel syllable. The children with CIs were all prelingually deafened and received unilateral implantation before 7 years of age with an average length of CI use of 4.54 years. In the acoustic analysis, the first two formants (F1 and F2) were extracted at seven equidistant time locations for the tested vowels. The durational and spectral features were compared between the CI and NH groups. In the vowel intelligibility task, the extracted vowel portions in both NH and CI children were presented to six Mandarin-speaking, NH adult listeners for identification. Results The acoustic analysis revealed that the children with CIs deviated from the NH controls in the acoustic features for both single vowels and compound vowels. The acoustic deviations were reflected in longer duration, more scattered vowel categories, smaller vowel space area, and distinct formant trajectories in the children with CIs in comparison to NH controls. The vowel intelligibility results showed that the recognition accuracy of the vowels produced by the children with CIs was significantly lower than that of the NH children. The confusion pattern of vowel recognition in the children with CIs generally followed that in the NH children. Conclusion Our data suggested that the prelingually deafened children with CIs, with a relatively long duration of CI experience, still showed measurable acoustic deviations and lower intelligibility in vowel productions in comparison to the NH children.}, }
@article {pmid33522087, year = {2021}, author = {Carl, M and Icht, M}, title = {Acoustic vowel analysis and speech intelligibility in young adult Hebrew speakers: Developmental dysarthria versus typical development.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {283-298}, doi = {10.1111/1460-6984.12598}, pmid = {33522087}, issn = {1460-6984}, mesh = {Acoustics ; Adolescent ; *Dysarthria/diagnosis ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; Young Adult ; }, abstract = {BACKGROUND: Developmental dysarthria is a motor speech impairment commonly characterized by varying levels of reduced speech intelligibility. The relationship between intelligibility deficits and acoustic vowel space among these individuals has long been noted in the literature, with evidence of vowel centralization (e.g., in English and Mandarin). However, the degree to which this centralization occurs and the intelligibility-acoustic relationship is maintained in different vowel systems has yet to be studied thoroughly. In comparison with American English, the Hebrew vowel system is significantly smaller, with a potentially smaller vowel space area, a factor that may impact upon the comparisons of the acoustic vowel space and its correlation with speech intelligibility. Data on vowel space and speech intelligibility are particularly limited for Hebrew speakers with motor speech disorders.
AIMS: To determine the nature and degree of vowel space centralization in Hebrew-speaking adolescents and young adults with dysarthria, in comparison with typically developing (TD) peers, and to correlate these findings with speech intelligibility scores.
METHODS & PROCEDURES: Adolescents and young adults with developmental dysarthria (secondary to cerebral palsy (CP) and other motor deficits, n = 17) and their TD peers (n = 17) were recorded producing Hebrew corner vowels within single words. For intelligibility assessments, naïve listeners transcribed those words produced by speakers with CP, and intelligibility scores were calculated.
OUTCOMES & RESULTS: Acoustic analysis of vowel formants (F1, F2) revealed a centralization of vowel space among speakers with CP for all acoustic metrics of vowel formants, and mainly for the formant centralization ratio (FCR), in comparison with TD peers. Intelligibility scores were correlated strongly with the FCR metric for speakers with CP.
The main results, vowel space centralization for speakers with CP in comparison with TD peers, echo previous cross-linguistic results. The correlation of acoustic results with speech intelligibility carries clinical implications. Taken together, the results contribute to better characterization of the speech production deficit in Hebrew speakers with motor speech disorders. Furthermore, they may guide clinical decision-making and intervention planning to improve speech intelligibility. What this paper adds What is already known on the subject Speech production and intelligibility deficits among individuals with developmental dysarthria (e.g., secondary to CP) are well documented. These deficits have also been correlated with centralization of the acoustic vowel space, although primarily in English speakers. Little is known about the acoustic characteristics of vowels in Hebrew speakers with motor speech disorders, and whether correlations with speech intelligibility are maintained. What this paper adds to existing knowledge This study is the first to describe the acoustic characteristics of vowel space in Hebrew-speaking adolescents and young adults with developmental dysarthria. The results demonstrate a centralization of the acoustic vowel space in comparison with TD peers for all measures, as found in other languages. Correlation between acoustic measures and speech intelligibility scores were also documented. We discuss these results within the context of cross-linguistic comparisons. What are the potential or actual clinical implications of this work? The results confirm the use of objective acoustic measures in the assessment of individuals with motor speech disorders, providing such data for Hebrew-speaking adolescents and young adults. These measures can be used to determine the nature and severity of the speech deficit across languages, may guide intervention planning, as well as measure the effectiveness of intelligibility-based treatment programmes.}, }
@article {pmid33514177, year = {2021}, author = {Bakst, S and Niziolek, CA}, title = {Effects of syllable stress in adaptation to altered auditory feedback in vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {149}, number = {1}, pages = {708}, pmid = {33514177}, issn = {1520-8524}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; }, mesh = {*Feedback ; Humans ; Language ; Phonetics ; *Speech ; Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Unstressed syllables in English most commonly contain the vowel quality [ə] (schwa), which is cross-linguistically described as having a variable target. The present study examines whether speakers are sensitive to whether their auditory feedback matches their target when producing unstressed syllables. When speakers hear themselves producing formant-altered speech, they will change their motor plans so that their altered feedback is a better match to the target. If schwa has no target, then feedback mismatches in unstressed syllables may not drive a change in production. In this experiment, participants spoke disyllabic words with initial or final stress where the auditory feedback of F1 was raised (Experiment 1) or lowered (Experiment 2) by 100 mels. Both stressed and unstressed syllables showed adaptive changes in F1. In Experiment 1, initial-stress words showed larger adaptive decreases in F1 than final-stress words, but in Experiment 2, stressed syllables overall showed greater adaptive increases in F1 than unstressed syllables in all words, regardless of which syllable contained the primary stress. These results suggest that speakers are sensitive to feedback mismatches in both stressed and unstressed syllables, but that stress and metrical foot type may mediate the corrective response.}, }
@article {pmid33495033, year = {2023}, author = {Hakanpää, T and Waaramaa, T and Laukkanen, AM}, title = {Training the Vocal Expression of Emotions in Singing: Effects of Including Acoustic Research-Based Elements in the Regular Singing Training of Acting Students.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {293.e7-293.e23}, doi = {10.1016/j.jvoice.2020.12.032}, pmid = {33495033}, issn = {1873-4588}, mesh = {Humans ; *Singing ; *Voice ; Acoustics ; Students ; Emotions ; }, abstract = {OBJECTIVES: This study examines the effects of including acoustic research-based elements of the vocal expression of emotions in the singing lessons of acting students during a seven-week teaching period. This information may be useful in improving the training of interpretation in singing.
STUDY DESIGN: Experimental comparative study.
METHODS: Six acting students participated in seven weeks of extra training concerning voice quality in the expression of emotions in singing. Song samples were recorded before and after the training. A control group of six acting students were recorded twice within a seven-week period, during which they participated in ordinary training. All participants sang on the vowel [a:] and on a longer phrase expressing anger, sadness, joy, tenderness, and neutral states. The vowel and phrase samples were evaluated by 34 listeners for the perceived emotion. Additionally, the vowel samples were analyzed for formant frequencies (F1-F4), sound pressure level (SPL), spectral structure (Alpha ratio = SPL 1500-5000 Hz - SPL 50-1500 Hz), harmonic-to-noise ratio (HNR), and perturbation (jitter, shimmer).
RESULTS: The number of correctly perceived expressions improved in the test group's vowel samples, while no significant change was observed in the control group. The overall recognition was higher for the phrases than for the vowel samples. Of the acoustic parameters, F1 and SPL significantly differentiated emotions in both groups, and HNR specifically differentiated emotions in the test group. The Alpha ratio was found to statistically significantly differentiate emotion expression after training.
CONCLUSIONS: The expression of emotion in the singing voice improved after seven weeks of voice quality training. The F1, SPL, Alpha ratio, and HNR differentiated emotional expression. The variation in acoustic parameters became wider after training. Similar changes were not observed after seven weeks of ordinary voice training.}, }
@article {pmid33484095, year = {2021}, author = {Mendoza Ramos, V and Paulyn, C and Van den Steen, L and Hernandez-Diaz Huici, ME and De Bodt, M and Van Nuffelen, G}, title = {Effect of boost articulation therapy (BArT) on intelligibility in adults with dysarthria.}, journal = {International journal of language & communication disorders}, volume = {56}, number = {2}, pages = {271-282}, pmid = {33484095}, issn = {1460-6984}, mesh = {Adult ; Behavior Therapy ; *Dysarthria/diagnosis/therapy ; Humans ; Speech Articulation Tests ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {BACKGROUND: The articulatory accuracy of patients with dysarthria is one of the most affected speech dimensions with a high impact on speech intelligibility. Behavioural treatments of articulation can either involve direct or indirect approaches. The latter have been thoroughly investigated and are generally appreciated for their almost immediate effects on articulation and intelligibility. The number of studies on (short-term) direct articulation therapy is limited.
AIMS: To investigate the effects of short-term, boost articulation therapy (BArT) on speech intelligibility in patients with chronic or progressive dysarthria and the effect of severity of dysarthria on the outcome.
METHODS & PROCEDURES: The study consists of a two-group pre-/post-test design to assess speech intelligibility at phoneme and sentence level and during spontaneous speech, automatic speech and reading a phonetically balanced text. A total of 17 subjects with mild to severe dysarthria participated in the study and were randomly assigned to either a patient-tailored, intensive articulatory drill programme or an intensive minimal pair training. Both training programmes were based on the principles of motor learning. Each training programme consisted of five sessions of 45 min completed within one week.
OUTCOMES & RESULTS: Following treatment, a statistically significant increase of mean group intelligibility was shown at phoneme and sentence level, and in automatic sequences. This was supported by an acoustic analysis that revealed a reduction in formant centralization ratio. Within specific groups of severity, large and moderate positive effect sizes with Cohen's d were demonstrated.
BArT successfully improves speech intelligibility in patients with chronic or progressive dysarthria at different levels of the impairment. What this paper adds What is already known on the subject Behavioural treatment of articulation in patients with dysarthria mainly involves indirect strategies, which have shown positive effects on speech intelligibility. However, there is limited evidence on the short-term effects of direct articulation therapy at the segmental level of speech. This study investigates the effectiveness of BArT on speech intelligibility in patients with chronic or progressive dysarthria at all severity levels. What this paper adds to existing knowledge The intensive and direct articulatory therapy programmes developed and applied in this study intend to reduce the impairment instead of compensating it. This approach results in a significant improvement of speech intelligibility at different dysarthria severity levels in a short period of time while contributing to exploit and develop all available residual motor skills in persons with dysarthria. What are the potential or actual clinical implications of this work? The improvements in intelligibility demonstrate the effectiveness of a BArT at the segmental level of speech. This makes it to be considered a suitable approach in the treatment of patients with chronic or progressive dysarthria.}, }
@article {pmid33455538, year = {2022}, author = {Kulikov, V}, title = {Voice and Emphasis in Arabic Coronal Stops: Evidence for Phonological Compensation.}, journal = {Language and speech}, volume = {65}, number = {1}, pages = {73-104}, pmid = {33455538}, issn = {1756-6053}, mesh = {Cues ; Humans ; Language ; Phonetics ; Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {The current study investigates multiple acoustic cues-voice onset time (VOT), spectral center of gravity (SCG) of burst, pitch (F0), and frequencies of the first (F1) and second (F2) formants at vowel onset-associated with phonological contrasts of voicing and emphasis in production of Arabic coronal stops. The analysis of the acoustic data collected from eight native speakers of the Qatari dialect showed that the three stops form three distinct modes on the VOT scale: [d] is (pre)voiced, voiceless [t] is aspirated, and emphatic [ṭ] is voiceless unaspirated. The contrast is also maintained in spectral cues. Each cue influences production of coronal stops while their relevance to phonological contrasts varies. VOT was most relevant for voicing, but F2 was mostly associated with emphasis. The perception experiment revealed that listeners were able to categorize ambiguous tokens correctly and compensate for phonological contrasts. The listeners' results were used to evaluate three categorization models to predict the intended category of a coronal stop: a model with unweighted and unadjusted cues, a model with weighted cues compensating for phonetic context, and a model with weighted cues compensating for the voicing and emphasis contrasts. The findings suggest that the model with phonological compensation performed most similar to human listeners both in terms of accuracy rate and error pattern.}, }
@article {pmid33441596, year = {2021}, author = {Aung, T and Goetz, S and Adams, J and McKenna, C and Hess, C and Roytman, S and Cheng, JT and Zilioli, S and Puts, D}, title = {Low fundamental and formant frequencies predict fighting ability among male mixed martial arts fighters.}, journal = {Scientific reports}, volume = {11}, number = {1}, pages = {905}, pmid = {33441596}, issn = {2045-2322}, mesh = {Acoustics ; Adult ; Aggression/*physiology/psychology ; Anthropometry ; Athletes/psychology ; Biomarkers ; Cues ; Humans ; Male ; Martial Arts/physiology ; Phenotype ; Pitch Discrimination/physiology ; Sexual Behavior/physiology/psychology ; Social Perception/psychology ; Voice/*physiology ; }, abstract = {Human voice pitch is highly sexually dimorphic and eminently quantifiable, making it an ideal phenotype for studying the influence of sexual selection. In both traditional and industrial populations, lower pitch in men predicts mating success, reproductive success, and social status and shapes social perceptions, especially those related to physical formidability. Due to practical and ethical constraints however, scant evidence tests the central question of whether male voice pitch and other acoustic measures indicate actual fighting ability in humans. To address this, we examined pitch, pitch variability, and formant position of 475 mixed martial arts (MMA) fighters from an elite fighting league, with each fighter's acoustic measures assessed from multiple voice recordings extracted from audio or video interviews available online (YouTube, Google Video, podcasts), totaling 1312 voice recording samples. In four regression models each predicting a separate measure of fighting ability (win percentages, number of fights, Elo ratings, and retirement status), no acoustic measure significantly predicted fighting ability above and beyond covariates. However, after fight statistics, fight history, height, weight, and age were used to extract underlying dimensions of fighting ability via factor analysis, pitch and formant position negatively predicted "Fighting Experience" and "Size" factor scores in a multivariate regression model, explaining 3-8% of the variance. Our findings suggest that lower male pitch and formants may be valid cues of some components of fighting ability in men.}, }
@article {pmid33413460, year = {2021}, author = {Volodin, IA and Volodina, EV and Frey, R}, title = {Rutting vocal display in male impala (Aepyceros melampus) and overlap with alarm context.}, journal = {Frontiers in zoology}, volume = {18}, number = {1}, pages = {2}, pmid = {33413460}, issn = {1742-9994}, abstract = {BACKGROUND: The rutting vocal display of male impala Aepyceros melampus is unique for its complexity among ruminants. This study investigates bouts of rutting calls produced towards potential mates and rival males by free-ranging male impala in Namibia. In particular, a comparison of male rutting and alarm snorts is conducted, inspired by earlier findings of mate guarding by using alarm snorts in male topi Damaliscus lunatus.
RESULTS: Rutting male impala produced 4-38 (13.5 ± 6.5) rutting calls per bout. We analyzed 201 bouts, containing in total 2709 rutting calls of five types: continuous roars produced within a single exhalation-inhalation cycle; interrupted roars including few exhalation-inhalation cycles; pant-roars distinctive by a pant-phase with rapidly alternating inhalations and exhalations; usual snorts lacking any roar part; and roar-snorts starting with a short roar part. Bouts mostly started and ended with usual snorts. Continuous roars were the shortest roars. The average duration of the exhalatory phase was longest in the continuous roars and shortest in the pant-roars. The average fundamental frequency (49.7-51.4 Hz) did not differ between roar types. Vocal tract length, calculated by using measurements of the first four vocal tract resonances (formants), ranged within 381-382 mm in all roar types. In the studied male impala, rutting snorts within bouts of rutting calls were longer and had higher values of the upper quartile in the call spectra than alarm snorts produced towards potential danger.
CONCLUSIONS: Additional inhalations during the emission of the interrupted and pant-roars prolong their duration compared to the continuous roars but do not affect the fundamental frequency or the degree of larynx retraction while roaring. Alarm snorts are separated from one another by large intervals, whereas the intervals between rutting snorts within bouts are short. Sometimes, rutting snorts alternate with roars, whereas alarm snorts do not. Therefore, it is not the acoustic structure of individual snorts but the temporal sequence and the occasional association with another call type that defines snorts as either rutting or alarm snorts. The rutting snorts of male impala may function to attract the attention of receptive females and delay their departure from a male's harem or territory.}, }
@article {pmid33399816, year = {2021}, author = {Bodaghi, D and Jiang, W and Xue, Q and Zheng, X}, title = {Effect of Supraglottal Acoustics on Fluid-Structure Interaction During Human Voice Production.}, journal = {Journal of biomechanical engineering}, volume = {143}, number = {4}, pages = {}, pmid = {33399816}, issn = {1528-8951}, support = {R01 DC009616/DC/NIDCD NIH HHS/United States ; }, mesh = {*Phonation ; }, abstract = {A hydrodynamic/acoustic splitting method was used to examine the effect of supraglottal acoustics on fluid-structure interactions during human voice production in a two-dimensional computational model. The accuracy of the method in simulating compressible flows in typical human airway conditions was verified by comparing it to full compressible flow simulations. The method was coupled with a three-mass model of vocal fold lateral motion to simulate fluid-structure interactions during human voice production. By separating the acoustic perturbation components of the airflow, the method allows isolation of the role of supraglottal acoustics in fluid-structure interactions. The results showed that an acoustic resonance between a higher harmonic of the sound source and the first formant of the supraglottal tract occurred during normal human phonation when the fundamental frequency was much lower than the formants. The resonance resulted in acoustic pressure perturbation at the glottis which was of the same order as the incompressible flow pressure and found to affect vocal fold vibrations and glottal flow rate waveform. Specifically, the acoustic perturbation delayed the opening of the glottis, reduced the vertical phase difference of vocal fold vibrations, decreased flow rate and maximum flow deceleration rate (MFDR) at the glottal exit; yet, they had little effect on glottal opening. The results imply that the sound generation in the glottis and acoustic resonance in the supraglottal tract are coupled processes during human voice production and computer modeling of vocal fold vibrations needs to include supraglottal acoustics for accurate predictions.}, }
@article {pmid33397591, year = {2023}, author = {Feng, M and Howard, DM}, title = {The Dynamic Effect of the Valleculae on Singing Voice - An Exploratory Study Using 3D Printed Vocal Tracts.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {178-186}, doi = {10.1016/j.jvoice.2020.12.012}, pmid = {33397591}, issn = {1873-4588}, mesh = {Humans ; *Singing ; Speech Acoustics ; *Voice/physiology ; Acoustics ; Printing, Three-Dimensional ; }, abstract = {BACKGROUND AND OBJECTIVES: The valleculae can be seen as a pair of side branches of the human vocal tract like the piriform fossae. While the acoustic properties of the piriform fossae have been explored in detail, there is little evidence of full exploration of the acoustic properties of the valleculae. A recent investigation (Vampola, Horáček, & Švec, 2015), using a finite element model of a single vowel /a/, suggests that the valleculae created two antiresonances and two resonances in the high frequency region (above 4kHz) along with those produced by the piriform sinuses. In the current study, we investigate, in multiple vowels, the acoustic influences of the valleculae in singing voice, using 3-D printed vocal tracts.
METHOD: MRI data were collected from an operatic tenor singing English vowels /a/, /u/, /i/. The images of each vowel were segmented and edited to create a pair of tracts, where one is the original and one had the valleculae digitally removed.The printed tracts were then placed atop a vocal tract organ loudspeaker, excited by white noise. Recordings were made with a microphone placed in front of the mouths of the tracts, to measure their frequency responses.
RESULTS: Dimensional changes were observed in valleculae of different vowels, with the long-term average spectra of the recordings illustrating clear differences between the frequency responses of the va-nova (valleculae - no valleculae) pairs, which varies with vowels.
CONCLUSION: The experiment demonstrates the dynamic[1] nature of the shapes of the valleculae in the human vocal tract and its acoustic consequences. It provides evidence that the valleculae have similar acoustic properties to the piriform fossae but with larger variations, and in some cases can influence acoustically the frequency region below 4kHz. The results suggest that large volume valleculae have the potential to impede to some extent the acoustic effect of the singers formant cluster and small valleculae may do the reverse. Since the volume of the valleculae is observed to be largely dependent on tongue movement and also with changes to the uttered vowel, it can be assumed that the high frequency energy, including that within the singer's formant region, could be vowel dependent. Strategies to control valleculae volumes are likely to be highly relevant to voice pedagogy practice as well as singing performance.}, }
@article {pmid36154080, year = {2021}, author = {Ying Liu, Y and Polka, L and Masapollo, M and Ménard, L}, title = {Disentangling the roles of formant proximity and stimulus prototypicality in adult vowel perception.}, journal = {JASA express letters}, volume = {1}, number = {1}, pages = {015201}, doi = {10.1121/10.0003041}, pmid = {36154080}, issn = {2691-1191}, abstract = {The present investigation examined the extent to which asymmetries in vowel perception derive from a sensitivity to focalization (formant proximity), stimulus prototypicality, or both. English-speaking adults identified, rated, and discriminated a vowel series that spanned a less-focal/prototypic English /u/ and a more-focal/prototypic French /u/ exemplar. Discrimination pairs included one-step, two-step, and three-step intervals along the series. Asymmetries predicted by both focalization and prototype effects emerged when discrimination step-size was varied. The findings indicate that both generic/universal and language-specific biases shape vowel perception in adults; the latter are challenging to isolate without well-controlled stimuli and appropriately scaled discrimination tasks.}, }
@article {pmid33379914, year = {2020}, author = {Lovcevic, I and Kalashnikova, M and Burnham, D}, title = {Acoustic features of infant-directed speech to infants with hearing loss.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3399}, doi = {10.1121/10.0002641}, pmid = {33379914}, issn = {1520-8524}, mesh = {Acoustics ; Adult ; *Deafness ; Female ; *Hearing Loss/diagnosis ; Humans ; Infant ; Speech ; *Speech Perception ; }, abstract = {This study investigated the effects of hearing loss and hearing experience on the acoustic features of infant-directed speech (IDS) to infants with hearing loss (HL) compared to controls with normal hearing (NH) matched by either chronological or hearing age (experiment 1) and across development in infants with hearing loss as well as the relation between IDS features and infants' developing lexical abilities (experiment 2). Both experiments included detailed acoustic analyses of mothers' productions of the three corner vowels /a, i, u/ and utterance-level pitch in IDS and in adult-directed speech. Experiment 1 demonstrated that IDS to infants with HL was acoustically more variable than IDS to hearing-age matched infants with NH. Experiment 2 yielded no changes in IDS features over development; however, the results did show a positive relationship between formant distances in mothers' speech and infants' concurrent receptive vocabulary size, as well as between vowel hyperarticulation and infants' expressive vocabulary. These findings suggest that despite infants' HL and thus diminished access to speech input, infants with HL are exposed to IDS with generally similar acoustic qualities as are infants with NH. However, some differences persist, indicating that infants with HL might receive less intelligible speech.}, }
@article {pmid33379900, year = {2020}, author = {Nault, DR and Munhall, KG}, title = {Individual variability in auditory feedback processing: Responses to real-time formant perturbations and their relation to perceptual acuity.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3709}, doi = {10.1121/10.0002923}, pmid = {33379900}, issn = {1520-8524}, abstract = {In this study, both between-subject and within-subject variability in speech perception and speech production were examined in the same set of speakers. Perceptual acuity was determined using an ABX auditory discrimination task, whereby speakers made judgments between pairs of syllables on a /ɛ/ to /æ/ acoustic continuum. Auditory feedback perturbations of the first two formants were implemented in a production task to obtain measures of compensation, normal speech production variability, and vowel spacing. Speakers repeated the word "head" 120 times under varying feedback conditions, with the final Hold phase involving the strongest perturbations of +240 Hz in F1 and -300 Hz in F2. Multiple regression analyses were conducted to determine whether individual differences in compensatory behavior in the Hold phase could be predicted by perceptual acuity, speech production variability, and vowel spacing. Perceptual acuity significantly predicted formant changes in F1, but not in F2. These results are discussed in consideration of the importance of using larger sample sizes in the field and developing new methods to explore feedback processing at the individual participant level. The potential positive role of variability in speech motor control is also considered.}, }
@article {pmid33379892, year = {2020}, author = {Kothare, H and Raharjo, I and Ramanarayanan, V and Ranasinghe, K and Parrell, B and Johnson, K and Houde, JF and Nagarajan, SS}, title = {Sensorimotor adaptation of speech depends on the direction of auditory feedback alteration.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {3682}, pmid = {33379892}, issn = {1520-8524}, support = {K08 AG058749/AG/NIA NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; }, mesh = {Feedback ; Feedback, Sensory ; Humans ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {A hallmark feature of speech motor control is its ability to learn to anticipate and compensate for persistent feedback alterations, a process referred to as sensorimotor adaptation. Because this process involves adjusting articulation to counter the perceived effects of altering acoustic feedback, there are a number of factors that affect it, including the complex relationship between acoustics and articulation and non-uniformities of speech perception. As a consequence, sensorimotor adaptation is hypothesised to vary as a function of the direction of the applied auditory feedback alteration in vowel formant space. This hypothesis was tested in two experiments where auditory feedback was altered in real time, shifting the frequency values of the first and second formants (F1 and F2) of participants' speech. Shifts were designed on a subject-by-subject basis and sensorimotor adaptation was quantified with respect to the direction of applied shift, normalised for individual speakers. Adaptation was indeed found to depend on the direction of the applied shift in vowel formant space, independent of shift magnitude. These findings have implications for models of sensorimotor adaptation of speech.}, }
@article {pmid33379880, year = {2020}, author = {Houle, N and Levi, SV}, title = {Acoustic differences between voiced and whispered speech in gender diverse speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {6}, pages = {4002}, doi = {10.1121/10.0002952}, pmid = {33379880}, issn = {1520-8524}, mesh = {Acoustics ; Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; *Voice ; }, abstract = {Whispered speech is a naturally produced mode of communication that lacks a fundamental frequency. Several other acoustic differences exist between whispered and voiced speech, such as speaking rate (measured as segment duration) and formant frequencies. Previous research has shown that listeners are less accurate at identifying linguistic information (e.g., identifying a speech sound) and speaker information (e.g., reporting speaker gender) from whispered speech. To further explore differences between voiced and whispered speech, acoustic differences were examined across three datasets (hVd, sVd, and ʃVd) and three speaker groups (ciswomen, transwomen, cismen). Consistent with previous studies, vowel duration was generally longer in whispered speech and formant frequencies were shifted higher, although the magnitude of these differences depended on vowel and gender. Despite the increase in duration, the acoustic vowel space area (measured either with a vowel quadrilateral or with a convex hull) was smaller in the whispered speech, suggesting that larger vowel space areas are not an automatic consequence of a lengthened articulation. Overall, these findings are consistent with previous literature showing acoustic differences between voiced and whispered speech beyond the articulatory change of eliminating fundamental frequency.}, }
@article {pmid33369591, year = {2021}, author = {Ananthakrishnan, S and Grinstead, L and Yurjevich, D}, title = {Human Frequency Following Responses to Filtered Speech.}, journal = {Ear and hearing}, volume = {42}, number = {1}, pages = {87-105}, doi = {10.1097/AUD.0000000000000902}, pmid = {33369591}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adult ; *Hearing Aids ; Humans ; Noise ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: There is increasing interest in using the frequency following response (FFR) to describe the effects of varying different aspects of hearing aid signal processing on brainstem neural representation of speech. To this end, recent studies have examined the effects of filtering on brainstem neural representation of the speech fundamental frequency (f0) in listeners with normal hearing sensitivity by measuring FFRs to low- and high-pass filtered signals. However, the stimuli used in these studies do not reflect the entire range of typical cutoff frequencies used in frequency-specific gain adjustments during hearing aid fitting. Further, there has been limited discussion on the effect of filtering on brainstem neural representation of formant-related harmonics. Here, the effects of filtering on brainstem neural representation of speech fundamental frequency (f0) and harmonics related to first formant frequency (F1) were assessed by recording envelope and spectral FFRs to a vowel low-, high-, and band-pass filtered at cutoff frequencies ranging from 0.125 to 8 kHz.
DESIGN: FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth and low-pass (experiment 1), high-pass (experiment 2), and band-pass (experiment 3) filtered conditions. In experiment 1, FFRs were measured to a synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 11 low-pass filtered conditions (low-pass cutoff frequencies: 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 6, and 8 kHz) in 19 adult listeners with normal hearing sensitivity. In experiment 2, FFRs were measured to the same synthetically generated vowel stimulus /u/ presented in a full bandwidth condition as well as 10 high-pass filtered conditions (high-pass cutoff frequencies: 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, and 6 kHz) in 7 adult listeners with normal hearing sensitivity. In experiment 3, in addition to the full bandwidth condition, FFRs were measured to vowel /u/ low-pass filtered at 2 kHz, band-pass filtered between 2-4 kHz and 4-6 kHz in 10 adult listeners with normal hearing sensitivity. A Fast Fourier Transform analysis was conducted to measure the strength of f0 and the F1-related harmonic relative to the noise floor in the brainstem neural responses obtained to the full bandwidth and filtered stimulus conditions.
RESULTS: Brainstem neural representation of f0 was reduced when the low-pass filter cutoff frequency was between 0.25 and 0.5 kHz; no differences in f0 strength were noted between conditions when the low-pass filter cutoff condition was at or greater than 0.75 kHz. While envelope FFR f0 strength was reduced when the stimulus was high-pass filtered at 6 kHz, there was no effect of high-pass filtering on brainstem neural representation of f0 when the high-pass filter cutoff frequency ranged from 0.125 to 4 kHz. There was a weakly significant global effect of band-pass filtering on brainstem neural phase-locking to f0. A trends analysis indicated that mean f0 magnitude in the brainstem neural response was greater when the stimulus was band-pass filtered between 2 and 4 kHz as compared to when the stimulus was band-pass filtered between 4 and 6 kHz, low-pass filtered at 2 kHz or presented in the full bandwidth condition. Last, neural phase-locking to f0 was reduced or absent in envelope FFRs measured to filtered stimuli that lacked spectral energy above 0.125 kHz or below 6 kHz. Similarly, little to no energy was seen at F1 in spectral FFRs obtained to low-, high-, or band-pass filtered stimuli that did not contain energy in the F1 region. For stimulus conditions that contained energy at F1, the strength of the peak at F1 in the spectral FFR varied little with low-, high-, or band-pass filtering.
CONCLUSIONS: Energy at f0 in envelope FFRs may arise due to neural phase-locking to low-, mid-, or high-frequency stimulus components, provided the stimulus envelope is modulated by at least two interacting harmonics. Stronger neural responses at f0 are measured when filtering results in stimulus bandwidths that preserve stimulus energy at F1 and F2. In addition, results suggest that unresolved harmonics may favorably influence f0 strength in the neural response. Lastly, brainstem neural representation of the F1-related harmonic measured in spectral FFRs obtained to filtered stimuli is related to the presence or absence of stimulus energy at F1. These findings add to the existing literature exploring the viability of the FFR as an objective technique to evaluate hearing aid fitting where stimulus bandwidth is altered by design due to frequency-specific gain applied by amplification algorithms.}, }
@article {pmid33356887, year = {2021}, author = {Parrell, B and Niziolek, CA}, title = {Increased speech contrast induced by sensorimotor adaptation to a nonuniform auditory perturbation.}, journal = {Journal of neurophysiology}, volume = {125}, number = {2}, pages = {638-647}, pmid = {33356887}, issn = {1522-1598}, support = {R00 DC014520/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; R01 DC017696/DC/NIDCD NIH HHS/United States ; }, mesh = {*Adaptation, Physiological ; *Feedback, Sensory ; Female ; Humans ; Male ; Phonetics ; Sensorimotor Cortex/*physiology ; *Speech ; *Speech Perception ; Young Adult ; }, abstract = {When auditory feedback is perturbed in a consistent way, speakers learn to adjust their speech to compensate, a process known as sensorimotor adaptation. Although this paradigm has been highly informative for our understanding of the role of sensory feedback in speech motor control, its ability to induce behaviorally relevant changes in speech that affect communication effectiveness remains unclear. Because reduced vowel contrast contributes to intelligibility deficits in many neurogenic speech disorders, we examine human speakers' ability to adapt to a nonuniform perturbation field that was designed to affect vowel distinctiveness, applying a shift that depended on the vowel being produced. Twenty-five participants were exposed to this "vowel centralization" feedback perturbation in which the first two formant frequencies were shifted toward the center of each participant's vowel space, making vowels less distinct from one another. Speakers adapted to this nonuniform shift, learning to produce corner vowels with increased vowel space area and vowel contrast to partially overcome the perceived centralization. The increase in vowel contrast occurred without a concomitant increase in duration and persisted after the feedback shift was removed, including after a 10-min silent period. These findings establish the validity of a sensorimotor adaptation paradigm to increase vowel contrast, showing that complex, nonuniform alterations to sensory feedback can successfully drive changes relevant to intelligible communication.NEW & NOTEWORTHY To date, the speech motor learning evoked in sensorimotor adaptation studies has had little ecological consequences for communication. By inducing complex, nonuniform acoustic errors, we show that adaptation can be leveraged to cause an increase in speech sound contrast, a change that has the capacity to improve intelligibility. This study is relevant for models of sensorimotor integration across motor domains, showing that complex alterations to sensory feedback can successfully drive changes relevant to ecological behavior.}, }
@article {pmid33302780, year = {2021}, author = {Pisanski, K and Sorokowski, P}, title = {Human Stress Detection: Cortisol Levels in Stressed Speakers Predict Voice-Based Judgments of Stress.}, journal = {Perception}, volume = {50}, number = {1}, pages = {80-87}, doi = {10.1177/0301006620978378}, pmid = {33302780}, issn = {1468-4233}, mesh = {Cues ; Humans ; Hydrocortisone ; Judgment ; *Speech Perception ; *Voice ; }, abstract = {Despite recent evidence of a positive relationship between cortisol levels and voice pitch in stressed speakers, the extent to which human listeners can reliably judge stress from the voice remains unknown. Here, we tested whether voice-based judgments of stress co-vary with the free cortisol levels and vocal parameters of speakers recorded in a real-life stressful situation (oral examination) and baseline (2 weeks prior). Hormone and acoustic analyses indicated elevated salivary cortisol levels and corresponding changes in voice pitch, vocal tract resonances (formants), and speed of speech during stress. In turn, listeners' stress ratings correlated significantly with speakers' cortisol levels. Higher pitched voices were consistently perceived as more stressed; however, the influence of formant frequencies, vocal perturbation and noise parameters on stress ratings varied across contexts, suggesting that listeners utilize different strategies when assessing calm versus stressed speech. These results indicate that nonverbal vocal cues can convey honest information about a speaker's underlying physiological level of stress that listeners can, to some extent, detect and utilize, while underscoring the necessity to control for individual differences in the biological stress response.}, }
@article {pmid33296889, year = {2020}, author = {Yu, M and Wen, Y and Xu, L and Han, F and Gao, X}, title = {Polysomnographic characteristics and acoustic analysis of catathrenia (nocturnal groaning).}, journal = {Physiological measurement}, volume = {41}, number = {12}, pages = {125012}, doi = {10.1088/1361-6579/abd235}, pmid = {33296889}, issn = {1361-6579}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Male ; *Parasomnias/diagnosis ; *Polysomnography ; Sleep Stages ; Sleep, REM ; Snoring/*diagnosis ; Young Adult ; }, abstract = {OBJECTIVE: Catathrenia is a sleep disorder characterized by nocturnal groaning sounds emitted during prolonged expiration. As a rare condition, its polysomnographic findings were inconsistent. We aimed to present polysomnographic characteristics of catathrenia patients and perform acoustic analysis of groaning sounds.
APPROACH: Twenty-three patients (eight males and 15 females) diagnosed with catathrenia by video-polysomnography were included. They underwent clinical evaluation and physical examination, and answered a questionnaire. Acoustic analyses (oscillograms and spectrograms) of catathrenia and snoring signals were performed by Praat 6.1.09. Sounds were classified according to Yanagihara criteria.
MAIN RESULTS: The average age of catathrenia patients was 29.6 ± 10.0 years, with a body mass index of 22.3 ± 5.1 kg m[-2]. A total of 3728 groaning episodes were documented. Catathrenia events of 16 patients (70%) were rapid eye movement (REM)-predominant. The average duration of groaning was 11.4 ± 4.6 s, ranging from 1.3 to 74.9 s. All signals of groaning were rhythmic or semi-rhythmic, classified as type I and type II, respectively, with formants and harmonics. Snoring events were observed in nine patients. Snoring mainly occurred in the non-REM stage, with a duration of less than 1.5 s. Signals of snoring were chaotic, classified as type III, without harmonics.
SIGNIFICANCE: Catathrenia occurred in all sleep stages but mainly in REM. Durations of groaning varied greatly across patients. Acoustic characteristics of catathrenia were typical. Groaning had rhythmic or semi-rhythmic waveform, formants and harmonics, indicating vocal origin, while snoring had chaotic waveform.}, }
@article {pmid33293174, year = {2023}, author = {Albuquerque, L and Oliveira, C and Teixeira, A and Sa-Couto, P and Figueiredo, D}, title = {A Comprehensive Analysis of Age and Gender Effects in European Portuguese Oral Vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {1}, pages = {143.e13-143.e29}, doi = {10.1016/j.jvoice.2020.10.021}, pmid = {33293174}, issn = {1873-4588}, mesh = {Adult ; Humans ; Male ; Female ; Middle Aged ; Aged ; Aged, 80 and over ; Portugal ; *Phonetics ; *Speech ; Speech Acoustics ; Language ; }, abstract = {The knowledge about the age effects in speech acoustics is still disperse and incomplete. This study extends the analyses of the effects of age and gender on acoustics of European Portuguese (EP) oral vowels, in order to complement initial studies with limited sets of acoustic parameters, and to further investigate unclear or inconsistent results. A database of EP vowels produced by a group of 113 adults, aged between 35 and 97, was used. Duration, fundamental frequency (f0), formant frequencies (F1 to F3), and a selection of vowel space metrics (F1 and F2 range ratios, vowel articulation index [VAI] and formant centralization ratio [FCR]) were analyzed. To avoid the arguable division into age groups, the analyses considered age as a continuous variable. The most relevant age-related results included: vowel duration increase in both genders; a general tendency to formant frequencies decrease for females; changes that were consistent with vowel centralization for males, confirmed by the vowel space acoustic indexes; and no evidence of F3 decrease with age, in both genders. This study has contributed to knowledge on aging speech, providing new information for an additional language. The results corroborated that acoustic characteristics of speech change with age and present different patterns between genders.}, }
@article {pmid33286105, year = {2020}, author = {Van Soom, M and de Boer, B}, title = {Detrending the Waveforms of Steady-State Vowels.}, journal = {Entropy (Basel, Switzerland)}, volume = {22}, number = {3}, pages = {}, pmid = {33286105}, issn = {1099-4300}, abstract = {Steady-state vowels are vowels that are uttered with a momentarily fixed vocal tract configuration and with steady vibration of the vocal folds. In this steady-state, the vowel waveform appears as a quasi-periodic string of elementary units called pitch periods. Humans perceive this quasi-periodic regularity as a definite pitch. Likewise, so-called pitch-synchronous methods exploit this regularity by using the duration of the pitch periods as a natural time scale for their analysis. In this work, we present a simple pitch-synchronous method using a Bayesian approach for estimating formants that slightly generalizes the basic approach of modeling the pitch periods as a superposition of decaying sinusoids, one for each vowel formant, by explicitly taking into account the additional low-frequency content in the waveform which arises not from formants but rather from the glottal pulse. We model this low-frequency content in the time domain as a polynomial trend function that is added to the decaying sinusoids. The problem then reduces to a rather familiar one in macroeconomics: estimate the cycles (our decaying sinusoids) independently from the trend (our polynomial trend function); in other words, detrend the waveform of steady-state waveforms. We show how to do this efficiently.}, }
@article {pmid33277544, year = {2020}, author = {Schild, C and Aung, T and Kordsmeyer, TL and Cardenas, RA and Puts, DA and Penke, L}, title = {Linking human male vocal parameters to perceptions, body morphology, strength and hormonal profiles in contexts of sexual selection.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {21296}, pmid = {33277544}, issn = {2045-2322}, mesh = {Adolescent ; Adult ; Female ; Humans ; *Hydrocortisone ; Male ; Muscle Strength ; Pitch Perception ; *Sexual Selection ; Social Dominance ; *Speech Acoustics ; *Testosterone ; *Voice ; Young Adult ; }, abstract = {Sexual selection appears to have shaped the acoustic signals of diverse species, including humans. Deep, resonant vocalizations in particular may function in attracting mates and/or intimidating same-sex competitors. Evidence for these adaptive functions in human males derives predominantly from perception studies in which vocal acoustic parameters were manipulated using specialist software. This approach affords tight experimental control but provides little ecological validity, especially when the target acoustic parameters vary naturally with other parameters. Furthermore, such experimental studies provide no information about what acoustic variables indicate about the speaker-that is, why attention to vocal cues may be favored in intrasexual and intersexual contexts. Using voice recordings with high ecological validity from 160 male speakers and biomarkers of condition, including baseline cortisol and testosterone levels, body morphology and strength, we tested a series of pre-registered hypotheses relating to both perceptions and underlying condition of the speaker. We found negative curvilinear and negative linear relationships between male fundamental frequency (fo) and female perceptions of attractiveness and male perceptions of dominance. In addition, cortisol and testosterone negatively interacted in predicting fo, and strength and measures of body size negatively predicted formant frequencies (Pf). Meta-analyses of the present results and those from two previous samples confirmed that fonegatively predicted testosterone only among men with lower cortisol levels. This research offers empirical evidence of possible evolutionary functions for attention to men's vocal characteristics in contexts of sexual selection.}, }
@article {pmid33268219, year = {2022}, author = {Leung, Y and Oates, J and Papp, V and Chan, SP}, title = {Formant Frequencies of Adult Speakers of Australian English and Effects of Sex, Age, Geographical Location, and Vowel Quality.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {6}, pages = {875.e1-875.e13}, doi = {10.1016/j.jvoice.2020.09.026}, pmid = {33268219}, issn = {1873-4588}, mesh = {Humans ; Adult ; Male ; Female ; Adolescent ; Young Adult ; Middle Aged ; *Speech Acoustics ; *Phonetics ; Australia ; Reading ; Linear Models ; }, abstract = {AIMS: The primary aim of this study was to provide normative formant frequency (F) values for male and female speakers of Australian English. The secondary aim was to examine the effects of speaker sex, age, vowel quality, and geographical location on F.
METHOD: The first three monophthong formant frequencies (F1, F2, and F3) for 244 female and 135 male speakers aged 18-60 years from a recent large-scale corpus of Australian English were analysed on a passage reading task.
RESULTS: Mixed effects linear regression models suggested that speaker sex, speaker age, and vowel quality significantly predicted F1, F2, and F3 (P = 0.000). Effect sizes suggested that speaker sex and vowel quality contributed most to the variations in F1, F2, and F3 whereas speaker age and geographical location contributed a smaller amount.
CONCLUSION: Both clinicians and researchers are provided with normative F data for 18-60 year-old speakers of Australian English. Such data have increased internal and external validity relative to previous literature. F normative data for speakers of Australian English should be considered with reference to speaker sex and vowel but it may not be practically necessary to adjust for speaker age and geographical location.}, }
@article {pmid33261411, year = {2020}, author = {Tabain, M and Kochetov, A and Beare, R}, title = {An ultrasound and formant study of manner contrasts at four coronal places of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {5}, pages = {3195}, doi = {10.1121/10.0002486}, pmid = {33261411}, issn = {1520-8524}, mesh = {Acoustics ; Australia ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study examines consonant manner of articulation at four coronal places of articulation, using ultrasound and formant analyses of the Australian language Arrernte. Stop, nasal, and lateral articulations are examined at the dental, alveolar, retroflex, and alveo-palatal places of articulation: /t̪ n̪ l̪ / vs /t n l/ vs /ʈɳɭ/ vs /c ɲ ʎ/. Ultrasound data clearly show a more retracted tongue root for the lateral, and a more advanced tongue root for the nasal, as compared to the stop. However, the magnitude of the differences is much greater for the stop∼lateral contrast than for the stop∼nasal contrast. Acoustic results show clear effects on F1 in the adjacent vowels, in particular the preceding vowel, with F1 lower adjacent to nasals and higher adjacent to laterals, as compared to stops. Correlations between the articulatory and acoustic data are particularly strong for this formant. However, the retroflex place of articulation shows effects according to manner for higher formants as well, suggesting that a better understanding of retroflex acoustics for different manners of articulation is required. The study also suggests that articulatory symmetry and gestural economy are affected by the size of the phonemic inventory.}, }
@article {pmid33261400, year = {2020}, author = {Vampola, T and Horáček, J and Radolf, V and Švec, JG and Laukkanen, AM}, title = {Influence of nasal cavities on voice quality: Computer simulations and experiments.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {5}, pages = {3218}, doi = {10.1121/10.0002487}, pmid = {33261400}, issn = {1520-8524}, mesh = {Computer Simulation ; Female ; Humans ; Nasal Cavity/diagnostic imaging ; Phonetics ; Speech Acoustics ; *Voice ; *Voice Quality ; }, abstract = {Nasal cavities are known to introduce antiresonances (dips) in the sound spectrum reducing the acoustic power of the voice. In this study, a three-dimensional (3D) finite element (FE) model of the vocal tract (VT) of one female subject was created for vowels [a:] and [i:] without and with a detailed model of nasal cavities based on CT (Computer Tomography) images. The 3D FE models were then used for analyzing the resonances, antiresonances and the acoustic pressure response spectra of the VT. The computed results were compared with the measurements of a VT model for the vowel [a:], obtained from the FE model by 3D printing. The nasality affects mainly the lowest formant frequency and decreases its peak level. The results confirm the main effect of nasalization, i.e., that sound pressure level decreases in the frequency region of the formants F1-F2 and emphasizes the frequency region of the formants F3-F5 around the singer's formant cluster. Additionally, many internal local resonances in the nasal and paranasal cavities were found in the 3D FE model. Their effect on the acoustic output was found to be minimal, but accelerometer measurements on the walls of the 3D-printed model suggested they could contribute to structure vibrations.}, }
@article {pmid33202192, year = {2021}, author = {Pépiot, E and Arnold, A}, title = {Cross-Gender Differences in English/French Bilingual Speakers: A Multiparametric Study.}, journal = {Perceptual and motor skills}, volume = {128}, number = {1}, pages = {153-177}, doi = {10.1177/0031512520973514}, pmid = {33202192}, issn = {1558-688X}, mesh = {Female ; Humans ; *Language ; Male ; Phonetics ; Sex Characteristics ; Speech ; *Voice ; }, abstract = {The present study concerns speech productions of female and male English/French bilingual speakers in both reading and semi-spontaneous speech tasks. We investigated various acoustic parameters: average fundamental sound frequency (F0), F0 range, F0 variance (SD), vowel formants (F1, F2, and F3), voice onset time (VOT) and H1-H2 (intensity difference between the first and the second harmonic frequencies, used to measure phonation type) in both languages. Our results revealed a significant effect of gender and language on all parameters. Overall, average F0 was higher in French while F0 modulation was stronger in English. Regardless of language, female speakers exhibited higher F0 than male speakers. Moreover, the higher average F0 in French was larger in female speakers. On the other hand, the smaller F0 modulation in French was stronger in male speakers. The analysis of vowel formants showed that overall, female speakers exhibited higher values than males. However, we found a significant cross-gender difference on F2 of the back vowel [u:] in English, but not on the vowel [u] in French. VOT of voiceless stops was longer in Female speakers in both languages, with a greater difference in English. VOT contrast between voiceless stops and their voiced counterparts was also significantly longer in female speakers in both languages. The scope of this cross-gender difference was greater in English. H1-H2 was higher in female speakers in both languages, indicating a breathier phonation type. Furthermore, female speakers tended to exhibit smaller H1-H2 in French, while the opposite was true in males. This resulted in a smaller cross-gender difference in French for this parameter. All these data support the idea of language- and gender-specific vocal norms, to which bilingual speakers seem to adapt. This constitutes a further argument to give social factors, such as gender dynamics, more consideration in phonetic studies.}, }
@article {pmid33166974, year = {2020}, author = {Hînganu, D and Hînganu, MV}, title = {Hidden Anatomy of Opera Singers.}, journal = {Advances in oto-rhino-laryngology}, volume = {85}, number = {}, pages = {158-169}, doi = {10.1159/000490014}, pmid = {33166974}, issn = {1662-2847}, mesh = {Glottis/*anatomy & histology/physiology ; Humans ; Oropharynx/*anatomy & histology/physiology ; Singing/*physiology ; Voice Quality/*physiology ; }, abstract = {The history of research on the voice of opera soloists shows that there are certain functional features of the cranial nerves and cortical nerve centers. In this chapter, we review the most important findings in the field of canto voice neuroanatomy, which we corroborate with the results of our team research and experience. Our study focuses on the nerve structures involved in phonation at each level of the vocal formants: infraglottic, glottic, and oropharyngeal. We consider this research to have direct applicability in the fields of neurolaryngology, neuroscience, phoniatry, but also in the academic teaching. At the same time, the present study is a starting point for future research works on the anatomical and functional particularities of the structures involved during the act of phonation in canto soloists.}, }
@article {pmid33143999, year = {2023}, author = {Ishikawa, K and Webster, J}, title = {The Formant Bandwidth as a Measure of Vowel Intelligibility in Dysphonic Speech.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {2}, pages = {173-177}, doi = {10.1016/j.jvoice.2020.10.012}, pmid = {33143999}, issn = {1873-4588}, mesh = {Adult ; Female ; Humans ; *Dysphonia ; Speech Intelligibility ; Speech Acoustics ; *Voice ; Acoustics ; Phonetics ; }, abstract = {OBJECTIVE: The current paper examined the impact of dysphonia on the bandwidth of the first two formants of vowels, and the relationship between the formant bandwidth and vowel intelligibility.
METHODS: Speaker participants of the study were 10 adult females with healthy voice and 10 adult females with dysphonic voice. Eleven vowels in American English were recorded in /h/-vowel-/d/ format. The vowels were presented to 10 native speakers of American English with normal hearing, who were asked to select a vowel they heard from a list of /h/-vowel-/d/ words. The vowels were acoustically analyzed to measure the bandwidth of the first and second formants (B1 and B2). Separate Wilcoxon rank sum tests were conducted for each vowel for normal and dysphonic speech because the differences in B1 and B2 were found to not be normally distributed. Spearman correlation tests were conducted to evaluate the association between the difference in formant bandwidths and vowel intelligibility between the healthy and dysphonic speakers.
RESULTS: B1 was significantly greater in dysphonic vowels for seven of the eleven vowels, and lesser for only one of the vowels. There was no statistically significant difference in B2 between the normal and dysphonic vowels, except for the vowel /i/. The difference in B1 between normal and dysphonic vowels strongly predicted the intelligibility difference.
CONCLUSION: Dysphonia significantly affects B1, and the difference in B1 may serve as an acoustic marker for the intelligibility reduction in dysphonic vowels. This acoustic-perceptual relationship should be confirmed by a larger-scale study in the future.}, }
@article {pmid33143998, year = {2023}, author = {Burckardt, ES and Hillman, RE and Murton, O and Mehta, D and Van Stan, J and Burns, JA}, title = {The Impact of Tonsillectomy on the Adult Singing Voice: Acoustic and Aerodynamic Measures.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {37}, number = {1}, pages = {101-104}, doi = {10.1016/j.jvoice.2020.09.029}, pmid = {33143998}, issn = {1873-4588}, mesh = {Adult ; Humans ; *Singing ; *Tonsillectomy/adverse effects ; Prospective Studies ; Quality of Life ; Voice Quality ; Acoustics ; }, abstract = {OBJECTIVE: Singers undergoing tonsillectomy are understandably concerned about possible sequelae to their voice. The surgical risks of laryngeal damage from intubation and upper airway scarring are valid reasons for singers to carefully consider their options for treatment of tonsil-related symptoms. No prior studies have statistically assessed objective voice outcomes in a group of adult singers undergoing tonsillectomy. This study determined the impact of tonsillectomy on the adult singing voice by determining if there were statistically significant changes in preoperative versus postoperative acoustic, aerodynamic, and Voice-Related Quality of Life (VRQOL) measures.
STUDY DESIGN: Prospective cohort study.
SETTING: Tertiary Referral Academic Hospital SUBJECTS: Thirty singers undergoing tonsillectomy from 2012 to 2019.
METHODS: Acoustic recordings were obtained with Computerized Speech Lab (CSL) (Pentax CSL 4500) and analyzed with the Multidimensional Voice Program (MDVP) (Pentax MDVP) and Pratt Acoustic Analysis Software. Estimates of aerodynamic vocal efficiency were obtained and analyzed using the Phonatory Aerodynamic System (Pentax PAS 6600). Preoperative VRQOL scores were recorded, and singers were instructed to refrain from singing for 3 weeks following tonsillectomy. Repeat acoustic and aerodynamic measures as well as VRQOL scores were obtained at the first postoperative visit.
RESULTS: Average postoperative acoustic (jitter, shimmer, HNR) and aerodynamic (sound pressure level divided by subglottal pressure) parameters related to laryngeal phonatory function did not differ significantly from preoperative measures. The only statistically significant change in postoperative measures of resonance was a decrease in the 3rd formant (F3) for the /a/ vowel. Average postoperative VRQOL scores (79.8, SD18.7) improved significantly from preoperative VRQOL scores (89, SD12.2) (P = 0.007).
CONCLUSIONS: Tonsillectomy does not appear to alter laryngeal voice production in adult singers as measured by standard acoustic and aerodynamic parameters. The observed decrease in F3 for the /a/ vowel is hypothetically related to increasing the pharyngeal cross-sectional area by removing tonsillar tissue, but this would not be expected to appreciably impact the perceptual characteristics of the vowel. Singers' self-assessment (VRQOL) improved after tonsillectomy.}, }
@article {pmid33138537, year = {2020}, author = {Roberts, B and Summers, RJ}, title = {Informational masking of speech depends on masker spectro-temporal variation but not on its coherence.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {4}, pages = {2416}, doi = {10.1121/10.0002359}, pmid = {33138537}, issn = {1520-8524}, mesh = {Humans ; *Perceptual Masking ; Recognition, Psychology ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {The impact of an extraneous formant on intelligibility is affected by the extent (depth) of variation in its formant-frequency contour. Two experiments explored whether this impact also depends on masker spectro-temporal coherence, using a method ensuring that interference occurred only through informational masking. Targets were monaural three-formant analogues (F1+F2+F3) of natural sentences presented alone or accompanied by a contralateral competitor for F2 (F2C) that listeners must reject to optimize recognition. The standard F2C was created using the inverted F2 frequency contour and constant amplitude. Variants were derived by dividing F2C into abutting segments (100-200 ms, 10-ms rise/fall). Segments were presented either in the correct order (coherent) or in random order (incoherent), introducing abrupt discontinuities into the F2C frequency contour. F2C depth was also manipulated (0%, 50%, or 100%) prior to segmentation, and the frequency contour of each segment either remained time-varying or was set to constant at the geometric mean frequency of that segment. The extent to which F2C lowered keyword scores depended on segment type (frequency-varying vs constant) and depth, but not segment order. This outcome indicates that the impact on intelligibility depends critically on the overall amount of frequency variation in the competitor, but not its spectro-temporal coherence.}, }
@article {pmid33138491, year = {2020}, author = {Nenadić, F and Coulter, P and Nearey, TM and Kiefte, M}, title = {Perception of vowels with missing formant peaks.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {4}, pages = {1911}, doi = {10.1121/10.0002110}, pmid = {33138491}, issn = {1520-8524}, mesh = {Cues ; Humans ; Language ; *Phonetics ; *Speech Perception ; }, abstract = {Although the first two or three formant frequencies are considered essential cues for vowel identification, certain limitations of this approach have been noted. Alternative explanations have suggested listeners rely on other aspects of the gross spectral shape. A study conducted by Ito, Tsuchida, and Yano [(2001). J. Acoust. Soc. Am. 110, 1141-1149] offered strong support for the latter, as attenuation of individual formant peaks left vowel identification largely unaffected. In the present study, these experiments are replicated in two dialects of English. Although the results were similar to those of Ito, Tsuchida, and Yano [(2001). J. Acoust. Soc. Am. 110, 1141-1149], quantitative analyses showed that when a formant is suppressed, participant response entropy increases due to increased listener uncertainty. In a subsequent experiment, using synthesized vowels with changing formant frequencies, suppressing individual formant peaks led to reliable changes in identification of certain vowels but not in others. These findings indicate that listeners can identify vowels with missing formant peaks. However, such formant-peak suppression may lead to decreased certainty in identification of steady-state vowels or even changes in vowel identification in certain dynamically specified vowels.}, }
@article {pmid33136646, year = {2020}, author = {Easwar, V and Birstler, J and Harrison, A and Scollie, S and Purcell, D}, title = {The Accuracy of Envelope Following Responses in Predicting Speech Audibility.}, journal = {Ear and hearing}, volume = {41}, number = {6}, pages = {1732-1746}, pmid = {33136646}, issn = {1538-4667}, support = {UL1 TR002373/TR/NCATS NIH HHS/United States ; /CAPMC/CIHR/Canada ; }, mesh = {Hearing Tests ; Humans ; Male ; *Speech ; Speech Acoustics ; *Speech Perception ; }, abstract = {OBJECTIVES: The present study aimed to (1) evaluate the accuracy of envelope following responses (EFRs) in predicting speech audibility as a function of the statistical indicator used for objective response detection, stimulus phoneme, frequency, and level, and (2) quantify the minimum sensation level (SL; stimulus level above behavioral threshold) needed for detecting EFRs.
DESIGN: In 21 participants with normal hearing, EFRs were elicited by 8 band-limited phonemes in the male-spoken token /susa∫i/ (2.05 sec) presented between 20 and 65 dB SPL in 15 dB increments. Vowels in /susa∫i/ were modified to elicit two EFRs simultaneously by selectively lowering the fundamental frequency (f0) in the first formant (F1) region. The modified vowels elicited one EFR from the low-frequency F1 and another from the mid-frequency second and higher formants (F2+). Fricatives were amplitude-modulated at the average f0. EFRs were extracted from single-channel EEG recorded between the vertex (Cz) and the nape of the neck when /susa∫i/ was presented monaurally for 450 sweeps. The performance of the three statistical indicators, F-test, Hotelling's T, and phase coherence, was compared against behaviorally determined audibility (estimated SL, SL ≥0 dB = audible) using area under the receiver operating characteristics (AUROC) curve, sensitivity (the proportion of audible speech with a detectable EFR [true positive rate]), and specificity (the proportion of inaudible speech with an undetectable EFR [true negative rate]). The influence of stimulus phoneme, frequency, and level on the accuracy of EFRs in predicting speech audibility was assessed by comparing sensitivity, specificity, positive predictive value (PPV; the proportion of detected EFRs elicited by audible stimuli) and negative predictive value (NPV; the proportion of undetected EFRs elicited by inaudible stimuli). The minimum SL needed for detection was evaluated using a linear mixed-effects model with the predictor variables stimulus and EFR detection p value.
RESULTS: of the 3 statistical indicators were similar; however, at the type I error rate of 5%, the sensitivities of Hotelling's T (68.4%) and phase coherence (68.8%) were significantly higher than the F-test (59.5%). In contrast, the specificity of the F-test (97.3%) was significantly higher than the Hotelling's T (88.4%). When analyzed using Hotelling's T as a function of stimulus, fricatives offered higher sensitivity (88.6 to 90.6%) and NPV (57.9 to 76.0%) compared with most vowel stimuli (51.9 to 71.4% and 11.6 to 51.3%, respectively). When analyzed as a function of frequency band (F1, F2+, and fricatives aggregated as low-, mid- and high-frequencies, respectively), high-frequency stimuli offered the highest sensitivity (96.9%) and NPV (88.9%). When analyzed as a function of test level, sensitivity improved with increases in stimulus level (99.4% at 65 dB SPL). The minimum SL for EFR detection ranged between 13.4 and 21.7 dB for F1 stimuli, 7.8 to 12.2 dB for F2+ stimuli, and 2.3 to 3.9 dB for fricative stimuli.
CONCLUSIONS: EFR-based inference of speech audibility requires consideration of the statistical indicator used, phoneme, stimulus frequency, and stimulus level.}, }
@article {pmid33123625, year = {2019}, author = {Rakerd, B and Hunter, EJ and Lapine, P}, title = {Resonance Effects and the Vocalization of Speech.}, journal = {Perspectives of the ASHA special interest groups}, volume = {4}, number = {6}, pages = {1637-1643}, pmid = {33123625}, issn = {2381-4764}, support = {R01 DC012315/DC/NIDCD NIH HHS/United States ; }, abstract = {Studies of the respiratory and laryngeal actions required for phonation are central to our understanding of both voice and voice disorders. The purpose of the present article is to highlight complementary insights about voice that have come from the study of vocal tract resonance effects.}, }
@article {pmid33121295, year = {2022}, author = {Jeanneteau, M and Hanna, N and Almeida, A and Smith, J and Wolfe, J}, title = {Using visual feedback to tune the second vocal tract resonance for singing in the high soprano range.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {1}, pages = {25-34}, doi = {10.1080/14015439.2020.1834612}, pmid = {33121295}, issn = {1651-2022}, mesh = {Feedback, Sensory ; Female ; Humans ; *Singing ; Vibration ; *Voice ; Voice Quality ; }, abstract = {PURPOSE: Over a range roughly C5-C6, sopranos usually tune their first vocal tract resonance (R1) to the fundamental frequency (fo) of the note sung: R1:fo tuning. Those who sing well above C6 usually adjust their second vocal tract resonance (R2) and use R2:fo tuning. This study investigated these questions: Can singers quickly learn R2:fo tuning when given suitable feedback? Can they subsequently use this tuning without feedback? And finally, if so, does this assist their singing in the high range?
METHODS: New computer software for the technique of resonance estimation by broadband excitation at the lips was used to provide real-time visual feedback on fo and vocal tract resonances. Eight sopranos participated. In a one-hour session, they practised adjusting R2 whilst miming (i.e. without phonating), and then during singing.
RESULTS: Six sopranos learned to tune R2 over a range of several semi-tones, when feedback was present. This achievement did not immediately extend their singing range. When the feedback was removed, two sopranos spontaneously used R2:fo tuning at the top of their range above C6.
CONCLUSIONS: With only one hour of training, singers can learn to adjust their vocal tract shape for R2:fo tuning when provided with visual feedback. One additional participant who spent considerable time with the software, acquired greater skill at R2:fo tuning and was able to extend her singing range. A simple version of the hardware used can be assembled using basic equipment and the software is available online.}, }
@article {pmid33106062, year = {2022}, author = {Ayres, A and Winckler, PB and Jacinto-Scudeiro, LA and Rech, RS and Padovani, MMP and Jotz, GP and Olchik, MR}, title = {Speech characteristics in individuals with myasthenia gravis: a case control study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {47}, number = {1}, pages = {35-42}, doi = {10.1080/14015439.2020.1834614}, pmid = {33106062}, issn = {1651-2022}, mesh = {Case-Control Studies ; *Dysarthria/diagnosis/etiology ; Female ; Humans ; Male ; Middle Aged ; *Myasthenia Gravis/complications/diagnosis ; Speech ; Speech Acoustics ; Voice Quality ; }, abstract = {INTRODUCTION: Myasthenia Gravis (MG) is an autoimmune disease. The characteristic symptoms of the disease are muscle weakness and fatigue. These symptoms affect de oral muscles causing dysarthria, affecting about 60% of patients with disease progression.
PURPOSE: Describe the speech pattern of patients with MG and comparing with healthy controls (HC).
MATERIAL AND METHODS: Case-control study. Participants were divided in MG group (MGG) with 38 patients MG diagnosed and HC with 18 individuals matched for age and sex. MGG was evaluated with clinical and motor scales and answered self-perceived questionnaires. Speech assessment of both groups included: recording of speech tasks, acoustic and auditory-perceptual analysis.
RESULTS: In the MGG, 68.24% of the patients were female, with average age of 50.21 years old (±16.47), 14.18 years (±9.52) of disease duration and a motor scale of 11.19 points (±8.79). The auditory-perceptual analysis verified that 47.36% (n = 18) participants in MGG presented mild dysarthria, 10.52% (n = 4) moderate dysarthria, with a high percentage of alterations in phonation (95.2%) and breathing (52.63%). The acoustic analysis verified a change in phonation, with significantly higher shimmer values in the MGG compared to the HC and articulation with a significant difference between the groups for the first formant of the /iu/ (p = <.001). No correlation was found between the diagnosis of speech disorder and the dysarthria self-perception questionnaire.
CONCLUSION: We found dysarthria mild in MG patients with changes in the motor bases phonation and breathing, with no correlation with severity and disease duration.}, }
@article {pmid33091464, year = {2020}, author = {Kim, KS and Daliri, A and Flanagan, JR and Max, L}, title = {Dissociated Development of Speech and Limb Sensorimotor Learning in Stuttering: Speech Auditory-motor Learning is Impaired in Both Children and Adults Who Stutter.}, journal = {Neuroscience}, volume = {451}, number = {}, pages = {1-21}, pmid = {33091464}, issn = {1873-7544}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adaptation, Physiological ; Adolescent ; Adult ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Learning ; *Speech ; *Stuttering ; }, abstract = {Stuttering is a neurodevelopmental disorder of speech fluency. Various experimental paradigms have demonstrated that affected individuals show limitations in sensorimotor control and learning. However, controversy exists regarding two core aspects of this perspective. First, it has been claimed that sensorimotor learning limitations are detectable only in adults who stutter (after years of coping with the disorder) but not during childhood close to the onset of stuttering. Second, it remains unclear whether stuttering individuals' sensorimotor learning limitations affect only speech movements or also unrelated effector systems involved in nonspeech movements. We report data from separate experiments investigating speech auditory-motor learning (N = 60) and limb visuomotor learning (N = 84) in both children and adults who stutter versus matched nonstuttering individuals. Both children and adults who stutter showed statistically significant limitations in speech auditory-motor adaptation with formant-shifted feedback. This limitation was more profound in children than in adults and in younger children versus older children. Between-group differences in the adaptation of reach movements performed with rotated visual feedback were subtle but statistically significant for adults. In children, even the nonstuttering groups showed limited visuomotor adaptation just like their stuttering peers. We conclude that sensorimotor learning is impaired in individuals who stutter, and that the ability for speech auditory-motor learning-which was already adult-like in 3-6 year-old typically developing children-is severely compromised in young children near the onset of stuttering. Thus, motor learning limitations may play an important role in the fundamental mechanisms contributing to the onset of this speech disorder.}, }
@article {pmid33079610, year = {2020}, author = {Lester-Smith, RA and Daliri, A and Enos, N and Abur, D and Lupiani, AA and Letcher, S and Stepp, CE}, title = {The Relation of Articulatory and Vocal Auditory-Motor Control in Typical Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {11}, pages = {3628-3642}, pmid = {33079610}, issn = {1558-9102}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; R21 DC017001/DC/NIDCD NIH HHS/United States ; }, mesh = {Auditory Perception ; Feedback, Sensory ; Female ; Humans ; *Voice ; }, abstract = {Purpose The purpose of this study was to explore the relationship between feedback and feedforward control of articulation and voice by measuring reflexive and adaptive responses to first formant (F 1) and fundamental frequency (f o) perturbations. In addition, perception of F 1 and f o perturbation was estimated using passive (listening) and active (speaking) just noticeable difference paradigms to assess the relation of auditory acuity to reflexive and adaptive responses. Method Twenty healthy women produced single words and sustained vowels while the F 1 or f o of their auditory feedback was suddenly and unpredictably perturbed to assess reflexive responses or gradually and predictably perturbed to assess adaptive responses. Results Typical speakers' reflexive responses to sudden perturbation of F 1 were related to their adaptive responses to gradual perturbation of F 1. Specifically, speakers with larger reflexive responses to sudden perturbation of F 1 had larger adaptive responses to gradual perturbation of F 1. Furthermore, their reflexive responses to sudden perturbation of F 1 were associated with their passive auditory acuity to F 1 such that speakers with better auditory acuity to F 1 produced larger reflexive responses to sudden perturbations of F 1. Typical speakers' adaptive responses to gradual perturbation of F 1 were not associated with their auditory acuity to F 1. Speakers' reflexive and adaptive responses to perturbation of f o were not related, nor were their responses related to either measure of auditory acuity to f o. Conclusion These findings indicate that there may be disparate feedback and feedforward control mechanisms for articulatory and vocal error correction based on auditory feedback.}, }
@article {pmid33069508, year = {2022}, author = {Pawelec, ŁP and Graja, K and Lipowicz, A}, title = {Vocal Indicators of Size, Shape and Body Composition in Polish Men.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {6}, pages = {878.e9-878.e22}, doi = {10.1016/j.jvoice.2020.09.011}, pmid = {33069508}, issn = {1873-4588}, mesh = {Humans ; Adult ; Male ; Poland ; *Voice ; Voice Quality ; *Larynx ; Body Composition ; Speech Acoustics ; }, abstract = {OBJECTIVES: From a human evolution perspective, identifying a link between physique and vocal quality could demonstrate dual signaling in terms of the health and biological condition of an individual. In this regard, this study investigates the relationship between men's body size, shape, and composition, and their vocal characteristics.
MATERIALS AND METHODS: Eleven anthropometric measurements, using seven indices, were carried out with 80 adult Polish male participants, while the speech analysis adopted a voice recording procedure that involved phonetically recording vowels /ɑː/, /ɛː/, /iː/, /ɔː/, /uː/ to define the voice acoustic components used in Praat software.
RESULTS: The relationship between voice parameters and body size/shape/composition was found. The analysis indicated that the formants and their derivatives were useful parameters for prediction of height, weight, neck, shoulder, waist, and hip circumferences. Fundamental frequency (F0) was negatively correlated with neck circumference at Adam's apple level and body height. Moreover neck circumference and F0 association was observed for the first time in this paper. The association between waist circumference and formant component showed a net effect. In addition, the formant parameters showed significant correlations with body shape, indicating a lower vocal timbre in men with a larger relative waist circumference.
DISCUSSION: Men with lower vocal pitch had wider necks, probably a result of larynx size. Furthermore, a greater waist circumference, presumably resulting from abdominal fat distribution in men, correlated with a lower vocal timbre. While these results are inconclusive, they highlight new directions for further research.}, }
@article {pmid33029845, year = {2020}, author = {Auracher, J and Menninghaus, W and Scharinger, M}, title = {Sound Predicts Meaning: Cross-Modal Associations Between Formant Frequency and Emotional Tone in Stanzas.}, journal = {Cognitive science}, volume = {44}, number = {10}, pages = {e12906}, doi = {10.1111/cogs.12906}, pmid = {33029845}, issn = {1551-6709}, mesh = {Acoustics ; Adult ; *Comprehension ; *Emotions ; Female ; Humans ; *Language ; Male ; Phonetics ; *Sound ; }, abstract = {Research on the relation between sound and meaning in language has reported substantial evidence for implicit associations between articulatory-acoustic characteristics of phonemes and emotions. In the present study, we specifically tested the relation between the acoustic properties of a text and its emotional tone as perceived by readers. To this end, we asked participants to assess the emotional tone of single stanzas extracted from a large variety of poems. The selected stanzas had either an extremely high, a neutral, or an extremely low average formant dispersion. To assess the average formant dispersion per stanza, all words were phonetically transcribed and the distance between the first and second formant per vowel was calculated. Building on a long tradition of research on associations between sound frequency on the one hand and non-acoustic concepts such as size, strength, or happiness on the other hand, we hypothesized that stanzas with an extremely high average formant dispersion would be rated lower on items referring to Potency (dominance) and higher on items referring to Activity (arousal) and Evaluation (emotional valence). The results confirmed our hypotheses for the dimensions of Potency and Evaluation, but not for the dimension of Activity. We conclude that, at least in poetic language, extreme values of acoustic features of vowels are a significant predictor for the emotional tone of a text.}, }
@article {pmid33012680, year = {2020}, author = {Song, XY and Wang, SJ and Xu, ZX and Hao, YM and Feng, L and Ding, XD and Gao, H and Wang, YQ}, title = {Preliminary study on phonetic characteristics of patients with pulmonary nodules.}, journal = {Journal of integrative medicine}, volume = {18}, number = {6}, pages = {499-504}, doi = {10.1016/j.joim.2020.09.004}, pmid = {33012680}, issn = {2095-4964}, mesh = {Case-Control Studies ; China ; Early Detection of Cancer ; Humans ; *Lung Neoplasms/complications ; *Phonetics ; *Voice Quality ; }, abstract = {OBJECTIVE: Pulmonary nodules (PNs) are one of the imaging manifestations of early lung cancer screening, which should receive more attention. Traditional Chinese medicine believes that voice changes occur in patients with pulmonary diseases. The purpose of this study is to explore the differences in phonetic characteristics between patients with PNs and able-bodied persons.
METHODS: This study explores the phonetic characteristics of patients with PNs in order to provide a simpler and cheaper method for PN screening. It is a case-control study to explore the differences in phonetic characteristics between individuals with and without PNs. This study performed non-parametric statistics on acoustic parameters of vocalizations, collected from January 2017 to March 2018 in Shanghai, China, from these two groups; it explores the differences in third and fourth acoustic parameters between patients with PNs and a normal control group. At the same time, computed tomography (CT) scans, course of disease, combined disease and other risk factors of the patients were collected in the form of questionnaire. According to the grouping of risk factors, the phonetic characteristics of the patients with PNs were analyzed.
RESULTS: This study was comprised of 200 patients with PNs, as confirmed by CT, and 86 healthy people that served as a control group. Among patients with PNs, 43% had ground glass opacity, 32% had nodules with a diameter ≥ 8 mm, 19% had a history of smoking and 31% had hyperlipidemia. Compared with the normal group, there were statistically significant differences in pitch, intensity and shimmer in patients with PNs. Among patients with PNs, patients with diameters ≥ 8 mm had a significantly higher third formant. There was a significant difference in intensity, fourth formant and harmonics-to-noise ratio (HNR) between smoking and non-smoking patients. Compared with non-hyperlipidemia patients, the pitch, jitter and shimmer of patients with PNs and hyperlipidemia were higher and the HNR was lower; these differences were statistically significant.
CONCLUSION: This measurable changes in vocalizations can be in patients with PNs. Patients with PNs had lower and weaker voices. The size of PNs had an effect on the phonetic formant. Smoking may contribute to damage to the voice and formant changes. Voice damage is more pronounced in individuals who have PNs accompanied by hyperlipidemia.}, }
@article {pmid33008725, year = {2022}, author = {Melton, J and Bradford, Z and Lee, J}, title = {Acoustic Characteristics of Vocal Sounds Used by Professional Actors Performing Classical Material Without Microphones in Outdoor Theatre.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {5}, pages = {733.e23-733.e29}, doi = {10.1016/j.jvoice.2020.08.036}, pmid = {33008725}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Humans ; Male ; Occupations ; *Speech Acoustics ; *Voice ; Voice Quality ; }, abstract = {OBJECTIVE: Theatre actors use voice in virtually any physical position, moving or still, and perform in a wide range of venues. The present study investigated acoustic qualities required to perform classical material without electronic amplification in outdoor spaces.
DESIGN: Eight professional actors, four female, four male, from NY Classical Theatre performed one-minute monologues, first stationary, then moving, for audio recording in Central Park. Four subjects recorded two monologues each, from productions in which they played both male and female characters. Data were analyzed for fundamental frequency (F0), sound pressure level (SPL), and long-term average spectrum (LTAS).
RESULTS: Overall, F0 ranged between 75.38 and 530.33 Hz. Average F0 was 326 Hz stationary and 335.78 Hz moving for females, 248.54 Hz stationary, 252.82 Hz moving for males. SPL ranged from 28.54 to 110.51 dB for females, and 56.69 to 124.44 dB for males. Average SPL was 82 dB for females, 96.98 dB for males. On LTAS, females had a peak between 3 and 4 kHz ranging from 1.5 to 4.5 dB and another between 4 and 5 kHz ranging from 2 to 4.5 dB, while males had a peak between 3 and 4 kHz ranging from 1 to 8.5 dB.
CONCLUSION: Actors appear to use a similar F0 range across gender and performing conditions. Average F0 increased from stationary to moving. Males had greater SPL values than females, and the amplitude of peaks in the region of the Actor's Formant of LTAS curves was higher in male than female voices.}, }
@article {pmid33003843, year = {2020}, author = {Caverlé, MWJ and Vogel, AP}, title = {Stability, reliability, and sensitivity of acoustic measures of vowel space: A comparison of vowel space area, formant centralization ratio, and vowel articulation index.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {3}, pages = {1436}, doi = {10.1121/10.0001931}, pmid = {33003843}, issn = {1520-8524}, mesh = {*Acoustics ; Phonetics ; Reproducibility of Results ; *Speech Acoustics ; }, abstract = {Vowel space (VS) measurements can provide objective information on formant distribution and act as a proxy for vowel production. There are a number of proposed ways to quantify vowel production clinically, including vowel space area, formant centralization ratio, and vowel articulation index (VAI). The stability, reliability, and sensitivity of three VS measurements were investigated in two experiments. Stability was explored across three inter-recording intervals and challenged in two sensitivity conditions. Data suggest that VAI is the most stable measure across 30 s, 2 h, and 4 h inter-recording intervals. VAI appears the most sensitive metric of the three measures in conditions of fatigue and noise. These analyses highlight the need for stability and sensitivity analysis when developing and validating acoustic metrics, and underscore the potential of the VAI for vowel analysis.}, }
@article {pmid32995486, year = {2020}, author = {Kaya, Z and Soltanipour, M and Treves, A}, title = {Non-hexagonal neural dynamics in vowel space.}, journal = {AIMS neuroscience}, volume = {7}, number = {3}, pages = {275-298}, pmid = {32995486}, issn = {2373-7972}, abstract = {Are the grid cells discovered in rodents relevant to human cognition? Following up on two seminal studies by others, we aimed to check whether an approximate 6-fold, grid-like symmetry shows up in the cortical activity of humans who "navigate" between vowels, given that vowel space can be approximated with a continuous trapezoidal 2D manifold, spanned by the first and second formant frequencies. We created 30 vowel trajectories in the assumedly flat central portion of the trapezoid. Each of these trajectories had a duration of 240 milliseconds, with a steady start and end point on the perimeter of a "wheel". We hypothesized that if the neural representation of this "box" is similar to that of rodent grid units, there should be an at least partial hexagonal (6-fold) symmetry in the EEG response of participants who navigate it. We have not found any dominant n-fold symmetry, however, but instead, using PCAs, we find indications that the vowel representation may reflect phonetic features, as positioned on the vowel manifold. The suggestion, therefore, is that vowels are encoded in relation to their salient sensory-perceptual variables, and are not assigned to arbitrary grid-like abstract maps. Finally, we explored the relationship between the first PCA eigenvector and putative vowel attractors for native Italian speakers, who served as the subjects in our study.}, }
@article {pmid32994430, year = {2020}, author = {Moon, IJ and Kang, S and Boichenko, N and Hong, SH and Lee, KM}, title = {Meter enhances the subcortical processing of speech sounds at a strong beat.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {15973}, pmid = {32994430}, issn = {2045-2322}, mesh = {Acoustic Stimulation/*methods ; Adult ; Auditory Cortex/*physiology ; Auditory Perception/*physiology ; Female ; Humans ; Male ; Music ; Phonetics ; Sound ; Speech Perception/*physiology ; Young Adult ; }, abstract = {The temporal structure of sound such as in music and speech increases the efficiency of auditory processing by providing listeners with a predictable context. Musical meter is a good example of a sound structure that is temporally organized in a hierarchical manner, with recent studies showing that meter optimizes neural processing, particularly for sounds located at a higher metrical position or strong beat. Whereas enhanced cortical auditory processing at times of high metric strength has been studied, there is to date no direct evidence showing metrical modulation of subcortical processing. In this work, we examined the effect of meter on the subcortical encoding of sounds by measuring human auditory frequency-following responses to speech presented at four different metrical positions. Results show that neural encoding of the fundamental frequency of the vowel was enhanced at the strong beat, and also that the neural consistency of the vowel was the highest at the strong beat. When comparing musicians to non-musicians, musicians were found, at the strong beat, to selectively enhance the behaviorally relevant component of the speech sound, namely the formant frequency of the transient part. Our findings indicate that the meter of sound influences subcortical processing, and this metrical modulation differs depending on musical expertise.}, }
@article {pmid32991418, year = {2020}, author = {Park, EJ and Kim, JH and Choi, YH and Son, JE and Lee, SA and Yoo, SD}, title = {Association between phonation and the vowel quadrilateral in patients with stroke: A retrospective observational study.}, journal = {Medicine}, volume = {99}, number = {39}, pages = {e22236}, pmid = {32991418}, issn = {1536-5964}, mesh = {Aged ; Dysphonia/etiology/physiopathology/*therapy ; Female ; Humans ; Male ; Middle Aged ; *Phonation ; Retrospective Studies ; Stroke/complications ; Stroke Rehabilitation/*methods ; }, abstract = {Articulation disorder is associated with impaired control of respiration and speech organ movement. There are many cases of dysarthria and dysphonia in stroke patients. Dysphonia adversely affects communication and social activities, and it can interfere with everyday life. The purpose of this study is to assess the association between phonation abilities and the vowel quadrilateral in stroke patients.The subjects were stroke patients with pronunciation and phonation disorders. The resonance frequency was measured for the 4 corner vowels to measure the vowel space area (VSA) and formant centralization ratio (FCR). Phonation ability was evaluated by the Dysphonia Severity Index (DSI) and maximal phonation time (MPT) through acoustic evaluation for each vowel. Pearsons correlation analysis was performed to confirm the association, and multiple linear regression analysis was performed between variables.The correlation coefficients of VSA and MPT/u/ were 0.420, VSA and MPT/i/ were 0.536, VSA and DSI/u/ were 0.392, VSA and DSI /i/ were 0.364, and FCR and DSI /i/ were -0.448. Multiple linear regression analysis showed that VSA was a factor significantly influencing MPT/u/ (β = 0.420, P = .021, R = 0.147), MPT/i/ (β = 0.536, P = .002, R = 0.262), DSI/u/ (β = 0.564, P = .045, R = 0.256), and DSI/i/ (β = 0.600, P = .03, R = 0.302).The vowel quadrilateral can be a useful tool for evaluating the phonation function of stroke patients.}, }
@article {pmid32985269, year = {2021}, author = {Ge, S and Wan, Q and Yin, M and Wang, Y and Huang, Z}, title = {Quantitative acoustic metrics of vowel production in mandarin-speakers with post-stroke spastic dysarthria.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {8}, pages = {779-792}, doi = {10.1080/02699206.2020.1827295}, pmid = {32985269}, issn = {1464-5076}, mesh = {Acoustics ; *Benchmarking ; *Dysarthria/etiology ; Female ; Humans ; Male ; Phonetics ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Impairment of vowel production in dysarthria has been highly valued. This study aimed to explore the vowel production of Mandarin-speakers with post-stroke spastic dysarthria in connected speech and to explore the influence of gender and tone on the vowel production. Multiple vowel acoustic metrics, including F1 range, F2 range, vowel space area (VSA), vowel articulation index (VAI) and formant centralization ratio (FCR), were analyzed from vowel tokens embedded in connected speech produced. The participants included 25 clients with spastic dysarthria secondary to stroke (15 males, 10 females) and 25 speakers with no history of neurological disease (15 males, 10 females). Variance analyses were conducted and the results showed that the main effects of population, gender, and tone on F2 range, VSA, VAI, and FCR were all significant. Vowel production became centralized in the clients with post-stroke spastic dysarthria. Vowel production was found to be more centralized in males compared to females. Vowels in neutral tone (T0) were the most centralized among the other tones. The quantitative acoustic metrics of F2 range, VSA, VAI, and FCR were effective in predicting vowel production in Mandarin-speaking clients with post-stroke spastic dysarthria, and hence may be used as powerful tools to assess the speech performance for this population.}, }
@article {pmid32976078, year = {2020}, author = {Daliri, A and Chao, SC and Fitzgerald, LC}, title = {Compensatory Responses to Formant Perturbations Proportionally Decrease as Perturbations Increase.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {10}, pages = {3392-3407}, pmid = {32976078}, issn = {1558-9102}, support = {R21 DC017563/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Feedback, Sensory ; Female ; Humans ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Perception ; Speech Production Measurement ; }, abstract = {Purpose We continuously monitor our speech output to detect potential errors in our productions. When we encounter errors, we rapidly change our speech output to compensate for the errors. However, it remains unclear whether we adjust the magnitude of our compensatory responses based on the characteristics of errors. Method Participants (N = 30 adults) produced monosyllabic words containing /ɛ/ (/hɛp/, /hɛd/, /hɛk/) while receiving perturbed or unperturbed auditory feedback. In the perturbed trials, we applied two different types of formant perturbations: (a) the F1 shift, in which the first formant of /ɛ/ was increased, and (b) the F1-F2 shift, in which the first formant was increased and the second formant was decreased to make a participant's /ɛ/ sound like his or her /æ/. In each perturbation condition, we applied three participant-specific perturbation magnitudes (0.5, 1.0, and 1.5 ɛ-æ distance). Results Compensatory responses to perturbations with the magnitude of 1.5 ɛ-æ were proportionally smaller than responses to perturbation magnitudes of 0.5 ɛ-æ. Responses to the F1-F2 shift were larger than responses to the F1 shift regardless of the perturbation magnitude. Additionally, compensatory responses for /hɛd/ were smaller than responses for /hɛp/ and /hɛk/. Conclusions Overall, these results suggest that the brain uses its error evaluation to determine the extent of compensatory responses. The brain may also consider categorical errors and phonemic environments (e.g., articulatory configurations of the following phoneme) to determine the magnitude of its compensatory responses to auditory errors.}, }
@article {pmid32951953, year = {2022}, author = {Nilsson, T and Laukkanen, AM and Syrjä, T}, title = {Effects of Sixteen Month Voice Training of Student Actors Applying the Linklater Voice Method.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {5}, pages = {733.e9-733.e21}, doi = {10.1016/j.jvoice.2020.08.014}, pmid = {32951953}, issn = {1873-4588}, mesh = {Humans ; Phonation ; Speech Acoustics ; Students ; *Voice ; Voice Quality ; *Voice Training ; }, abstract = {OBJECTIVE: This study investigates the perceptual and acoustic changes in student actors' voices after 16 months of Linklater Voice training, which is a holistic method to train actors' voices.
METHODS: Eleven (n = 11) actor students' text and Voice Range Profile (VRP) recordings were analyzed pretraining and 16 months posttraining. From text readings at comfortable performance loudness, both perceptual and acoustic analyses were made. Acoustic measures included sound pressure level (SPL), fundamental frequency (fo), and sound level differences between different frequency ranges derived from long-term-average spectrum. Sustained vowels [i:], [o], and [e] abstracted from the text sample were analyzed for formant frequencies F1-F4 and the frequency difference between F4 and F3. The VRP was registered to investigate SPL of the softest and loudest phonations throughout the voice range.
RESULTS: The perceived pitch range during text reading increased significantly. The acoustic result showed a strong trend toward decreasing in minimum fo, and increasing in maximum fo and fo range. The VRP showed a significant increase in the fo range and dynamics (SPL range). Perceived voice production showed a trend toward phonation balance (neither pressed-nor breathy) and darker voice color posttraining.
CONCLUSION: The perceptual and acoustic analysis of text reading and acoustic measures of VRP suggest that LV training has a positive impact on voice.}, }
@article {pmid32943283, year = {2022}, author = {Di Natale, V and Cantarella, G and Manfredi, C and Ciabatta, A and Bacherini, C and DeJonckere, PH}, title = {Semioccluded Vocal Tract Exercises Improve Self-Perceived Voice Quality in Healthy Actors.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {4}, pages = {584.e7-584.e14}, doi = {10.1016/j.jvoice.2020.07.024}, pmid = {32943283}, issn = {1873-4588}, mesh = {Female ; Humans ; Male ; *Singing ; Speech Acoustics ; *Voice Disorders/diagnosis/therapy ; Voice Quality ; Voice Training ; }, abstract = {PURPOSE: Semi-occluded vocal tract exercises (SOVTE) have shown to lead to more effective and efficient vocal production for individuals with voice disorders and for singers. The aim of the present study is to investigate the effects of a 10-minute SOVTE warm-up protocol on the actors' voice.
METHODS: Twenty-seven professional theater actors (16 females) without voice complaints were audio-recorded while reading aloud, with their acting voice, a short dramatic passage at four time points. Recordings were made: the day before the show, just before and soon after the warm-up protocol which was performed prior to the show and soon after the show. The voice quality was acoustically and auditory-perceptually evaluated and quantified at each time point by blinded raters. Self-assessment parameters anonymously collected pre and post exercising were also analyzed.
RESULTS: No statistically significant differences on perceptual ratings and acoustic parameters were found between pre/post exercise sessions and males/females. A statistically significant improvement was detected in the self-assessment parameters concerning comfort of production, sonorousness, vocal clarity and power.
CONCLUSIONS: Vocal warm-up with the described SOVTE protocol was effective in determining a self-perceived improvement in comfort of production, voice quality and power, although objective evidence was missing. This straightforward protocol could thus be beneficial if routinely utilized by professional actors to facilitate the voice performance.}, }
@article {pmid32933336, year = {2021}, author = {Sugathan, N and Maruthy, S}, title = {Predictive factors for persistence and recovery of stuttering in children: A systematic review.}, journal = {International journal of speech-language pathology}, volume = {23}, number = {4}, pages = {359-371}, doi = {10.1080/17549507.2020.1812718}, pmid = {32933336}, issn = {1754-9515}, mesh = {Child ; Humans ; Linguistics ; Speech ; Speech Production Measurement ; *Stuttering ; Telephone ; }, abstract = {PURPOSE: The purpose of this study was to systematically review the available literature on various factors that can predict the persistence and recovery of stuttering in children.
METHOD: An electronic search yielded a total of 35 studies, which considered 44 variables that can be potential factors for predicting persistence and recovery.
RESULT: Among 44 factors studied, only four factors- phonological abilities, articulatory rate, change in the pattern of disfluencies, and trend in stuttering severity over one-year post-onset were identified to be replicated predictors of recovery of the stuttering. Several factors, such as differences in the second formant transition between fluent and disfluent speech, articulatory rate measured in phones/sec, etc., were observed to predict the future course of stuttering. However, these factors lack replicated evidence as predictors.
CONCLUSION: There is clear support only for limited factors as reliable predictors. Also, it is observed to be too early to conclude on several replicated factors due to differences in the age group of participants, participant sample size, and the differences in tools used in research that lead to mixed findings as a predictive factor. Hence there is a need for systematic and replicated testing of the factors identified before initiating their use for clinical purposes.}, }
@article {pmid32921855, year = {2020}, author = {Palaparthi, A and Titze, IR}, title = {Analysis of Glottal Inverse Filtering in the Presence of Source-Filter Interaction.}, journal = {Speech communication}, volume = {123}, number = {}, pages = {98-108}, pmid = {32921855}, issn = {0167-6393}, support = {R01 DC012045/DC/NIDCD NIH HHS/United States ; R01 DC017998/DC/NIDCD NIH HHS/United States ; }, abstract = {The validity of glottal inverse filtering (GIF) to obtain a glottal flow waveform from radiated pressure signal in the presence and absence of source-filter interaction was studied systematically. A driven vocal fold surface model of vocal fold vibration was used to generate source signals. A one-dimensional wave reflection algorithm was used to solve for acoustic pressures in the vocal tract. Several test signals were generated with and without source-filter interaction at various fundamental frequencies and vowels. Linear Predictive Coding (LPC), Quasi Closed Phase (QCP), and Quadratic Programming (QPR) based algorithms, along with supraglottal impulse response, were used to inverse filter the radiated pressure signals to obtain the glottal flow pulses. The accuracy of each algorithm was tested for its recovery of maximum flow declination rate (MFDR), peak glottal flow, open phase ripple factor, closed phase ripple factor, and mean squared error. The algorithms were also tested for their absolute relative errors of the Normalized Amplitude Quotient, the Quasi-Open Quotient, and the Harmonic Richness Factor. The results indicated that the mean squared error decreased with increase in source-filter interaction level suggesting that the inverse filtering algorithms perform better in the presence of source-filter interaction. All glottal inverse filtering algorithms predicted the open phase ripple factor better than the closed phase ripple factor of a glottal flow waveform, irrespective of the source-filter interaction level. Major prediction errors occurred in the estimation of the closed phase ripple factor, MFDR, peak glottal flow, normalized amplitude quotient, and Quasi-Open Quotient. Feedback-related nonlinearity (source-filter interaction) affected the recovered signal primarily when f o was well below the first formant frequency of a vowel. The prediction error increased when f o was close to the first formant frequency due to the difficulty of estimating the precise value of resonance frequencies, which was exacerbated by nonlinear kinetic losses in the vocal tract.}, }
@article {pmid32917459, year = {2022}, author = {Lopes, LW and França, FP and Evangelista, DDS and Alves, JDN and Vieira, VJD and de Lima-Silva, MFB and Pernambuco, LA}, title = {Does the Combination of Glottal and Supraglottic Acoustic Measures Improve Discrimination Between Women With and Without Voice Disorders?.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {4}, pages = {583.e17-583.e29}, doi = {10.1016/j.jvoice.2020.08.006}, pmid = {32917459}, issn = {1873-4588}, mesh = {Acoustics ; Cross-Sectional Studies ; Edema ; Female ; Humans ; *Laryngeal Edema ; Laryngeal Muscles ; Retrospective Studies ; Speech Acoustics ; *Voice Disorders/diagnosis ; Voice Quality ; }, abstract = {AIM: To analyze the accuracy of traditional acoustic measurements (F0, perturbation, and noise) and formant measurements in discriminating between women with and without voice disorders, and with different laryngeal disorders.
STUDY DESIGN: A descriptive, cross-sectional, and retrospective.
METHOD: Two hundred and sixty women participated. All participants recorded the spoken vowel /Ɛ/ and underwent laryngeal visual examination. Acoustic measures of the mean and standard deviation of the fundamental frequency (F0), jitter, shimmer, glottal-to-noise excitation ratio, and the values of the first three formants (F1, F2, and F3) were obtained.
RESULTS: Individual acoustic measurements did not demonstrate adequate (<70%) performance when discriminating between women with and without voice disorders. The combination of the standard deviation of the F0, shimmer, glottal-to-noise excitation ratio, F1, F2, and F3 showed acceptable (>70%) performance in classifying women with and without voice disorders. Individual measures of jitter as well as F1 and F3 demonstrated acceptable (>70%) performance when distinguishing women with different laryngeal diagnoses, including without voice disorders (healthy larynges), Reinke's edema, unilateral vocal fold paralysis, and sulcus vocalis. The combination of acoustic measurements showed excellent (>80%) performance when discriminating women without voice disorder from those with Reinke's edema (mean of F0, F1, and F3) and with sulcus vocalis (mean of F0, F1, and F2).
CONCLUSIONS: Individual formant and traditional acoustic measurements do not demonstrate adequate performance when discriminating between women with and without voice disorders. However, the combination of traditional and formant measurements improves the discrimination between the presence and absence of voice disorders and differentiates several laryngeal diagnoses.}, }
@article {pmid32913919, year = {2020}, author = {Kishimoto, T and Takamiya, A and Liang, KC and Funaki, K and Fujita, T and Kitazawa, M and Yoshimura, M and Tazawa, Y and Horigome, T and Eguchi, Y and Kikuchi, T and Tomita, M and Bun, S and Murakami, J and Sumali, B and Warnita, T and Kishi, A and Yotsui, M and Toyoshiba, H and Mitsukura, Y and Shinoda, K and Sakakibara, Y and Mimura, M and , }, title = {The project for objective measures using computational psychiatry technology (PROMPT): Rationale, design, and methodology.}, journal = {Contemporary clinical trials communications}, volume = {19}, number = {}, pages = {100649}, pmid = {32913919}, issn = {2451-8654}, abstract = {INTRODUCTION: Depressive and neurocognitive disorders are debilitating conditions that account for the leading causes of years lived with disability worldwide. However, there are no biomarkers that are objective or easy-to-obtain in daily clinical practice, which leads to difficulties in assessing treatment response and developing new drugs. New technology allows quantification of features that clinicians perceive as reflective of disorder severity, such as facial expressions, phonic/speech information, body motion, daily activity, and sleep.
METHODS: Major depressive disorder, bipolar disorder, and major and minor neurocognitive disorders as well as healthy controls are recruited for the study. A psychiatrist/psychologist conducts conversational 10-min interviews with participants ≤10 times within up to five years of follow-up. Interviews are recorded using RGB and infrared cameras, and an array microphone. As an option, participants are asked to wear wrist-band type devices during the observational period. Various software is used to process the raw video, voice, infrared, and wearable device data. A machine learning approach is used to predict the presence of symptoms, severity, and the improvement/deterioration of symptoms.
DISCUSSION: The overall goal of this proposed study, the Project for Objective Measures Using Computational Psychiatry Technology (PROMPT), is to develop objective, noninvasive, and easy-to-use biomarkers for assessing the severity of depressive and neurocognitive disorders in the hopes of guiding decision-making in clinical settings as well as reducing the risk of clinical trial failure. Challenges may include the large variability of samples, which makes it difficult to extract the features that commonly reflect disorder severity.
TRIAL REGISTRATION: UMIN000021396, University Hospital Medical Information Network (UMIN).}, }
@article {pmid32881631, year = {2020}, author = {Skuk, VG and Kirchen, L and Oberhoffner, T and Guntinas-Lichius, O and Dobel, C and Schweinberger, SR}, title = {Parameter-Specific Morphing Reveals Contributions of Timbre and Fundamental Frequency Cues to the Perception of Voice Gender and Age in Cochlear Implant Users.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {9}, pages = {3155-3175}, doi = {10.1044/2020_JSLHR-20-00026}, pmid = {32881631}, issn = {1558-9102}, mesh = {*Cochlear Implantation ; *Cochlear Implants ; Cues ; Female ; Humans ; Male ; Perception ; *Speech Perception ; *Voice ; }, abstract = {Purpose Using naturalistic synthesized speech, we determined the relative importance of acoustic cues in voice gender and age perception in cochlear implant (CI) users. Method We investigated 28 CI users' abilities to utilize fundamental frequency (F0) and timbre in perceiving voice gender (Experiment 1) and vocal age (Experiment 2). Parameter-specific voice morphing was used to selectively control acoustic cues (F0; time; timbre, i.e., formant frequencies, spectral-level information, and aperiodicity, as defined in TANDEM-STRAIGHT) in voice stimuli. Individual differences in CI users' performance were quantified via deviations from the mean performance of 19 normal-hearing (NH) listeners. Results CI users' gender perception seemed exclusively based on F0, whereas NH listeners efficiently used timbre. For age perception, timbre was more informative than F0 for both groups, with minor contributions of temporal cues. While a few CI users performed comparable to NH listeners overall, others were at chance. Separate analyses confirmed that even high-performing CI users classified gender almost exclusively based on F0. While high performers could discriminate age in male and female voices, low performers were close to chance overall but used F0 as a misleading cue to age (classifying female voices as young and male voices as old). Satisfaction with CI generally correlated with performance in age perception. Conclusions We confirmed that CI users' gender classification is mainly based on F0. However, high performers could make reasonable usage of timbre cues in age perception. Overall, parameter-specific morphing can serve to objectively assess individual profiles of CI users' abilities to perceive nonverbal social-communicative vocal signals.}, }
@article {pmid32873043, year = {2020}, author = {Hansen, JHL and Bokshi, M and Khorram, S}, title = {Speech variability: A cross-language study on acoustic variations of speaking versus untrained singing.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {2}, pages = {829}, pmid = {32873043}, issn = {1520-8524}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustics ; Humans ; Language ; *Singing ; *Speech ; Speech Acoustics ; }, abstract = {Speech production variability introduces significant challenges for existing speech technologies such as speaker identification (SID), speaker diarization, speech recognition, and language identification (ID). There has been limited research analyzing changes in acoustic characteristics for speech produced by untrained singing versus speaking. To better understand changes in speech production of the untrained singing voice, this study presents the first cross-language comparison between normal speaking and untrained karaoke singing of the same text content. Previous studies comparing professional singing versus speaking have shown deviations in both prosodic and spectral features. Some investigations also considered assigning the intrinsic activity of the singing. Motivated by these studies, a series of experiments to investigate both prosodic and spectral variations of untrained karaoke singers for three languages, American English, Hindi, and Farsi, are considered. A comprehensive comparison on common prosodic features, including phoneme duration, mean fundamental frequency (F0), and formant center frequencies of vowels was performed. Collective changes in the corresponding overall acoustic spaces based on the Kullback-Leibler distance using Gaussian probability distribution models trained on spectral features were analyzed. Finally, these models were used in a Gausian mixture model with universal background model SID evaluation to quantify speaker changes between speaking and singing when the audio text content is the same. The experiments showed that many acoustic characteristics of untrained singing are considerably different from speaking when the text content is the same. It is suggested that these results would help advance automatic speech production normalization/compensation to improve performance of speech processing applications (e.g., speaker ID, speech recognition, and language ID).}, }
@article {pmid32873011, year = {2020}, author = {Winn, MB and Moore, AN}, title = {Perceptual weighting of acoustic cues for accommodating gender-related talker differences heard by listeners with normal hearing and with cochlear implants.}, journal = {The Journal of the Acoustical Society of America}, volume = {148}, number = {2}, pages = {496}, pmid = {32873011}, issn = {1520-8524}, support = {R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation ; Acoustics ; *Cochlear Implants ; Cues ; Hearing ; Humans ; Male ; *Speech Perception ; }, abstract = {Listeners must accommodate acoustic differences between vocal tracts and speaking styles of conversation partners-a process called normalization or accommodation. This study explores what acoustic cues are used to make this perceptual adjustment by listeners with normal hearing or with cochlear implants, when the acoustic variability is related to the talker's gender. A continuum between /ʃ/ and /s/ was paired with naturally spoken vocalic contexts that were parametrically manipulated to vary by numerous cues for talker gender including fundamental frequency (F0), vocal tract length (formant spacing), and direct spectral contrast with the fricative. The goal was to examine relative contributions of these cues toward the tendency to have a lower-frequency acoustic boundary for fricatives spoken by men (found in numerous previous studies). Normal hearing listeners relied primarily on formant spacing and much less on F0. The CI listeners were individually variable, with the F0 cue emerging as the strongest cue on average.}, }
@article {pmid32777195, year = {2020}, author = {Chung, H}, title = {Acquisition and Acoustic Patterns of Southern American English /l/ in Young Children.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2609-2624}, doi = {10.1044/2020_JSLHR-19-00040}, pmid = {32777195}, issn = {1558-9102}, mesh = {Acoustics ; Child, Preschool ; Humans ; *Language ; Phonetics ; Speech ; *Speech Acoustics ; United States ; }, abstract = {Purpose The aim of the current study was to examine /l/ developmental patterns in young learners of Southern American English, especially in relation to the effect of word position and phonetic contexts. Method Eighteen children with typically developing speech, aged between 2 and 5 years, produced monosyllabic single words containing singleton /l/ in different word positions (pre- vs. postvocalic /l/) across different vowel contexts (high front vs. low back) and cluster /l/ in different consonant contexts (/pl, bl/ vs. /kl, gl/). Each production was analyzed for its accuracy and acoustic patterns as measured by the first two formant frequencies and their difference (F1, F2, and F2-F1). Results There was great individual variability in /l/ acquisition patterns, with some 2- and 3-year-olds reaching 100% accuracy for prevocalic /l/, while others were below 70%. Overall, accuracy of prevocalic /l/ was higher than that of postvocalic /l/. Acoustic patterns of pre- and postvocalic /l/ showed greater differences in younger children and less apparent differences in 5-year-olds. There were no statistically significant differences between the acoustic patterns of /l/ coded as perceptually acceptable and those coded as misarticulated. There was also no apparent effect of vowel and consonant contexts on /l/ patterns. Conclusion The accuracy patterns of this study suggest an earlier development of /l/, especially prevocalic /l/, than has been reported in previous studies. The differences in acoustic patterns between pre- and postvocalic /l/, which become less apparent with age, may suggest that children alter the way they articulate /l/ with age. No significant acoustic differences between acceptable and misarticulated /l/, especially postvocalic /l/, suggest a gradient nature of /l/ that is dialect specific. This suggests the need for careful consideration of a child's dialect/language background when studying /l/.}, }
@article {pmid32777194, year = {2020}, author = {Lee, J and Kim, H and Jung, Y}, title = {Patterns of Misidentified Vowels in Individuals With Dysarthria Secondary to Amyotrophic Lateral Sclerosis.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2649-2666}, doi = {10.1044/2020_JSLHR-19-00237}, pmid = {32777194}, issn = {1558-9102}, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis/complications ; *Dysarthria/diagnosis/etiology ; Humans ; Phonetics ; Speech Acoustics ; Tongue ; }, abstract = {Purpose The current study examines the pattern of misidentified vowels produced by individuals with dysarthria secondary to amyotrophic lateral sclerosis (ALS). Method Twenty-three individuals with ALS and 22 typical individuals produced 10 monophthongs in an /h/-vowel-/d/ context. One hundred thirty-five listeners completed a forced-choice vowel identification test. Misidentified vowels were examined in terms of the target vowel categories (front-back; low-mid-high) and the direction of misidentification (the directional pattern when the target vowel was misidentified, e.g., misidentification "to a lower vowel"). In addition, acoustic predictors of vowel misidentifications were tested based on log first formant (F1), log second formant, log F1 vowel inherent spectral change, log second formant vowel inherent spectral change, and vowel duration. Results First, high and mid vowels were more frequently misidentified than low vowels for all speaker groups. Second, front and back vowels were misidentified at a similar rate for both the Mild and Severe groups, whereas back vowels were more frequently misidentified than front vowels in typical individuals. Regarding the direction of vowel misidentification, vowel errors were mostly made within the same backness (front-back) category for all groups. In addition, more errors were found toward a lower vowel category than toward a higher vowel category in the Severe group, but not in the Mild group. Overall, log F1 difference was identified as a consistent acoustic predictor of the main vowel misidentification pattern. Conclusion Frequent misidentifications in the vowel height dimension and the acoustic predictor, F1, suggest that limited tongue height control is the major articulatory dysfunction in individuals with ALS. Clinical implications regarding this finding are discussed.}, }
@article {pmid32754872, year = {2021}, author = {Koo, SK and Kwon, SB and Koh, TK and Ji, CL and Park, GH and Lee, HB}, title = {Acoustic analyses of snoring sounds using a smartphone in patients undergoing septoplasty and turbinoplasty.}, journal = {European archives of oto-rhino-laryngology : official journal of the European Federation of Oto-Rhino-Laryngological Societies (EUFOS) : affiliated with the German Society for Oto-Rhino-Laryngology - Head and Neck Surgery}, volume = {278}, number = {1}, pages = {257-263}, pmid = {32754872}, issn = {1434-4726}, mesh = {Acoustics ; Humans ; Male ; *Nasal Obstruction/diagnosis/surgery ; Nasal Septum/surgery ; Prospective Studies ; *Rhinoplasty ; Smartphone ; Snoring/diagnosis/surgery ; Treatment Outcome ; }, abstract = {PURPOSE: Several studies have been performed using recently developed smartphone-based acoustic analysis techniques. We investigated the effects of septoplasty and turbinoplasty in patients with nasal septal deviation and turbinate hypertrophy accompanied by snoring by recording the sounds of snoring using a smartphone and performing acoustic analysis.
METHODS: A total of 15 male patients who underwent septoplasty with turbinoplasty for snoring and nasal obstruction were included in this prospective study. Preoperatively and 2 months after surgery, their bed partners or caregivers were instructed to record the snoring sounds. The intensity (dB), formant frequencies (F1, F2, F3, and F4), spectrogram pattern, and visual analog scale (VAS) score were analyzed for each subject.
RESULTS: Overall snoring sounds improved after surgery in 12/15 (80%) patients, and there was significant improvement in the intensity of snoring sounds after surgery (from 64.17 ± 12.18 dB to 55.62 ± 9.11 dB, p = 0.018). There was a significant difference in the F1 formant frequency before and after surgery (p = 0.031), but there were no significant differences in F2, F3, or F4. The change in F1 indicated that patients changed from mouth breathing to normal breathing. The degree of subjective snoring sounds improved significantly after surgery (VAS: from 5.40 ± 1.55 to 3.80 ± 1.26, p = 0.003).
CONCLUSION: Our results confirm that snoring is reduced when nasal congestion is improved, and they demonstrate that smartphone-based acoustic analysis of snoring sounds can be useful for diagnosis.}, }
@article {pmid32738502, year = {2020}, author = {Scott, TL and Haenchen, L and Daliri, A and Chartove, J and Guenther, FH and Perrachione, TK}, title = {Noninvasive neurostimulation of left ventral motor cortex enhances sensorimotor adaptation in speech production.}, journal = {Brain and language}, volume = {209}, number = {}, pages = {104840}, pmid = {32738502}, issn = {1090-2155}, support = {R01 DC002852/DC/NIDCD NIH HHS/United States ; R03 DC014045/DC/NIDCD NIH HHS/United States ; T90 DA032484/DA/NIDA NIH HHS/United States ; }, mesh = {Adult ; Feedback, Sensory/*physiology ; Female ; Humans ; Learning/*physiology ; Male ; Motor Cortex/*physiology ; Patient-Specific Modeling ; Psychomotor Performance/*physiology ; Speech/*physiology ; Speech Acoustics ; *Transcranial Direct Current Stimulation ; Young Adult ; }, abstract = {Sensorimotor adaptation-enduring changes to motor commands due to sensory feedback-allows speakers to match their articulations to intended speech acoustics. How the brain integrates auditory feedback to modify speech motor commands and what limits the degree of these modifications remain unknown. Here, we investigated the role of speech motor cortex in modifying stored speech motor plans. In a within-subjects design, participants underwent separate sessions of sham and anodal transcranial direct current stimulation (tDCS) over speech motor cortex while speaking and receiving altered auditory feedback of the first formant. Anodal tDCS increased the rate of sensorimotor adaptation for feedback perturbation. Computational modeling of our results using the Directions Into Velocities of Articulators (DIVA) framework of speech production suggested that tDCS primarily affected behavior by increasing the feedforward learning rate. This study demonstrates how focal noninvasive neurostimulation can enhance the integration of auditory feedback into speech motor plans.}, }
@article {pmid32720557, year = {2021}, author = {Chung, H and Munson, B and Edwards, J}, title = {Cross-Linguistic Perceptual Categorization of the Three Corner Vowels: Effects of Listener Language and Talker Age.}, journal = {Language and speech}, volume = {64}, number = {3}, pages = {558-575}, doi = {10.1177/0023830920943240}, pmid = {32720557}, issn = {1756-6053}, mesh = {Adult ; Child, Preschool ; Humans ; *Language ; Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {The present study examined the center and size of naïve adult listeners' vowel perceptual space (VPS) in relation to listener language (LL) and talker age (TA). Adult listeners of three different first languages, American English, Greek, and Korean, categorized and rated the goodness of different vowels produced by 2-year-olds and 5-year-olds and adult speakers of those languages, and speakers of Cantonese and Japanese. The center (i.e., mean first and second formant frequencies (F1 and F2)) and size (i.e., area in the F1/F2 space) of VPSs that were categorized either into /a/, /i/, or /u/ were calculated for each LL and TA group. All center and size calculations were weighted by the goodness rating of each stimulus. The F1 and F2 values of the vowel category (VC) centers differed significantly by LL and TA. These effects were qualitatively different for the three vowel categories: English listeners had different /a/ and /u/ centers than Greek and Korean listeners. The size of VPSs did not differ significantly by LL, but did differ by TA and VCs: Greek and Korean listeners had larger vowel spaces when perceiving vowels produced by 2-year-olds than by 5-year-olds or adults, and English listeners had larger vowel spaces for /a/ than /i/ or /u/. Findings indicate that vowel perceptual categories of listeners varied by the nature of their native vowel system, and were sensitive to TA.}, }
@article {pmid32697631, year = {2020}, author = {Mefferd, AS and Dietrich, MS}, title = {Tongue- and Jaw-Specific Articulatory Changes and Their Acoustic Consequences in Talkers With Dysarthria due to Amyotrophic Lateral Sclerosis: Effects of Loud, Clear, and Slow Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2625-2636}, pmid = {32697631}, issn = {1558-9102}, support = {R03 DC015075/DC/NIDCD NIH HHS/United States ; UL1 TR002243/TR/NCATS NIH HHS/United States ; }, mesh = {Acoustics ; *Amyotrophic Lateral Sclerosis/complications ; *Dysarthria/etiology ; Humans ; Speech ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; Tongue ; }, abstract = {Purpose This study aimed to determine how tongue and jaw displacement changes impact acoustic vowel contrast in talkers with amyotrophic lateral sclerosis (ALS) and controls. Method Ten talkers with ALS and 14 controls participated in this study. Loud, clear, and slow speech cues were used to elicit tongue and jaw kinematic as well as acoustic changes. Speech kinematics was recorded using three-dimensional articulography. Independent tongue and jaw displacements were extracted during the diphthong /ai/ in kite. Acoustic distance between diphthong onset and offset in Formant 1-Formant 2 vowel space indexed acoustic vowel contrast. Results In both groups, all three speech modifications elicited increases in jaw displacement (typical < slow < loud < clear). By contrast, only slow speech elicited significantly increased independent tongue displacement in the ALS group (typical = loud = clear < slow), whereas all three speech modifications elicited significantly increased independent tongue displacement in controls (typical < loud < clear = slow). Furthermore, acoustic vowel contrast significantly increased in response to clear and slow speech in the ALS group, whereas all three speech modifications elicited significant increases in acoustic vowel contrast in controls (typical < loud < slow < clear). Finally, only jaw displacements accounted for acoustic vowel contrast gains in the ALS group. In controls, however, independent tongue displacements accounted for increases in vowel acoustic contrast during loud and slow speech, whereas jaw and independent tongue displacements accounted equally for acoustic vowel contrast change during clear speech. Conclusion Kinematic findings suggest that slow speech may be better suited to target independent tongue displacements in talkers with ALS than clear and loud speech. However, given that gains in acoustic vowel contrast were comparable for slow and clear speech cues in these talkers, future research is needed to determine potential differential impacts of slow and clear speech on perceptual measures, such as intelligibility. Finally, findings suggest that acoustic vowel contrast gains are predominantly jaw driven in talkers with ALS. Therefore, the acoustic and perceptual consequences of direct instructions of enhanced jaw movements should be compared to cued speech modification, such as clear and slow speech in these talkers.}, }
@article {pmid32694252, year = {2020}, author = {Laturnus, R}, title = {Comparative Acoustic Analyses of L2 English: The Search for Systematic Variation.}, journal = {Phonetica}, volume = {77}, number = {6}, pages = {441-479}, doi = {10.1159/000508387}, pmid = {32694252}, issn = {1423-0321}, mesh = {Comprehension ; Humans ; *Language ; Male ; *Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; }, abstract = {BACKGROUND/AIMS: Previous research has shown that exposure to multiple foreign accents facilitates adaptation to an untrained novel accent. One explanation is that L2 speech varies systematically such that there are commonalities in the productions of nonnative speakers, regardless of their language background.
METHODS: A systematic acoustic comparison was conducted between 3 native English speakers and 6 nonnative accents. Voice onset time, unstressed vowel duration, and formant values of stressed and unstressed vowels were analyzed, comparing each nonnative accent to the native English talkers. A subsequent perception experiment tests what effect training on regionally accented voices has on the participant's comprehension of nonnative accented speech to investigate the importance of within-speaker variation on attunement and generalization.
RESULTS: Data for each measure show substantial variability across speakers, reflecting phonetic transfer from individual L1s, as well as substantial inconsistency and variability in pronunciation, rather than commonalities in their productions. Training on native English varieties did not improve participants' accuracy in understanding nonnative speech.
CONCLUSION: These findings are more consistent with a hypothesis of accent attune-ment wherein listeners track general patterns of nonnative speech rather than relying on overlapping acoustic signals between speakers.}, }
@article {pmid32693610, year = {2020}, author = {Rishiq, D and Harkrider, A and Springer, C and Hedrick, M}, title = {Effects of Aging on the Subcortical Encoding of Stop Consonants.}, journal = {American journal of audiology}, volume = {29}, number = {3}, pages = {391-403}, doi = {10.1044/2020_AJA-19-00044}, pmid = {32693610}, issn = {1558-9137}, mesh = {Adolescent ; Adult ; Aged ; Aging/*physiology ; Auditory Perception/physiology ; Brain Stem/physiology/physiopathology ; Evoked Potentials, Auditory, Brain Stem/*physiology ; Female ; Humans ; Male ; Middle Aged ; Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Purpose The main purpose of this study was to evaluate aging effects on the predominantly subcortical (brainstem) encoding of the second-formant frequency transition, an essential acoustic cue for perceiving place of articulation. Method Synthetic consonant-vowel syllables varying in second-formant onset frequency (i.e., /ba/, /da/, and /ga/ stimuli) were used to elicit speech-evoked auditory brainstem responses (speech-ABRs) in 16 young adults (M age = 21 years) and 11 older adults (M age = 59 years). Repeated-measures mixed-model analyses of variance were performed on the latencies and amplitudes of the speech-ABR peaks. Fixed factors were phoneme (repeated measures on three levels: /b/ vs. /d/ vs. /g/) and age (two levels: young vs. older). Results Speech-ABR differences were observed between the two groups (young vs. older adults). Specifically, older listeners showed generalized amplitude reductions for onset and major peaks. Significant Phoneme × Group interactions were not observed. Conclusions Results showed aging effects in speech-ABR amplitudes that may reflect diminished subcortical encoding of consonants in older listeners. These aging effects were not phoneme dependent as observed using the statistical methods of this study.}, }
@article {pmid32657177, year = {2021}, author = {Al-Tamimi, F and Howell, P}, title = {Voice onset time and formant onset frequencies in Arabic stuttered speech.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {6}, pages = {493-508}, doi = {10.1080/02699206.2020.1786726}, pmid = {32657177}, issn = {1464-5076}, mesh = {Adolescent ; Humans ; Phonetics ; Speech ; Speech Production Measurement ; *Stuttering ; *Voice ; }, abstract = {Neuromuscular models of stuttering consider that making transitions between phones results in inappropriate temporal arrangements of articulators in people who stutter (PWS). Using this framework, the current study examined the acoustic productions of two fine-grained phonetic features: voice onset time (VOT) and second formant (F2). The hypotheses were that PWS should differ from fluent persons (FP) in VOT duration and F2 onset frequency as a result of the transition deficit for environments with complex phonetic features such as Arabic emphatics. Ten adolescent PWS and 10 adolescent FPs participated in the study. They read and memorized four monosyllabic plain-emphatic words silently. Data were analyzed by Repeated Measures ANOVAs. The positive and negative VOT durations of/t/vs./tˁ/and/d/vs./dˁ/and F2 onset frequency were measured acoustically. Results showed that stuttering was significantly affected by emphatic consonants. PWS had atypical VOT durations and F2 values. Findings are consistent with the atypicality of VOT and F2 reported for English-speaking PWS. This atypicality is realized differently in Arabic depending on the articulatory complexity and cognitive load of the sound.}, }
@article {pmid32649536, year = {2020}, author = {Levy-Lambert, D and Grigos, MI and LeBlanc, É and DeMitchell-Rodriguez, EM and Noel, DY and Alfonso, AR and Ramly, EP and Rifkin, WJ and Diaz-Siso, JR and Ceradini, DJ and Kantar, RS and Rodriguez, ED}, title = {Communication Efficiency in a Face Transplant Recipient: Determinants and Therapeutic Implications.}, journal = {The Journal of craniofacial surgery}, volume = {31}, number = {6}, pages = {e528-e530}, doi = {10.1097/SCS.0000000000006727}, pmid = {32649536}, issn = {1536-3732}, mesh = {Adult ; *Facial Transplantation ; Humans ; Male ; Speech Intelligibility ; Speech Production Measurement ; Transplant Recipients ; }, abstract = {We longitudinally assessed speech intelligibility (percent words correct/pwc), communication efficiency (intelligible words per minute/iwpm), temporal control markers (speech and pause coefficients of variation), and formant frequencies associated with lip motion in a 41-year-old face transplant recipient. Pwc and iwpm at 13 months post-transplantation were both higher than preoperative values. Multivariate regression demonstrated that temporal markers and all formant frequencies associated with lip motion were significant predictors (P < 0.05) of communication efficiency, highlighting the interplay of these variables in generating intelligible and effective speech. These findings can guide us in developing personalized rehabilitative approaches in face transplant recipients for optimal speech outcomes.}, }
@article {pmid32640180, year = {2020}, author = {Kim, KS and Wang, H and Max, L}, title = {It's About Time: Minimizing Hardware and Software Latencies in Speech Research With Real-Time Auditory Feedback.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {8}, pages = {2522-2534}, pmid = {32640180}, issn = {1558-9102}, support = {R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Computers ; Feedback ; *Feedback, Sensory ; Humans ; Reproducibility of Results ; Software ; *Speech ; }, abstract = {Purpose Various aspects of speech production related to auditory-motor integration and learning have been examined through auditory feedback perturbation paradigms in which participants' acoustic speech output is experimentally altered and played back via earphones/headphones "in real time." Scientific rigor requires high precision in determining and reporting the involved hardware and software latencies. Many reports in the literature, however, are not consistent with the minimum achievable latency for a given experimental setup. Here, we focus specifically on this methodological issue associated with implementing real-time auditory feedback perturbations, and we offer concrete suggestions for increased reproducibility in this particular line of work. Method Hardware and software latencies as well as total feedback loop latency were measured for formant perturbation studies with the Audapter software. Measurements were conducted for various audio interfaces, desktop and laptop computers, and audio drivers. An approach for lowering Audapter's software latency through nondefault parameter specification was also tested. Results Oft-overlooked hardware-specific latencies were not negligible for some of the tested audio interfaces (adding up to 15 ms). Total feedback loop latencies (including both hardware and software latency) were also generally larger than claimed in the literature. Nondefault parameter values can improve Audapter's own processing latency without negative impact on formant tracking. Conclusions Audio interface selection and software parameter optimization substantially affect total feedback loop latency. Thus, the actual total latency (hardware plus software) needs to be correctly measured and described in all published reports. Future speech research with "real-time" auditory feedback perturbations should increase scientific rigor by minimizing this latency.}, }
@article {pmid32632010, year = {2020}, author = {Plass, J and Brang, D and Suzuki, S and Grabowecky, M}, title = {Vision perceptually restores auditory spectral dynamics in speech.}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, volume = {117}, number = {29}, pages = {16920-16927}, pmid = {32632010}, issn = {1091-6490}, support = {T32 NS047987/NS/NINDS NIH HHS/United States ; }, mesh = {Adult ; Cues ; Female ; Humans ; Lip/physiology ; Male ; Phonetics ; *Speech Acoustics ; *Speech Perception ; *Visual Perception ; }, abstract = {Visual speech facilitates auditory speech perception, but the visual cues responsible for these benefits and the information they provide remain unclear. Low-level models emphasize basic temporal cues provided by mouth movements, but these impoverished signals may not fully account for the richness of auditory information provided by visual speech. High-level models posit interactions among abstract categorical (i.e., phonemes/visemes) or amodal (e.g., articulatory) speech representations, but require lossy remapping of speech signals onto abstracted representations. Because visible articulators shape the spectral content of speech, we hypothesized that the perceptual system might exploit natural correlations between midlevel visual (oral deformations) and auditory speech features (frequency modulations) to extract detailed spectrotemporal information from visual speech without employing high-level abstractions. Consistent with this hypothesis, we found that the time-frequency dynamics of oral resonances (formants) could be predicted with unexpectedly high precision from the changing shape of the mouth during speech. When isolated from other speech cues, speech-based shape deformations improved perceptual sensitivity for corresponding frequency modulations, suggesting that listeners could exploit this cross-modal correspondence to facilitate perception. To test whether this type of correspondence could improve speech comprehension, we selectively degraded the spectral or temporal dimensions of auditory sentence spectrograms to assess how well visual speech facilitated comprehension under each degradation condition. Visual speech produced drastically larger enhancements during spectral degradation, suggesting a condition-specific facilitation effect driven by cross-modal recovery of auditory speech spectra. The perceptual system may therefore use audiovisual correlations rooted in oral acoustics to extract detailed spectrotemporal information from visual speech.}, }
@article {pmid32631070, year = {2020}, author = {Kent, RD and Rountrey, C}, title = {What Acoustic Studies Tell Us About Vowels in Developing and Disordered Speech.}, journal = {American journal of speech-language pathology}, volume = {29}, number = {3}, pages = {1749-1778}, pmid = {32631070}, issn = {1558-9110}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Acoustics ; Adult ; Child ; Child, Preschool ; Humans ; Language ; Phonetics ; Speech ; *Speech Acoustics ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Purpose Literature was reviewed on the development of vowels in children's speech and on vowel disorders in children and adults, with an emphasis on studies using acoustic methods. Method Searches were conducted with PubMed/MEDLINE, Google Scholar, CINAHL, HighWire Press, and legacy sources in retrieved articles. The primary search items included, but were not limited to, vowels, vowel development, vowel disorders, vowel formants, vowel therapy, vowel inherent spectral change, speech rhythm, and prosody. Results/Discussion The main conclusions reached in this review are that vowels are (a) important to speech intelligibility; (b) intrinsically dynamic; (c) refined in both perceptual and productive aspects beyond the age typically given for their phonetic mastery; (d) produced to compensate for articulatory and auditory perturbations; (e) influenced by language and dialect even in early childhood; (f) affected by a variety of speech, language, and hearing disorders in children and adults; (g) inadequately assessed by standardized articulation tests; and (h) characterized by at least three factors-articulatory configuration, extrinsic and intrinsic regulation of duration, and role in speech rhythm and prosody. Also discussed are stages in typical vowel ontogeny, acoustic characterization of rhotic vowels, a sensory-motor perspective on vowel production, and implications for clinical assessment of vowels.}, }
@article {pmid32624371, year = {2022}, author = {Vurma, A}, title = {Amplitude Effects of Vocal Tract Resonance Adjustments When Singing Louder.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {36}, number = {2}, pages = {292.e11-292.e22}, doi = {10.1016/j.jvoice.2020.05.020}, pmid = {32624371}, issn = {1873-4588}, mesh = {Humans ; Male ; *Singing ; Sound ; Vibration ; *Voice ; Voice Quality ; }, abstract = {In the literature on vocal pedagogy we may find suggestions to increase the mouth opening when singing louder. It is known that sopranos tend to sing loud high notes with a wider mouth opening which raises the frequency of the first resonance of the vocal tract (fR1) to tune it close to the fundamental. Our experiment with classically trained male singers revealed that they also tended to raise the fR1 with the dynamics at pitches where the formant tuning does not seem relevant. The analysis by synthesis showed that such behaviour may contribute to the strengthening of the singer's formant by several dB-s and to a rise in the centre of spectral gravity. The contribution of the fR1 raising to the overall sound level was less consistent. Changing the extent of the mouth opening with the dynamics may create several simultaneous semantic cues that signal how prominent the produced sound is and how great the physical effort by the singer is. The diminishing of the mouth opening when singing piano may also have an importance as it helps singers to produce a quieter sound by increasing the distance between the fR1 and higher resonances, which lowers the transfer function of the vocal tract at the relevant spectral regions.}, }
@article {pmid32616360, year = {2020}, author = {Chaturvedi, R and Kraus, M and Keefe, RSE}, title = {A new measure of authentic auditory emotion recognition: Application to patients with schizophrenia.}, journal = {Schizophrenia research}, volume = {222}, number = {}, pages = {450-454}, doi = {10.1016/j.schres.2019.11.043}, pmid = {32616360}, issn = {1573-2509}, support = {R21 MH101685/MH/NIMH NIH HHS/United States ; }, mesh = {Auditory Perception ; Emotions ; Face ; Facial Expression ; Humans ; Recognition, Psychology ; *Schizophrenia ; }, abstract = {BACKGROUND: Many social processes such as emotion recognition are severely impaired in patients with schizophrenia. While basic auditory processing seems to play a key role in identifying emotions, research in this field is limited due to the lack of proper assessment batteries. Many of the widely accepted tests utilize actors to portray certain emotions-these batteries are less ecologically and face valid.
METHODS: This study utilized a newly developed auditory emotion recognition test that contained natural stimuli from spontaneous displays of emotions to assess 28 patients with schizophrenia and 16 healthy controls.
RESULTS: The results indicate that the newly developed test, referred to as the INTONATION Test, is more sensitive to the emotion recognition deficits in patients with schizophrenia than previously used measures. The correlations of the INTONATION Test measures with basic auditory processes were similar to established tests of auditory emotion. Particular emotion sub scores from the INTONTATION test, such as happiness, demonstrated the strongest correlations with specific auditory processing skills, such as formant discrimination and sinusoidal amplitude modulation detection (SAM60).
CONCLUSIONS: The results from this study indicate that auditory emotion recognition impairments are more pronounced in patients with schizophrenia when perceiving authentic displays of emotion. Understanding these deficits could help specify the nature of auditory emotion recognition deficits in patients with schizophrenia and those at risk.}, }
@article {pmid32611190, year = {2020}, author = {Toutios, A and Xu, M and Byrd, D and Goldstein, L and Narayanan, S}, title = {How an aglossic speaker produces an alveolar-like percept without a functional tongue tip.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {6}, pages = {EL460}, pmid = {32611190}, issn = {1520-8524}, support = {R01 DC007124/DC/NIDCD NIH HHS/United States ; }, mesh = {Female ; Humans ; Phonetics ; Speech ; *Tongue/diagnostic imaging ; *Voice ; }, abstract = {It has been previously observed [McMicken, Salles, Berg, Vento-Wilson, Rogers, Toutios, and Narayanan. (2017). J. Commun. Disorders, Deaf Stud. Hear. Aids 5(2), 1-6] using real-time magnetic resonance imaging that a speaker with severe congenital tongue hypoplasia (aglossia) had developed a compensatory articulatory strategy where she, in the absence of a functional tongue tip, produced a plosive consonant perceptually similar to /d/ using a bilabial constriction. The present paper provides an updated account of this strategy. It is suggested that the previously observed compensatory bilabial closing that occurs during this speaker's /d/ production is consistent with vocal tract shaping resulting from hyoid raising created with mylohyoid action, which may also be involved in typical /d/ production. Simulating this strategy in a dynamic articulatory synthesis experiment leads to the generation of /d/-like formant transitions.}, }
@article {pmid32611162, year = {2020}, author = {Harper, S and Goldstein, L and Narayanan, S}, title = {Variability in individual constriction contributions to third formant values in American English /ɹ/.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {6}, pages = {3905}, pmid = {32611162}, issn = {1520-8524}, support = {R01 DC007124/DC/NIDCD NIH HHS/United States ; T32 DC009975/DC/NIDCD NIH HHS/United States ; }, mesh = {Constriction ; Language ; Pharynx ; *Phonetics ; *Speech Acoustics ; Speech Production Measurement ; United States ; }, abstract = {Although substantial variability is observed in the articulatory implementation of the constriction gestures involved in /ɹ/ production, studies of articulatory-acoustic relations in /ɹ/ have largely ignored the potential for subtle variation in the implementation of these gestures to affect salient acoustic dimensions. This study examines how variation in the articulation of American English /ɹ/ influences the relative sensitivity of the third formant to variation in palatal, pharyngeal, and labial constriction degree. Simultaneously recorded articulatory and acoustic data from six speakers in the USC-TIMIT corpus was analyzed to determine how variation in the implementation of each constriction across tokens of /ɹ/ relates to variation in third formant values. Results show that third formant values are differentially affected by constriction degree for the different constrictions used to produce /ɹ/. Additionally, interspeaker variation is observed in the relative effect of different constriction gestures on third formant values, most notably in a division between speakers exhibiting relatively equal effects of palatal and pharyngeal constriction degree on F3 and speakers exhibiting a stronger palatal effect. This division among speakers mirrors interspeaker differences in mean constriction length and location, suggesting that individual differences in /ɹ/ production lead to variation in articulatory-acoustic relations.}, }
@article {pmid32581975, year = {2020}, author = {Xu, M and Tachibana, RO and Okanoya, K and Hagiwara, H and Hashimoto, RI and Homae, F}, title = {Unconscious and Distinctive Control of Vocal Pitch and Timbre During Altered Auditory Feedback.}, journal = {Frontiers in psychology}, volume = {11}, number = {}, pages = {1224}, pmid = {32581975}, issn = {1664-1078}, abstract = {Vocal control plays a critical role in smooth social communication. Speakers constantly monitor auditory feedback (AF) and make adjustments when their voices deviate from their intentions. Previous studies have shown that when certain acoustic features of the AF are artificially altered, speakers compensate for this alteration in the opposite direction. However, little is known about how the vocal control system implements compensations for alterations of different acoustic features, and associates them with subjective consciousness. The present study investigated whether compensations for the fundamental frequency (F0), which corresponds to perceived pitch, and formants, which contribute to perceived timbre, can be performed unconsciously and independently. Forty native Japanese speakers received two types of altered AF during vowel production that involved shifts of either only the formant frequencies (formant modification; Fm) or both the pitch and formant frequencies (pitch + formant modification; PFm). For each type, three levels of shift (slight, medium, and severe) in both directions (increase or decrease) were used. After the experiment, participants were tested for whether they had perceived a change in the F0 and/or formants. The results showed that (i) only formants were compensated for in the Fm condition, while both the F0 and formants were compensated for in the PFm condition; (ii) the F0 compensation exhibited greater precision than the formant compensation in PFm; and (iii) compensation occurred even when participants misperceived or could not explicitly perceive the alteration in AF. These findings indicate that non-experts can compensate for both formant and F0 modifications in the AF during vocal production, even when the modifications are not explicitly or correctly perceived, which provides further evidence for a dissociation between conscious perception and action in vocal control. We propose that such unconscious control of voice production may enhance rapid adaptation to changing speech environments and facilitate mutual communication.}, }
@article {pmid32554244, year = {2020}, author = {White-Schwoch, T and Magohe, AK and Fellows, AM and Rieke, CC and Vilarello, B and Nicol, T and Massawe, ER and Moshi, N and Kraus, N and Buckey, JC}, title = {Auditory neurophysiology reveals central nervous system dysfunction in HIV-infected individuals.}, journal = {Clinical neurophysiology : official journal of the International Federation of Clinical Neurophysiology}, volume = {131}, number = {8}, pages = {1827-1832}, pmid = {32554244}, issn = {1872-8952}, support = {D43 TW009573/TW/FIC NIH HHS/United States ; R01 DC009972/DC/NIDCD NIH HHS/United States ; }, mesh = {Adolescent ; Adult ; Auditory Perception/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; HIV Infections/*physiopathology ; Hearing/physiology ; Humans ; Male ; Middle Aged ; Speech ; Speech Perception/*physiology ; Tanzania ; Young Adult ; }, abstract = {OBJECTIVE: To test the hypothesis that human immunodeficiency virus (HIV) affects auditory-neurophysiological functions.
METHODS: A convenience sample of 68 HIV+ and 59 HIV- normal-hearing adults was selected from a study set in Dar es Salaam, Tanzania. The speech-evoked frequency-following response (FFR), an objective measure of auditory function, was collected. Outcome measures were FFRs to the fundamental frequency (F0) and to harmonics corresponding to the first formant (F1), two behaviorally relevant cues for understanding speech.
RESULTS: The HIV+ group had weaker responses to the F1 than the HIV- group; this effect generalized across multiple stimuli (d = 0.59). Responses to the F0 were similar between groups.
CONCLUSIONS: Auditory-neurophysiological responses differ between HIV+ and HIV- adults despite normal hearing thresholds.
SIGNIFICANCE: The FFR may reflect HIV-associated central nervous system dysfunction that manifests as disrupted auditory processing of speech harmonics corresponding to the first formant.}, }
@article {pmid32552327, year = {2020}, author = {DiNino, M and Arenberg, JG and Duchen, ALR and Winn, MB}, title = {Effects of Age and Cochlear Implantation on Spectrally Cued Speech Categorization.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {7}, pages = {2425-2440}, pmid = {32552327}, issn = {1558-9102}, support = {R01 DC012142/DC/NIDCD NIH HHS/United States ; R01 DC017114/DC/NIDCD NIH HHS/United States ; R03 DC014309/DC/NIDCD NIH HHS/United States ; T32 DC005361/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Child ; *Cochlear Implantation ; *Cochlear Implants ; Cues ; Humans ; Speech ; *Speech Perception ; }, abstract = {Purpose Weighting of acoustic cues for perceiving place-of-articulation speech contrasts was measured to determine the separate and interactive effects of age and use of cochlear implants (CIs). It has been found that adults with normal hearing (NH) show reliance on fine-grained spectral information (e.g., formants), whereas adults with CIs show reliance on broad spectral shape (e.g., spectral tilt). In question was whether children with NH and CIs would demonstrate the same patterns as adults, or show differences based on ongoing maturation of hearing and phonetic skills. Method Children and adults with NH and with CIs categorized a /b/-/d/ speech contrast based on two orthogonal spectral cues. Among CI users, phonetic cue weights were compared to vowel identification scores and Spectral-Temporally Modulated Ripple Test thresholds. Results NH children and adults both relied relatively more on the fine-grained formant cue and less on the broad spectral tilt cue compared to participants with CIs. However, early-implanted children with CIs better utilized the formant cue compared to adult CI users. Formant cue weights correlated with CI participants' vowel recognition and in children, also related to Spectral-Temporally Modulated Ripple Test thresholds. Adults and child CI users with very poor phonetic perception showed additive use of the two cues, whereas those with better and/or more mature cue usage showed a prioritized trading relationship, akin to NH listeners. Conclusions Age group and hearing modality can influence phonetic cue-weighting patterns. Results suggest that simple nonlexical categorization tests correlate with more general speech recognition skills of children and adults with CIs.}, }
@article {pmid32539544, year = {2021}, author = {Chiu, Y and Neel, A and Loux, T}, title = {Acoustic characteristics in relation to intelligibility reduction in noise for speakers with Parkinson's disease.}, journal = {Clinical linguistics & phonetics}, volume = {35}, number = {3}, pages = {222-236}, doi = {10.1080/02699206.2020.1777585}, pmid = {32539544}, issn = {1464-5076}, mesh = {Acoustics ; Humans ; *Parkinson Disease/complications ; Speech Acoustics ; Speech Intelligibility ; Speech Production Measurement ; }, abstract = {Decreased speech intelligibility in noisy environments is frequently observed in speakers with Parkinson's disease (PD). This study investigated which acoustic characteristics across the speech subsystems contributed to poor intelligibility in noise for speakers with PD. Speech samples were obtained from 13 speakers with PD and five healthy controls reading 56 sentences. Intelligibility analysis was conducted in quiet and noisy listening conditions. Seventy-two young listeners transcribed the recorded sentences in quiet and another 72 listeners transcribed in noise. The acoustic characteristics of the speakers with PD who experienced large intelligibility reduction from quiet to noise were compared to those with smaller intelligibility reduction in noise and healthy controls. The acoustic measures in the study included second formant transitions, cepstral and spectral measures of voice (cepstral peak prominence and low/high spectral ratio), pitch variation, and articulation rate to represent speech components across speech subsystems of articulation, phonation, and prosody. The results show that speakers with PD who had larger intelligibility reduction in noise exhibited decreased second formant transition, limited cepstral and spectral variations, and faster articulation rate. These findings suggest that the adverse effect of noise on speech intelligibility in PD is related to speech changes in the articulatory and phonatory systems.}, }
@article {pmid32538265, year = {2021}, author = {Rankinen, W and de Jong, K}, title = {The Entanglement of Dialectal Variation and Speaker Normalization.}, journal = {Language and speech}, volume = {64}, number = {1}, pages = {181-202}, doi = {10.1177/0023830920929379}, pmid = {32538265}, issn = {1756-6053}, mesh = {Algorithms ; Humans ; *Language ; *Phonetics ; Psycholinguistics ; Reading ; *Social Behavior ; Speech/*physiology ; Speech Acoustics ; *Verbal Behavior ; }, abstract = {This paper explores the relationship between speaker normalization and dialectal identity in sociolinguistic data, examining a database of vowel formants collected from 88 monolingual American English speakers in Michigan's Upper Peninsula. Audio recordings of Finnish- and Italian-heritage American English speakers reading a passage and a word list were normalized using two normalization procedures. These algorithms are based on different concepts of normalization: Lobanov, which models normalization as based on experience with individual talkers, and Labov ANAE, which models normalization as based on experience with scale-factors inherent in acoustic resonators of all kinds. The two procedures yielded different results; while the Labov ANAE method reveals a cluster shifting of low and back vowels that correlated with heritage, the Lobanov procedure seems to eliminate this sociolinguistic variation. The difference between the two procedures lies in how they treat relations between formant changes, suggesting that dimensions of variation in the vowel space may be treated differently by different normalization procedures, raising the question of how anatomical variation and dialectal variation interact in the real world. The structure of the sociolinguistic effects found with the Labov ANAE normalized data, but not in the Lobanov normalized data, suggest that the Lobanov normalization does over-normalize formant measures and remove sociolinguistically relevant information.}, }
@article {pmid32525399, year = {2020}, author = {Xiao, CC and Luetzenberg, FS and Jiang, N and Liang, J}, title = {Does Nasal Surgery Affect Voice Outcomes? A Systematic Review with Meta-Analyses.}, journal = {The Annals of otology, rhinology, and laryngology}, volume = {129}, number = {12}, pages = {1174-1185}, doi = {10.1177/0003489420933290}, pmid = {32525399}, issn = {1943-572X}, mesh = {Chronic Disease ; Humans ; Nasal Polyps/*surgery ; Nasal Septum/surgery ; *Nasal Surgical Procedures ; Otorhinolaryngologic Surgical Procedures ; Paranasal Sinuses/surgery ; Postoperative Complications/epidemiology/physiopathology ; Rhinitis/*surgery ; Rhinoplasty ; Sinusitis/*surgery ; Treatment Outcome ; Turbinates/surgery ; *Voice Quality ; }, abstract = {OBJECTIVES: Changes in airflow dynamics after nasal surgery may have implications on voice quality. Multiple studies have evaluated the impact of nasal surgery on voice using heterogeneous outcome measures. We aim to systematically review the impact of nasal surgery on voice quality.
METHODS: Our study design was a systematic review with meta-analyses. A literature search of PubMed, Ovid, Cochrane from 1997 to 2017 was performed. Inclusion criteria included English language studies containing original data on nasal surgery and voice. Two investigators independently reviewed all manuscripts and performed a comprehensive quality assessment. Meta-analysis was completed on quantitative voice measurements.
RESULTS: Of 463 identified, 19 studies with 692 patients fulfilled eligibility. Nasal surgeries performed included endoscopic sinus surgery (11/20), septoplasty (11/20), rhinoplasty (2/20), and turbinate reduction (2/20). Voice outcomes measured included nasalance (8/20), fundamental frequency (11/20), jitter (10/20), shimmer (10/20), harmonic to noise ratio (HRN) (8/20), formants (5/20), and voice handicap index (VHI) (4/20). Voice examinations were assessed preoperatively and 1 to 30 months postoperatively. Meta-analysis revealed statistically significant changes in nasalance, (P < .01) 1 month postoperatively; there was no significant difference in nasalance at 6 months postoperatively. All other variables analyzed revealed no statistically significant differences. Five of nine studies showed majority of patients did not notice subjective change in voice after surgery, but with high heterogeneity of measurements.
CONCLUSIONS: There may be a short-term increase in nasalance that resolves at longer follow-up, but there seem to be no other objective changes in voice. There may be subjective changes after surgery, but require further study to evaluate.}, }
@article {pmid32516559, year = {2020}, author = {Ménard, L and Prémont, A and Trudeau-Fisette, P and Turgeon, C and Tiede, M}, title = {Phonetic Implementation of Prosodic Emphasis in Preschool-Aged Children and Adults: Probing the Development of Sensorimotor Speech Goals.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {6}, pages = {1658-1674}, doi = {10.1044/2020_JSLHR-20-00017}, pmid = {32516559}, issn = {1558-9102}, mesh = {Adult ; Child, Preschool ; Goals ; Humans ; *Phonetics ; *Speech ; Speech Acoustics ; Speech Production Measurement ; }, abstract = {Objective We aimed to investigate the production of contrastive emphasis in French-speaking 4-year-olds and adults. Based on previous work, we predicted that, due to their immature motor control abilities, preschool-aged children would produce smaller articulatory differences between emphasized and neutral syllables than adults. Method Ten 4-year-old children and 10 adult French speakers were recorded while repeating /bib/, /bub/, and /bab/ sequences in neutral and contrastive emphasis conditions. Synchronous recordings of tongue movements, lip and jaw positions, and speech signals were made. Lip positions and tongue shapes were analyzed; formant frequencies, amplitude, fundamental frequency, and duration were extracted from the acoustic signals; and between-vowel contrasts were calculated. Results Emphasized vowels were higher in pitch, intensity, and duration than their neutral counterparts in all participants. However, the effect of contrastive emphasis on lip position was smaller in children. Prosody did not affect tongue position in children, whereas it did in adults. As a result, children's productions were perceived less accurately than those of adults. Conclusion These findings suggest that 4-year-old children have not yet learned to produce hypoarticulated forms of phonemic goals to allow them to successfully contrast syllables and enhance prosodic saliency.}, }
@article {pmid35402959, year = {2020}, author = {Quatieri, TF and Talkar, T and Palmer, JS}, title = {A Framework for Biomarkers of COVID-19 Based on Coordination of Speech-Production Subsystems.}, journal = {IEEE open journal of engineering in medicine and biology}, volume = {1}, number = {}, pages = {203-206}, pmid = {35402959}, issn = {2644-1276}, abstract = {Goal: We propose a speech modeling and signal-processing framework to detect and track COVID-19 through asymptomatic and symptomatic stages. Methods: The approach is based on complexity of neuromotor coordination across speech subsystems involved in respiration, phonation and articulation, motivated by the distinct nature of COVID-19 involving lower (i.e., bronchial, diaphragm, lower tracheal) versus upper (i.e., laryngeal, pharyngeal, oral and nasal) respiratory tract inflammation, as well as by the growing evidence of the virus' neurological manifestations. Preliminary results: An exploratory study with audio interviews of five subjects provides Cohen's d effect sizes between pre-COVID-19 (pre-exposure) and post-COVID-19 (after positive diagnosis but presumed asymptomatic) using: coordination of respiration (as measured through acoustic waveform amplitude) and laryngeal motion (fundamental frequency and cepstral peak prominence), and coordination of laryngeal and articulatory (formant center frequencies) motion. Conclusions: While there is a strong subject-dependence, the group-level morphology of effect sizes indicates a reduced complexity of subsystem coordination. Validation is needed with larger more controlled datasets and to address confounding influences such as different recording conditions, unbalanced data quantities, and changes in underlying vocal status from pre-to-post time recordings.}, }
@article {pmid32379521, year = {2020}, author = {Groll, MD and McKenna, VS and Hablani, S and Stepp, CE}, title = {Formant-Estimated Vocal Tract Length and Extrinsic Laryngeal Muscle Activation During Modulation of Vocal Effort in Healthy Speakers.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {5}, pages = {1395-1403}, pmid = {32379521}, issn = {1558-9102}, support = {R01 DC015570/DC/NIDCD NIH HHS/United States ; T32 DC000030/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Electromyography ; Humans ; *Laryngeal Muscles ; Neck Muscles ; Speech Acoustics ; *Voice ; }, abstract = {Purpose The goal of this study was to explore the relationships among vocal effort, extrinsic laryngeal muscle activity, and vocal tract length (VTL) within healthy speakers. We hypothesized that increased vocal effort would result in increased suprahyoid muscle activation and decreased VTL, as previously observed in individuals with vocal hyperfunction. Method Twenty-eight healthy speakers of American English produced vowel-consonant-vowel utterances under varying levels of vocal effort. VTL was estimated from the vowel formants. Three surface electromyography sensors measured the activation of the suprahyoid and infrahyoid muscle groups. A general linear model was used to investigate the effects of vocal effort level and surface electromyography on VTL. Two additional general linear models were used to investigate the effects of vocal effort on suprahyoid and infrahyoid muscle activities. Results Neither vocal effort nor extrinsic muscle activity showed significant effects on VTL; however, the degree of extrinsic muscle activity of both suprahyoid and infrahyoid muscle groups increased with increases in vocal effort. Conclusion Increasing vocal effort resulted in increased activation of both suprahyoid and infrahyoid musculature in healthy adults, with no change to VTL.}, }
@article {pmid32371713, year = {2020}, author = {Zhou, H and Lu, J and Zhang, C and Li, X and Li, Y}, title = {Abnormal Acoustic Features Following Pharyngeal Flap Surgery in Patients Aged Six Years and Older.}, journal = {The Journal of craniofacial surgery}, volume = {31}, number = {5}, pages = {1395-1399}, doi = {10.1097/SCS.0000000000006483}, pmid = {32371713}, issn = {1536-3732}, mesh = {Acoustics ; Adolescent ; Adult ; Child ; Female ; Humans ; Male ; Otorhinolaryngologic Surgical Procedures/*adverse effects ; Pharynx/*surgery ; Phonetics ; Retrospective Studies ; Speech ; Speech Disorders/*etiology ; *Surgical Flaps ; Treatment Outcome ; Velopharyngeal Insufficiency/surgery ; Young Adult ; }, abstract = {In our study, older velopharyngeal insufficiency (posterior velopharyngeal insufficiency) patients were defined as those older than 6 years of age. This study aimed to evaluate the abnormal acoustic features of older velopharyngeal insufficiency patients before and after posterior pharyngeal flap surgery. A retrospective medical record review was conducted for patients aged 6 years and older, who underwent posterior pharyngeal flap surgery between November 2011 and March 2015. The audio records of patients were evaluated before and after surgery. Spectral analysis was conducted by the Computer Speech Lab (CSL)-4150B acoustic system with the following input data: The vowel /i/, unaspirated plosive /b/, aspirated plosives /p/, aspirated fricatives /s/ and /x/, unaspirated affricates /j/ and /z/, and aspirated affricates /c/ and /q/. The patients were followed up for 3 months. Speech outcome was evaluated by comparing the postoperatively phonetic data with preoperative data. Subjective and objective analyses showed significant differences in the sonogram, formant, and speech articulation before and after the posterior pharyngeal flap surgery. However, the sampled patients could not be considered to have a high speech articulation (<85%) as the normal value was above or equal to 96%. Our results showed that pharyngeal flap surgery could correct the speech function of older patients with posterior velopharyngeal insufficiency to some extent. Owing to the original errors in pronunciation patterns, pathological speech articulation still existed, and speech treatment is required in the future.}, }
@article {pmid32359329, year = {2020}, author = {Kochetov, A and Petersen, JH and Arsenault, P}, title = {Acoustics of Kalasha laterals.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {3012}, doi = {10.1121/10.0001013}, pmid = {32359329}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {Kalasha, a Northwestern Indo-Aryan language spoken in a remote mountainous region of Pakistan, is relatively unusual among languages of the region as it has lateral approximants contrasting in secondary articulation-velarization and palatalization (/ɫ/ vs /lʲ/). Given the paucity of previous phonetic work on the language and some discrepancies between descriptive accounts, the nature of the Kalasha lateral contrast remains poorly understood. This paper presents an analysis of fieldwork recordings with laterals produced by 14 Kalasha speakers in a variety of lexical items and phonetic contexts. Acoustic analysis of formants measured during the lateral closure revealed that the contrast was most clearly distinguished by F2 (as well as by F2-F1 difference), which was considerably higher for /lʲ/ than for /ɫ/. This confirms that the two laterals are primarily distinguished by secondary articulation and not by retroflexion, which is otherwise robustly represented in the language inventory. The laterals showed no positional differences but did show considerable fronting (higher F2) next to front vowels. Some inter-speaker variation was observed in the realization of /ɫ/, which was produced with little or no velarization by older speakers. This is indicative of a change in progress, resulting in an overall enhancement of an otherwise auditorily vulnerable contrast.}, }
@article {pmid32359316, year = {2020}, author = {Almurashi, W and Al-Tamimi, J and Khattab, G}, title = {Static and dynamic cues in vowel production in Hijazi Arabic.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2917}, doi = {10.1121/10.0001004}, pmid = {32359316}, issn = {1520-8524}, mesh = {Cues ; Humans ; Language ; Male ; Phonetics ; *Speech Acoustics ; *Speech Perception ; }, abstract = {Static cues such as formant measurements obtained at the vowel midpoint are usually taken as the main correlate for vowel identification. However, dynamic cues such as vowel-inherent spectral change have been shown to yield better classification of vowels using discriminant analysis. The aim of this study is to evaluate the role of static versus dynamic cues in Hijazi Arabic (HA) vowel classification, in addition to vowel duration and F3, which are not usually looked at. Data from 12 male HA speakers producing eight HA vowels in /hVd/ syllables were obtained, and classification accuracy was evaluated using discriminant analysis. Dynamic cues, particularly the three-point model, had higher classification rates (average 95.5%) than the remaining models (static model: 93.5%; other dynamic models: between 65.75% and 94.25%). Vowel duration had a significant role in classification accuracy (average +8%). These results are in line with dynamic approaches to vowel classification and highlight the relative importance of cues such as vowel duration across languages, particularly where it is prominent in the phonology.}, }
@article {pmid32359308, year = {2020}, author = {Egurtzegi, A and Carignan, C}, title = {An acoustic description of Mixean Basque.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2791}, doi = {10.1121/10.0000996}, pmid = {32359308}, issn = {1520-8524}, mesh = {Acoustics ; *Language ; Phonetics ; Spain ; *Speech Acoustics ; }, abstract = {This paper presents an acoustic analysis of Mixean Low Navarrese, an endangered variety of Basque. The manuscript includes an overview of previous acoustic studies performed on different Basque varieties in order to synthesize the sparse acoustic descriptions of the language that are available. This synthesis serves as a basis for the acoustic analysis performed in the current study, in which the various acoustic analyses given in previous studies are replicated in a single, cohesive general acoustic description of Mixean Basque. The analyses include formant and duration measurements for the six-vowel system, voice onset time measurements for the three-way stop system, spectral center of gravity for the sibilants, and number of lingual contacts in the alveolar rhotic tap and trill. Important findings include: a centralized realization ([ʉ]) of the high-front rounded vowel usually described as /y/; a data-driven confirmation of the three-way laryngeal opposition in the stop system; evidence in support of an alveolo-palatal to apical sibilant merger; and the discovery of a possible incipient merger of rhotics. These results show how using experimental acoustic methods to study under-represented linguistic varieties can result in revelations of sound patterns otherwise undescribed in more commonly studied varieties of the same language.}, }
@article {pmid32359305, year = {2020}, author = {Mellesmoen, G and Babel, M}, title = {Acoustically distinct and perceptually ambiguous: ʔayʔaǰuθəm (Salish) fricatives.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2959}, doi = {10.1121/10.0001007}, pmid = {32359305}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {ʔayʔaǰuθəm (Comox-Sliammon) is a Central Salish language spoken in British Columbia with a large fricative inventory. Previous impressionistic descriptions of ʔayʔaǰuθəm have noted perceptual ambiguity of select anterior fricatives. This paper provides an auditory-acoustic description of the four anterior fricatives /θ s ʃ ɬ/ in the Mainland dialect of ʔayʔaǰuθəm. Peak ERBN trajectories, noise duration, and formant transitions are analysed in the fricative productions of five speakers. These analyses provide quantitative and qualitative descriptions of these fricative contrasts, indicating more robust acoustic differentiation for fricatives in onset versus coda position. In a perception task, English listeners categorized fricatives in CV and VC sequences from the natural productions. The results of the perception experiment are consistent with reported perceptual ambiguity between /s/ and /θ/, with listeners frequently misidentifying /θ/ as /s/. The production and perception data suggest that listener L1 categories play a role in the categorization and discrimination of ʔayʔaǰuθəm fricatives. These findings provide an empirical description of fricatives in an understudied language and have implications for L2 teaching and learning in language revitalization contexts.}, }
@article {pmid32359280, year = {2020}, author = {Rosen, N and Stewart, J and Sammons, ON}, title = {How "mixed" is mixed language phonology? An acoustic analysis of the Michif vowel system.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2989}, doi = {10.1121/10.0001009}, pmid = {32359280}, issn = {1520-8524}, mesh = {Acoustics ; Canada ; Humans ; *Language ; *Phonetics ; Speech Acoustics ; }, abstract = {Michif, a severely endangered language still spoken today by an estimated 100-200 Métis people in Western Canada, is generally classified as a mixed language, meaning it cannot be traced back to a single language family [Bakker (1997). A Language of Our Own (Oxford University Press, Oxford); Thomason (2001). Language Contact: An Introduction (Edinburgh University Press and Georgetown University Press, Edinburgh and Washington, DC); Meakins (2013). Contact Languages: A Comprehensive Guide (Mouton De Gruyter, Berlin), pp. 159-228.]. It has been claimed to maintain the phonological grammar of both of its source languages, French and Plains Cree [Rhodes (1977). Actes du Huitieme congrès des Algonqunistes (Carleton University, Ottawa), pp. 6-25; Bakker (1997). A Language of Our Own (Oxford University Press, Oxford); Bakker and Papen (1997). Contact Languages: A Wider Perspective (John Benjamins, Amsterdam), pp. 295-363]. The goal of this paper is twofold: to offer an instrumental analysis of Michif vowels and to investigate this claim of a stratified grammar, based on this careful phonetic analysis. Using source language as a variable in the analysis, the authors argue the Michif vowel system does not appear to rely on historical information, and that historically similar French and Cree vowels pattern together within the Michif system with regards to formant frequencies and duration. The authors show that there are nine Michif oral vowels in this system, which has merged phonetically similar French- and Cree-source vowels.}, }
@article {pmid32359278, year = {2020}, author = {van Brenk, F and Terband, H}, title = {Compensatory and adaptive responses to real-time formant shifts in adults and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2261}, doi = {10.1121/10.0001018}, pmid = {32359278}, issn = {1520-8524}, mesh = {Adaptation, Physiological ; Adolescent ; Adult ; Child ; Child, Preschool ; Feedback, Sensory ; Humans ; Speech ; *Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Auditory feedback plays an important role in speech motor learning, yet, little is known about the strength of motor learning and feedback control in speech development. This study investigated compensatory and adaptive responses to auditory feedback perturbation in children (aged 4-9 years old) and young adults (aged 18-29 years old). Auditory feedback was perturbed by near-real-time shifting F1 and F2 of the vowel /ɪː/ during the production of consonant-vowel-consonant words. Children were able to compensate and adapt in a similar or larger degree compared to young adults. Higher token-to-token variability was found in children compared to adults but not disproportionately higher during the perturbation phases compared to the unperturbed baseline. The added challenge to auditory-motor integration did not influence production variability in children, and compensation and adaptation effects were found to be strong and sustainable. Significant group differences were absent in the proportions of speakers displaying a compensatory or adaptive response, an amplifying response, or no consistent response. Within these categories, children produced significantly stronger compensatory, adaptive, or amplifying responses, which could be explained by less-ingrained existing representations. The results are interpreted as both auditory-motor integration and learning capacities are stronger in young children compared to adults.}, }
@article {pmid32359273, year = {2020}, author = {Chiu, C and Sun, JT}, title = {On pharyngealized vowels in Northern Horpa: An acoustic and ultrasound study.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2928}, doi = {10.1121/10.0001005}, pmid = {32359273}, issn = {1520-8524}, mesh = {Acoustics ; Language ; *Phonetics ; *Speech Acoustics ; Ultrasonography ; }, abstract = {In the Northern Horpa (NH) language of Sichuan, vowels are divided between plain and pharyngealized sets, with the latter pronounced with auxiliary articulatory gestures involving more constriction in the vocal tract. The current study examines how the NH vocalic contrast is manifested in line with the process of pharyngealization both acoustically and articulatorily, based on freshly gathered data from two varieties of the language (i.e., Rtsangkhog and Yunasche). Along with formant analyses, ultrasound imaging was employed to capture the tongue postures and positions during vowel production. The results show that in contrast with plain vowels, pharyngealized vowels generally feature lower F2 values and higher F1 and F3 values. Mixed results for F2 and F3 suggest that the quality contrasts are vowel-dependent. Ultrasound images, on the other hand, reveal that the vocalic distinction is affected by different types of tongue movements, including retraction, backing, and double bunching, depending on the inherent tongue positions for each vowel. The two NH varieties investigated are found to display differential formant changes and different types of tongue displacements. The formant profiles along with ultrasound images support the view that the production of the NH phonologically marked vowels is characteristic of pharyngealization.}, }
@article {pmid32359268, year = {2020}, author = {Horo, L and Sarmah, P and Anderson, GDS}, title = {Acoustic phonetic study of the Sora vowel system.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {3000}, doi = {10.1121/10.0001011}, pmid = {32359268}, issn = {1520-8524}, mesh = {Acoustics ; India ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This paper is an acoustic phonetic study of vowels in Sora, a Munda language of the Austroasiatic language family. Descriptions here illustrate that the Sora vowel system has six vowels and provide evidence that Sora disyllables have prominence on the second syllable. While the acoustic categorization of vowels is based on formant frequencies, the presence of prominence on the second syllable is shown through temporal features of vowels, including duration, intensity, and fundamental frequency. Additionally, this paper demonstrates that acoustic categorization of vowels in Sora is better in the prominent syllable than in the non-prominent syllable, providing evidence that syllable prominence and vowel quality are correlated in Sora. These acoustic properties of Sora vowels are discussed in relation to the existing debates on vowels and patterns of syllable prominence in Munda languages of India. In this regard, it is noteworthy that Munda languages, in general, lack instrumental studies, and therefore this paper presents significant findings that are undocumented in other Munda languages. These acoustic studies are supported by exploratory statistical modeling and statistical classification methods.}, }
@article {pmid32359261, year = {2020}, author = {Sarvasy, H and Elvin, J and Li, W and Escudero, P}, title = {An acoustic phonetic description of Nungon vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2891}, doi = {10.1121/10.0001003}, pmid = {32359261}, issn = {1520-8524}, mesh = {Acoustics ; Papua New Guinea ; *Phonetics ; Speech ; *Speech Acoustics ; }, abstract = {This study is a comprehensive acoustic description and analysis of the six vowels /i e a u o ɔ/ in the Towet dialect of the Papuan language Nungon ⟨yuw⟩ of northeastern Papua New Guinea. Vowel tokens were extracted from a corpus of audio speech recordings created for general language documentation and grammatical description. To assess the phonetic correlates of a claimed phonological vowel length distinction, vowel duration was measured. Multi-point acoustic analyses enabled investigation of mean vowel F1, F2, and F3; vowel trajectories, and coarticulation effects. The three Nungon back vowels were of particular interest, as they contribute to an asymmetrical, back vowel-heavy array, and /o/ had previously been described as having an especially low F2. The authors found that duration of phonologically long and short vowels differed significantly. Mean vowel formant measurements confirmed that the six phonological vowels form six distinct acoustic groupings; trajectories show slightly more formant movement in some vowels than was previously known. Adjacent nasal consonants exerted significant effects on vowel formant measurements. The authors show that an uncontrolled, general documentation corpus for an under-described language can be mined for acoustic analysis, but coarticulation effects should be taken into account.}, }
@article {pmid32359247, year = {2020}, author = {Nance, C and Kirkham, S}, title = {The acoustics of three-way lateral and nasal palatalisation contrasts in Scottish Gaelic.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2858}, doi = {10.1121/10.0000998}, pmid = {32359247}, issn = {1520-8524}, mesh = {Acoustics ; *Language ; Phonetics ; Scotland ; *Speech Acoustics ; }, abstract = {This paper presents an acoustic description of laterals and nasals in an endangered minority language, Scottish Gaelic (known as "Gaelic"). Gaelic sonorants are reported to take part in a typologically unusual three-way palatalisation contrast. Here, the acoustic evidence for this contrast is considered, comparing lateral and nasal consonants in both word-initial and word-final position. Previous acoustic work has considered lateral consonants, but nasals are much less well-described. An acoustic analysis of twelve Gaelic-dominant speakers resident in a traditionally Gaelic-speaking community is reported. Sonorant quality is quantified via measurements of F2-F1 and F3-F2 and observation of the whole spectrum. Additionally, we quantify extensive devoicing in word-final laterals that has not been previously reported. Mixed-effects regression modelling suggests robust three-way acoustic differences in lateral consonants in all relevant vowel contexts. Nasal consonants, however, display lesser evidence of the three-way contrast in formant values and across the spectrum. Potential reasons for lesser evidence of contrast in the nasal system are discussed, including the nature of nasal acoustics, evidence from historical changes, and comparison to other Goidelic dialects. In doing so, contributions are made to accounts of the acoustics of the Celtic languages, and to typologies of contrastive palatalisation in the world's languages.}, }
@article {pmid32359243, year = {2020}, author = {Tabain, M and Butcher, A and Breen, G and Beare, R}, title = {A formant study of the alveolar versus retroflex contrast in three Central Australian languages: Stop, nasal, and lateral manners of articulation.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {4}, pages = {2745}, doi = {10.1121/10.0001012}, pmid = {32359243}, issn = {1520-8524}, mesh = {Acoustics ; Australia ; Language ; *Phonetics ; *Speech Acoustics ; }, abstract = {This study presents formant transition data from 21 speakers for the apical alveolar∼retroflex contrast in three neighbouring Central Australian languages: Arrernte, Pitjantjatjara, and Warlpiri. The contrast is examined for three manners of articulation: stop, nasal, and lateral /t ∼ ʈ/ /n ∼ ɳ/, and /l ∼ ɭ/, and three vowel contexts /a i u/. As expected, results show that a lower F3 and F4 in the preceding vowel signal a retroflex consonant; and that the alveolar∼retroflex contrast is most clearly realized in the context of an /a/ vowel, and least clearly realized in the context of an /i/ vowel. Results also show that the contrast is most clearly realized for the stop manner of articulation. These results provide an acoustic basis for the greater typological rarity of retroflex nasals and laterals as compared to stops. It is suggested that possible nasalization of the preceding vowel accounts for the poorer nasal consonant results, and that articulatory constraints on lateral consonant production account for the poorer lateral consonant results. Importantly, differences are noticed between speakers, and it is suggested that literacy plays a major role in maintenance of this marginal phonemic contrast.}, }
@article {pmid32339775, year = {2020}, author = {Liepins, R and Kaider, A and Honeder, C and Auinger, AB and Dahm, V and Riss, D and Arnoldner, C}, title = {Formant frequency discrimination with a fine structure sound coding strategy for cochlear implants.}, journal = {Hearing research}, volume = {392}, number = {}, pages = {107970}, doi = {10.1016/j.heares.2020.107970}, pmid = {32339775}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Adolescent ; Adult ; Aged ; Cochlea/*physiopathology ; Cochlear Implantation/*instrumentation ; *Cochlear Implants ; Comprehension ; Cross-Over Studies ; Electric Stimulation ; Female ; *Hearing ; Hearing Loss/diagnosis/physiopathology/*therapy ; Humans ; Longitudinal Studies ; Male ; Middle Aged ; Noise/adverse effects ; Perceptual Masking ; Persons With Hearing Impairments/psychology/*rehabilitation ; *Pitch Discrimination ; Speech Intelligibility ; *Speech Perception ; Young Adult ; }, abstract = {Recent sound coding strategies for cochlear implants (CI) have focused on the transmission of temporal fine structure to the CI recipient. To date, knowledge about the effects of fine structure coding in electrical hearing is poorly charactarized. The aim of this study was to examine whether the presence of temporal fine structure coding affects how the CI recipient perceives sound. This was done by comparing two sound coding strategies with different temporal fine structure coverage in a longitudinal cross-over setting. The more recent FS4 coding strategy provides fine structure coding on typically four apical stimulation channels compared to FSP with usually one or two fine structure channels. 34 adult CI patients with a minimum CI experience of one year were included. All subjects were fitted according to clinical routine and used both coding strategies for three months in a randomized sequence. Formant frequency discrimination thresholds (FFDT) were measured to assess the ability to resolve timbre information. Further outcome measures included a monosyllables test in quiet and the speech reception threshold of an adaptive matrix sentence test in noise (Oldenburger sentence test). In addition, the subjective sound quality was assessed using visual analogue scales and a sound quality questionnaire after each three months period. The extended fine structure range of FS4 yields FFDT similar to FSP for formants occurring in the frequency range only covered by FS4. There is a significant interaction (p = 0.048) between the extent of fine structure coverage in FSP and the improvement in FFDT in favour of FS4 for these stimuli. FS4 Speech perception in noise and quiet was similar with both coding strategies. Sound quality was rated heterogeneously showing that both strategies represent valuable options for CI fitting to allow for best possible individual optimization.}, }
@article {pmid32339072, year = {2020}, author = {Dorman, MF and Natale, SC and Baxter, L and Zeitler, DM and Carlson, ML and Lorens, A and Skarzynski, H and Peters, JPM and Torres, JH and Noble, JH}, title = {Approximations to the Voice of a Cochlear Implant: Explorations With Single-Sided Deaf Listeners.}, journal = {Trends in hearing}, volume = {24}, number = {}, pages = {2331216520920079}, pmid = {32339072}, issn = {2331-2165}, support = {R01 DC014037/DC/NIDCD NIH HHS/United States ; }, mesh = {*Cochlear Implantation ; *Cochlear Implants ; *Deafness/diagnosis ; Humans ; *Speech Perception ; }, abstract = {Fourteen single-sided deaf listeners fit with an MED-EL cochlear implant (CI) judged the similarity of clean signals presented to their CI and modified signals presented to their normal-hearing ear. The signals to the normal-hearing ear were created by (a) filtering, (b) spectral smearing, (c) changing overall fundamental frequency (F0), (d) F0 contour flattening, (e) changing formant frequencies, (f) altering resonances and ring times to create a metallic sound quality, (g) using a noise vocoder, or (h) using a sine vocoder. The operations could be used singly or in any combination. On a scale of 1 to 10 where 10 was a complete match to the sound of the CI, the mean match score was 8.8. Over half of the matches were 9.0 or higher. The most common alterations to a clean signal were band-pass or low-pass filtering, spectral peak smearing, and F0 contour flattening. On average, 3.4 operations were used to create a match. Upshifts in formant frequencies were implemented most often for electrode insertion angles less than approximately 500°. A relatively small set of operations can produce signals that approximate the sound of the MED-EL CI. There are large individual differences in the combination of operations needed. The sound files in Supplemental Material approximate the sound of the MED-EL CI for patients fit with 28-mm electrode arrays.}, }
@article {pmid32330738, year = {2020}, author = {Eipert, L and Klump, GM}, title = {Uncertainty-based informational masking in a vowel discrimination task for young and old Mongolian gerbils.}, journal = {Hearing research}, volume = {392}, number = {}, pages = {107959}, doi = {10.1016/j.heares.2020.107959}, pmid = {32330738}, issn = {1878-5891}, mesh = {Acoustic Stimulation ; Age Factors ; *Aging ; Animals ; *Auditory Perception ; Auditory Threshold ; *Behavior, Animal ; *Discrimination, Psychological ; Female ; Gerbillinae ; Humans ; Male ; Species Specificity ; *Speech Acoustics ; Speech Perception ; *Uncertainty ; *Voice Quality ; }, abstract = {Informational masking emerges with processing of complex sounds in the central auditory system and can be affected by uncertainty emerging from trial-to-trial variation of stimulus features. Uncertainty can be non-informative but confusing and thus mask otherwise salient stimulus changes resulting in increased discrimination thresholds. With increasing age, the ability for processing of such complex sound scenes degrades. Here, 6 young and 4 old gerbils were tested behaviorally in a vowel discrimination task. Animals were trained to discriminate between sequentially presented target and reference vowels of the vowel pair/I/-/i/. Reference and target vowels were generated shifting the three formants of the reference vowel in steps towards the formants of the target vowels. Non-informative but distracting uncertainty was introduced by random changes in location, level, fundamental frequency or all three features combined. Young gerbils tested with uncertainty for the target or target and reference vowels showed similar informational masking effects for both conditions. Young and old gerbils were tested with uncertainty for the target vowels only. Old gerbils showed no threshold increase discriminating vowels without uncertainty in comparison with young gerbils. Introducing uncertainty, vowel discrimination thresholds increased for young and old gerbils and vowel discrimination thresholds increased most when presenting all three uncertainty features combined. Old gerbils were more susceptible to non-informative uncertainty and their thresholds increased more than thresholds of young gerbils. Gerbils' vowel discrimination thresholds are compared to human performance in the same task (Eipert et al., 2019).}, }
@article {pmid32318928, year = {2020}, author = {Toyoda, A and Maruhashi, T and Malaivijitnond, S and Koda, H}, title = {Dominance status and copulatory vocalizations among male stump-tailed macaques in Thailand.}, journal = {Primates; journal of primatology}, volume = {61}, number = {5}, pages = {685-694}, doi = {10.1007/s10329-020-00820-7}, pmid = {32318928}, issn = {1610-7365}, mesh = {Animals ; *Copulation ; Macaca arctoides/*psychology ; Male ; *Social Dominance ; Thailand ; *Vocalization, Animal ; }, abstract = {Male copulation calls sometimes play important roles in sexual strategies, attracting conspecific females or advertising their social status to conspecific males. These calls generally occur in sexually competitive societies such as harem groups and multi-male and multi-female societies. However, the call functions remain unclear because of limited availability of data sets that include a large number of male and female animals in naturalistic environments, particularly in primates. Here, we examined the possible function of male-specific copulation calls in wild stump-tailed macaques (Macaca arctoides) by analyzing the contexts and acoustic features of vocalizations. We observed 395 wild stump-tailed macaques inhabiting the Khao Krapuk Khao Taomor Non-Hunting Area in Thailand and recorded all occurrences of observed copulations. We counted 446 male-specific calls in 383 copulations recorded, and measured their acoustic characteristics. Data were categorized into three groups depending on their social status: dominant (alpha and coalition) males and non-dominant males. When comparing male status, alpha males most frequently produced copulation calls at ejaculation, coalition males produced less frequent calls than alpha males, and other non-dominant males rarely vocalized, maintaining silence even when mounting females. Acoustic analysis indicated no significant influence of status (alpha or coalition) on call number, bout duration, or further formant dispersion parameters. Our results suggest that male copulation calls of this species are social status-dependent signals. Furthermore, dominant males might actively transmit their social status and copulations to other male rivals to impede their challenging attacks, while other non-dominant males maintain silence to prevent the interference of dominants.}, }
@article {pmid32305174, year = {2021}, author = {Saldías, M and Laukkanen, AM and Guzmán, M and Miranda, G and Stoney, J and Alku, P and Sundberg, J}, title = {The Vocal Tract in Loud Twang-Like Singing While Producing High and Low Pitches.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {5}, pages = {807.e1-807.e23}, doi = {10.1016/j.jvoice.2020.02.005}, pmid = {32305174}, issn = {1873-4588}, mesh = {Acoustics ; Humans ; Male ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {UNLABELLED: Twang-like vocal qualities have been related to a megaphone-like shape of the vocal tract (epilaryngeal tube and pharyngeal narrowing, and a wider mouth opening), low-frequency spectral changes, and tighter and/or increased vocal fold adduction. Previous studies have focused mainly on loud and high-pitched singing, comfortable low-pitched spoken vowels, or are based on modeling and simulation. There is no data available related to twang-like voices in loud, low-pitched singing.
PURPOSE: This study investigates the possible contribution of the lower and upper vocal tract configurations during loud twang-like singing on high and low pitches in a real subject.
METHODS: One male contemporary commercial music singer produced a sustained vowel [a:] in his habitual speaking pitch (B2) and loudness. The same vowel was also produced in a loud twang-like singing voice on high (G4) and low pitches (B2). Computerized tomography, acoustic analysis, inverse filtering, and audio-perceptual assessments were performed.
RESULTS: Both loud twang-like voices showed a megaphone-like shape of the vocal tract, being more notable on the low pitch. Also, low-frequency spectral changes, a peak of sound energy around 3 kHz and increased vocal fold adduction were found. Results agreed with audio-perceptual evaluation.
CONCLUSIONS: Loud twang-like phonation seems to be mainly related to low-frequency spectral changes (under 2 kHz) and a more compact formant structure. Twang-like qualities seem to require different degrees of twang-related vocal tract adjustments while phonating in different pitches. A wider mouth opening, pharyngeal constriction, and epilaryngeal tube narrowing may be helpful strategies for maximum power transfer and improved vocal economy in loud contemporary commercial music singing and potentially in loud speech. Further studies should focus on vocal efficiency and vocal economy measurements using modeling and simulation, based on real-singers' data.}, }
@article {pmid32302643, year = {2020}, author = {Yaralı, M}, title = {Varying effect of noise on sound onset and acoustic change evoked auditory cortical N1 responses evoked by a vowel-vowel stimulus.}, journal = {International journal of psychophysiology : official journal of the International Organization of Psychophysiology}, volume = {152}, number = {}, pages = {36-43}, doi = {10.1016/j.ijpsycho.2020.04.010}, pmid = {32302643}, issn = {1872-7697}, mesh = {Acoustic Stimulation ; Adult ; Auditory Cortex/*physiology ; Electroencephalography ; Evoked Potentials, Auditory/*physiology ; Female ; Humans ; Male ; Noise ; Speech Perception/*physiology ; Young Adult ; }, abstract = {INTRODUCTION: According to previous studies noise causes prolonged latencies and decreased amplitudes in acoustic change evoked cortical responses. Particularly for a consonant-vowel stimulus, speech shaped noise leads to more pronounced changes on onset evoked response than acoustic change evoked response. Reasoning that this may be related to the spectral characteristics of the stimuli and the noise, in the current study a vowel-vowel stimulus (/ui/) was presented in white noise during cortical response recordings. The hypothesis is that the effect of noise will be higher on acoustic change N1 compared to onset N1 due to the masking effects on formant transitions.
METHODS: Onset and acoustic change evoked auditory cortical N1-P2 responses were obtained from 21 young adults with normal hearing while presenting 1000 ms /ui/ stimuli in quiet and in white noise at +10 dB and 0 dB signal-to-noise ratio (SNR).
RESULTS: In the quiet and +10 dB SNR conditions, the N1-P2 responses to both onset and change were present. In the +10 dB SNR condition acoustic change N1-P2 peak-to-peak amplitudes were reduced and N1 latencies were prolonged compared to the quiet condition. Whereas there was not a significant change in onset N1 latencies and N1-P2 peak-to-peak amplitudes in the +10 dB SNR condition. In the 0 dB SNR condition change responses were not observed but onset N1-P2 peak-to-peak amplitudes were significantly lower, and onset N1 latencies were significantly higher compared to the quiet and the 10 dB SNR conditions. Onset and change responses were also compared with each other in each condition. N1 latencies and N1-P2 peak to peak amplitudes of onset and acoustic change were not significantly different in the quiet condition. Whereas at 10 dB SNR, acoustic change N1 latencies were higher and N1-P2 amplitudes were lower than onset latencies and amplitudes. To summarize, presentation of white noise at 10 dB SNR resulted in the reduction of acoustic change evoked N1-P2 peak-to-peak amplitudes and the prolongation of N1 latencies compared to quiet. Same effect on onsets were only observed at 0 dB SNR, where acoustic change N1 was not observed. In the quiet condition, latencies and amplitudes of onsets and changes were not different. Whereas at 10 dB SNR, acoustic change N1 latencies were higher, amplitudes were lower than onset N1.
DISCUSSION/CONCLUSIONS: The effect of noise was found to be higher on acoustic change evoked N1 response compared to onset N1. This may be related to the spectral characteristics of the utilized noise and the stimuli, possible differences in acoustic features of sound onsets and acoustic changes, or to the possible differences in the mechanisms for detecting acoustic changes and sound onsets. In order to investigate the possible reasons for more pronounced effect of noise on acoustic changes, future work with different vowel-vowel transitions in different noise types is suggested.}, }
@article {pmid32245663, year = {2021}, author = {Tykalova, T and Skrabal, D and Boril, T and Cmejla, R and Volin, J and Rusz, J}, title = {Effect of Ageing on Acoustic Characteristics of Voice Pitch and Formants in Czech Vowels.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {6}, pages = {931.e21-931.e33}, doi = {10.1016/j.jvoice.2020.02.022}, pmid = {32245663}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Aged ; Aged, 80 and over ; Aging ; Czech Republic ; Female ; Humans ; Language ; Male ; Middle Aged ; Phonetics ; *Speech Acoustics ; Young Adult ; }, abstract = {BACKGROUND: The relevance of formant-based measures has been noted across a spectrum of medical, technical, and linguistic applications. Therefore, the primary aim of the study was to evaluate the effect of ageing on vowel articulation, as the previous research revealed contradictory findings. The secondary aim was to provide normative acoustic data for all Czech monophthongs.
METHODS: The database consisted of 100 healthy speakers (50 men and 50 women) aged between 20 and 90. Acoustic characteristics, including vowel duration, vowel space area (VSA), fundamental frequency (fo), and the first to fourth formant frequencies (F1-F4) of 10 Czech vowels were extracted from a reading passage. In addition, the articulation rate was calculated from the entire duration of the reading passage.
RESULTS: Age-related changes in pitch were sex-dependent, while age-related alterations in F2/a/, F2/u/, VSA, and vowel duration seemed to be sex-independent. In particular, we observed a clear lowering of fo with age for women, but no change for men. With regard to formants, we found lowering of F2/a/ and F2/u/ with increased age, but no statistically significant changes in F1, F3, or F4 frequencies with advanced age. Although the alterations in F1 and F2 frequencies were rather small, they appeared to be in a direction against vowel centralization, resulting in a significantly greater VSA in the older population. The greater VSA was found to be related partly to longer vowel duration.
CONCLUSIONS: Alterations in vowel formant frequencies across several decades of adult life appear to be small or in a direction against vowel centralization, thus indicating the good preservation of articulatory precision in older speakers.}, }
@article {pmid32237805, year = {2020}, author = {Milenkovic, PH and Wagner, M and Kent, RD and Story, BH and Vorperian, HK}, title = {Effects of sampling rate and type of anti-aliasing filter on linear-predictive estimates of formant frequencies in men, women, and children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {3}, pages = {EL221}, pmid = {32237805}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {*Acoustics ; Child ; Female ; Humans ; Male ; *Speech ; Speech Acoustics ; }, abstract = {The purpose of this study was to assess the effect of downsampling the acoustic signal on the accuracy of linear-predictive (LPC) formant estimation. Based on speech produced by men, women, and children, the first four formant frequencies were estimated at sampling rates of 48, 16, and 10 kHz using different anti-alias filtering. With proper selection of number of LPC coefficients, anti-alias filter and between-frame averaging, results suggest that accuracy is not improved by rates substantially below 48 kHz. Any downsampling should not go below 16 kHz with a filter cut-off centered at 8 kHz.}, }
@article {pmid32201644, year = {2020}, author = {Chen, ZQ and Lin, YF and Tang, Y and Ding, GH and Wu, YQ and Lin, ZH}, title = {Acoustic divergence in advertisement calls among three sympatric Microhyla species from East China.}, journal = {PeerJ}, volume = {8}, number = {}, pages = {e8708}, pmid = {32201644}, issn = {2167-8359}, abstract = {BACKGROUND: Species-specific advertisement calls are the main mechanism of transmitting information between individuals in anuran amphibians and are therefore indispensable for anuran survival and reproduction. Survey methods that monitor these calls can be used for rapid species recognition, behavioral experiments, and conservation monitoring. In this study, we described in detail 10 call parameters from three sympatric species in the genus Microhyla and analyzed the differences in call parameter among these species to provide a basis for systematic monitoring, acoustic analysis and taxonomic study of this genus.
METHODS: The quantitative analyses of temporal and spectral call parameters were used in our study for the advertisement calls of three sympatric Microhyla species (M. beilunensis, M. fissipes and M. heymonsi) in Zhejiang Province, East China.
RESULTS: Our results showed the following: (1) Significant differences existed among the three sympatric Microhyla species in call duration (CD), call interval (CI), number of pulses (NP), pulse rate, call intensity (CIT), dominant frequency (DF) and frequency of the first to fourth formants (F1, F2, F3 and F4). (2) Some spectral parameters (DF, F1 and F3) were negatively correlated with the body size of the vocalizing individuals in each species. (3) The coefficients of variation within individuals (CVw) for CIT, DF and F1-F4 were smaller than 5%, whereas the CVW for CI was larger than 10% in each species. (4) The principal component analysis and discriminant function analysis showed that call parameters could distinguish the three Microhyla species. (5) The phylogenetic generalized least squares analysis showed that phylogenetic relationships affected CD and NP against snout-vent length (SVL), DF and NP against CD, and NP against DF, but not of DF against SVL; based on the phylogenetic analysis, CD and NP were not related to SVL, but DF was negatively related to SVL.}, }
@article {pmid32196513, year = {2020}, author = {Deloche, F}, title = {Fine-grained statistical structure of speech.}, journal = {PloS one}, volume = {15}, number = {3}, pages = {e0230233}, pmid = {32196513}, issn = {1932-6203}, mesh = {Acoustic Stimulation/methods ; Cochlea/physiology ; Cochlear Implants ; Humans ; Phonetics ; Speech/*physiology ; Speech Acoustics ; Speech Perception/physiology ; }, abstract = {In spite of its acoustic diversity, the speech signal presents statistical regularities that can be exploited by biological or artificial systems for efficient coding. Independent Component Analysis (ICA) revealed that on small time scales (∼ 10 ms), the overall structure of speech is well captured by a time-frequency representation whose frequency selectivity follows the same power law in the high frequency range 1-8 kHz as cochlear frequency selectivity in mammals. Variations in the power-law exponent, i.e. different time-frequency trade-offs, have been shown to provide additional adaptation to phonetic categories. Here, we adopt a parametric approach to investigate the variations of the exponent at a finer level of speech. The estimation procedure is based on a measure that reflects the sparsity of decompositions in a set of Gabor dictionaries whose atoms are Gaussian-modulated sinusoids. We examine the variations of the exponent associated with the best decomposition, first at the level of phonemes, then at an intra-phonemic level. We show that this analysis offers a rich interpretation of the fine-grained statistical structure of speech, and that the exponent values can be related to key acoustic properties. Two main results are: i) for plosives, the exponent is lowered by the release bursts, concealing higher values during the opening phases; ii) for vowels, the exponent is bound to formant bandwidths and decreases with the degree of acoustic radiation at the lips. This work further suggests that an efficient coding strategy is to reduce frequency selectivity with sound intensity level, congruent with the nonlinear behavior of cochlear filtering.}, }
@article {pmid32196397, year = {2020}, author = {Hardy, TLD and Boliek, CA and Aalto, D and Lewicke, J and Wells, K and Rieger, JM}, title = {Contributions of Voice and Nonverbal Communication to Perceived Masculinity-Femininity for Cisgender and Transgender Communicators.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {4}, pages = {931-947}, doi = {10.1044/2019_JSLHR-19-00387}, pmid = {32196397}, issn = {1558-9102}, mesh = {Female ; Femininity ; Humans ; Male ; Masculinity ; Nonverbal Communication ; Speech Acoustics ; *Transgender Persons ; *Voice ; }, abstract = {Purpose The purpose of this study was twofold: (a) to identify a set of communication-based predictors (including both acoustic and gestural variables) of masculinity-femininity ratings and (b) to explore differences in ratings between audio and audiovisual presentation modes for transgender and cisgender communicators. Method The voices and gestures of a group of cisgender men and women (n = 10 of each) and transgender women (n = 20) communicators were recorded while they recounted the story of a cartoon using acoustic and motion capture recording systems. A total of 17 acoustic and gestural variables were measured from these recordings. A group of observers (n = 20) rated each communicator's masculinity-femininity based on 30- to 45-s samples of the cartoon description presented in three modes: audio, visual, and audio visual. Visual and audiovisual stimuli contained point light displays standardized for size. Ratings were made using a direct magnitude estimation scale without modulus. Communication-based predictors of masculinity-femininity ratings were identified using multiple regression, and analysis of variance was used to determine the effect of presentation mode on perceptual ratings. Results Fundamental frequency, average vowel formant, and sound pressure level were identified as significant predictors of masculinity-femininity ratings for these communicators. Communicators were rated significantly more feminine in the audio than the audiovisual mode and unreliably in the visual-only mode. Conclusions Both study purposes were met. Results support continued emphasis on fundamental frequency and vocal tract resonance in voice and communication modification training with transgender individuals and provide evidence for the potential benefit of modifying sound pressure level, especially when a masculine presentation is desired.}, }
@article {pmid32160481, year = {2020}, author = {Carl, M and Kent, RD and Levy, ES and Whalen, DH}, title = {Vowel Acoustics and Speech Intelligibility in Young Adults With Down Syndrome.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {3}, pages = {674-687}, doi = {10.1044/2019_JSLHR-19-00204}, pmid = {32160481}, issn = {1558-9102}, mesh = {Acoustics ; *Down Syndrome/complications ; Humans ; Phonetics ; Speech Acoustics ; *Speech Intelligibility ; *Speech Perception ; Speech Production Measurement ; Young Adult ; }, abstract = {Purpose Speech production deficits and reduced intelligibility are frequently noted in individuals with Down syndrome (DS) and are attributed to a combination of several factors. This study reports acoustic data on vowel production in young adults with DS and relates these findings to perceptual analysis of speech intelligibility. Method Participants were eight young adults with DS as well as eight age- and gender-matched typically developing (TD) controls. Several different acoustic measures of vowel centralization and variability were applied to tokens of corner vowels (/ɑ/, /æ/, /i/, /u/) produced in common English words. Intelligibility was assessed for single-word productions of speakers with DS, by means of transcriptions from 14 adult listeners. Results Group differentiation was found for some, but not all, of the acoustic measures. Low vowels were more acoustically centralized and variable in speakers with DS than TD controls. Acoustic findings were associated with overall intelligibility scores. Vowel formant dispersion was the most sensitive measure in distinguishing DS and TD formant data. Conclusion Corner vowels are differentially affected in speakers with DS. The acoustic characterization of vowel production and its association with speech intelligibility scores within the DS group support the conclusion of motor control deficits in the overall speech impairment. Implications are discussed for effective treatment planning.}, }
@article {pmid32160080, year = {2020}, author = {Coy, A and Watson, S}, title = {Acoustic Similarity of Inner and Outer Circle Varieties of Child-Produced English Vowels.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {3}, pages = {722-737}, doi = {10.1044/2019_JSLHR-19-00179}, pmid = {32160080}, issn = {1558-9102}, mesh = {Acoustics ; Adult ; Child ; Humans ; Language ; *Phonetics ; Speech ; *Speech Acoustics ; United States ; }, abstract = {Purpose This article compares acoustic data of normally developing children from two dominant and one nondominant variety of English in order to determine phonetic proximity. Method The study focuses on one variety of American English (AE), one British English (BE) variety, and one Jamaican English (JE) variety owing to the historical and sociopolitical influences of both dominant varieties on JE. The work examines the four corner vowels (/a/, /ɑ/, /u:/, and /i:/) of the specified varieties. Speech from children aged 8-11 years was processed to extract duration, intensity, and fundamental frequency as well as the first three formants (F1, F2, and F3) of each vowel. Results Analysis of the acoustic variables showed, for the first time, that child-produced JE is phonetically closer to the variety of BE studied, than it is to the American variety. The acoustic properties of the child-produced JE vowels were found to be similar to those of adult-produced vowels, suggesting that, as has been shown for adult speech, there appears to be a limited impact of AE on JE. Conclusions This is the first acoustic study of children's speech to show that, despite the proximity to BE, the Jamaican variety is clearly a distinct variety of English. As the first study comparing AE, BE, and JE, the article provides experimental evidence of the acoustic differences in the varieties and points to the implications for automatic speech recognition and educational applications for children who speak JE.}, }
@article {pmid32149701, year = {2020}, author = {Zhang, T and Shao, Y and Wu, Y and Pang, Z and Liu, G}, title = {Multiple Vowels Repair Based on Pitch Extraction and Line Spectrum Pair Feature for Voice Disorder.}, journal = {IEEE journal of biomedical and health informatics}, volume = {24}, number = {7}, pages = {1940-1951}, doi = {10.1109/JBHI.2020.2978103}, pmid = {32149701}, issn = {2168-2208}, mesh = {Aged ; Humans ; Neural Networks, Computer ; Sound Spectrography/*methods ; Voice/*physiology ; Voice Disorders/*diagnosis ; *Wavelet Analysis ; }, abstract = {Individuals, such as voice-related professionals, elderly people and smokers, are increasingly suffering from voice disorder, which implies the importance of pathological voice repair. Previous work on pathological voice repair only concerned about sustained vowel /a/, but multiple vowels repair is still challenging due to the unstable extraction of pitch and the unsatisfactory reconstruction of formant. In this paper, a multiple vowels repair based on pitch extraction and Line Spectrum Pair feature for voice disorder is proposed, which broadened the research subjects of voice repair from only single vowel /a/ to multiple vowels /a/, /i/ and /u/ and achieved the repair of these vowels successfully. Considering deep neural network as a classifier, a voice recognition is performed to classify the normal and pathological voices. Wavelet Transform and Hilbert-Huang Transform are applied for pitch extraction. Based on Line Spectrum Pair (LSP) feature, the formant is reconstructed. The final repaired voice is obtained by synthesizing the pitch and the formant. The proposed method is validated on Saarbrücken Voice Database (SVD) database. The achieved improvements of three metrics, Segmental Signal-to-Noise Ratio, LSP distance measure and Mel cepstral distance measure, are respectively 45.87%, 50.37% and 15.56%. Besides, an intuitive analysis based on spectrogram has been done and a prominent repair effect has been achieved.}, }
@article {pmid32138570, year = {2021}, author = {Figueroa Saavedra, C and Otzen Hernández, T and Alarcón Godoy, C and Ríos Pérez, A and Frugone Salinas, D and Lagos Hernández, R}, title = {Association between suicidal ideation and acoustic parameters of university students' voice and speech: a pilot study.}, journal = {Logopedics, phoniatrics, vocology}, volume = {46}, number = {2}, pages = {55-62}, doi = {10.1080/14015439.2020.1733075}, pmid = {32138570}, issn = {1651-2022}, mesh = {Acoustics ; Adolescent ; Adult ; Cross-Sectional Studies ; Female ; Humans ; Male ; Pilot Projects ; *Speech ; Students ; *Suicidal Ideation ; Universities ; Voice Quality ; Young Adult ; }, abstract = {PURPOSE: At a worldwide level, suicide is a public health problem that, despite displaying downward trends in several areas of the world, in many countries these rates have increased. One of the elements that contributes to its prevention is an early and dynamic evaluation. Due to this, the objective is to determine the association between acoustic parameters of voice and speech (F0, F1, F2, F3, dB, and Jitter) and suicidal ideation arousal amongst some university students from the city of Temuco, Chile.
METHODS: Attending to this issue, a cross-sectional design study was conducted through a non-probabilistic sampling of sixty 18- and 19-year-old adolescents from the city of Temuco, that went through an acoustic evaluation of their voice and speech after taking a test to determine suicidal ideation. Afterwards, data were analyzed through IBM SPSS version 23.0 software (IBM SPSS Statistics, Armonk, NY), by means of exploratory, descriptive, and inferential statistics taking the variable's levels of measurements and the types of distributions into account.
RESULTS: The results point out that 30% of the adolescents, from both genders, displayed suicidal ideation. Taking into account the acoustic results of their voice, it is possible to recognize that the fundamental frequency (F0), the formants (F1, F2), and Jitter, are the ones that majorly link to the presence of suicidal ideation, both in women and men (p < .05). The characteristics that describe F3 were only linked to the presence of suicidal ideation in men (p < .05).
CONCLUSIONS: It is concluded that the acoustic parameters of voice and speech differ in adolescents with suicidal behavior, opening the possibility of representing a useful tool in the diagnosis of suicide.}, }
@article {pmid32113329, year = {2020}, author = {Allison, KM and Salehi, S and Green, JR}, title = {Effect of prosodic manipulation on articulatory kinematics and second formant trajectories in children.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {769}, pmid = {32113329}, issn = {1520-8524}, support = {F32 DC016484/DC/NIDCD NIH HHS/United States ; K24 DC016312/DC/NIDCD NIH HHS/United States ; }, mesh = {Biomechanical Phenomena ; Child ; Dysarthria ; Humans ; Phonetics ; Speech ; *Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {This study investigated effects of rate reduction and emphatic stress cues on second formant (F2) trajectories and articulatory movements during diphthong production in 11 typically developing school-aged children. F2 extent increased in slow and emphatic stress conditions, and tongue and jaw displacement increased in the emphatic stress condition compared to habitual speech. Tongue displacement significantly predicted F2 extent across speaking conditions. Results suggest that slow rate and emphatic stress cues induce articulatory and acoustic changes in children that may enhance clarity of the acoustic signal. Potential clinical implications for improving speech in children with dysarthria are discussed.}, }
@article {pmid32113320, year = {2020}, author = {Summers, RJ and Roberts, B}, title = {Informational masking of speech by acoustically similar intelligible and unintelligible interferers.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {1113}, doi = {10.1121/10.0000688}, pmid = {32113320}, issn = {1520-8524}, abstract = {Masking experienced when target speech is accompanied by a single interfering voice is often primarily informational masking (IM). IM is generally greater when the interferer is intelligible than when it is not (e.g., speech from an unfamiliar language), but the relative contributions of acoustic-phonetic and linguistic interference are often difficult to assess owing to acoustic differences between interferers (e.g., different talkers). Three-formant analogues (F1+F2+F3) of natural sentences were used as targets and interferers. Targets were presented monaurally either alone or accompanied contralaterally by interferers from another sentence (F0 = 4 semitones higher); a target-to-masker ratio (TMR) between ears of 0, 6, or 12 dB was used. Interferers were either intelligible or rendered unintelligible by delaying F2 and advancing F3 by 150 ms relative to F1, a manipulation designed to minimize spectro-temporal differences between corresponding interferers. Target-sentence intelligibility (keywords correct) was 67% when presented alone, but fell considerably when an unintelligible interferer was present (49%) and significantly further when the interferer was intelligible (41%). Changes in TMR produced neither a significant main effect nor an interaction with interferer type. Interference with acoustic-phonetic processing of the target can explain much of the impact on intelligibility, but linguistic factors-particularly interferer intrusions-also make an important contribution to IM.}, }
@article {pmid32113256, year = {2020}, author = {Winn, MB}, title = {Manipulation of voice onset time in speech stimuli: A tutorial and flexible Praat script.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {2}, pages = {852}, doi = {10.1121/10.0000692}, pmid = {32113256}, issn = {1520-8524}, abstract = {Voice onset time (VOT) is an acoustic property of stop consonants that is commonly manipulated in studies of phonetic perception. This paper contains a thorough description of the "progressive cutback and replacement" method of VOT manipulation, and comparison with other VOT manipulation techniques. Other acoustic properties that covary with VOT-such as fundamental frequency and formant transitions-are also discussed, along with considerations for testing VOT perception and its relationship to various other measures of auditory temporal or spectral processing. An implementation of the progressive cutback and replacement method in the Praat scripting language is presented, which is suitable for modifying natural speech for perceptual experiments involving VOT and/or related covarying F0 and intensity cues. Justifications are provided for the stimulus design choices and constraints implemented in the script.}, }
@article {pmid32111954, year = {2020}, author = {Riggs, WJ and Hiss, MM and Skidmore, J and Varadarajan, VV and Mattingly, JK and Moberly, AC and Adunka, OF}, title = {Utilizing Electrocochleography as a Microphone for Fully Implantable Cochlear Implants.}, journal = {Scientific reports}, volume = {10}, number = {1}, pages = {3714}, pmid = {32111954}, issn = {2045-2322}, mesh = {Adolescent ; Adult ; Aged ; Aged, 80 and over ; Audiometry, Evoked Response ; Auditory Threshold ; Cochlear Implantation ; Cochlear Implants ; Hearing ; Hearing Loss/physiopathology/*therapy ; Humans ; Middle Aged ; Sound ; Young Adult ; }, abstract = {Current cochlear implants (CIs) are semi-implantable devices with an externally worn sound processor that hosts the microphone and sound processor. A fully implantable device, however, would ultimately be desirable as it would be of great benefit to recipients. While some prototypes have been designed and used in a few select cases, one main stumbling block is the sound input. Specifically, subdermal implantable microphone technology has been poised with physiologic issues such as sound distortion and signal attenuation under the skin. Here we propose an alternative method that utilizes a physiologic response composed of an electrical field generated by the sensory cells of the inner ear to serve as a sound source microphone for fully implantable hearing technology such as CIs. Electrophysiological results obtained from 14 participants (adult and pediatric) document the feasibility of capturing speech properties within the electrocochleography (ECochG) response. Degradation of formant properties of the stimuli /da/ and /ba/ are evaluated across various degrees of hearing loss. Preliminary results suggest proof-of-concept of using the ECochG response as a microphone is feasible to capture vital properties of speech. However, further signal processing refinement is needed in addition to utilization of an intracochlear recording location to likely improve signal fidelity.}, }
@article {pmid32104050, year = {2020}, author = {Kim, HT}, title = {Vocal Feminization for Transgender Women: Current Strategies and Patient Perspectives.}, journal = {International journal of general medicine}, volume = {13}, number = {}, pages = {43-52}, pmid = {32104050}, issn = {1178-7074}, abstract = {Voice feminization for transgender women is a highly complicated comprehensive transition process. Voice feminization has been thought to be equal to pitch elevation. Thus, many surgical procedures have only focused on pitch raising for voice feminization. However, voice feminization should not only consider voice pitch but also consider gender differences in physical, neurophysiological, and acoustical characteristics of voice. That is why voice therapy has been the preferred choice for the feminization of the voice. Considering gender difference of phonatory system, the method for voice feminization consists of changing the following four critical elements: fundamental frequency, resonance frequency related to vocal tract volume and length, formant tuning, and phonatory pattern. Voice feminizing process can be generally divided into non-surgical feminization and surgical feminization. As a non-surgical procedure, feminization voice therapy consists of increasing fundamental frequency, improving oral and pharyngeal resonance, and behavioral therapy. Surgical feminization usually can be achieved by external approach or endoscopic approach. Based on three factors (length, tension and mass) of vocal fold for pitch modulation, surgical procedure can be classified as one-factor, two-factors and three-factors modification of vocal folds. Recent systematic reviews and meta-analysis studies have reported positive outcomes for both the voice therapy and voice feminization surgery. The benefits of voice therapy, as it is highly satisfactory, mostly increase vocal pitch, and are noninvasive. However, the surgical voice feminization of three-factors modification of vocal folds is also highly competent and provides a maximum absolute increase in vocal pitch. Voice feminization is a long transition journey for physical, neurophysiological, and psychosomatic changes that convert a male phonatory system to a female phonatory system. Therefore, strategies for voice feminization should be individualized according to the individual's physical condition, the desired change in voice pitch, economic conditions, and social roles.}, }
@article {pmid32077196, year = {2020}, author = {Levy, ES and Moya-Galé, G and Chang, YM and Campanelli, L and MacLeod, AAN and Escorial, S and Maillart, C}, title = {Effects of speech cues in French-speaking children with dysarthria.}, journal = {International journal of language & communication disorders}, volume = {55}, number = {3}, pages = {401-416}, doi = {10.1111/1460-6984.12526}, pmid = {32077196}, issn = {1460-6984}, mesh = {Adolescent ; Cerebral Palsy/complications/*psychology ; Child ; *Cues ; Dysarthria/etiology/*psychology ; Female ; Humans ; Male ; *Speech ; Speech Acoustics ; Speech Intelligibility ; }, abstract = {BACKGROUND: Articulatory excursion and vocal intensity are reduced in many children with dysarthria due to cerebral palsy (CP), contributing to the children's intelligibility deficits and negatively affecting their social participation. However, the effects of speech-treatment strategies for improving intelligibility in this population are understudied, especially for children who speak languages other than English. In a cueing study on English-speaking children with dysarthria, acoustic variables and intelligibility improved when the children were provided with cues aimed to increase articulatory excursion and vocal intensity. While French is among the top 20 most spoken languages in the world, dysarthria and its management in French-speaking children are virtually unexplored areas of research. Information gleaned from such research is critical for providing an evidence base on which to provide treatment.
AIMS: To examine acoustic and perceptual changes in the speech of French-speaking children with dysarthria, who are provided with speech cues targeting greater articulatory excursion (French translation of 'speak with your big mouth') and vocal intensity (French translation of 'speak with your strong voice'). This study investigated whether, in response to the cues, the children would make acoustic changes and listeners would perceive the children's speech as more intelligible.
METHODS & PROCEDURES: Eleven children with dysarthria due to CP (six girls, five boys; ages 4;11-17;0 years; eight with spastic CP, three with dyskinetic CP) repeated pre-recorded speech stimuli across three speaking conditions (habitual, 'big mouth' and 'strong voice'). Stimuli were sentences and contrastive words in phrases. Acoustic analyses were conducted. A total of 66 Belgian-French listeners transcribed the children's utterances orthographically and rated their ease of understanding on a visual analogue scale at sentence and word levels.
OUTCOMES & RESULTS: Acoustic analyses revealed significantly longer duration in response to the big mouth cue at sentence level and in response to both the big mouth and strong voice cues at word level. Significantly higher vocal sound-pressure levels were found following both cues at sentence and word levels. Both cues elicited significantly higher first-formant vowel frequencies and listeners' greater ease-of-understanding ratings at word level. Increases in the percentage of words transcribed correctly and in sentence ease-of-understanding ratings, however, did not reach statistical significance. Considerable variability between children was observed.
Speech cues targeting greater articulatory excursion and vocal intensity yield significant acoustic changes in French-speaking children with dysarthria. However, the changes may only aid listeners' ease of understanding at word level. The significant findings and great inter-speaker variability are generally consistent with studies on English-speaking children with dysarthria, although changes appear more constrained in these French-speaking children. What this paper adds What is already known on the subject According to the only study comparing effects of speech-cueing strategies on English-speaking children with dysarthria, intelligibility increases when the children are provided with cues aimed to increase articulatory excursion and vocal intensity. Little is known about speech characteristics in French-speaking children with dysarthria and no published research has explored effects of cueing strategies in this population. What this paper adds to existing knowledge This paper is the first study to examine the effects of speech cues on the acoustics and intelligibility of French-speaking children with CP. It provides evidence that the children can make use of cues to modify their speech, although the changes may only aid listeners' ease of understanding at word level. What are the potential or actual clinical implications of this work? For clinicians, the findings suggest that speech cues emphasizing increasing articulatory excursion and vocal intensity show promise for improving the ease of understanding of words produced by francophone children with dysarthria, although improvements may be modest. The variability in the responses also suggests that this population may benefit from a combination of such cues to produce words that are easier to understand.}, }
@article {pmid32076631, year = {2019}, author = {Boë, LJ and Sawallis, TR and Fagot, J and Badin, P and Barbier, G and Captier, G and Ménard, L and Heim, JL and Schwartz, JL}, title = {Which way to the dawn of speech?: Reanalyzing half a century of debates and data in light of speech science.}, journal = {Science advances}, volume = {5}, number = {12}, pages = {eaaw3916}, pmid = {32076631}, issn = {2375-2548}, mesh = {Animals ; *Biological Evolution ; Communication ; Humans ; *Models, Theoretical ; Research ; *Speech ; Vocalization, Animal ; }, abstract = {Recent articles on primate articulatory abilities are revolutionary regarding speech emergence, a crucial aspect of language evolution, by revealing a human-like system of proto-vowels in nonhuman primates and implicitly throughout our hominid ancestry. This article presents both a schematic history and the state of the art in primate vocalization research and its importance for speech emergence. Recent speech research advances allow more incisive comparison of phylogeny and ontogeny and also an illuminating reinterpretation of vintage primate vocalization data. This review produces three major findings. First, even among primates, laryngeal descent is not uniquely human. Second, laryngeal descent is not required to produce contrasting formant patterns in vocalizations. Third, living nonhuman primates produce vocalizations with contrasting formant patterns. Thus, evidence now overwhelmingly refutes the long-standing laryngeal descent theory, which pushes back "the dawn of speech" beyond ~200 ka ago to over ~20 Ma ago, a difference of two orders of magnitude.}, }
@article {pmid32048990, year = {2020}, author = {Bergevin, C and Narayan, C and Williams, J and Mhatre, N and Steeves, JK and Bernstein, JG and Story, B}, title = {Overtone focusing in biphonic tuvan throat singing.}, journal = {eLife}, volume = {9}, number = {}, pages = {}, pmid = {32048990}, issn = {2050-084X}, mesh = {Audiovisual Aids ; Humans ; Magnetic Resonance Imaging ; Pharynx/diagnostic imaging/*physiology ; Russia ; *Singing ; }, abstract = {Khoomei is a unique singing style originating from the republic of Tuva in central Asia. Singers produce two pitches simultaneously: a booming low-frequency rumble alongside a hovering high-pitched whistle-like tone. The biomechanics of this biphonation are not well-understood. Here, we use sound analysis, dynamic magnetic resonance imaging, and vocal tract modeling to demonstrate how biphonation is achieved by modulating vocal tract morphology. Tuvan singers show remarkable control in shaping their vocal tract to narrowly focus the harmonics (or overtones) emanating from their vocal cords. The biphonic sound is a combination of the fundamental pitch and a focused filter state, which is at the higher pitch (1-2 kHz) and formed by merging two formants, thereby greatly enhancing sound-production in a very narrow frequency range. Most importantly, we demonstrate that this biphonation is a phenomenon arising from linear filtering rather than from a nonlinear source.}, }
@article {pmid32041121, year = {2020}, author = {Gabrieli, G and Bornstein, MH and Manian, N and Esposito, G}, title = {Assessing Mothers' Postpartum Depression From Their Infants' Cry Vocalizations.}, journal = {Behavioral sciences (Basel, Switzerland)}, volume = {10}, number = {2}, pages = {}, pmid = {32041121}, issn = {2076-328X}, support = {Intramural Research Program/NH/NIH HHS/United States ; }, abstract = {Postpartum Depression (PPD), a condition that affects up to 15% of mothers in high-income countries, reduces attention to the needs of the child and is among the first causes of infanticide. PPD is usually identified using self-report measures and therefore it is possible that mothers are unwilling to report PPD because of a social desirability bias. Previous studies have highlighted the presence of significant differences in the acoustical properties of the vocalizations of infants of depressed and healthy mothers, suggesting that the mothers' behavior can induce changes in infants' vocalizations. In this study, cry episodes of infants (N = 56, 157.4 days ± 8.5, 62% firstborn) of depressed (N = 29) and non-depressed (N = 27) mothers (mean age = 31.1 years ± 3.9) are analyzed to investigate the possibility that a cloud-based machine learning model can identify PPD in mothers from the acoustical properties of their infants' vocalizations. Acoustic features (fundamental frequency, first four formants, and intensity) are first extracted from recordings of crying infants, then cloud-based artificial intelligence models are employed to identify maternal depression versus non-depression from estimated features. The trained model shows that commonly adopted acoustical features can be successfully used to identify postpartum depressed mothers with high accuracy (89.5%).}, }
@article {pmid32038381, year = {2019}, author = {Kearney, E and Nieto-Castañón, A and Weerathunge, HR and Falsini, R and Daliri, A and Abur, D and Ballard, KJ and Chang, SE and Chao, SC and Heller Murray, ES and Scott, TL and Guenther, FH}, title = {A Simple 3-Parameter Model for Examining Adaptation in Speech and Voice Production.}, journal = {Frontiers in psychology}, volume = {10}, number = {}, pages = {2995}, pmid = {32038381}, issn = {1664-1078}, support = {P50 DC015446/DC/NIDCD NIH HHS/United States ; R03 DC014045/DC/NIDCD NIH HHS/United States ; R01 DC011277/DC/NIDCD NIH HHS/United States ; R01 DC002852/DC/NIDCD NIH HHS/United States ; R01 DC016270/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; T32 DC013017/DC/NIDCD NIH HHS/United States ; F31 DC016197/DC/NIDCD NIH HHS/United States ; }, abstract = {Sensorimotor adaptation experiments are commonly used to examine motor learning behavior and to uncover information about the underlying control mechanisms of many motor behaviors, including speech production. In the speech and voice domains, aspects of the acoustic signal are shifted/perturbed over time via auditory feedback manipulations. In response, speakers alter their production in the opposite direction of the shift so that their perceived production is closer to what they intended. This process relies on a combination of feedback and feedforward control mechanisms that are difficult to disentangle. The current study describes and tests a simple 3-parameter mathematical model that quantifies the relative contribution of feedback and feedforward control mechanisms to sensorimotor adaptation. The model is a simplified version of the DIVA model, an adaptive neural network model of speech motor control. The three fitting parameters of SimpleDIVA are associated with the three key subsystems involved in speech motor control, namely auditory feedback control, somatosensory feedback control, and feedforward control. The model is tested through computer simulations that identify optimal model fits to six existing sensorimotor adaptation datasets. We show its utility in (1) interpreting the results of adaptation experiments involving the first and second formant frequencies as well as fundamental frequency; (2) assessing the effects of masking noise in adaptation paradigms; (3) fitting more than one perturbation dimension simultaneously; (4) examining sensorimotor adaptation at different timepoints in the production signal; and (5) quantitatively predicting responses in one experiment using parameters derived from another experiment. The model simulations produce excellent fits to real data across different types of perturbations and experimental paradigms (mean correlation between data and model fits across all six studies = 0.95 ± 0.02). The model parameters provide a mechanistic explanation for the behavioral responses to the adaptation paradigm that are not readily available from the behavioral responses alone. Overall, SimpleDIVA offers new insights into speech and voice motor control and has the potential to inform future directions of speech rehabilitation research in disordered populations. Simulation software, including an easy-to-use graphical user interface, is publicly available to facilitate the use of the model in future studies.}, }
@article {pmid32037936, year = {2021}, author = {Binos, P and Thodi, C and Vogazianos, P and Psillas, G and Constantinidis, J}, title = {An acoustic and auditory analysis of vocants in infants with cochlear implants.}, journal = {Logopedics, phoniatrics, vocology}, volume = {46}, number = {1}, pages = {28-34}, doi = {10.1080/14015439.2020.1724325}, pmid = {32037936}, issn = {1651-2022}, mesh = {Acoustics ; Child ; *Cochlear Implantation ; *Cochlear Implants ; *Deafness/surgery ; Humans ; Infant ; Infant, Newborn ; Longitudinal Studies ; Speech Intelligibility ; *Speech Perception ; Voice Quality ; }, abstract = {INTRODUCTION: The duration of the nuclei is a crucial factor for the shift of prelexical to mature speech, since control of duration is closely related with improved speech intelligibility.
OBJECTIVES: This work records the suprasegmental feature of duration in infants with normal hearing (NH) compared to those with cochlear implants (CI) based on vocant productions (quasivowels and full vowels).
MATERINALS AND METHODS: In this longitudinal study, 102 vocant productions were analyzed from cases of congenitally hearing-impaired infants (implantation ages 1:4 and 1:11 years; post-implant ages 0:6 months and 1:3 years) who were matched with three NH infants of similar hearing experience (ages 0:8-0:11 months). Current methodology analyzes vocants using a combination of acoustical and auditory analyses. Vegetative data or reflexive sounds were excluded. Participants had had unknown deafness etiology and no other disabilities. Duration was measured using wideband spectrographic analysis, from voice onset to the loss of audible signal and the decrease of higher formant's energy.
RESULTS: The results showed that the mean vocant duration of young CI users was longer, compared to hearing matched peers during the first six months after cochlear implantation.
CONCLUSIONS: This recorded weakness for CI users' speech production is a challenge for future work in speech processing strategies. This is the first study measuring production of vocants during the pre-linguistic stage in CI recipients.}, }
@article {pmid32036357, year = {2021}, author = {Viegas, F and Viegas, D and Serra Guimarães, G and Ritto, F and Simões-Zenari, M and Nemr, K}, title = {Acoustic Analysis of Voice and Speech in Men with Skeletal Class III Malocclusion: A Pilot Study.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {73}, number = {2}, pages = {117-125}, doi = {10.1159/000505186}, pmid = {32036357}, issn = {1421-9972}, mesh = {Acoustics ; Humans ; Male ; *Malocclusion ; Phonetics ; Pilot Projects ; *Speech ; Speech Acoustics ; }, abstract = {OBJECTIVES: To assess the fundamental (f0) and first third formant (F1, F2, F3) frequencies of the 7 oral vowels of Brazilian Portuguese in men with skeletal class III malocclusion and to compare these measures with a control group of individuals with Angle's class I.
METHODS: Sixty men aged 18-40 years, 20 with Angle's class III skeletal malocclusion and 40 with Angle's class I malocclusion were selected by speech therapists and dentists. The speech signals were obtained from sustained vowels, and the values of f0 and frequencies of F1, F2 and F3 were estimated. The differences were verified through Student's t test, and the effect size calculation was performed.
RESULTS: In the class III group, more acute f0 values were observed in all vowels, higher values of F1 in the vowels [a] and [ε] and in F2 in the vowels [a], [e] and [i] and lower F1 and F3 values of the vowel [u].
CONCLUSION: More acute f0 values were found in all vowels investigated in the class III group, which showed a higher laryngeal position in the production of these sounds. The frequencies of the first 3 formants showed punctual differences, with higher values of F1 in the vowels [a] and [ε] and of F2 in [a], [e] and [i], and lower values of F1 and F3 in the vowel [u] in the experimental group. Thus, it is concluded that the fundamental frequency of the voice was the main parameter that differentiated the studied group from the control.}, }
@article {pmid32007016, year = {2020}, author = {Kelley, MC and Tucker, BV}, title = {A comparison of four vowel overlap measures.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {137}, doi = {10.1121/10.0000494}, pmid = {32007016}, issn = {1520-8524}, abstract = {Multiple measures of vowel overlap have been proposed that use F1, F2, and duration to calculate the degree of overlap between vowel categories. The present study assesses four of these measures: the spectral overlap assessment metric [SOAM; Wassink (2006). J. Acoust. Soc. Am. 119(4), 2334-2350], the a posteriori probability (APP)-based metric [Morrison (2008). J. Acoust. Soc. Am. 123(1), 37-40], the vowel overlap analysis with convex hulls method [VOACH; Haynes and Taylor, (2014). J. Acoust. Soc. Am. 136(2), 883-891], and the Pillai score as first used for vowel overlap by Hay, Warren, and Drager [(2006). J. Phonetics 34(4), 458-484]. Summaries of the measures are presented, and theoretical critiques of them are performed, concluding that the APP-based metric and Pillai score are theoretically preferable to SOAM and VOACH. The measures are empirically assessed using accuracy and precision criteria with Monte Carlo simulations. The Pillai score demonstrates the best overall performance in these tests. The potential applications of vowel overlap measures to research scenarios are discussed, including comparisons of vowel productions between different social groups, as well as acoustic investigations into vowel formant trajectories.}, }
@article {pmid32007015, year = {2020}, author = {Renwick, MEL and Stanley, JA}, title = {Modeling dynamic trajectories of front vowels in the American South.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {579}, doi = {10.1121/10.0000549}, pmid = {32007015}, issn = {1520-8524}, abstract = {Regional variation in American English speech is often described in terms of shifts, indicating which vowel sounds are converging or diverging. In the U.S. South, the Southern vowel shift (SVS) and African American vowel shift (AAVS) affect not only vowels' relative positions but also their formant dynamics. Static characterizations of shifting, with a single pair of first and second formant values taken near vowels' midpoint, fail to capture this vowel-inherent spectral change, which can indicate dialect-specific diphthongization or monophthongization. Vowel-inherent spectral change is directly modeled to investigate how trajectories of front vowels /i eɪ ɪ ɛ/ differ across social groups in the 64-speaker Digital Archive of Southern Speech. Generalized additive mixed models are used to test the effects of two social factors, sex and ethnicity, on trajectory shape. All vowels studied show significant differences between men, women, African American and European American speakers. Results show strong overlap between the trajectories of /eɪ, ɛ/ particularly among European American women, consistent with the SVS, and greater vowel-inherent raising of /ɪ/ among African American speakers, indicating how that lax vowel is affected by the AAVS. Model predictions of duration additionally indicate that across groups, trajectories become more peripheral as vowel duration increases.}, }
@article {pmid32006995, year = {2020}, author = {Chung, H}, title = {Vowel acoustic characteristics of Southern American English variation in Louisiana.}, journal = {The Journal of the Acoustical Society of America}, volume = {147}, number = {1}, pages = {541}, doi = {10.1121/10.0000505}, pmid = {32006995}, issn = {1520-8524}, abstract = {This study examined acoustic characteristics of vowels produced by speakers from Louisiana, one of the states in the Southern English dialect region. First, how Louisiana vowels differ from or are similar to the reported patterns of Southern dialect were examined. Then, within-dialect differences across regions in Louisiana were examined. Thirty-four female adult monolingual speakers of American English from Louisiana, ranging in age from 18 to 23, produced English monosyllabic words containing 11 vowels /i, ɪ, e, ɛ, æ, ʌ, u, ʊ, o, ɔ, ɑ/. The first two formant frequencies at the midpoint of the vowel nucleus, direction, and amount of formant changes across three different time points (20, 50, and 80%), and vowel duration were compared to previously reported data on Southern vowels. Overall, Louisiana vowels showed patterns consistent with previously reported characteristics of Southern vowels that reflect ongoing changes in the Southern dialect (no evidence of acoustic reversal of tense-lax pairs, more specifically no peripheralization of front vowels). Some dialect-specific patterns were also observed (a relatively lesser degree of formant changes and slightly shorter vowel duration). These patterns were consistent across different regions within Louisiana.}, }
@article {pmid31956258, year = {2019}, author = {Maebayashi, H and Takiguchi, T and Takada, S}, title = {Study on the Language Formation Process of Very-Low-Birth-Weight Infants in Infancy Using a Formant Analysis.}, journal = {The Kobe journal of medical sciences}, volume = {65}, number = {2}, pages = {E59-E70}, pmid = {31956258}, issn = {1883-0498}, mesh = {Case-Control Studies ; Child, Preschool ; Female ; Humans ; Infant ; Infant, Newborn ; Infant, Very Low Birth Weight/*growth & development ; *Language Development ; Male ; *Speech Acoustics ; }, abstract = {Expressive language development depends on anatomical factors, such as motor control of the tongue and oral cavity needed for vocalization, as well as cognitive aspects for comprehension and speech. The purpose of this study was to examine the differences in expressive language development between normal-birth-weight (NBW) infants and very-low-birth-weight (VLBW) infants in infancy using a formant analysis. We also examined the presence of differences between infants with a normal development and those with a high risk of autism spectrum disorder who were expected to exist among VLBW infants. The participants were 10 NBW infants and 10 VLBW infants 12-15 months of age whose speech had been recorded at intervals of approximately once every 3 months. The recorded speech signal was analyzed using a formant analysis, and changes due to age were observed. One NBW and 3 VLBW infants failed to pass the screening tests (CBCL and M-CHAT) at 24 months of age. The formant frequencies (F1 and F2) of the three groups of infants (NBW, VLBW and CBCL·M-CHAT non-passing infants) were scatter-plotted by age. For the NBW and VLBW infants, the area of the plot increased with age, but there was no significant expansion of the plot area for the CBCL·M-CHAT non-passing infants. The results showed no significant differences in expressive language development between NBW infants at 24 months old and VLBW infants at the corrected age. However, different language developmental patterns were observed in CBCL·M-CHAT non-passing infants, regardless of birth weight, suggesting the importance of screening by acoustic analyses.}, }
@article {pmid31944876, year = {2020}, author = {Hosbach-Cannon, CJ and Lowell, SY and Colton, RH and Kelley, RT and Bao, X}, title = {Assessment of Tongue Position and Laryngeal Height in Two Professional Voice Populations.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {1}, pages = {109-124}, doi = {10.1044/2019_JSLHR-19-00164}, pmid = {31944876}, issn = {1558-9102}, mesh = {Adult ; Audiometry ; Female ; Humans ; Laryngeal Muscles/*diagnostic imaging/physiology ; Larynx/*diagnostic imaging/physiology ; Male ; Phonation/physiology ; Singing/*physiology ; Sound Spectrography ; Stroboscopy ; Tongue/*diagnostic imaging/physiology ; Ultrasonography/*methods ; Young Adult ; }, abstract = {Purpose To advance our current knowledge of singer physiology by using ultrasonography in combination with acoustic measures to compare physiological differences between musical theater (MT) and opera (OP) singers under controlled phonation conditions. Primary objectives addressed in this study were (a) to determine if differences in hyolaryngeal and vocal fold contact dynamics occur between two professional voice populations (MT and OP) during singing tasks and (b) to determine if differences occur between MT and OP singers in oral configuration and associated acoustic resonance during singing tasks. Method Twenty-one singers (10 MT and 11 OP) were included. All participants were currently enrolled in a music program. Experimental procedures consisted of sustained phonation on the vowels /i/ and /ɑ/ during both a low-pitch task and a high-pitch task. Measures of hyolaryngeal elevation, tongue height, and tongue advancement were assessed using ultrasonography. Vocal fold contact dynamics were measured using electroglottography. Simultaneous acoustic recordings were obtained during all ultrasonography procedures for analysis of the first two formant frequencies. Results Significant oral configuration differences, reflected by measures of tongue height and tongue advancement, were seen between groups. Measures of acoustic resonance also showed significant differences between groups during specific tasks. Both singer groups significantly raised their hyoid position when singing high-pitched vowels, but hyoid elevation was not statistically different between groups. Likewise, vocal fold contact dynamics did not significantly differentiate the two singer groups. Conclusions These findings suggest that, under controlled phonation conditions, MT singers alter their oral configuration and achieve differing resultant formants as compared with OP singers. Because singers are at a high risk of developing a voice disorder, understanding how these two groups of singers adjust their vocal tract configuration during their specific singing genre may help to identify risky vocal behavior and provide a basis for prevention of voice disorders.}, }
@article {pmid31940258, year = {2020}, author = {Souza, P and Gallun, F and Wright, R}, title = {Contributions to Speech-Cue Weighting in Older Adults With Impaired Hearing.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {63}, number = {1}, pages = {334-344}, pmid = {31940258}, issn = {1558-9102}, support = {R01 DC006014/DC/NIDCD NIH HHS/United States ; R01 DC015051/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Aged ; Aged, 80 and over ; Audiometry, Pure-Tone ; *Auditory Threshold ; *Cues ; Female ; Hearing Aids ; Hearing Loss, Bilateral/*psychology ; Hearing Loss, Sensorineural/*psychology ; Hearing Tests ; Humans ; Male ; Middle Aged ; Speech Discrimination Tests ; *Speech Perception ; }, abstract = {Purpose In a previous paper (Souza, Wright, Blackburn, Tatman, & Gallun, 2015), we explored the extent to which individuals with sensorineural hearing loss used different cues for speech identification when multiple cues were available. Specifically, some listeners placed the greatest weight on spectral cues (spectral shape and/or formant transition), whereas others relied on the temporal envelope. In the current study, we aimed to determine whether listeners who relied on temporal envelope did so because they were unable to discriminate the formant information at a level sufficient to use it for identification and the extent to which a brief discrimination test could predict cue weighting patterns. Method Participants were 30 older adults with bilateral sensorineural hearing loss. The first task was to label synthetic speech tokens based on the combined percept of temporal envelope rise time and formant transitions. An individual profile was derived from linear discriminant analysis of the identification responses. The second task was to discriminate differences in either temporal envelope rise time or formant transitions. The third task was to discriminate spectrotemporal modulation in a nonspeech stimulus. Results All listeners were able to discriminate temporal envelope rise time at levels sufficient for the identification task. There was wide variability in the ability to discriminate formant transitions, and that ability predicted approximately one third of the variance in the identification task. There was no relationship between performance in the identification task and either amount of hearing loss or ability to discriminate nonspeech spectrotemporal modulation. Conclusions The data suggest that listeners who rely to a greater extent on temporal cues lack the ability to discriminate fine-grained spectral information. The fact that the amount of hearing loss was not associated with the cue profile underscores the need to characterize individual abilities in a more nuanced way than can be captured by the pure-tone audiogram.}, }
@article {pmid31898261, year = {2020}, author = {Kamiloğlu, RG and Fischer, AH and Sauter, DA}, title = {Good vibrations: A review of vocal expressions of positive emotions.}, journal = {Psychonomic bulletin & review}, volume = {27}, number = {2}, pages = {237-265}, pmid = {31898261}, issn = {1531-5320}, support = {714977/ERC_/European Research Council/International ; }, mesh = {Emotions/*physiology ; Humans ; Nonverbal Communication/*physiology ; Speech/*physiology ; Voice/*physiology ; }, abstract = {Researchers examining nonverbal communication of emotions are becoming increasingly interested in differentiations between different positive emotional states like interest, relief, and pride. But despite the importance of the voice in communicating emotion in general and positive emotion in particular, there is to date no systematic review of what characterizes vocal expressions of different positive emotions. Furthermore, integration and synthesis of current findings are lacking. In this review, we comprehensively review studies (N = 108) investigating acoustic features relating to specific positive emotions in speech prosody and nonverbal vocalizations. We find that happy voices are generally loud with considerable variability in loudness, have high and variable pitch, and are high in the first two formant frequencies. When specific positive emotions are directly compared with each other, pitch mean, loudness mean, and speech rate differ across positive emotions, with patterns mapping onto clusters of emotions, so-called emotion families. For instance, pitch is higher for epistemological emotions (amusement, interest, relief), moderate for savouring emotions (contentment and pleasure), and lower for a prosocial emotion (admiration). Some, but not all, of the differences in acoustic patterns also map on to differences in arousal levels. We end by pointing to limitations in extant work and making concrete proposals for future research on positive emotions in the voice.}, }
@article {pmid31893680, year = {2019}, author = {Dubey, AK and Prasanna, SRM and Dandapat, S}, title = {Detection and assessment of hypernasality in repaired cleft palate speech using vocal tract and residual features.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {6}, pages = {4211}, doi = {10.1121/1.5134433}, pmid = {31893680}, issn = {1520-8524}, mesh = {Child ; Cleft Palate/*surgery ; Female ; Humans ; Male ; Speech/*physiology ; Speech Acoustics ; Speech Production Measurement/methods ; Velopharyngeal Insufficiency/physiopathology/*surgery ; Voice/*physiology ; }, abstract = {The presence of hypernasality in repaired cleft palate (CP) speech is a consequence of velopharyngeal insufficiency. The coupling of the nasal tract with the oral tract adds nasal formant and antiformant pairs in the hypernasal speech spectrum. This addition deviates the spectral and linear prediction (LP) residual characteristics of hypernasal speech compared to normal speech. In this work, the vocal tract constriction feature, peak to side-lobe ratio feature, and spectral moment features augmented by low-order cepstral coefficients are used to capture the spectral and residual deviations for hypernasality detection. The first feature captures the lower-frequencies prominence in speech due to the presence of nasal formants, the second feature captures the undesirable signal components in the residual signal due to the nasal antiformants, and the third feature captures the information about formants and antiformants in the spectrum along with the spectral envelope. The combination of three features gives normal versus hypernasal speech detection accuracies of 87.76%, 91.13%, and 93.70% for /a/, /i/, and /u/ vowels, respectively, and hypernasality severity detection accuracies of 80.13% and 81.25% for /i/ and /u/ vowels, respectively. The speech data are collected from 30 control normal and 30 repaired CP children between the ages of 7 and 12.}, }
@article {pmid31889645, year = {2021}, author = {Shiraishi, M and Mishima, K and Umeda, H}, title = {Development of an Acoustic Simulation Method during Phonation of the Japanese Vowel /a/ by the Boundary Element Method.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {4}, pages = {530-544}, doi = {10.1016/j.jvoice.2019.11.022}, pmid = {31889645}, issn = {1873-4588}, mesh = {*Acoustics ; Adult ; Female ; Humans ; Japan ; Male ; *Phonation ; Speech Acoustics ; Vocal Cords/diagnostic imaging ; }, abstract = {OBJECTIVES: The purpose of the present study was to establish the method for an acoustic simulation of a vocal tract created from CT data during phonation of the Japanese vowel /a/ and to verify the validity of the simulation.
MATERIAL AND METHODS: The subjects were 15 healthy adults (8 males, 7 females). The vocal tract model was created from CT data acquired during sustained phonation of the Japanese vowel /a/. After conversion to a mesh model for analysis, a wave acoustic analysis was performed with a boundary element method. The wall and the bottom of the vocal tract model were regarded as a rigid wall and a nonrigid wall, respectively. The acoustic medium was set to 37°C, and a point sound source was set in the place corresponding to the vocal cord as a sound source. The first and second formant frequencies (F1 and F2) were calculated. For 1 of the 15 subjects, the range from the upper end of the frontal sinus to the tracheal bifurcation was scanned, and 2 models were created: model 1 included the range from the frontal sinus to the tracheal bifurcation; and model 2 included the range from the frontal sinus to the glottis and added a virtually extended trachea by 12 cm cylindrically. F1 and F2 calculated from models 1 and 2 were compared. To evaluate the validity of the present simulation, F1 and F2 calculated from the simulation were compared with those of the actual voice and the sound generated using a solid model and a whistle-type artificial larynx. To judge the validity, the vowel formant frequency discrimination threshold reported in the past was used as a criterion. Namely, the relative discrimination thresholds (%), dividing ▵F by F, where F was the formant frequency calculated from the simulation, and ▵F was the difference between F and the formant frequency of the actual voice and the sound generated using the solid model and artificial larynx, were obtained.
RESULTS: F1 and F2 calculated from models 1 and 2 were similar. Therefore, to reduce the exposure dose, the remaining 14 subjects were scanned from the upper end of the frontal sinus to the glottis, and model 2 with the trachea extended by 12 cm virtually was used for the simulation. The averages of the relative discrimination thresholds against F1 and F2 calculated from the actual voice were 5.9% and 4.6%, respectively. The averages of the relative discrimination thresholds against F1 and F2 calculated from the sound generated by using the solid model and the artificial larynx were 4.1% and 3.7%, respectively.
CONCLUSIONS: The Japanese vowel /a/ could be simulated with high validity for the vocal tract models created from the CT data during phonation of /a/ using the boundary element method.}, }
@article {pmid31889288, year = {2020}, author = {Huang, MY and Duan, RY and Zhao, Q}, title = {The influence of long-term cadmium exposure on the male advertisement call of Xenopus laevis.}, journal = {Environmental science and pollution research international}, volume = {27}, number = {8}, pages = {7996-8002}, pmid = {31889288}, issn = {1614-7499}, mesh = {Advertising ; Animals ; *Cadmium/chemistry ; *Endocrine Disruptors ; Female ; Male ; Xenopus laevis ; }, abstract = {Cadmium (Cd) is a non-essential environmental endocrine-disrupting compound found in water and a potential threat to aquatic habitats. Cd has been shown to have various short-term effects on aquatic animals; however, evidence for long-term effects of Cd on vocal communications in amphibians is lacking. To better understand the long-term effects of low-dose Cd on acoustic communication in amphibians, male Xenopus laevis individuals were treated with low Cd concentrations (0.1, 1, and 10 μg/L) via aqueous exposure for 24 months. At the end of the exposure, the acoustic spectrum characteristics of male advertisement calls and male movement behaviors in response to female calls were recorded. The gene and protein expressions of the androgen receptor (AR) were determined using Western blot and RT-PCR. The results showed that long-term Cd treatment affected the spectrogram and formant of the advertisement call. Compared with the control group, 10 μg/L Cd significantly decreased the first and second formant frequency, and the fundamental and main frequency, and increased the third formant frequency. One and 10-μg/L Cd treatments significantly reduced the proportion of individuals responding to female calls and prolonged the time of first movement of the male. Long-term Cd treatment induced a downregulation in the AR protein. Treatments of 0.1, 1, and 10 μg/L Cd significantly decreased the expression of AR mRNA in the brain. These findings indicate that long-term exposure of Cd has negative effects on advertisement calls in male X. laevis.}, }
@article {pmid31868693, year = {2019}, author = {Park, EJ and Yoo, SD and Kim, HS and Lee, JH and Yun, DH and Kim, DH and Chon, JM and Lee, SA and Soh, YS and Kim, Y and Han, YR and Yoo, MC and Choi, KM and Seo, YK and Lee, DH and Choi, YH and Jeong, KH and Son, JE}, title = {Correlations between swallowing function and acoustic vowel space in stroke patients with dysarthria.}, journal = {NeuroRehabilitation}, volume = {45}, number = {4}, pages = {463-469}, doi = {10.3233/NRE-192904}, pmid = {31868693}, issn = {1878-6448}, mesh = {*Deglutition ; Deglutition Disorders/epidemiology/*physiopathology ; Dysarthria/epidemiology/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; *Speech Acoustics ; Stroke/*complications/physiopathology ; }, abstract = {BACKGROUND: Dysphagia and dysarthria tend to coexist in stroke patients. Dysphagia can reduce patients' quality of life, cause aspiration pneumonia and increased mortality.
OBJECTIVE: To evaluate correlations among swallowing function parameters and acoustic vowel space values in patients with stroke.
METHODS: Data from stroke patients with dysarthria and dysphagia were collected. The formant parameter representing the resonance frequency of the vocal tract as a two-dimensional coordinate point was measured for the /a/, /ae/, /i/, and /u/vowels, and the quadrilateral vowel space area (VSA) and formant centralization ratio (FCR) were measured. Swallowing function was evaluated by a videofluoroscopic swallowing study (VFSS) using the videofluoroscopic dysphagia scale (VDS) and penetration aspiration scale (PAS). Pearson's correlation and linear regression analyses were used to assess the correlation of VSA and FCR to VDS and PAS scores.
RESULTS: Thirty-one stroke patients with dysphagia and dysarthria were analyzed. VSA showed a negative correlation to VDS and PAS scores, while FCR showed a positive correlation to VDS score, but not to PAS score. VSA and FCR were significant factors for assessing dysphagia severity.
CONCLUSIONS: VSA and FCR values were correlated with swallowing function and may be helpful in predicting dysphagia severity associated with stroke.}, }
@article {pmid31862999, year = {2019}, author = {McCarthy, KM and Skoruppa, K and Iverson, P}, title = {Development of neural perceptual vowel spaces during the first year of life.}, journal = {Scientific reports}, volume = {9}, number = {1}, pages = {19592}, pmid = {31862999}, issn = {2045-2322}, mesh = {Electroencephalography ; Female ; Humans ; Infant ; Language ; Learning ; Male ; *Phonetics ; Sound Spectrography ; *Speech Acoustics ; Speech Discrimination Tests ; Speech Perception/*physiology ; Verbal Learning ; }, abstract = {This study measured infants' neural responses for spectral changes between all pairs of a set of English vowels. In contrast to previous methods that only allow for the assessment of a few phonetic contrasts, we present a new method that allows us to assess changes in spectral sensitivity across the entire vowel space and create two-dimensional perceptual maps of the infants' vowel development. Infants aged four to eleven months were played long series of concatenated vowels, and the neural response to each vowel change was assessed using the Acoustic Change Complex (ACC) from EEG recordings. The results demonstrated that the youngest infants' responses more closely reflected the acoustic differences between the vowel pairs and reflected higher weight to first-formant variation. Older infants had less acoustically driven responses that seemed a result of selective increases in sensitivity for phonetically similar vowels. The results suggest that phonetic development may involve a perceptual warping for confusable vowels rather than uniform learning, as well as an overall increasing sensitivity to higher-frequency acoustic information.}, }
@article {pmid31848063, year = {2021}, author = {Houle, N and Levi, SV}, title = {Effect of Phonation on Perception of Femininity/Masculinity in Transgender and Cisgender Speakers.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {3}, pages = {497.e23-497.e37}, doi = {10.1016/j.jvoice.2019.10.011}, pmid = {31848063}, issn = {1873-4588}, mesh = {Auditory Perception ; Female ; Femininity ; Humans ; Infant, Newborn ; Male ; Masculinity ; Phonation ; Speech Acoustics ; *Speech Perception ; *Transgender Persons ; }, abstract = {Many transwomen seek voice and communication therapy to support their transition from their gender assigned at birth to their gender identity. This has led to an increased need to examine the perception of gender and femininity/masculinity to develop evidence-based intervention practices. In this study, we explore the auditory perception of femininity/masculinity in normally phonated and whispered speech. Transwomen, ciswomen, and cismen were recorded producing /hVd/ words. Naïve listeners rated femininity/masculinity of a speaker's voice using a visual analog scale, rather than completing a binary gender identification task. The results revealed that listeners rated speakers more ambiguously in whispered speech than normally phonated speech. An analysis of speaker and token characteristics revealed that in the normally phonated condition listeners consistently use f0 to rate femininity/masculinity. In addition, some evidence was found for possible contributions of formant frequencies, particularly F2, and duration. Taken together, this provides additional evidence for the salience of f0 and F2 for voice and communication intervention among transwomen.}, }
@article {pmid31824364, year = {2019}, author = {Xu, Y and Prom-On, S}, title = {Economy of Effort or Maximum Rate of Information? Exploring Basic Principles of Articulatory Dynamics.}, journal = {Frontiers in psychology}, volume = {10}, number = {}, pages = {2469}, pmid = {31824364}, issn = {1664-1078}, support = {R01 DC003902/DC/NIDCD NIH HHS/United States ; }, abstract = {Economy of effort, a popular notion in contemporary speech research, predicts that dynamic extremes such as the maximum speed of articulatory movement are avoided as much as possible and that approaching the dynamic extremes is necessary only when there is a need to enhance linguistic contrast, as in the case of stress or clear speech. Empirical data, however, do not always support these predictions. In the present study, we considered an alternative principle: maximum rate of information, which assumes that speech dynamics are ultimately driven by the pressure to transmit information as quickly and accurately as possible. For empirical data, we asked speakers of American English to produce repetitive syllable sequences such as wawawawawa as fast as possible by imitating recordings of the same sequences that had been artificially accelerated and to produce meaningful sentences containing the same syllables at normal and fast speaking rates. Analysis of formant trajectories shows that dynamic extremes in meaningful speech sometimes even exceeded those in the nonsense syllable sequences but that this happened more often in unstressed syllables than in stressed syllables. We then used a target approximation model based on a mass-spring system of varying orders to simulate the formant kinematics. The results show that the kind of formant kinematics found in the present study and in previous studies can only be generated by a dynamical system operating with maximal muscular force under strong time pressure and that the dynamics of this operation may hold the solution to the long-standing enigma of greater stiffness in unstressed than in stressed syllables. We conclude, therefore, that maximum rate of information can coherently explain both current and previous empirical data and could therefore be a fundamental principle of motor control in speech production.}, }
@article {pmid31795850, year = {2019}, author = {Root-Gutteridge, H and Ratcliffe, VF and Korzeniowska, AT and Reby, D}, title = {Dogs perceive and spontaneously normalize formant-related speaker and vowel differences in human speech sounds.}, journal = {Biology letters}, volume = {15}, number = {12}, pages = {20190555}, pmid = {31795850}, issn = {1744-957X}, support = {BB/P00170X/1/BB_/Biotechnology and Biological Sciences Research Council/United Kingdom ; }, mesh = {Animals ; Cues ; Dogs ; Humans ; Phonetics ; Speech ; *Speech Perception ; *Voice ; }, abstract = {Domesticated animals have been shown to recognize basic phonemic information from human speech sounds and to recognize familiar speakers from their voices. However, whether animals can spontaneously identify words across unfamiliar speakers (speaker normalization) or spontaneously discriminate between unfamiliar speakers across words remains to be investigated. Here, we assessed these abilities in domestic dogs using the habituation-dishabituation paradigm. We found that while dogs habituated to the presentation of a series of different short words from the same unfamiliar speaker, they significantly dishabituated to the presentation of a novel word from a new speaker of the same gender. This suggests that dogs spontaneously categorized the initial speaker across different words. Conversely, dogs who habituated to the same short word produced by different speakers of the same gender significantly dishabituated to a novel word, suggesting that they had spontaneously categorized the word across different speakers. Our results indicate that the ability to spontaneously recognize both the same phonemes across different speakers, and cues to identity across speech utterances from unfamiliar speakers, is present in domestic dogs and thus not a uniquely human trait.}, }
@article {pmid31795713, year = {2019}, author = {Vorperian, HK and Kent, RD and Lee, Y and Bolt, DM}, title = {Corner vowels in males and females ages 4 to 20 years: Fundamental and F1-F4 formant frequencies.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3255}, pmid = {31795713}, issn = {1520-8524}, support = {P30 HD003352/HD/NICHD NIH HHS/United States ; R01 DC006282/DC/NIDCD NIH HHS/United States ; U54 HD090256/HD/NICHD NIH HHS/United States ; }, mesh = {Adolescent ; Aging/*physiology ; Child ; Child, Preschool ; Female ; Glottis/growth & development ; Humans ; Male ; *Phonation ; *Phonetics ; Sex Factors ; Speech Acoustics ; Voice ; Young Adult ; }, abstract = {The purpose of this study was to determine the developmental trajectory of the four corner vowels' fundamental frequency (fo) and the first four formant frequencies (F1-F4), and to assess when speaker-sex differences emerge. Five words per vowel, two of which were produced twice, were analyzed for fo and estimates of the first four formants frequencies from 190 (97 female, 93 male) typically developing speakers ages 4-20 years old. Findings revealed developmental trajectories with decreasing values of fo and formant frequencies. Sex differences in fo emerged at age 7. The decrease of fo was larger in males than females with a marked drop during puberty. Sex differences in formant frequencies appeared at the earliest age under study and varied with vowel and formant. Generally, the higher formants (F3-F4) were sensitive to sex differences. Inter- and intra-speaker variability declined with age but had somewhat different patterns, likely reflective of maturing motor control that interacts with the changing anatomy. This study reports a source of developmental normative data on fo and the first four formants in both sexes. The different developmental patterns in the first four formants and vowel-formant interactions in sex differences likely point to anatomic factors, although speech-learning phenomena cannot be discounted.}, }
@article {pmid31795696, year = {2019}, author = {Gianakas, SP and Winn, MB}, title = {Lexical bias in word recognition by cochlear implant listeners.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3373}, pmid = {31795696}, issn = {1520-8524}, support = {R03 DC014309/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Bias ; *Cochlear Implants ; Cues ; Female ; Hearing Loss/*physiopathology/rehabilitation ; Humans ; Male ; *Phonetics ; Speech Acoustics ; *Speech Perception ; }, abstract = {When hearing an ambiguous speech sound, listeners show a tendency to perceive it as a phoneme that would complete a real word, rather than completing a nonsense/fake word. For example, a sound that could be heard as either /b/ or /ɡ/ is perceived as /b/ when followed by _ack but perceived as /ɡ/ when followed by "_ap." Because the target sound is acoustically identical across both environments, this effect demonstrates the influence of top-down lexical processing in speech perception. Degradations in the auditory signal were hypothesized to render speech stimuli more ambiguous, and therefore promote increased lexical bias. Stimuli included three speech continua that varied by spectral cues of varying speeds, including stop formant transitions (fast), fricative spectra (medium), and vowel formants (slow). Stimuli were presented to listeners with cochlear implants (CIs), and also to listeners with normal hearing with clear spectral quality, or with varying amounts of spectral degradation using a noise vocoder. Results indicated an increased lexical bias effect with degraded speech and for CI listeners, for whom the effect size was related to segment duration. This method can probe an individual's reliance on top-down processing even at the level of simple lexical/phonetic perception.}, }
@article {pmid31795676, year = {2019}, author = {Perrachione, TK and Furbeck, KT and Thurston, EJ}, title = {Acoustic and linguistic factors affecting perceptual dissimilarity judgments of voices.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {5}, pages = {3384}, pmid = {31795676}, issn = {1520-8524}, mesh = {Adult ; Female ; Humans ; Male ; Phonetics ; Psycholinguistics ; *Speech Acoustics ; *Speech Perception ; *Voice ; }, abstract = {The human voice is a complex acoustic signal that conveys talker identity via individual differences in numerous features, including vocal source acoustics, vocal tract resonances, and dynamic articulations during speech. It remains poorly understood how differences in these features contribute to perceptual dissimilarity of voices and, moreover, whether linguistic differences between listeners and talkers interact during perceptual judgments of voices. Here, native English- and Mandarin-speaking listeners rated the perceptual dissimilarity of voices speaking English or Mandarin from either forward or time-reversed speech. The language spoken by talkers, but not listeners, principally influenced perceptual judgments of voices. Perceptual dissimilarity judgments of voices were always highly correlated between listener groups and forward/time-reversed speech. Representational similarity analyses that explored how acoustic features (fundamental frequency mean and variation, jitter, harmonics-to-noise ratio, speech rate, and formant dispersion) contributed to listeners' perceptual dissimilarity judgments, including how talker- and listener-language affected these relationships, found the largest effects relating to voice pitch. Overall, these data suggest that, while linguistic factors may influence perceptual judgments of voices, the magnitude of such effects tends to be very small. Perceptual judgments of voices by listeners of different native language backgrounds tend to be more alike than different.}, }
@article {pmid31789576, year = {2020}, author = {Lo, JJH}, title = {Between Äh(m) and Euh(m): The Distribution and Realization of Filled Pauses in the Speech of German-French Simultaneous Bilinguals.}, journal = {Language and speech}, volume = {63}, number = {4}, pages = {746-768}, doi = {10.1177/0023830919890068}, pmid = {31789576}, issn = {1756-6053}, mesh = {Adult ; Female ; France ; Germany ; Humans ; *Language ; Male ; *Multilingualism ; *Phonetics ; *Speech ; }, abstract = {Filled pauses are well known for their speaker specificity, yet cross-linguistic research has also shown language-specific trends in their distribution and phonetic quality. To examine the extent to which speakers acquire filled pauses as language- or speaker-specific phenomena, this study investigates the use of filled pauses in the context of adult simultaneous bilinguals. Making use of both distributional and acoustic data, this study analyzed UH, consisting of only a vowel component, and UM, with a vowel followed by [m], in the speech of 15 female speakers who were simultaneously bilingual in French and German. Speakers were found to use UM more frequently in German than in French, but only German-dominant speakers had a preference for UM in German. Formant and durational analyses showed that while speakers maintained distinct vowel qualities in their filled pauses in different languages, filled pauses in their weaker language exhibited a shift towards those in their dominant language. These results suggest that, despite high levels of variability between speakers, there is a significant role for language in the acquisition of filled pauses in simultaneous bilingual speakers, which is further shaped by the linguistic environment they grow up in.}, }
@article {pmid31777085, year = {2020}, author = {Frey, R and Volodin, IA and Volodina, EV and Efremova, KO and Menges, V and Portas, R and Melzheimer, J and Fritsch, G and Gerlach, C and von Dörnberg, K}, title = {Savannah roars: The vocal anatomy and the impressive rutting calls of male impala (Aepyceros melampus) - highlighting the acoustic correlates of a mobile larynx.}, journal = {Journal of anatomy}, volume = {236}, number = {3}, pages = {398-424}, pmid = {31777085}, issn = {1469-7580}, mesh = {Acoustics ; Animals ; Antelopes/*anatomy & histology/physiology ; Laryngeal Muscles/*anatomy & histology/physiology ; Larynx/*anatomy & histology/physiology ; Male ; Vocal Cords/anatomy & histology/physiology ; Vocalization, Animal/*physiology ; }, abstract = {A retractable larynx and adaptations of the vocal folds in the males of several polygynous ruminants serve for the production of rutting calls that acoustically announce larger than actual body size to both rival males and potential female mates. Here, such features of the vocal tract and of the sound source are documented in another species. We investigated the vocal anatomy and laryngeal mobility including its acoustical effects during the rutting vocal display of free-ranging male impala (Aepyceros melampus melampus) in Namibia. Male impala produced bouts of rutting calls (consisting of oral roars and interspersed explosive nasal snorts) in a low-stretch posture while guarding a rutting territory or harem. For the duration of the roars, male impala retracted the larynx from its high resting position to a low mid-neck position involving an extensible pharynx and a resilient connection between the hyoid apparatus and the larynx. Maximal larynx retraction was 108 mm based on estimates in video single frames. This was in good concordance with 91-mm vocal tract elongation calculated on the basis of differences in formant dispersion between roar portions produced with the larynx still ascended and those produced with maximally retracted larynx. Judged by their morphological traits, the larynx-retracting muscles of male impala are homologous to those of other larynx-retracting ruminants. In contrast, the large and massive vocal keels are evolutionary novelties arising by fusion and linear arrangement of the arytenoid cartilage and the canonical vocal fold. These bulky and histologically complex vocal keels produced a low fundamental frequency of 50 Hz. Impala is another ruminant species in which the males are capable of larynx retraction. In addition, male impala vocal folds are spectacularly specialized compared with domestic bovids, allowing the production of impressive, low-frequency roaring vocalizations as a significant part of their rutting behaviour. Our study expands knowledge on the evolutionary variation of vocal fold morphology in mammals, suggesting that the structure of the mammalian sound source is not always human-like and should be considered in acoustic analysis and modelling.}, }
@article {pmid31758279, year = {2020}, author = {Hu, G and Determan, SC and Dong, Y and Beeve, AT and Collins, JE and Gai, Y}, title = {Spectral and Temporal Envelope Cues for Human and Automatic Speech Recognition in Noise.}, journal = {Journal of the Association for Research in Otolaryngology : JARO}, volume = {21}, number = {1}, pages = {73-87}, pmid = {31758279}, issn = {1438-7573}, support = {T32 AR060719/AR/NIAMS NIH HHS/United States ; }, mesh = {Adult ; Artificial Intelligence ; Female ; Humans ; Male ; *Noise ; *Speech Acoustics ; *Speech Perception ; *Speech Recognition Software ; Young Adult ; }, abstract = {Acoustic features of speech include various spectral and temporal cues. It is known that temporal envelope plays a critical role for speech recognition by human listeners, while automated speech recognition (ASR) heavily relies on spectral analysis. This study compared sentence-recognition scores of humans and an ASR software, Dragon, when spectral and temporal-envelope cues were manipulated in background noise. Temporal fine structure of meaningful sentences was reduced by noise or tone vocoders. Three types of background noise were introduced: a white noise, a time-reversed multi-talker noise, and a fake-formant noise. Spectral information was manipulated by changing the number of frequency channels. With a 20-dB signal-to-noise ratio (SNR) and four vocoding channels, white noise had a stronger disruptive effect than the fake-formant noise. The same observation with 22 channels was made when SNR was lowered to 0 dB. In contrast, ASR was unable to function with four vocoding channels even with a 20-dB SNR. Its performance was least affected by white noise and most affected by the fake-formant noise. Increasing the number of channels, which improved the spectral resolution, generated non-monotonic behaviors for the ASR with white noise but not with colored noise. The ASR also showed highly improved performance with tone vocoders. It is possible that fake-formant noise affected the software's performance by disrupting spectral cues, whereas white noise affected performance by compromising speech segmentation. Overall, these results suggest that human listeners and ASR utilize different listening strategies in noise.}, }
@article {pmid31751443, year = {2019}, author = {França, FP and Almeida, AA and Lopes, LW}, title = {Acoustic-articulatory configuration of women with vocal nodules and with healthy voice.}, journal = {CoDAS}, volume = {31}, number = {6}, pages = {e20180241}, doi = {10.1590/2317-1782/20192018241}, pmid = {31751443}, issn = {2317-1782}, mesh = {Adult ; Aged ; Aged, 80 and over ; Case-Control Studies ; Cross-Sectional Studies ; Female ; Humans ; Laryngeal Diseases/physiopathology ; Middle Aged ; Phonetics ; *Speech Acoustics ; Speech Production Measurement/methods ; Vocal Cords/*physiopathology ; *Voice Quality ; }, abstract = {PURPOSE: To analyze the acoustic-articulatory configuration of vowels in women with vocal nodules and with healthy voice.
METHODS: Twelve women with vocal nodules (EG) and twelve vocally health women (CG) participated of this study. All women recorded vehicle phrases with the vowels /a/, /i/, and /u/ in stress position, preceded and followed by the occlusive consonant /p/: "Digo papa baixinho", "Digo pipa baixinho", and "Digo pupa baixinho". Subsequently, the first three formants (F1, F2, and F3) were extracted from these vowel targets.
RESULTS: Between the two groups studied, F1 measures differed for vowels /a/ and /u/, and F2 measures differed for the vowel /a/. Women with vocal nodules showed lower values for these measures compared to vocally healthy women. Patients with vocal nodules showed a smaller interval in F1 and F2 values between vowels /a/, /i/, and /u/ compared to vocally healthy women.
CONCLUSION: Women with vocal nodules show lower F1 and F2 values and lower range of motion of the articulators during vowel production compared to vocally healthy women.}, }
@article {pmid31747532, year = {2019}, author = {Hu, W and Tao, S and Li, M and Liu, C}, title = {Distinctiveness and Assimilation in Vowel Perception in a Second Language.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {12}, pages = {4534-4543}, doi = {10.1044/2019_JSLHR-H-19-0074}, pmid = {31747532}, issn = {1558-9102}, mesh = {*Acculturation ; Acoustic Stimulation ; Adolescent ; Adult ; Asian People/psychology ; Audiometry, Speech ; Female ; Humans ; *Language ; Male ; *Multilingualism ; *Phonetics ; Speech Perception/*physiology ; Young Adult ; }, abstract = {Purpose The purpose of this study was to investigate how the distinctive establishment of 2nd language (L2) vowel categories (e.g., how distinctively an L2 vowel is established from nearby L2 vowels and from the native language counterpart in the 1st formant [F1] × 2nd formant [F2] vowel space) affected L2 vowel perception. Method Identification of 12 natural English monophthongs, and categorization and rating of synthetic English vowels /i/ and /ɪ/ in the F1 × F2 space were measured for Chinese-native (CN) and English-native (EN) listeners. CN listeners were also examined with categorization and rating of Chinese vowels in the F1 × F2 space. Results As expected, EN listeners significantly outperformed CN listeners in English vowel identification. Whereas EN listeners showed distinctive establishment of 2 English vowels, CN listeners had multiple patterns of L2 vowel establishment: both, 1, or neither established. Moreover, CN listeners' English vowel perception was significantly related to the perceptual distance between the English vowel and its Chinese counterpart, and the perceptual distance between the adjacent English vowels. Conclusions L2 vowel perception relied on listeners' capacity to distinctively establish L2 vowel categories that were distant from the nearby L2 vowels.}, }
@article {pmid31741978, year = {2019}, author = {Sandeep, S and Shilpa, C and Shetty, TS and Basavaraj, S and Menon, NN}, title = {Voice Analysis in Post Tonsillectomy Patients.}, journal = {Indian journal of otolaryngology and head and neck surgery : official publication of the Association of Otolaryngologists of India}, volume = {71}, number = {Suppl 1}, pages = {312-317}, pmid = {31741978}, issn = {2231-3796}, abstract = {The main aim of this study was to analyse the change in voice in terms of acoustic parameters and its perceptual impact in patients who have undergone tonsillectomy. A prospective study was conducted in our institution-JSS Hospital and JSS institute of speech and hearing, Mysore for a duration of 1 year (December 2015-December 2016). 50 post tonsillectomy cases were selected randomly and subjected to acoustic analysis. It was inferred that situation of vocal analysis and assessment for the vowels 'a', 'i' and 'u' under the categories hoarse, harsh and breathy remain more or less the same during preoperative stages, first preoperative follow up and the second post operative follow up. It was concluded that tonsillectomy did not appear to change the acoustic features of vowels remarkably. It was assumed that the subject may adjust the shape of the vocal tract to produce consistent speech sound after surgery using auditory feedback.}, }
@article {pmid31738857, year = {2019}, author = {Mollaei, F and Shiller, DM and Baum, SR and Gracco, VL}, title = {The Relationship Between Speech Perceptual Discrimination and Speech Production in Parkinson's Disease.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {12}, pages = {4256-4268}, doi = {10.1044/2019_JSLHR-S-18-0425}, pmid = {31738857}, issn = {1558-9102}, mesh = {Aged ; Basal Ganglia/physiopathology ; Case-Control Studies ; Feedback, Sensory ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*physiopathology ; Pitch Discrimination/*physiology ; Speech/*physiology ; Speech Acoustics ; Speech Discrimination Tests ; }, abstract = {Purpose We recently demonstrated that individuals with Parkinson's disease (PD) respond differentially to specific altered auditory feedback parameters during speech production. Participants with PD respond more robustly to pitch and less robustly to formant manipulations compared to control participants. In this study, we investigated whether differences in perceptual processing may in part underlie these compensatory differences in speech production. Methods Pitch and formant feedback manipulations were presented under 2 conditions: production and listening. In the production condition, 15 participants with PD and 15 age- and gender-matched healthy control participants judged whether their own speech output was manipulated in real time. During the listening task, participants judged whether paired tokens of their previously recorded speech samples were the same or different. Results Under listening, 1st formant manipulation discrimination was significantly reduced for the PD group compared to the control group. There was a trend toward better discrimination of pitch in the PD group, but the group difference was not significant. Under the production condition, the ability of participants with PD to identify pitch manipulations was greater than that of the controls. Conclusion The findings suggest perceptual processing differences associated with acoustic parameters of fundamental frequency and 1st formant perturbations in PD. These findings extend our previous results, indicating that different patterns of compensation to pitch and 1st formant shifts may reflect a combination of sensory and motor mechanisms that are differentially influenced by basal ganglia dysfunction.}, }
@article {pmid31734323, year = {2020}, author = {Escudero, P and Kalashnikova, M}, title = {Infants use phonetic detail in speech perception and word learning when detail is easy to perceive.}, journal = {Journal of experimental child psychology}, volume = {190}, number = {}, pages = {104714}, doi = {10.1016/j.jecp.2019.104714}, pmid = {31734323}, issn = {1096-0457}, mesh = {Analysis of Variance ; *Discrimination, Psychological ; Female ; Humans ; Infant ; Language ; *Language Development ; Male ; *Phonetics ; *Speech Perception ; *Verbal Learning ; }, abstract = {Infants successfully discriminate speech sound contrasts that belong to their native language's phonemic inventory in auditory-only paradigms, but they encounter difficulties in distinguishing the same contrasts in the context of word learning. These difficulties are usually attributed to the fact that infants' attention to the phonetic detail in novel words is attenuated when they must allocate additional cognitive resources demanded by word-learning tasks. The current study investigated 15-month-old infants' ability to distinguish novel words that differ by a single vowel in an auditory discrimination paradigm (Experiment 1) and a word-learning paradigm (Experiment 2). These experiments aimed to tease apart whether infants' performance is dependent solely on the specific acoustic properties of the target vowels or on the context of the task. Experiment 1 showed that infants were able to discriminate only a contrast marked by a large difference along a static dimension (the vowels' second formant), whereas they were not able to discriminate a contrast with a small phonetic distance between its vowels, due to the dynamic nature of the vowels. In Experiment 2, infants did not succeed at learning words containing the same contrast they were able to discriminate in Experiment 1. The current findings demonstrate that both the specific acoustic properties of vowels in infants' native language and the task presented continue to play a significant role in early speech perception well into the second year of life.}, }
@article {pmid31715197, year = {2020}, author = {Rosenthal, MA}, title = {A systematic review of the voice-tagging hypothesis of speech-in-noise perception.}, journal = {Neuropsychologia}, volume = {136}, number = {}, pages = {107256}, doi = {10.1016/j.neuropsychologia.2019.107256}, pmid = {31715197}, issn = {1873-3514}, mesh = {Discrimination, Psychological/*physiology ; Humans ; *Music ; Pitch Perception/*physiology ; Speech Perception/*physiology ; *Voice ; }, abstract = {The voice-tagging hypothesis claims that individuals who better represent pitch information in a speaker's voice, as measured with the frequency following response (FFR), will be better at speech-in-noise perception. The hypothesis has been provided to explain how music training might improve speech-in-noise perception. This paper reviews studies that are relevant to the voice-tagging hypothesis, including studies on musicians and nonmusicians. Most studies on musicians show greater f0 amplitude compared to controls. Most studies on nonmusicians do not show group differences in f0 amplitude. Across all studies reviewed, f0 amplitude does not consistently predict accuracy in speech-in-noise perception. The evidence suggests that music training does not improve speech-in-noise perception via enhanced subcortical representation of the f0.}, }
@article {pmid31708368, year = {2021}, author = {Hakanpää, T and Waaramaa, T and Laukkanen, AM}, title = {Comparing Contemporary Commercial and Classical Styles: Emotion Expression in Singing.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {4}, pages = {570-580}, doi = {10.1016/j.jvoice.2019.10.002}, pmid = {31708368}, issn = {1873-4588}, mesh = {Emotions ; Female ; Humans ; *Singing ; Speech Acoustics ; *Voice ; Voice Quality ; }, abstract = {OBJECTIVE: This study examines the acoustic correlates of the vocal expression of emotions in contemporary commercial music (CCM) and classical styles of singing. This information may be useful in improving the training of interpretation in singing.
STUDY DESIGN: This is an experimental comparative study.
METHODS: Eleven female singers with a minimum of 3 years of professional-level singing training in CCM, classical, or both styles participated. They sang the vowel [ɑ:] at three pitches (A3 220Hz, E4 330Hz, and A4 440Hz) expressing anger, sadness, joy, tenderness, and a neutral voice. Vowel samples were analyzed for fundamental frequency (fo) formant frequencies (F1-F5), sound pressure level (SPL), spectral structure (alpha ratio = SPL 1500-5000 Hz-SPL 50-1500 Hz), harmonics-to-noise ratio (HNR), perturbation (jitter, shimmer), onset and offset duration, sustain time, rate and extent of fo variation in vibrato, and rate and extent of amplitude vibrato.
RESULTS: The parameters that were statistically significantly (RM-ANOVA, P ≤ 0.05) related to emotion expression in both genres were SPL, alpha ratio, F1, and HNR. Additionally, for CCM, significance was found in sustain time, jitter, shimmer, F2, and F4. When fo and SPL were set as covariates in the variance analysis, jitter, HNR, and F4 did not show pure dependence on expression. The alpha ratio, F1, F2, shimmer apq5, amplitude vibrato rate, and sustain time of vocalizations had emotion-related variation also independent of fo and SPL in the CCM style, while these parameters were related to fo and SPL in the classical style.
CONCLUSIONS: The results differed somewhat for the CCM and classical styles. The alpha ratio showed less variation in the classical style, most likely reflecting the demand for a more stable voice source quality. The alpha ratio, F1, F2, shimmer, amplitude vibrato rate, and the sustain time of the vocalizations were related to fo and SPL control in the classical style. The only common independent sound parameter indicating emotional expression for both styles was SPL. The CCM style offers more freedom for expression-related changes in voice quality.}, }
@article {pmid31693443, year = {2019}, author = {Weirich, M and Simpson, A}, title = {Effects of Gender, Parental Role, and Time on Infant- and Adult-Directed Read and Spontaneous Speech.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {11}, pages = {4001-4014}, doi = {10.1044/2019_JSLHR-S-19-0047}, pmid = {31693443}, issn = {1558-9102}, mesh = {Adult ; Age Factors ; *Child Language ; Germany ; Humans ; Infant ; Male ; Parents/*psychology ; Phonetics ; *Reading ; *Role ; Sex Factors ; *Speech ; }, abstract = {Purpose The study sets out to investigate inter- and intraspeaker variation in German infant-directed speech (IDS) and considers the potential impact that the factors gender, parental involvement, and speech material (read vs. spontaneous speech) may have. In addition, we analyze data from 3 time points prior to and after the birth of the child to examine potential changes in the features of IDS and, particularly also, of adult-directed speech (ADS). Here, the gender identity of a speaker is considered as an additional factor. Method IDS and ADS data from 34 participants (15 mothers, 19 fathers) is gathered by means of a reading and a picture description task. For IDS, 2 recordings were made when the baby was approximately 6 and 9 months old, respectively. For ADS, an additional recording was made before the baby was born. Phonetic analyses comprise mean fundamental frequency (f0), variation in f0, the 1st 2 formants measured in /i: ɛ a u:/, and the vowel space size. Moreover, social and behavioral data were gathered regarding parental involvement and gender identity. Results German IDS is characterized by an increase in mean f0, a larger variation in f0, vowel- and formant-specific differences, and a larger acoustic vowel space. No effect of gender or parental involvement was found. Also, the phonetic features of IDS were found in both spontaneous and read speech. Regarding ADS, changes in vowel space size in some of the fathers and in mean f0 in mothers were found. Conclusion Phonetic features of German IDS are robust with respect to the factors gender, parental involvement, speech material (read vs. spontaneous speech), and time. Some phonetic features of ADS changed within the child's first year depending on gender and parental involvement/gender identity. Thus, further research on IDS needs to address also potential changes in ADS.}, }
@article {pmid31688299, year = {2019}, author = {de Carvalho, CC and da Silva, DM and de Carvalho, AD and Nóbrega, FJF and de Orange, FA}, title = {Evaluation of the association between voice formants and difficult facemask ventilation.}, journal = {European journal of anaesthesiology}, volume = {36}, number = {12}, pages = {972-973}, doi = {10.1097/EJA.0000000000001108}, pmid = {31688299}, issn = {1365-2346}, mesh = {Adult ; Aged ; Airway Management/adverse effects/*instrumentation ; Anesthesia, General/adverse effects/instrumentation ; Elective Surgical Procedures/adverse effects ; Female ; Humans ; Intraoperative Complications/*epidemiology/etiology/physiopathology ; Larynx/anatomy & histology/physiology ; Male ; Masks/*adverse effects ; Middle Aged ; Phonation/*physiology ; Preoperative Period ; Prospective Studies ; Risk Assessment/methods ; Voice/*physiology ; }, }
@article {pmid31685325, year = {2021}, author = {Bernardi, JMB and de Barros, LN and Assunção, LS and de Oliveira, RS and Gambirásio, YF and Medved, DMS and Fernandes, ACN and da Silva, EM}, title = {Effect of the Finnish Tube on the Voice of a Deaf Musician: A Case Report.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {3}, pages = {498.e23-498.e29}, doi = {10.1016/j.jvoice.2019.09.019}, pmid = {31685325}, issn = {1873-4588}, mesh = {Brazil ; Finland ; Humans ; Male ; Middle Aged ; *Phonation ; Speech Acoustics ; Voice Quality ; *Voice Training ; }, abstract = {PURPOSE: To verify the auditory-perceptual and acoustic effects of the semioccluded vocal tract exercise with Finnish tube on the vocal quality of a deaf musician.
METHODS: A seven-day protocol with Finnish tube was performed with guidance for its home replication twice a day. A 46-years-old man with profound bilateral sensorineural hearing loss, musician and composer participated. Before and after the application of the protocol had undergone tonal audiometry, nasofibrolaryngoscopy, acoustic analysis with Praat and auditory-perceptual evaluation of the voice with Voice Profile Analysis Scheme for Brazilian Portuguese.
RESULTS: The postintervention auditory-perceptual analysis identified reduction of the deviation in lip spreading, extensive labial range, raised tongue body, pharyngeal expansion, nasal resonance, larynx height, larynx and vocal tract tension and irregularity, pitch, speech rate, and a better respiratory support. The maximum phonation time reduced, probably because of elimination of the abrupt vocal attack and tension, articulatory deviations, improvement in voicing and the absence of the use of expiratory reserve air. The fundamental frequency became lower, and the first, second, third, and fourth formants became higher. The jitter increased, and the shimmer reduced.
CONCLUSION: The use of the Finnish tube might have facilitated the voicing sensations in the deaf musician, by enhancing the tactile-kinesthetic perception of the vocal tract and brought a greater source-filter interaction.}, }
@article {pmid31682569, year = {2020}, author = {Preisig, BC and Sjerps, MJ and Hervais-Adelman, A and Kösem, A and Hagoort, P and Riecke, L}, title = {Bilateral Gamma/Delta Transcranial Alternating Current Stimulation Affects Interhemispheric Speech Sound Integration.}, journal = {Journal of cognitive neuroscience}, volume = {32}, number = {7}, pages = {1242-1250}, doi = {10.1162/jocn_a_01498}, pmid = {31682569}, issn = {1530-8898}, mesh = {*Auditory Cortex ; Auditory Perception ; Humans ; Phonetics ; Speech ; *Transcranial Direct Current Stimulation ; }, abstract = {Perceiving speech requires the integration of different speech cues, that is, formants. When the speech signal is split so that different cues are presented to the right and left ear (dichotic listening), comprehension requires the integration of binaural information. Based on prior electrophysiological evidence, we hypothesized that the integration of dichotically presented speech cues is enabled by interhemispheric phase synchronization between primary and secondary auditory cortex in the gamma frequency band. We tested this hypothesis by applying transcranial alternating current stimulation (TACS) bilaterally above the superior temporal lobe to induce or disrupt interhemispheric gamma-phase coupling. In contrast to initial predictions, we found that gamma TACS applied in-phase above the two hemispheres (interhemispheric lag 0°) perturbs interhemispheric integration of speech cues, possibly because the applied stimulation perturbs an inherent phase lag between the left and right auditory cortex. We also observed this disruptive effect when applying antiphasic delta TACS (interhemispheric lag 180°). We conclude that interhemispheric phase coupling plays a functional role in interhemispheric speech integration. The direction of this effect may depend on the stimulation frequency.}, }
@article {pmid31663086, year = {2019}, author = {Howson, PJ and Redford, MA}, title = {Liquid coarticulation in child and adult speech.}, journal = {Proceedings of the ... International Congress of Phonetic Sciences. International Congress of Phonetic Sciences}, volume = {2019}, number = {}, pages = {3100-3104}, pmid = {31663086}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, abstract = {Although liquids are mastered late, English-speaking children are said to have fully acquired these segments by age 8. The aim of this study was to test whether liquid coarticulation was also adult-like by this age. 8-year-old productions of /əLa/ and /əLu/ sequences were compared to 5-year-old and adult productions of these sequences. SSANOVA analyses of formant frequency trajectories indicated that, while adults contrasted rhotics and laterals from the onset of the vocalic sequence, F2 trajectories for rhotics and lateral were overlapped at the onset of the /əLa/ sequence in 8-year-old productions and across the entire /əLu/ sequence. The F2 trajectories for rhotics and laterals were even more overlapped in 5-year olds' productions. Overall, the study suggests that whereas younger children have difficulty coordinating the tongue body/root gesture with the tongue tip gesture, older children still struggle with the intergestural timing associated with liquid production.}, }
@article {pmid31663083, year = {2019}, author = {Howson, PJ and Redford, MA}, title = {LISTENER PREFERENCE IS FOR REDUCED DETERMINERS THAT ANTICIPATE THE FOLLOWING NOUN.}, journal = {Proceedings of the ... International Congress of Phonetic Sciences. International Congress of Phonetic Sciences}, volume = {2019}, number = {}, pages = {378-382}, pmid = {31663083}, support = {R01 HD087452/HD/NICHD NIH HHS/United States ; }, abstract = {This study examines the effects of determiner reduction and coarticulation on the perceived naturalness of resynthesized shock-the-geek (V-the-N) sequences. The determiner, equally spaced between monosyllabic V and N, was manipulated in 3 experiments along a 7-step continuum: (1) duration varied from 0.25x the original duration to 4x this duration; (2) amplitude varied from 55 dB to 85 dB; (3) schwa formants varied from completely overlapped with the vowel in V to completely overlapped with the vowel in N. Listeners rated V-the-N sequences with reduced duration and intensity and more anticipatory coarticulation more favourably than sequences with increased duration and intensity and more preservatory coarticulation. These results are consistent with a listener preference for the production of supralexical chunks that adhere to morphosyntactic rather than metrical structure.}, }
@article {pmid31660423, year = {2019}, author = {Kim, D and Kim, S}, title = {Coarticulatory vowel nasalization in American English: Data of individual differences in acoustic realization of vowel nasalization as a function of prosodic prominence and boundary.}, journal = {Data in brief}, volume = {27}, number = {}, pages = {104593}, doi = {10.1016/j.dib.2019.104593}, pmid = {31660423}, issn = {2352-3409}, abstract = {This article provides acoustic measurements data for vowel nasalization which are based on speech recorded from fifteen (8 female and 7 male) native speakers of American English in a laboratory setting. Each individual speaker's production patterns for the vowel nasalization in tautosyllabic CVN and NVC words are documented in terms of three acoustic parameters: the duration of nasal consonant (N-Duration), the duration of vowel (V-Duration) and the difference between the amplitude of the first formant (A1) and the first nasal peak (P0) obtained from the vowel (A1-P0) as an indication of the degree of vowel nasalization. The A1-P0 is measured at three different time points within the vowel -i.e., the near point (25%), midpoint (50%), and distant point (75%), either from the onset (CVN) or the offset (NVC) of the nasal consonant. These measures are taken from the target words in various prosodic prominence and boundary contexts: phonologically focused (PhonFOC) vs. lexically focused (LexFOC) vs. unfocused (NoFOC) conditions; phrase-edge (i.e., phrase-final for CVN and phrase-initial for NVC) vs. phrase-medial conditions. The data also contain a CSV file with each speaker's mean values of the N-Duration, V-Duration, and A1-P0 (z-scored) for each prosodic context along with the information about the speakers' gender. For further discussion of the data, please refer to the full-length article entitled "Prosodically-conditioned fine-tuning of coarticulatory vowel nasalization in English"(Cho et al., 2017).}, }
@article {pmid31659578, year = {2020}, author = {Goswami, U and Nirmala, SR and Vikram, CM and Kalita, S and Prasanna, SRM}, title = {Analysis of Articulation Errors in Dysarthric Speech.}, journal = {Journal of psycholinguistic research}, volume = {49}, number = {1}, pages = {163-174}, pmid = {31659578}, issn = {1573-6555}, mesh = {Dysarthria/*diagnosis/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Psycholinguistics ; *Speech Acoustics ; }, abstract = {Imprecise articulation is the major issue reported in various types of dysarthria. Detection of articulation errors can help in diagnosis. The cues derived from both the burst and the formant transitions contribute to the discrimination of place of articulation of stops. It is believed that any acoustic deviations in stops due to articulation error can be analyzed by deriving features around the burst and the voicing onsets. The derived features can be used to discriminate the normal and dysarthric speech. In this work, a method is proposed to differentiate the voiceless stops produced by the normal speakers from the dysarthric by deriving the spectral moments, two-dimensional discrete cosine transform of linear prediction spectrum and Mel frequency cepstral coefficients features. These features and cosine distance based classifier is used for the classification of normal and dysarthic speech.}, }
@article {pmid31658784, year = {2019}, author = {Machado, TJ and Vieira Filho, J and de Oliveira, MA}, title = {Forensic Speaker Verification Using Ordinary Least Squares.}, journal = {Sensors (Basel, Switzerland)}, volume = {19}, number = {20}, pages = {}, pmid = {31658784}, issn = {1424-8220}, abstract = {In Brazil, the recognition of speakers for forensic purposes still relies on a subjectivity-based decision-making process through a results analysis of untrustworthy techniques. Owing to the lack of a voice database, speaker verification is currently applied to samples specifically collected for confrontation. However, speaker comparative analysis via contested discourse requires the collection of an excessive amount of voice samples for a series of individuals. Further, the recognition system must inform who is the most compatible with the contested voice from pre-selected individuals. Accordingly, this paper proposes using a combination of linear predictive coding (LPC) and ordinary least squares (OLS) as a speaker verification tool for forensic analysis. The proposed recognition technique establishes confidence and similarity upon which to base forensic reports, indicating verification of the speaker of the contested discourse. Therefore, in this paper, an accurate, quick, alternative method to help verify the speaker is contributed. After running seven different tests, this study preliminarily achieved a hit rate of 100% considering a limited dataset (Brazilian Portuguese). Furthermore, the developed method extracts a larger number of formants, which are indispensable for statistical comparisons via OLS. The proposed framework is robust at certain levels of noise, for sentences with the suppression of word changes, and with different quality or even meaningful audio time differences.}, }
@article {pmid31644889, year = {2020}, author = {Cartei, V and Banerjee, R and Garnham, A and Oakhill, J and Roberts, L and Anns, S and Bond, R and Reby, D}, title = {Physiological and perceptual correlates of masculinity in children's voices.}, journal = {Hormones and behavior}, volume = {117}, number = {}, pages = {104616}, doi = {10.1016/j.yhbeh.2019.104616}, pmid = {31644889}, issn = {1095-6867}, mesh = {Adolescent ; Adult ; Age Factors ; Auditory Perception/physiology ; Child ; Child Development/*physiology ; Child, Preschool ; Female ; Humans ; Male ; *Masculinity ; Sex Factors ; Sexual Maturation/physiology ; *Social Perception ; *Speech Acoustics ; Testosterone/blood ; Voice/*physiology ; Young Adult ; }, abstract = {Low frequency components (i.e. a low pitch (F0) and low formant spacing (ΔF)) signal high salivary testosterone and height in adult male voices and are associated with high masculinity attributions by unfamiliar listeners (in both men and women). However, the relation between the physiological, acoustic and perceptual dimensions of speakers' masculinity prior to puberty remains unknown. In this study, 110 pre-pubertal children (58 girls), aged 3 to 10, were recorded as they described a cartoon picture. 315 adults (182 women) rated children's perceived masculinity from the voice only after listening to the speakers' audio recordings. On the basis of their voices alone, boys who had higher salivary testosterone levels were rated as more masculine and the relation between testosterone and perceived masculinity was partially mediated by F0. The voices of taller boys were also rated as more masculine, but the relation between height and perceived masculinity was not mediated by the considered acoustic parameters, indicating that acoustic cues other than F0 and ΔF may signal stature. Both boys and girls who had lower F0, were also rated as more masculine, while ΔF did not affect ratings. These findings highlight the interdependence of physiological, acoustic and perceptual dimensions, and suggest that inter-individual variation in male voices, particularly F0, may advertise hormonal masculinity from a very early age.}, }
@article {pmid31621355, year = {2020}, author = {Scheerer, NE and Jacobson, DS and Jones, JA}, title = {Sensorimotor control of vocal production in early childhood.}, journal = {Journal of experimental psychology. General}, volume = {149}, number = {6}, pages = {1071-1077}, doi = {10.1037/xge0000706}, pmid = {31621355}, issn = {1939-2222}, mesh = {Acoustic Stimulation ; Child, Preschool ; Feedback, Sensory/physiology ; Female ; Humans ; Male ; Speech/*physiology ; Speech Perception/physiology ; Voice/*physiology ; }, abstract = {Children maintain fluent speech despite dramatic changes to their articulators during development. Auditory feedback aids in the acquisition and maintenance of the sensorimotor mechanisms that underlie vocal motor control. MacDonald, Johnson, Forsythe, Plante, and Munhall (2012) reported that toddlers' speech motor control systems may "suppress" the influence of auditory feedback, since exposure to altered auditory feedback regarding their formant frequencies did not lead to modifications of their speech. This finding is not parsimonious with most theories of motor control. Here, we exposed toddlers to perturbations to the pitch of their auditory feedback as they vocalized. Toddlers compensated for the manipulations, producing significantly different responses to upward and downward perturbations. These data represent the first empirical demonstration that toddlers use auditory feedback for vocal motor control. Furthermore, our findings suggest toddlers are more sensitive to changes to the postural properties of their auditory feedback, such as fundamental frequency, relative to the phonemic properties, such as formant frequencies. (PsycInfo Database Record (c) 2020 APA, all rights reserved).}, }
@article {pmid31593943, year = {2020}, author = {Conklin, JT and Dmitrieva, O}, title = {Vowel-to-Vowel Coarticulation in Spanish Nonwords.}, journal = {Phonetica}, volume = {77}, number = {4}, pages = {294-319}, doi = {10.1159/000502890}, pmid = {31593943}, issn = {1423-0321}, mesh = {Adult ; Female ; Humans ; Language ; Linguistics ; Male ; Middle Aged ; *Phonetics ; Spain ; *Speech Acoustics ; Young Adult ; }, abstract = {The present study examined vowel-to-vowel (VV) coarticulation in backness affecting mid vowels /e/ and /o/ in 36 Spanish nonwords produced by 20 native speakers of Spanish, aged 19-50 years (mean = 30.7; SD = 8.2). Examination of second formant frequency showed substantial carryover coarticulation throughout the data set, while anticipatory coarticulation was minimal and of shorter duration. Furthermore, the effect of stress on vowel-to-vowel coarticulation was investigated and found to vary by direction. In the anticipatory direction, small coarticulatory changes were relatively stable regardless of stress, particularly for target /e/, while in the carryover direction, a hierarchy of stress emerged wherein the greatest coarticulation occurred between stressed triggers and unstressed targets, less coarticulation was observed between unstressed triggers and unstressed targets, and the least coarticulation occurred between unstressed triggers with stressed targets. The results of the study augment and refine previously available knowledge about vowel-to-vowel coarticulation in Spanish and expand cross-linguistic understanding of the effect of stress on the magnitude and direction of vowel-to-vowel coarticulation.}, }
@article {pmid31590565, year = {2019}, author = {Lee, Y and Keating, P and Kreiman, J}, title = {Acoustic voice variation within and between speakers.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {3}, pages = {1568}, pmid = {31590565}, issn = {1520-8524}, support = {R01 DC001797/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Biological Variation, Individual ; *Biological Variation, Population ; Female ; Humans ; Male ; Phonetics ; Psychoacoustics ; *Speech Acoustics ; Voice/*physiology ; }, abstract = {Little is known about the nature or extent of everyday variability in voice quality. This paper describes a series of principal component analyses to explore within- and between-talker acoustic variation and the extent to which they conform to expectations derived from current models of voice perception. Based on studies of faces and cognitive models of speaker recognition, the authors hypothesized that a few measures would be important across speakers, but that much of within-speaker variability would be idiosyncratic. Analyses used multiple sentence productions from 50 female and 50 male speakers of English, recorded over three days. Twenty-six acoustic variables from a psychoacoustic model of voice quality were measured every 5 ms on vowels and approximants. Across speakers the balance between higher harmonic amplitudes and inharmonic energy in the voice accounted for the most variance (females = 20%, males = 22%). Formant frequencies and their variability accounted for an additional 12% of variance across speakers. Remaining variance appeared largely idiosyncratic, suggesting that the speaker-specific voice space is different for different people. Results further showed that voice spaces for individuals and for the population of talkers have very similar acoustic structures. Implications for prototype models of voice perception and recognition are discussed.}, }
@article {pmid31586643, year = {2019}, author = {Gammon, DE and Corsiglia, AM}, title = {Mockingbirds imitate frogs and toads across North America.}, journal = {Behavioural processes}, volume = {169}, number = {}, pages = {103982}, doi = {10.1016/j.beproc.2019.103982}, pmid = {31586643}, issn = {1872-8308}, mesh = {Acoustics ; Animals ; Anura ; Behavior, Animal/*physiology ; Bufonidae ; Imitative Behavior/*physiology ; North America ; Passeriformes/*physiology ; }, abstract = {Vocal mimicry is taxonomically widespread among birds, but little is known about mimicry of non-avian models. Prior studies show preferential imitation of avian models whose sounds are acoustically similar to the non-imitative songs of the vocal mimic. Based on these studies and anecdotes about frog imitations by northern mockingbirds (Mimus polyglottos), we hypothesized which anuran models would be most likely to get imitated by mockingbirds across their geographic range. We tested our hypothesis using >40 h of archived mockingbird recordings. Our results showed that mockingbirds imitated at least 12 anuran species, and calls were disproportionately mimicked when they contained dominant frequencies within the vocal range of the mockingbird (750-7000 Hz). Mockingbirds also frequently modified model anuran sounds by leaving out formants and/or truncating call duration. Our results represent the most comprehensive survey for any mimicking species of the imitation of anurans.}, }
@article {pmid31571334, year = {2020}, author = {Balaguer, M and Pommée, T and Farinas, J and Pinquier, J and Woisard, V and Speyer, R}, title = {Effects of oral and oropharyngeal cancer on speech intelligibility using acoustic analysis: Systematic review.}, journal = {Head & neck}, volume = {42}, number = {1}, pages = {111-130}, doi = {10.1002/hed.25949}, pmid = {31571334}, issn = {1097-0347}, mesh = {Acoustics ; Humans ; *Oropharyngeal Neoplasms ; Speech Acoustics ; *Speech Intelligibility ; Speech Production Measurement ; }, abstract = {BACKGROUND: The development of automatic tools based on acoustic analysis allows to overcome the limitations of perceptual assessment for patients with head and neck cancer. The aim of this study is to provide a systematic review of literature describing the effects of oral and oropharyngeal cancer on speech intelligibility using acoustic analysis.
METHODS: Two databases (PubMed and Embase) were surveyed. The selection process, according to the preferred reporting items for systematic reviews and meta-analyses (PRISMA) statement, led to a final set of 22 articles.
RESULTS: Nasalance is studied mainly in oropharyngeal patients. The vowels are mostly studied using formant analysis and vowel space area, the consonants by means of spectral moments with specific parameters according to their phonetic characteristic. Machine learning methods allow classifying "intelligible" or "unintelligible" speech for T3 or T4 tumors.
CONCLUSIONS: The development of comprehensive models combining different acoustic measures would allow a better consideration of the functional impact of the speech disorder.}, }
@article {pmid31564128, year = {2019}, author = {Suire, A and Raymond, M and Barkat-Defradas, M}, title = {Male Vocal Quality and Its Relation to Females' Preferences.}, journal = {Evolutionary psychology : an international journal of evolutionary approaches to psychology and behavior}, volume = {17}, number = {3}, pages = {1474704919874675}, doi = {10.1177/1474704919874675}, pmid = {31564128}, issn = {1474-7049}, mesh = {Adult ; Choice Behavior/*physiology ; Female ; Humans ; Male ; Sexual Behavior/*physiology ; *Social Perception ; *Verbal Behavior ; *Voice ; }, abstract = {In both correlational and experimental settings, studies on women's vocal preferences have reported negative relationships between perceived attractiveness and men's vocal pitch, emphasizing the idea of an adaptive preference. However, such consensus on vocal attractiveness has been mostly conducted with native English speakers, but a few evidence suggest that it may be culture-dependent. Moreover, other overlooked acoustic components of vocal quality, such as intonation, perceived breathiness and roughness, may influence vocal attractiveness. In this context, the present study aims to contribute to the literature by investigating vocal attractiveness in an underrepresented language (i.e., French) as well as shedding light on its relationship with understudied acoustic components of vocal quality. More specifically, we investigated the relationships between attractiveness ratings as assessed by female raters and male voice pitch, its variation, the formants' dispersion and position, and the harmonics-to-noise and jitter ratios. Results show that women were significantly more attracted to lower vocal pitch and higher intonation patterns. However, they did not show any directional preferences for all the other acoustic features. We discuss our results in light of the adaptive functions of vocal preferences in a mate choice context.}, }
@article {pmid31543207, year = {2019}, author = {Zeng, Q and Jiao, Y and Huang, X and Wang, R and Bao, H and Lamb, JR and Le, J and Zhuang, P and Jiang, J}, title = {Effects of Angle of Epiglottis on Aerodynamic and Acoustic Parameters in Excised Canine Larynges.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {33}, number = {5}, pages = {627-633}, doi = {10.1016/j.jvoice.2018.02.007}, pmid = {31543207}, issn = {1873-4588}, mesh = {Acoustics ; Animals ; Dogs ; Epiglottis/*anatomy & histology/*physiology/surgery ; Laryngectomy ; *Phonation ; Pressure ; Sound Spectrography ; *Vocalization, Animal ; }, abstract = {OBJECTIVES: The aim of this study is to explore the effects of the angle of epiglottis (Aepi) on phonation and resonance in excised canine larynges.
METHODS: The anatomic Aepi was measured for 14 excised canine larynges as a control. Then, the Aepis were manually adjusted to 60° and 90° in each larynx. Aerodynamic and acoustic parameters, including mean flow rate, sound pressure level, jitter, shimmer, fundamental frequency (F0), and formants (F1'-F4'), were measured with a subglottal pressure of 1.5 kPa. Simple linear regression analysis between acoustic and aerodynamic parameters and the Aepi of the control was performed, and an analysis of variance comparing the acoustic and aerodynamic parameters of the three treatments was carried out.
RESULTS: The results of the study are as follows: (1) the larynges with larger anatomic Aepi had significantly lower jitter, shimmer, formant 1, and formant 2; (2) phonation threshold flow was significantly different for the three treatments; and (3) mean flow rate and sound pressure level were significantly different between the 60° and the 90° treatments of the 14 larynges.
CONCLUSIONS: The Aepi was proposed for the first time in this study. The Aepi plays an important role in phonation and resonance of excised canine larynges.}, }
@article {pmid31533114, year = {2020}, author = {Dmitrieva, O and Dutta, I}, title = {Acoustic Correlates of the Four-Way Laryngeal Contrast in Marathi.}, journal = {Phonetica}, volume = {77}, number = {3}, pages = {209-237}, doi = {10.1159/000501673}, pmid = {31533114}, issn = {1423-0321}, mesh = {Humans ; Speech Acoustics ; Speech ; *Larynx/diagnostic imaging ; Acoustics ; *Voice ; Language ; }, abstract = {The study examines acoustic correlates of the four-way laryngeal contrast in Marathi, focusing on temporal parameters, voice quality, and onset f0. Acoustic correlates of the laryngeal contrast were investigated in the speech of 33 native speakers of Marathi, recorded in Mumbai, India, producing a word list containing six sets of words minimally contrastive in terms of laryngeal specification of word-initial velar stops. Measurements were made for the duration of prevoicing, release, and voicing during release. Fundamental frequency was measured at the onset of voicing following the stop and at 10 additional time points. As measures of voice quality, amplitude differences between the first and second harmonic (H1-H2) and between the first harmonic and the third formant (H1-A3) were calculated. The results demonstrated that laryngeal categories in Marathi are differentiated based on temporal measures, voice quality, and onset f0, although differences in each dimension were unequal in magnitude across different pairs of stop categories. We conclude that a single acoustic correlate, such as voice onset time, is insufficient to differentiate among all the laryngeal categories in languages such as Marathi, characterized by complex four-way laryngeal contrasts. Instead, a joint contribution of several acoustic correlates creates a robust multidimensional contrast.}, }
@article {pmid31479380, year = {2019}, author = {Guan, J and Liu, C}, title = {Speech Perception in Noise With Formant Enhancement for Older Listeners.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {9}, pages = {3290-3301}, doi = {10.1044/2019_JSLHR-S-18-0089}, pmid = {31479380}, issn = {1558-9102}, mesh = {Age Factors ; Female ; Hearing Loss/*physiopathology ; Humans ; Male ; Middle Aged ; *Noise ; Speech Intelligibility ; *Speech Perception ; }, abstract = {Purpose Degraded speech intelligibility in background noise is a common complaint of listeners with hearing loss. The purpose of the current study is to explore whether 2nd formant (F2) enhancement improves speech perception in noise for older listeners with hearing impairment (HI) and normal hearing (NH). Method Target words (e.g., color and digit) were selected and presented based on the paradigm of the coordinate response measure corpus. Speech recognition thresholds with original and F2-enhanced speech in 2- and 6-talker babble were examined for older listeners with NH and HI. Results The thresholds for both the NH and HI groups improved for enhanced speech signals primarily in 2-talker babble, but not in 6-talker babble. The F2 enhancement benefits did not correlate significantly with listeners' age and their average hearing thresholds in most listening conditions. However, speech intelligibility index values increased significantly with F2 enhancement in babble for listeners with HI, but not for NH listeners. Conclusions Speech sounds with F2 enhancement may improve listeners' speech perception in 2-talker babble, possibly due to a greater amount of speech information available in temporally modulated noise or a better capacity to separate speech signals from background babble.}, }
@article {pmid31472573, year = {2019}, author = {Klein, E and Brunner, J and Hoole, P}, title = {The influence of coarticulatory and phonemic relations on individual compensatory formant production.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {2}, pages = {1265}, doi = {10.1121/1.5122788}, pmid = {31472573}, issn = {1520-8524}, abstract = {Previous auditory perturbation studies have shown that speakers are able to simultaneously use multiple compensatory strategies to produce a certain acoustic target. In the case of formant perturbation, these findings were obtained examining the compensatory production for low vowels /ɛ/ and /æ/. This raises some controversy as more recent research suggests that the contribution of the somatosensory feedback to the production of vowels might differ across phonemes. In particular, the compensatory magnitude to auditory perturbations is expected to be weaker for high vowels compared to low vowels since the former are characterized by larger linguopalatal contact. To investigate this hypothesis, this paper conducted a bidirectional auditory perturbation study in which F2 of the high central vowel /ɨ/ was perturbed in opposing directions depending on the preceding consonant (alveolar vs velar). The consonants were chosen such that speakers' usual coarticulatory patterns were either compatible or incompatible with the required compensatory strategy. The results demonstrate that speakers were able to compensate for applied perturbations even if speakers' compensatory movements resulted in unusual coarticulatory configurations. However, the results also suggest that individual compensatory patterns were influenced by additional perceptual factors attributable to the phonemic space surrounding the target vowel /ɨ/.}, }
@article {pmid31472538, year = {2019}, author = {Migimatsu, K and Tokuda, IT}, title = {Experimental study on nonlinear source-filter interaction using synthetic vocal fold models.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {2}, pages = {983}, doi = {10.1121/1.5120618}, pmid = {31472538}, issn = {1520-8524}, mesh = {Acoustics/*instrumentation ; Biomimetic Materials/chemistry ; *Models, Biological ; Phonation ; Silicones/chemistry ; Transducers ; Vocal Cords/*physiology ; Voice ; }, abstract = {Under certain conditions, e.g., singing voice, the fundamental frequency of the vocal folds can go up and interfere with the formant frequencies. Acoustic feedback from the vocal tract filter to the vocal fold source then becomes strong and non-negligible. An experimental study was presented on such source-filter interaction using three types of synthetic vocal fold models. Asymmetry was also created between the left and right vocal folds. The experiment reproduced various nonlinear phenomena, such as frequency jump and quenching, as reported in humans. Increase in phonation threshold pressure was also observed when resonant frequency of the vocal tract and fundamental frequency of the vocal folds crossed each other. As a combined effect, the phonation threshold pressure was further increased by the left-right asymmetry. Simulation of the asymmetric two-mass model reproduced the experiments to some extent. One of the intriguing findings of this study is the variable strength of the source-filter interaction over different model types. Among the three models, two models were strongly influenced by the vocal tract, while no clear effect of the vocal tract was observed in the other model. This implies that the level of source-filter interaction may vary considerably from one subject to another in humans.}, }
@article {pmid34307642, year = {2019}, author = {Mamun, N and Ghosh, R and Hansen, JHL}, title = {Quantifying Cochlear Implant Users' Ability for Speaker Identification using CI Auditory Stimuli.}, journal = {Interspeech}, volume = {2019}, number = {}, pages = {3118-3122}, pmid = {34307642}, issn = {2308-457X}, support = {R01 DC016839/DC/NIDCD NIH HHS/United States ; }, abstract = {Speaker recognition is a biometric modality that uses underlying speech information to determine the identity of the speaker. Speaker Identification (SID) under noisy conditions is one of the challenging topics in the field of speech processing, specifically when it comes to individuals with cochlear implants (CI). This study analyzes and quantifies the ability of CI-users to perform speaker identification based on direct electric auditory stimuli. CI users employ a limited number of frequency bands (8 ∼ 22) and use electrodes to directly stimulate the Basilar Membrane/Cochlear in order to recognize the speech signal. The sparsity of electric stimulation within the CI frequency range is a prime reason for loss in human speech recognition, as well as SID performance. Therefore, it is assumed that CI-users might be unable to recognize and distinguish a speaker given dependent information such as formant frequencies, pitch etc. which are lost to un-simulated electrodes. To quantify this assumption, the input speech signal is processed using a CI Advanced Combined Encoder (ACE) signal processing strategy to construct the CI auditory electrodogram. The proposed study uses 50 speakers from each of three different databases for training the system using two different classifiers under quiet, and tested under both quiet and noisy conditions. The objective result shows that, the CI users can effectively identify a limited number of speakers. However, their performance decreases when more speakers are added in the system, as well as when noisy conditions are introduced. This information could therefore be used for improving CI-user signal processing techniques to improve human SID.}, }
@article {pmid31465711, year = {2019}, author = {Max, L and Daliri, A}, title = {Limited Pre-Speech Auditory Modulation in Individuals Who Stutter: Data and Hypotheses.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {8S}, pages = {3071-3084}, pmid = {31465711}, issn = {1558-9102}, support = {R01 DC007603/DC/NIDCD NIH HHS/United States ; R01 DC014510/DC/NIDCD NIH HHS/United States ; R01 DC017444/DC/NIDCD NIH HHS/United States ; MOP-137001//CIHR/Canada ; }, mesh = {Adult ; *Auditory Perception/physiology ; Evoked Potentials, Auditory/physiology ; Humans ; Models, Theoretical ; Speech/physiology ; Stuttering/etiology/*physiopathology ; }, abstract = {Purpose We review and interpret our recent series of studies investigating motor-to-auditory influences during speech movement planning in fluent speakers and speakers who stutter. In those studies, we recorded auditory evoked potentials in response to probe tones presented immediately prior to speaking or at the equivalent time in no-speaking control conditions. As a measure of pre-speech auditory modulation (PSAM), we calculated changes in auditory evoked potential amplitude in the speaking conditions relative to the no-speaking conditions. Whereas adults who do not stutter consistently showed PSAM, this phenomenon was greatly reduced or absent in adults who stutter. The same between-group difference was observed in conditions where participants expected to hear their prerecorded speech played back without actively producing it, suggesting that the speakers who stutter use inefficient forward modeling processes rather than inefficient motor command generation processes. Compared with fluent participants, adults who stutter showed both less PSAM and less auditory-motor adaptation when producing speech while exposed to formant-shifted auditory feedback. Across individual participants, however, PSAM and auditory-motor adaptation did not correlate in the typically fluent group, and they were negatively correlated in the stuttering group. Interestingly, speaking with a consistent 100-ms delay added to the auditory feedback signal-normalized PSAM in speakers who stutter, and there no longer was a between-group difference in this condition. Conclusions Combining our own data with human and animal neurophysiological evidence from other laboratories, we interpret the overall findings as suggesting that (a) speech movement planning modulates auditory processing in a manner that may optimize its tuning characteristics for monitoring feedback during speech production and, (b) in conditions with typical auditory feedback, adults who stutter do not appropriately modulate the auditory system prior to speech onset. Lack of modulation of speakers who stutter may lead to maladaptive feedback-driven movement corrections that manifest themselves as repetitive movements or postural fixations.}, }
@article {pmid31439969, year = {2018}, author = {Plummer, AR and Reidy, PF}, title = {Computing low-dimensional representations of speech from socio-auditory structures for phonetic analyses.}, journal = {Journal of phonetics}, volume = {71}, number = {}, pages = {355-375}, pmid = {31439969}, issn = {0095-4470}, support = {R01 DC002932/DC/NIDCD NIH HHS/United States ; }, abstract = {Low-dimensional representations of speech data, such as formant values extracted by linear predictive coding analysis or spectral moments computed from whole spectra viewed as probability distributions, have been instrumental in both phonetic and phonological analyses over the last few decades. In this paper, we present a framework for computing low-dimensional representations of speech data based on two assumptions: that speech data represented in high-dimensional data spaces lie on shapes called manifolds that can be used to map speech data to low-dimensional coordinate spaces, and that manifolds underlying speech data are generated from a combination of language-specific lexical, phonological, and phonetic information as well as culture-specific socio-indexical information that is expressed by talkers of a given speech community. We demonstrate the basic mechanics of the framework by carrying out an analysis of children's productions of sibilant fricatives relative to those of adults in their speech community using the phoneigen package - a publicly available implementation of the framework. We focus the demonstration on enumerating the steps for constructing manifolds from data and then using them to map the data to a low-dimensional space, explicating how manifold structure affects the learned low-dimensional representations, and comparing the use of these representations against standard acoustic features in a phonetic analysis. We conclude with a discussion of the framework's underlying assumptions, its broader modeling potential, and its position relative to recent advances in the field of representation learning.}, }
@article {pmid31418715, year = {2019}, author = {Jain, S and Nataraja, NP}, title = {The Relationship between Temporal Integration and Temporal Envelope Perception in Noise by Males with Mild Sensorineural Hearing Loss.}, journal = {The journal of international advanced otology}, volume = {15}, number = {2}, pages = {257-262}, pmid = {31418715}, issn = {2148-3817}, mesh = {Acoustic Stimulation ; Adult ; Analysis of Variance ; Auditory Threshold/physiology ; Hearing Loss, Sensorineural/*physiopathology ; Humans ; Male ; *Noise ; Psychoacoustics ; Signal-To-Noise Ratio ; Speech Perception/*physiology ; }, abstract = {OBJECTIVES: A surge of literature indicated that temporal integration and temporal envelope perception contribute largely to the perception of speech. A review of literature showed that the perception of speech with temporal integration and temporal envelope perception in noise might be affected due to sensorineural hearing loss but to a varying degree. Because the temporal integration and temporal envelope share similar physiological processing at the cochlear level, the present study was aimed to identify the relationship between temporal integration and temporal envelope perception in noise by individuals with mild sensorineural hearing loss.
MATERIALS AND METHODS: Thirty adult males with mild sensorineural hearing loss and thirty age- and gender-matched normal-hearing individuals volunteered for being the participants of the study. The temporal integration was measured using synthetic consonant-vowel-consonant syllables, varied for onset, offset, and onset-offset of second and third formant frequencies of the vowel following and preceding consonants in six equal steps, thus forming a six-step onset, offset, and onset-offset continuum, each. The duration of the transition was kept short (40 ms) in one set of continua and long (80 ms) in another. Temporal integration scores were calculated as the differences in the identification of the categorical boundary between short- and long-transition continua. Temporal envelope perception was measured using sentences processed in quiet, 0 dB, and -5 dB signal-to-noise ratios at 4, 8, 16, and 32 contemporary frequency channels, and the temporal envelope was extracted for each sentence using the Hilbert transformation.
RESULTS: A significant effect of hearing loss was observed on temporal integration, but not on temporal envelope perception. However, when the temporal integration abilities were controlled, the variable effect of hearing loss on temporal envelope perception was noted.
CONCLUSION: It was important to measure the temporal integration to accurately account for the envelope perception by individuals with normal hearing and those with hearing loss.}, }
@article {pmid31417760, year = {2019}, author = {Cartei, V and Garnham, A and Oakhill, J and Banerjee, R and Roberts, L and Reby, D}, title = {Children can control the expression of masculinity and femininity through the voice.}, journal = {Royal Society open science}, volume = {6}, number = {7}, pages = {190656}, pmid = {31417760}, issn = {2054-5703}, abstract = {Pre-pubertal boys and girls speak with acoustically different voices despite the absence of a clear anatomical dimorphism in the vocal apparatus, suggesting that a strong component of the expression of gender through the voice is behavioural. Initial evidence for this hypothesis was found in a previous study showing that children can alter their voice to sound like a boy or like a girl. However, whether they can spontaneously modulate these voice components within their own gender in order to vary the expression of their masculinity and femininity remained to be investigated. Here, seventy-two English-speaking children aged 6-10 were asked to give voice to child characters varying in masculine and feminine stereotypicality to investigate whether primary school children spontaneously adjust their sex-related cues in the voice-fundamental frequency (F0) and formant spacing (ΔF)-along gender stereotypical lines. Boys and girls masculinized their voice, by lowering F0 and ΔF, when impersonating stereotypically masculine child characters of the same sex. Girls and older boys also feminized their voice, by raising their F0 and ΔF, when impersonating stereotypically feminine same-sex child characters. These findings reveal that children have some knowledge of the sexually dimorphic acoustic cues underlying the expression of gender, and are capable of controlling them to modulate gender-related attributes, paving the way for the use of the voice as an implicit, objective measure of the development of gender stereotypes and behaviour.}, }
@article {pmid31415186, year = {2019}, author = {Dorman, MF and Natale, SC and Zeitler, DM and Baxter, L and Noble, JH}, title = {Looking for Mickey Mouse™ But Finding a Munchkin: The Perceptual Effects of Frequency Upshifts for Single-Sided Deaf, Cochlear Implant Patients.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {9}, pages = {3493-3499}, pmid = {31415186}, issn = {1558-9102}, support = {R01 DC014037/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; *Auditory Perception ; *Cochlear Implants ; Deafness/*physiopathology/*rehabilitation ; Female ; Humans ; Middle Aged ; *Sound ; }, abstract = {Purpose Our aim was to make audible for normal-hearing listeners the Mickey Mouse™ sound quality of cochlear implants (CIs) often found following device activation. Method The listeners were 3 single-sided deaf patients fit with a CI and who had 6 months or less of CI experience. Computed tomography imaging established the location of each electrode contact in the cochlea and allowed an estimate of the place frequency of the tissue nearest each electrode. For the most apical electrodes, this estimate ranged from 650 to 780 Hz. To determine CI sound quality, a clean signal (a sentence) was presented to the CI ear via a direct connect cable and candidate, and CI-like signals were presented to the ear with normal hearing via an insert receiver. The listeners rated the similarity of the candidate signals to the sound of the CI on a 1- to 10-point scale, with 10 being a complete match. Results To make the match to CI sound quality, all 3 patients need an upshift in formant frequencies (300-800 Hz) and a metallic sound quality. Two of the 3 patients also needed an upshift in voice pitch (10-80 Hz) and a muffling of sound quality. Similarity scores ranged from 8 to 9.7. Conclusion The formant frequency upshifts, fundamental frequency upshifts, and metallic sound quality experienced by the listeners can be linked to the relatively basal locations of the electrode contacts and short duration experience with their devices. The perceptual consequence was not the voice quality of Mickey Mouse™ but rather that of Munchkins in The Wizard of Oz for whom both formant frequencies and voice pitch were upshifted. Supplemental Material https://doi.org/10.23641/asha.9341651.}, }
@article {pmid31399293, year = {2020}, author = {Knight, EJ and Austin, SF}, title = {The Effect of Head Flexion/Extension on Acoustic Measures of Singing Voice Quality.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {964.e11-964.e21}, doi = {10.1016/j.jvoice.2019.06.019}, pmid = {31399293}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Humans ; Phonation ; *Singing ; *Voice ; Voice Quality ; }, abstract = {A study was undertaken to identify the effect of head flexion/extension on singing voice quality. The amplitude of the fundamental frequency (F0) and the singing power ratio (SPR), an indirect measure of Singer's Formant activity, were measured. F0 and SPR scores at four experimental head positions were compared with the subjects' scores at their habitual positions. Three vowels and three pitch levels were tested. F0 amplitudes and low-frequency partials in general were greater with neck extension, while SPR increased with neck flexion. No effect of pitch or vowel was found. Gains in SPR appear to be the result of damping low-frequency partials rather than amplifying those in the Singer's Formant region. Raising the amplitude of F0 is an important resonance tool for female voices in the high range, and may be of benefit to other voice types in resonance, loudness, and laryngeal function.}, }
@article {pmid31379540, year = {2019}, author = {Alho, K and Żarnowiec, K and Gorina-Careta, N and Escera, C}, title = {Phonological Task Enhances the Frequency-Following Response to Deviant Task-Irrelevant Speech Sounds.}, journal = {Frontiers in human neuroscience}, volume = {13}, number = {}, pages = {245}, pmid = {31379540}, issn = {1662-5161}, abstract = {In electroencephalography (EEG) measurements, processing of periodic sounds in the ascending auditory pathway generates the frequency-following response (FFR) phase-locked to the fundamental frequency (F0) and its harmonics of a sound. We measured FFRs to the steady-state (vowel) part of syllables /ba/ and /aw/ occurring in binaural rapid streams of speech sounds as frequently repeating standard syllables or as infrequent (p = 0.2) deviant syllables among standard /wa/ syllables. Our aim was to study whether concurrent active phonological processing affects early processing of irrelevant speech sounds reflected by FFRs to these sounds. To this end, during syllable delivery, our healthy adult participants performed tasks involving written letters delivered on a computer screen in a rapid stream. The stream consisted of vowel letters written in red, infrequently occurring consonant letters written in the same color, and infrequently occurring vowel letters written in blue. In the phonological task, the participants were instructed to press a response key to the consonant letters differing phonologically but not in color from the frequently occurring red vowels, whereas in the non-phonological task, they were instructed to respond to the vowel letters written in blue differing only in color from the frequently occurring red vowels. We observed that the phonological task enhanced responses to deviant /ba/ syllables but not responses to deviant /aw/ syllables. This suggests that active phonological task performance may enhance processing of such small changes in irrelevant speech sounds as the 30-ms difference in the initial formant-transition time between the otherwise identical syllables /ba/ and /wa/ used in the present study.}, }
@article {pmid31370636, year = {2019}, author = {Birkholz, P and Gabriel, F and Kürbis, S and Echternach, M}, title = {How the peak glottal area affects linear predictive coding-based formant estimates of vowels.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {223}, doi = {10.1121/1.5116137}, pmid = {31370636}, issn = {1520-8524}, abstract = {The estimation of formant frequencies from acoustic speech signals is mostly based on Linear Predictive Coding (LPC) algorithms. Since LPC is based on the source-filter model of speech production, the formant frequencies obtained are often implicitly regarded as those for an infinite glottal impedance, i.e., a closed glottis. However, previous studies have indicated that LPC-based formant estimates of vowels generated with a realistically varying glottal area may substantially differ from the resonances of the vocal tract with a closed glottis. In the present study, the deviation between closed-glottis resonances and LPC-estimated formants during phonation with different peak glottal areas has been systematically examined both using physical vocal tract models excited with a self-oscillating rubber model of the vocal folds, and by computer simulations of interacting source and filter models. Ten vocal tract resonators representing different vowels have been analyzed. The results showed that F1 increased with the peak area of the time-varying glottis, while F2 and F3 were not systematically affected. The effect of the peak glottal area on F1 was strongest for close-mid to close vowels, and more moderate for mid to open vowels.}, }
@article {pmid31370618, year = {2019}, author = {González Hautamäki, R and Hautamäki, V and Kinnunen, T}, title = {On the limits of automatic speaker verification: Explaining degraded recognizer scores through acoustic changes resulting from voice disguise.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {693}, doi = {10.1121/1.5119240}, pmid = {31370618}, issn = {1520-8524}, abstract = {In speaker verification research, objective performance benchmarking of listeners and automatic speaker verification (ASV) systems are of key importance in understanding the limits of speaker recognition. While the adoption of common data and metrics has been instrumental to progress in ASV, there are two major shortcomings. First, the utterances lack intentional voice changes imposed by the speaker. Second, the standard evaluation metrics focus on average performance across all speakers and trials. As a result, a knowledge gap remains in how the acoustic changes impact recognition performance at the level of individual speakers. This paper addresses the limits of speaker recognition in ASV systems under voice disguise using a linear mixed effects model to analyze the impact of change in long-term statistics of selected features (formants F1-F4, the bandwidths B1-B4, F0, and speaking rate) to ASV log-likelihood ratio (LLR) score. The correlations between the proposed predictive model and the LLR scores are 0.72 for females and 0.81 for male speakers. As a whole, the difference in long-term F0 between enrollment and test utterances was found to be the individually most detrimental factor, even if the ASV system uses only spectral, rather than prosodic, features.}, }
@article {pmid31370566, year = {2019}, author = {Patel, RR and Lulich, SM and Verdi, A}, title = {Vocal tract shape and acoustic adjustments of children during phonation into narrow flow-resistant tubes.}, journal = {The Journal of the Acoustical Society of America}, volume = {146}, number = {1}, pages = {352}, doi = {10.1121/1.5116681}, pmid = {31370566}, issn = {1520-8524}, mesh = {Child ; Female ; Glottis/physiology ; Humans ; Male ; Mouth/physiology ; Phonation/*physiology ; Tongue/physiology ; Voice/*physiology ; *Voice Quality ; *Voice Training ; }, abstract = {The goal of the study is to quantify the salient vocal tract acoustic, subglottal acoustic, and vocal tract physiological characteristics during phonation into a narrow flow-resistant tube with 2.53 mm inner diameter and 124 mm length in typically developing vocally healthy children using simultaneous microphone, accelerometer, and 3D/4D ultrasound recordings. Acoustic measurements included fundamental frequency (fo), first formant frequency (F1), second formant frequency (F2), first subglottal resonance (FSg1), and peak-to-peak amplitude ratio (Pvt:Psg). Physiological measurements included posterior tongue height (D1), tongue dorsum height (D2), tongue tip height (D3), tongue length (D4), oral cavity width (D5), hyoid elevation (D6), pharynx width (D7). All measurements were made on eight boys and ten girls (6-9 years) during sustained /o:/ production at typical pitch and loudness, with and without flow-resistant tube. Phonation with the flow-resistant tube resulted in a significant decrease in F1, F2, and Pvt:Psg and a significant increase in D2, D3, and FSg1. A statistically significant gender effect was observed for D1, with D1 higher in boys. These findings agree well with reported findings from adults, suggesting common acoustic and articulatory mechanisms for narrow flow-resistant tube phonation. Theoretical implications of the findings are discussed.}, }
@article {pmid31370496, year = {2019}, author = {Wadamori, N}, title = {Evaluation of a photoacoustic bone-conduction vibration system.}, journal = {The Review of scientific instruments}, volume = {90}, number = {7}, pages = {074905}, doi = {10.1063/1.5081078}, pmid = {31370496}, issn = {1089-7623}, abstract = {This article proposes a bone conduction vibrator that is based on a phenomenon by which audible sound can be perceived when vibrations are produced using a laser beam that is synchronized to the sound and these vibrations are then transmitted to an auricular cartilage. To study this phenomenon, we measured the vibrations using a rubber sheet with similar properties to those of soft tissue in combination with an acceleration sensor. We also calculated the force level of the sound based on the mechanical impedance and the acceleration in the proposed system. We estimated the formant frequencies of specific vibrations that were synchronized to five Japanese vowels using this phenomenon. We found that the vibrations produced in the rubber sheet caused audible sound generation when the photoacoustic bone conduction vibration system was used. It is expected that a force level that is equal to the reference equivalent threshold force level can be achieved at light intensities that lie below the safety limit for human skin exposure by selecting an irradiation wavelength at which a high degree of optical absorption occurs. It is demonstrated that clear sounds can be transmitted to the cochlea using the proposed system, while the effects of acoustic and electric noise in the environment are barred. Improvements in the vibratory force levels realized using this system will enable the development of a novel hearing aid that will provide an alternative to conventional bone conduction hearing aids.}, }
@article {pmid31345679, year = {2020}, author = {Kaneko, M and Sugiyama, Y and Mukudai, S and Hirano, S}, title = {Effect of Voice Therapy Using Semioccluded Vocal Tract Exercises in Singers and Nonsingers With Dysphonia.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {963.e1-963.e9}, doi = {10.1016/j.jvoice.2019.06.014}, pmid = {31345679}, issn = {1873-4588}, mesh = {*Dysphonia/diagnosis/therapy ; Humans ; *Singing ; *Voice ; Voice Quality ; Voice Training ; }, abstract = {OBJECTIVES: Voice therapy with semioccluded vocal tract exercises (SOVTE) has a long history of use in singers and nonsingers with dysphonia. SOVTE with increased vocal tract impedance leads to increased vocal efficiency and economy. Although there is a growing body of research on the physiological impact of SOVTE, and growing clinical sentiment about its therapeutic benefits, empirical data describing its potential efficacy in singers and nonsingers are lacking. The objective of the current study is to evaluate vocal tract function and voice quality in singers and nonsingers with dysphonia after undergoing SOVTE.
METHODS: Patients who were diagnosed with functional dysphonia, vocal fold nodules and age-related atrophy were assessed (n = 8 singers, n = 8 nonsingers). Stroboscopic examination, aerodynamic assessment, acoustic analysis, formant frequency, and self-assessments were evaluated before and after performing SOVTE.
RESULTS: In the singer group, expiratory lung pressure, jitter, shimmer, and self-assessment significantly improved after SOVTE. In addition, formant frequency (first, second, third, and fourth), and the standard deviation (SD) of the first, second, and third formant frequency significantly improved. In the nonsinger group, expiratory lung pressure, jitter, shimmer, and Voice Handicap Index-10 significantly improved after SOVTE. However, no significant changes were observed in formant frequency.
CONCLUSIONS: These results suggest that SOVTE may improve voice quality in singers and nonsingers with dysphonia, and SOVTE may be more effective at adjusting the vocal tract function in singers with dysphonia compared to nonsingers.}, }
@article {pmid31331237, year = {2020}, author = {Myers, S}, title = {An Acoustic Study of Sandhi Vowel Hiatus in Luganda.}, journal = {Language and speech}, volume = {63}, number = {3}, pages = {506-525}, doi = {10.1177/0023830919862842}, pmid = {31331237}, issn = {1756-6053}, mesh = {Adult ; Female ; Humans ; *Language ; Male ; Middle Aged ; *Phonetics ; *Speech Acoustics ; Uganda ; Young Adult ; }, abstract = {In Luganda (Bantu, Uganda), a sequence of vowels in successive syllables (V.V) is not allowed. If the first vowel is high, the two vowels are joined together in a diphthong (e.g., i + a → i͜a). If the first vowel is non-high, it is deleted with compensatory lengthening of the second vowel in the sequence (e.g., e + a → aː). This paper presents an acoustic investigation of inter-word V#V sequences in Luganda. It was found that the vowel interval in V#V sequences is longer than that in V#C sequences. When the first vowel in V#V is non-high, the formant frequency of the outcome is determined by the second vowel in the sequence. When the first vowel is high, on the other hand, the sequence is realized as a diphthong, with the transition between the two formant patterns taking up most of the duration. The durational patterns within these diphthongs provide evidence against the transcription-based claim that these sequences are reorganized so that the length lies in the second vowel (/i#V/ → [jVː]). The findings bring into question a canonical case of compensatory lengthening conditioned by glide formation.}, }
@article {pmid31307041, year = {2020}, author = {Longo, L and Di Stadio, A and Ralli, M and Marinucci, I and Ruoppolo, G and Dipietro, L and de Vincentiis, M and Greco, A}, title = {Voice Parameter Changes in Professional Musician-Singers Singing with and without an Instrument: The Effect of Body Posture.}, journal = {Folia phoniatrica et logopaedica : official organ of the International Association of Logopedics and Phoniatrics (IALP)}, volume = {72}, number = {4}, pages = {309-315}, doi = {10.1159/000501202}, pmid = {31307041}, issn = {1421-9972}, mesh = {Acoustics ; Humans ; *Music ; *Phonation ; *Posture ; *Singing ; Voice Quality ; }, abstract = {BACKGROUND AND AIM: The impact of body posture on vocal emission is well known. Postural changes may increase muscular resistance in tracts of the phono-articulatory apparatus and lead to voice disorders. This work aimed to assess whether and to which extent body posture during singing and playing a musical instrument impacts voice performance in professional musicians.
SUBJECTS AND METHODS: Voice signals were recorded from 17 professional musicians (pianists and guitarists) while they were singing and while they were singing and playing a musical instrument simultaneously. Metrics were extracted from their voice spectrogram using the Multi-Dimensional Voice Program (MDVP) and included jitter, shift in fundamental voice frequency (sF0), shimmer, change in peak amplitude, noise to harmonic ratio, Voice Turbulence Index, Soft Phonation Index (SPI), Frequency Tremor Intensity Index, Amplitude Tremor Intensity Index, and maximum phonatory time (MPT). Statistical analysis was performed using two-tailed t tests, one-way ANOVA, and χ2 tests. Subjects' body posture was visually assessed following the recommendations of the Italian Society of Audiology and Phoniatrics. Thirty-seven voice signals were collected, 17 during singing and 20 during singing and playing a musical instrument.
RESULTS: Data showed that playing an instrument while singing led to an impairment of the "singer formant" and to a decrease in jitter, sF0, shimmer, SPI, and MPT. However, statistical analysis showed that none of the MDVP metrics changed significantly when subjects played an instrument compared to when they did not. Shoulder and back position affected voice features as measured by the MDVP metrics, while head and neck position did not. In particular, playing the guitar decreased the amplitude of the "singer formant" and increased noise, causing a typical "raucous rock voice."
CONCLUSIONS: Voice features may be affected by the use of the instrument the musicians play while they sing. Body posture selected by the musician while playing the instrument may affect expiration and phonation.}, }
@article {pmid31306606, year = {2019}, author = {Whitfield, JA and Mehta, DD}, title = {Examination of Clear Speech in Parkinson Disease Using Measures of Working Vowel Space.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2082-2098}, doi = {10.1044/2019_JSLHR-S-MSC18-18-0189}, pmid = {31306606}, issn = {1558-9102}, mesh = {Aged ; Case-Control Studies ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/*physiopathology ; *Phonetics ; Reading ; Speech Acoustics ; Speech Intelligibility/*physiology ; Speech Production Measurement/methods ; }, abstract = {Purpose The purpose of the current study was to characterize clear speech production for speakers with and without Parkinson disease (PD) using several measures of working vowel space computed from frequently sampled formant trajectories. Method The 1st 2 formant frequencies were tracked for a reading passage that was produced using habitual and clear speaking styles by 15 speakers with PD and 15 healthy control speakers. Vowel space metrics were calculated from the distribution of frequently sampled formant frequency tracks, including vowel space hull area, articulatory-acoustic vowel space, and multiple vowel space density (VSD) measures based on different percentile contours of the formant density distribution. Results Both speaker groups exhibited significant increases in the articulatory-acoustic vowel space and VSD10, the area of the outermost (10th percentile) contour of the formant density distribution, from habitual to clear styles. These clarity-related vowel space increases were significantly smaller for speakers with PD than controls. Both groups also exhibited a significant increase in vowel space hull area; however, this metric was not sensitive to differences in the clear speech response between groups. Relative to healthy controls, speakers with PD exhibited a significantly smaller VSD90, the area of the most central (90th percentile), densely populated region of the formant space. Conclusions Using vowel space metrics calculated from formant traces of the reading passage, the current work suggests that speakers with PD do indeed reach the more peripheral regions of the vowel space during connected speech but spend a larger percentage of the time in more central regions of formant space than healthy speakers. Additionally, working vowel space metrics based on the distribution of formant data suggested that speakers with PD exhibited less of a clarity-related increase in formant space than controls, a trend that was not observed for perimeter-based measures of vowel space area.}, }
@article {pmid31306601, year = {2019}, author = {Chiu, YF and Forrest, K and Loux, T}, title = {Relationship Between F2 Slope and Intelligibility in Parkinson's Disease: Lexical Effects and Listening Environment.}, journal = {American journal of speech-language pathology}, volume = {28}, number = {2S}, pages = {887-894}, doi = {10.1044/2018_AJSLP-MSC18-18-0098}, pmid = {31306601}, issn = {1558-9110}, mesh = {Aged ; *Auditory Perception ; Case-Control Studies ; Dysarthria/*physiopathology ; Female ; Humans ; Male ; Middle Aged ; Parkinson Disease/complications/*physiopathology ; *Signal-To-Noise Ratio ; Speech Acoustics ; *Speech Intelligibility ; }, abstract = {Purpose There is a complex relationship between speech production and intelligibility of speech. The current study sought to evaluate the interaction of the factors of lexical characteristics, listening environment, and the 2nd formant transition (F2 slope) on intelligibility of speakers with Parkinson's disease (PD). Method Twelve speakers with PD and 12 healthy controls read sentences that included words with the diphthongs /aɪ/, /ɔɪ/, and /aʊ/. The F2 slope of the diphthong transition was measured and averaged across the 3 diphthongs for each speaker. Young adult listeners transcribed the sentences to assess intelligibility of words with high and low word frequency and high and low neighborhood density in quiet and noisy listening conditions. The average F2 slope and intelligibility scores were entered into regression models to examine their relationship. Results F2 slope was positively related to intelligibility in speakers with PD in both listening conditions with a stronger relationship in noise than in quiet. There was no significant relationship between F2 slope and intelligibility of healthy speakers. In the quiet condition, F2 slope was only correlated with intelligibility in less-frequent words produced by the PD group. In the noise condition, F2 slope was related to intelligibility in high- and low-frequency words and high-density words in PD. Conclusions The relationship between F2 slope and intelligibility in PD was affected by lexical factors and listening conditions. F2 slope was more strongly related to intelligibility in noise than in quiet for speakers with PD. This relationship was absent in highly frequent words presented in quiet and those with fewer lexical neighbors.}, }
@article {pmid31265363, year = {2019}, author = {Bauerly, KR and Jones, RM and Miller, C}, title = {Effects of Social Stress on Autonomic, Behavioral, and Acoustic Parameters in Adults Who Stutter.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2185-2202}, doi = {10.1044/2019_JSLHR-S-18-0241}, pmid = {31265363}, issn = {1558-9102}, mesh = {Adult ; Anxiety/complications/physiopathology ; Autonomic Nervous System/*physiology ; Case-Control Studies ; Emotions ; Female ; Humans ; Male ; Middle Aged ; Psychomotor Performance/physiology ; Speech/*physiology ; Speech Acoustics ; Stress, Psychological/*complications/physiopathology ; Stuttering/physiopathology/*psychology ; Young Adult ; }, abstract = {Purpose The purpose of this study was to assess changes in autonomic, behavioral, and acoustic measures in response to social stress in adults who stutter (AWS) compared to adults who do not stutter (ANS). Method Participants completed the State-Trait Anxiety Inventory (Speilberger, Gorsuch, Luschene, Vagg, & Jacobs, 1983). In order to provoke social stress, participants were required to complete a modified version of the Trier Social Stress Test (TSST-M, Kirschbaum, Pirke, & Hellhammer, 1993), which included completing a nonword reading task and then preparing and delivering a speech to what was perceived as a group of professionals trained in public speaking. Autonomic nervous system changes were assessed by measuring skin conductance levels, heart rate, and respiratory sinus arrhythmia (RSA). Behavioral changes during speech production were measured in errors, percentage of syllable stuttered, percentage of other disfluencies, and speaking rate. Acoustic changes were measured using 2nd formant frequency fluctuations. In order to make comparisons of speech with and without social-cognitive stress, measurements were collected while participants completed a speaking task before and during TSST-M conditions. Results AWS showed significantly higher levels of self-reported state and trait anxiety compared to ANS. Autonomic nervous system changes revealed similar skin conductance level and heart rate across pre-TSST-M and TSST-M conditions; however, RSA levels were significantly higher in AWS compared to ANS across conditions. There were no differences found between groups for speaking rate, fundamental frequency, and percentage of other disfluencies when speaking with or without social stress. However, acoustic analysis revealed higher levels of 2nd formant frequency fluctuations in the AWS compared to the controls under pre-TSST-M conditions, followed by a decline to a level that resembled controls when speaking under the TSST-M condition. Discussion Results suggest that AWS, compared to ANS, engage higher levels of parasympathetic control (i.e., RSA) during speaking, regardless of stress level. Higher levels of self-reported state and trait anxiety support this view point and suggest that anxiety may have an indirect role on articulatory variability in AWS.}, }
@article {pmid31255144, year = {2019}, author = {Charles, S and Lulich, SM}, title = {Articulatory-acoustic relations in the production of alveolar and palatal lateral sounds in Brazilian Portuguese.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {6}, pages = {3269}, doi = {10.1121/1.5109565}, pmid = {31255144}, issn = {1520-8524}, abstract = {Lateral approximant speech sounds are notoriously difficult to measure and describe due to their complex articulation and acoustics. This has prevented researchers from reaching a unifying description of the articulatory and acoustic characteristics of laterals. This paper examines articulatory and acoustic properties of Brazilian Portuguese alveolar and palatal lateral approximants (/l/ and /ʎ/) produced by six native speakers. The methodology for obtaining vocal tract area functions was based on three-dimensional/four-dimensional (3D/4D) ultrasound recordings and 3D digitized palatal impressions with simultaneously recorded audio signals. Area functions were used to calculate transfer function spectra, and predicted formant and anti-resonance frequencies were compared with the acoustic recordings. Mean absolute error in formant frequency prediction was 4% with a Pearson correlation of r = 0.987. Findings suggest anti-resonances from the interdental channels are less important than a prominent anti-resonance from the supralingual cavity but can become important in asymmetrical articulations. The use of 3D/4D ultrasound to study articulatory-acoustic relations is promising, but significant limitations remain and future work is needed to make better use of 3D/4D ultrasound data, e.g., by combining it with magnetic resonance imaging.}, }
@article {pmid31251880, year = {2019}, author = {Heller Murray, ES and Lupiani, AA and Kolin, KR and Segina, RK and Stepp, CE}, title = {Pitch Shifting With the Commercially Available Eventide Eclipse: Intended and Unintended Changes to the Speech Signal.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2270-2279}, pmid = {31251880}, issn = {1558-9102}, support = {F31 DC016197/DC/NIDCD NIH HHS/United States ; P50 DC015446/DC/NIDCD NIH HHS/United States ; }, mesh = {Algorithms ; Analysis of Variance ; Female ; Healthy Volunteers ; Humans ; Male ; Pitch Discrimination/*physiology ; Speech/*physiology ; Speech Acoustics ; Young Adult ; }, abstract = {Purpose This study details the intended and unintended consequences of pitch shifting with the commercially available Eventide Eclipse. Method Ten vocally healthy participants (M = 22.0 years; 6 cisgender females, 4 cisgender males) produced a sustained /ɑ/, creating an input signal. This input signal was processed in near real time by the Eventide Eclipse to create an output signal that was either not shifted (0 cents), shifted +100 cents, or shifted -100 cents. Shifts occurred either throughout the entire vocalization or for a 200-ms period after vocal onset. Results Input signals were compared to output signals to examine potential changes. Average pitch-shift magnitudes were within 1 cent of the intended pitch shift. Measured pitch-shift length for intended 200-ms shifts was between 5.9% and 21.7% less than expected, based on the portion of shift selected for measurement. The delay between input and output signals was an average of 11.1 ms. Trials shifted +100 cents had a longer delay than trials shifted -100 or 0 cents. The first 2 formants (F1, F2) shifted in the direction of the pitch shift, with F1 shifting 6.5% and F2 shifting 6.0%. Conclusions The Eventide Eclipse is an accurate pitch-shifting hardware that can be used to explore voice and vocal motor control. The pitch-shifting algorithm shifts all frequencies, resulting in a subsequent change in F1 and F2 during pitch-shifted trials. Researchers using this device should be mindful of stimuli selection to avoid confusion during data interpretation.}, }
@article {pmid31251676, year = {2019}, author = {Horáček, J and Radolf, V and Laukkanen, AM}, title = {Experimental and Computational Modeling of the Effects of Voice Therapy Using Tubes.}, journal = {Journal of speech, language, and hearing research : JSLHR}, volume = {62}, number = {7}, pages = {2227-2244}, doi = {10.1044/2019_JSLHR-S-17-0490}, pmid = {31251676}, issn = {1558-9102}, mesh = {Computer Simulation ; Female ; Glottis ; Humans ; Lung/physiology ; Male ; Models, Anatomic ; Phonation/*physiology ; Speech Acoustics ; Speech Therapy/*methods ; Voice Training ; }, abstract = {Purpose Phonations into a tube with the distal end either in the air or submerged in water are used for voice therapy. This study explores the effective mechanisms of these therapy methods. Method The study applied a physical model complemented by calculations from a computational model, and the results were compared to those that have been reported for humans. The effects of tube phonation on vocal tract resonances and oral pressure variation were studied. The relationships of transglottic pressure variation in time Ptrans (t) versus glottal area variation in time GA(t) were constructed. Results The physical model revealed that, for the phonation on [u:] vowel through a glass resonance tube ending in the air, the 1st formant frequency (F1) decreased by 67%, from 315 Hz to 105 Hz, thus slightly above the fundamental frequency (F0) that was set to 90-94 Hz . For phonation through the tube into water, F1 decreased by 91%-92%, reaching 26-28 Hz, and the water bubbling frequency Fb ≅ 19-24 Hz was just below F1 . The relationships of Ptrans (t) versus GA(t) clearly differentiate vowel phonation from both therapy methods, and show a physical background for voice therapy with tubes. It is shown that comparable results have been measured in humans during tube therapy. For the tube in air, F1 descends closer to F0 , whereas for the tube in water, the frequency Fb occurs close to the acoustic-mechanical resonance of the human vocal tract. Conclusion In both therapy methods, part of the airflow energy required for phonation is substituted by the acoustic energy utilizing the 1st acoustic resonance. Thus, less flow energy is needed for vocal fold vibration, which results in improved vocal efficiency. The effect can be stronger in water resistance therapy if the frequency Fb approaches the acoustic-mechanical resonance of the vocal tract, while simultaneously F0 is voluntarily changed close to F1.}, }
@article {pmid31246660, year = {2020}, author = {Suresh, CH and Krishnan, A and Luo, X}, title = {Human Frequency Following Responses to Vocoded Speech: Amplitude Modulation Versus Amplitude Plus Frequency Modulation.}, journal = {Ear and hearing}, volume = {41}, number = {2}, pages = {300-311}, doi = {10.1097/AUD.0000000000000756}, pmid = {31246660}, issn = {1538-4667}, mesh = {Acoustic Stimulation ; Adult ; *Cochlear Implantation ; *Cochlear Implants ; Cues ; Humans ; Speech ; *Speech Perception ; }, abstract = {OBJECTIVES: The most commonly employed speech processing strategies in cochlear implants (CIs) only extract and encode amplitude modulation (AM) in a limited number of frequency channels. proposed a novel speech processing strategy that encodes both frequency modulation (FM) and AM to improve CI performance. Using behavioral tests, they reported better speech, speaker, and tone recognition with this novel strategy than with the AM-alone strategy. Here, we used the scalp-recorded human frequency following responses (FFRs) to examine the differences in the neural representation of vocoded speech sounds with AM alone and AM + FM as the spectral and temporal cues were varied. Specifically, we were interested in determining whether the addition of FM to AM improved the neural representation of envelope periodicity (FFRENV) and temporal fine structure (FFRTFS), as reflected in the temporal pattern of the phase-locked neural activity generating the FFR.
DESIGN: FFRs were recorded from 13 normal-hearing, adult listeners in response to the original unprocessed stimulus (a synthetic diphthong /au/ with a 110-Hz fundamental frequency or F0 and a 250-msec duration) and the 2-, 4-, 8- and 16-channel sine vocoded versions of /au/ with AM alone and AM + FM. Temporal waveforms, autocorrelation analyses, fast Fourier Transform, and stimulus-response spectral correlations were used to analyze both the strength and fidelity of the neural representation of envelope periodicity (F0) and TFS (formant structure).
RESULTS: The periodicity strength in the FFRENV decreased more for the AM stimuli than for the relatively resilient AM + FM stimuli as the number of channels was increased. Regardless of the number of channels, a clear spectral peak of FFRENV was consistently observed at the stimulus F0 for all the AM + FM stimuli but not for the AM stimuli. Neural representation as revealed by the spectral correlation of FFRTFS was better for the AM + FM stimuli when compared to the AM stimuli. Neural representation of the time-varying formant-related harmonics as revealed by the spectral correlation was also better for the AM + FM stimuli as compared to the AM stimuli.
CONCLUSIONS: These results are consistent with previously reported behavioral results and suggest that the AM + FM processing strategy elicited brainstem neural activity that better preserved periodicity, temporal fine structure, and time-varying spectral information than the AM processing strategy. The relatively more robust neural representation of AM + FM stimuli observed here likely contributes to the superior performance on speech, speaker, and tone recognition with the AM + FM processing strategy. Taken together, these results suggest that neural information preserved in the FFR may be used to evaluate signal processing strategies considered for CIs.}, }
@article {pmid31231051, year = {2019}, author = {Stansbury, AL and Janik, VM}, title = {Formant Modification through Vocal Production Learning in Gray Seals.}, journal = {Current biology : CB}, volume = {29}, number = {13}, pages = {2244-2249.e4}, doi = {10.1016/j.cub.2019.05.071}, pmid = {31231051}, issn = {1879-0445}, mesh = {Animals ; Female ; *Learning ; Male ; Seals, Earless/*physiology ; *Vocalization, Animal ; }, abstract = {Vocal production learning is a rare communication skill and has only been found in selected avian and mammalian species [1-4]. Although humans use learned formants and voiceless sounds to encode most lexical information [5], evidence for vocal learning in other animals tends to focus on the modulation pattern of the fundamental frequency [3, 4]. Attempts to teach mammals to produce human speech sounds have largely been unsuccessful, most notably in extensive studies on great apes [5]. The limited evidence for formant copying in mammals raises the question whether advanced learned control over formant production is uniquely human. We show that gray seals (Halichoerus grypus) have the ability to match modulations in peak frequency patterns of call sequences or melodies by modifying the formants in their own calls, moving outside of their normal repertoire's distribution of frequencies and even copying human vowel sounds. Seals also demonstrated enhanced auditory memory for call sequences by accurately copying sequential changes in peak frequency and the number of calls played to them. Our results demonstrate that formants can be influenced by vocal production learning in non-human vocal learners, providing a mammalian substrate for the evolution of flexible information coding in formants as found in human language.}, }
@article {pmid31202525, year = {2020}, author = {Dahl, KL and Mahler, LA}, title = {Acoustic Features of Transfeminine Voices and Perceptions of Voice Femininity.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {961.e19-961.e26}, doi = {10.1016/j.jvoice.2019.05.012}, pmid = {31202525}, issn = {1873-4588}, mesh = {Acoustics ; Female ; Femininity ; Humans ; Male ; Speech Acoustics ; *Speech Perception ; *Transgender Persons ; *Voice ; Voice Quality ; }, abstract = {The purpose of this study was to evaluate the relationships between acoustic measures of transfeminine voices and both self- and listener ratings of voice femininity. Connected speech samples were collected from 12 transfeminine individuals (M = 36.3 years, SD = 10.6 years) and a control group of five cisgender (cis) women and five cis men (M = 35.3 years, SD = 13.3 years). The acoustic measures of fundamental frequency (fo), fo variation, formant frequencies, and vocal intensity were calculated from these samples. Transfeminine speakers rated their own voices on a five-point scale of voice femininity. Twenty inexperienced listeners heard an excerpt of each speech sample and rated the voices on the same five-point scale of voice femininity. Spearman's rank-order correlation coefficients were calculated to measure the relationships between the acoustic variables and ratings of voice femininity. Significant positive correlations were found between fo and both self-ratings (r = 0.712, P = 0.009) and listener ratings of voice femininity (r = 0.513, P < 0.001). Significant positive correlations were found between intensity and both self-ratings (r = 0.584, P = 0.046) and listener ratings of voice femininity (r = 0.584, P = 0.046). No significant correlations were found between fo variation or formant frequencies and perceptual ratings of voice femininity. A Pearson's chi-square test of independence showed that the distribution of self- and listener ratings differed significantly (χ[2] = 9.668, P = 0.046). Self- and listener ratings were also shown to be strongly correlated (r = 0.912, P < 0.001). This study provides further evidence to support the selection of training targets in voice feminization programs for transfeminine individuals and promotes the use of self-ratings of voice as an important outcome measure.}, }
@article {pmid31193755, year = {2019}, author = {Sankar, MSA and Sathidevi, PS}, title = {A scalable speech coding scheme using compressive sensing and orthogonal mapping based vector quantization.}, journal = {Heliyon}, volume = {5}, number = {5}, pages = {e01820}, doi = {10.1016/j.heliyon.2019.e01820}, pmid = {31193755}, issn = {2405-8440}, abstract = {A novel scalable speech coding scheme based on Compressive Sensing (CS), which can operate at bit rates from 3.275 to 7.275 kbps is designed and implemented in this paper. The CS based speech coding offers the benefit of combined compression and encryption with inherent de-noising and bit rate scalability. The non-stationary nature of speech signal causes the recovery process from CS measurements very complex due to the variation in sparsifying bases. In this work, the complexity of the recovery process is reduced by assigning a suitable basis to each frame of the speech signal based on its statistical properties. As the quality of the reconstructed speech depends on the sensing matrix used at the transmitter, a variant of Binary Permuted Block Diagonal (BPBD) matrix is also proposed here which offers a better performance than that of the commonly used Gaussian random matrix. To improve the coding efficiency, formant filter coefficients are quantized using the conventional Vector Quantization (VQ) and an orthogonal mapping based VQ is developed for the quantization of CS measurements. The proposed coding scheme offers the listening quality for reconstructed speech similar to that of Adaptive Multi rate - Narrowband (AMR-NB) codec at 6.7 kbps and Enhanced Voice Services (EVS) at 7.2 kbps. A separate de-noising block is not required in the proposed coding scheme due to the inherent de-noising property of CS. Scalability in bit rate is achieved in the proposed method by varying the number of random measurements and the number of levels for orthogonal mapping in the VQ stage of measurements.}, }
@article {pmid31183861, year = {2019}, author = {de Carvalho, CC and da Silva, DM and de Carvalho Junior, AD and Santos Neto, JM and Rio, BR and Neto, CN and de Orange, FA}, title = {Pre-operative voice evaluation as a hypothetical predictor of difficult laryngoscopy.}, journal = {Anaesthesia}, volume = {74}, number = {9}, pages = {1147-1152}, doi = {10.1111/anae.14732}, pmid = {31183861}, issn = {1365-2044}, mesh = {Adult ; Anesthesia, General ; Female ; Humans ; Intubation, Intratracheal/*methods ; Laryngoscopy/*methods ; Male ; Middle Aged ; Predictive Value of Tests ; Preoperative Care/*methods ; Prospective Studies ; Voice/*physiology ; }, abstract = {We examined the potential for voice sounds to predict a difficult airway as compared with prediction by the modified Mallampati test. A total of 453 patients scheduled for elective surgery under general anaesthesia with tracheal intubation were studied. Five phonemes were recorded and their formants analysed. Difficult laryngoscopy was defined as the Cormack-Lehane grade 3 or 4. Univariate and multivariate logistic regression were used to examine the association between some variables (mouth opening, sternomental distance, modified Mallampati and formants) and difficult laryngoscopy. Difficult laryngoscopy was reported in 29/453 (6.4%) patients. Among five regression models evaluated, the model achieving better performance to predict difficult laryngoscopy, after a variable selection criteria (stepwise, multivariate) and included a modified Mallampati classification (OR 2.920; 95%CI 1.992-4.279; p < 0.001), first formant of /i/(iF1) (OR 1.003; 95%CI 1.002-1.04; p < 0.001), and second formant of /i/(iF2) (OR 0.998; 95%CI 0.997-0.998; p < 0.001). The receiver operating curve for a regression model that included both formants and Mallampati showed an area under curve of 0.918, higher than formants alone (area under curve 0.761) and modified Mallampati alone (area under curve 0.874). Voice presented a significant association with difficult laryngoscopy during general anaesthesia showing a 76.1% probability of correctly classifying a randomly selected patient.}, }
@article {pmid31176869, year = {2019}, author = {Easwar, V and Scollie, S and Purcell, D}, title = {Investigating potential interactions between envelope following responses elicited simultaneously by different vowel formants.}, journal = {Hearing research}, volume = {380}, number = {}, pages = {35-45}, doi = {10.1016/j.heares.2019.05.005}, pmid = {31176869}, issn = {1878-5891}, mesh = {*Acoustic Stimulation ; Adolescent ; Adult ; Auditory Pathways/*physiology ; *Evoked Potentials, Auditory ; Female ; Humans ; Male ; *Periodicity ; *Speech Acoustics ; *Speech Perception ; Time Factors ; *Voice Quality ; Young Adult ; }, abstract = {Envelope following responses (EFRs) evoked by the periodicity of voicing in vowels are elicited at the fundamental frequency of voice (f0), irrespective of the harmonics that initiate it. One approach of improving the frequency specificity of vowel stimuli without increasing test-time is by altering the f0 selectively in one or more formants. The harmonics contributing to an EFR can then be differentiated by the unique f0 at which the EFRs are elicited. The advantages of using such an approach would be increased frequency specificity and efficiency, given that multiple EFRs can be evaluated in a certain test-time. However, multiple EFRs elicited simultaneously could interact and lead to altered amplitudes and outcomes. To this end, the present study aimed to evaluate: (i) if simultaneous recording of two EFRs, one elicited by harmonics in the first formant (F1) and one elicited by harmonics in the second and higher formants (F2+), leads to attenuation or enhancement of EFR amplitude, and (ii) if simultaneous measurement of two EFRs affects its accuracy and anticipated efficiency. In a group of 22 young adults with normal hearing, EFRs were elicited by F1 and F2+ bands of /u/, /a/ and /i/ when F1 and F2+ were presented independently (individual), when F1 and F2+ were presented simultaneously (dual), and when F1 or F2+ was presented with spectrally matched Gaussian noise of the other (noise). Repeated-measures analysis of variance indicated no significant group differences in EFR amplitudes between any of the conditions, suggesting minimal between-EFR interactions. Between-participant variability was evident, however, significant changes were evident only in a third of the participants for the stimulus /u/ F1. For the majority of stimuli, the change between individual and dual conditions was positively correlated with the change between individual and noise conditions, suggesting that interaction-based changes in EFR amplitude, when present, were likely due to the restriction of cochlear regions of excitation in the presence of a competing stimulus. The amplitude of residual noise was significantly higher in the dual or noise relative to the individual conditions, although the mean differences were very small (<3 nV). F-test-based detection of EFRs, commonly used to determine the presence of an EFR, did not vary across conditions. Further, neither the mean reduction in EFR amplitude nor the mean increase in noise amplitude in dual relative to individual conditions was large enough to alter the anticipated gain in efficiency of simultaneous EFR recordings. Together, results suggest that the approach of simultaneously recording two vowel-evoked EFRs from different formants for improved frequency-specificity does not alter test accuracy and is more time-efficient than evaluating EFRs to each formant individually.}, }
@article {pmid31164264, year = {2019}, author = {Mou, Z and Teng, W and Ouyang, H and Chen, Y and Liu, Y and Jiang, C and Zhang, J and Chen, Z}, title = {Quantitative analysis of vowel production in cerebral palsy children with dysarthria.}, journal = {Journal of clinical neuroscience : official journal of the Neurosurgical Society of Australasia}, volume = {66}, number = {}, pages = {77-82}, doi = {10.1016/j.jocn.2019.05.020}, pmid = {31164264}, issn = {1532-2653}, mesh = {Adolescent ; Cerebral Palsy/*complications ; Child ; Dysarthria/etiology/*physiopathology ; Female ; Humans ; Male ; Phonetics ; *Speech Acoustics ; Speech Intelligibility ; }, abstract = {OBJECTIVE: The present study aimed to identify certain acoustic parameters for speech evaluation in cerebral palsy children with dysarthria.
METHODS: The subject included 30 native Mandarin-Speaking children with cerebral palsy, who were 5-15 years old, and 13 healthy children in a similar age range. Each subject was recorded while producing a list of 12 Mandarin words, which included three syllables ('ba', 'bi' and 'du'), in all four Mandarin tones. The formants (F1 and F2) of monophthong vowels /a, i, u/ were extracted from each vowel token. Based on F1 and F2, the vowel acoustic indexes VSA, VAI and FCR were calculated and analyzed.
RESULTS: Compared with the control group, the cerebral palsy group had significantly low F1 and F2 in vowel /a/ (P < 0.05), and F2 in vowel /i/ (P < 0.05), while F1 and F2 in vowel /u/ and F1 in vowel /i/ had no significant difference. Between the healthy group and cerebral palsy group, the differences in VSA, VAI and FCR were all statistically significant.
CONCLUSION: Children with cerebral palsy have reduced vowel space and speech articulation. The significant difference in vowel acoustic indexes (VSA, VAI and FCR) among the two groups revealed that the three indexes were sensitive to the variation of the vowels production in children with cerebral palsy, and that these may be used as an evaluation method of speech intelligibility caused by impaired vowel pronunciation in children with cerebral palsy, and the effect of rehabilitation therapy.}, }
@article {pmid31153772, year = {2020}, author = {Buckley, DP and Dahl, KL and Cler, GJ and Stepp, CE}, title = {Transmasculine Voice Modification: A Case Study.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {34}, number = {6}, pages = {903-910}, pmid = {31153772}, issn = {1873-4588}, support = {F31 DC014872/DC/NIDCD NIH HHS/United States ; F32 DC017637/DC/NIDCD NIH HHS/United States ; R01 DC015570/DC/NIDCD NIH HHS/United States ; }, mesh = {Adult ; Humans ; Male ; Masculinity ; Speech ; *Speech Acoustics ; *Voice ; Voice Training ; }, abstract = {This case study measured the effects of manual laryngeal therapy on the fundamental frequency (fo), formant frequencies, estimated vocal tract length, and listener perception of masculinity of a 32-year-old transmasculine individual. The participant began testosterone therapy 1.5 years prior to the study. Two therapy approaches were administered sequentially in a single session: (1) passive circumlaryngeal massage and manual laryngeal reposturing, and (2) active laryngeal reposturing with voicing. Acoustic recordings were collected before and after each treatment and 3 days after the session. Speaking fo decreased from 124 Hz to 120 Hz after passive training, and to 108 Hz after active training. Estimated vocal tract length increased from 17.0 cm to 17.3 cm after passive training, and to 19.4 cm after active training. Eight listeners evaluated the masculinity of the participant's speech; his voice was rated as most masculine at the end of the training session. All measures returned to baseline at follow-up. Overall, both acoustic and perceptual changes were observed in one transmasculine individual who participated in manual laryngeal therapy, even after significant testosterone-induced voice changes had already occurred; however, changes were not maintained in the follow-up. This study adds to scant literature on effective approaches to and proposed outcome measures for voice masculinization in transmasculine individuals.}, }
@article {pmid31153348, year = {2019}, author = {Chen, WR and Whalen, DH and Shadle, CH}, title = {F0-induced formant measurement errors result in biased variabilities.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {EL360}, pmid = {31153348}, issn = {1520-8524}, support = {R01 DC002717/DC/NIDCD NIH HHS/United States ; }, mesh = {*Acoustics ; *Bias ; Humans ; Phonetics ; Sound Spectrography/methods ; *Speech Acoustics ; Speech Perception/*physiology ; }, abstract = {Many developmental studies attribute reduction of acoustic variability to increasing motor control. However, linear prediction-based formant measurements are known to be biased toward the nearest harmonic of F0, especially at high F0s. Thus, the amount of reported formant variability generated by changes in F0 is unknown. Here, 470 000 vowels were synthesized, mimicking statistics reported in four developmental studies, to estimate the proportion of formant variability that can be attributed to F0 bias, as well as other formant measurement errors. Results showed that the F0-induced formant measurements errors are large and systematic, and cannot be eliminated by a large sample size.}, }
@article {pmid31153321, year = {2019}, author = {Briefer, EF and Vizier, E and Gygax, L and Hillmann, E}, title = {Expression of emotional valence in pig closed-mouth grunts: Involvement of both source- and filter-related parameters.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {2895}, doi = {10.1121/1.5100612}, pmid = {31153321}, issn = {1520-8524}, mesh = {Animals ; Arousal/*physiology ; Behavior, Animal/*physiology ; Emotions/*physiology ; Face/physiology ; Female ; Male ; Mouth/physiology ; Swine ; Vocalization, Animal/*physiology ; Voice ; }, abstract = {Emotion expression plays a crucial role for regulating social interactions. One efficient channel for emotion communication is the vocal-auditory channel, which enables a fast transmission of information. Filter-related parameters (formants) have been suggested as a key to the vocal differentiation of emotional valence (positive versus negative) across species, but variation in relation to emotions has rarely been investigated. Here, whether pig (Sus scrofa domesticus) closed-mouth grunts differ in source- and filter-related features when produced in situations assumed to be positive and negative is investigated. Behavioral and physiological parameters were used to validate the animals' emotional state (both in terms of valence and arousal, i.e., bodily activation). Results revealed that grunts produced in a positive situation were characterized by higher formants, a narrower range of the third formant, a shorter duration, a lower fundamental frequency, and a lower harmonicity compared to negative grunts. Particularly, formant-related parameters and duration made up most of the difference between positive and negative grunts. Therefore, these parameters have the potential to encode dynamic information and to vary as a function of the emotional valence of the emitter in pigs, and possibly in other mammals as well.}, }
@article {pmid31153297, year = {2019}, author = {Houde, JF and Gill, JS and Agnew, Z and Kothare, H and Hickok, G and Parrell, B and Ivry, RB and Nagarajan, SS}, title = {Abnormally increased vocal responses to pitch feedback perturbations in patients with cerebellar degeneration.}, journal = {The Journal of the Acoustical Society of America}, volume = {145}, number = {5}, pages = {EL372}, pmid = {31153297}, issn = {1520-8524}, support = {R01 DC017696/DC/NIDCD NIH HHS/United States ; R01 NS105839/NS/NINDS NIH HHS/United States ; R01 DC013979/DC/NIDCD NIH HHS/United States ; R01 DC010145/DC/NIDCD NIH HHS/United States ; R01 DC017091/DC/NIDCD NIH HHS/United States ; }, mesh = {Acoustic Stimulation/methods ; Adult ; Feedback ; Feedback, Sensory/*physiology ; Female ; Humans ; Male ; Middle Aged ; Pitch Perception/*physiology ; Speech/*physiology ; Speech Perception/physiology ; Voice/*physiology ; }, abstract = {Cerebellar degeneration (CD) has deleterious effects on speech motor behavior. Recently, a dissociation between feedback and feedforward control of speaking was observed in CD: Whereas CD patients exhibited reduced adaptation across trials to consistent formant feedback alterations, they showed enhanced within-trial compensation for unpredictable formant feedback perturbations. In this study, it was found that CD patients exhibit abnormally increased within-trial vocal compensation responses to unpredictable pitch feedback perturbations. Taken together with recent findings, the results indicate that CD is associated with a general hypersensitivity to auditory feedback during speaking.}, }
@article {pmid31147205, year = {2021}, author = {Reinheimer, DM and Andrade, BMR and Nascimento, JKF and Fonte, JBM and Araújo, IMP and Martins-Filho, PRS and Salvatori, R and Valença, EHO and Oliveira, AHA and Aguiar-Oliveira, MH and Oliveira-Neto, LA}, title = {Formant Frequencies, Cephalometric Measures, and Pharyngeal Airway Width in Adults With Congenital, Isolated, and Untreated Growth Hormone Deficiency.}, journal = {Journal of voice : official journal of the Voice Foundation}, volume = {35}, number = {1}, pages = {61-68}, doi = {10.1016/j.jvoice.2019.04.014}, pmid = {31147205}, issn = {1873-4588}, mesh = {Adult ; Cephalometry ; *Dwarfism, Pituitary ; Growth Hormone ; Humans ; Mandible/diagnostic imaging ; Pharynx/diagnostic imaging ; }, abstract = {OBJECTIVE: Adult subjects with isolated growth hormone deficiency (IGHD) due to a mutation in the growth hormone releasing hormone receptor gene exhibit higher values formant frequencies. In normal subjects, a significant negative association between the formant frequencies and the reduction of linear craniofacial measurements, especially of maxilla and mandible, has been reported. This suggests smaller pharyngeal width, despite low prevalence of obstructive sleep apnea syndrome. Here we evaluate their pharyngeal airway width, its correlation with vowel formant frequencies, and the correlation between them and the craniofacial measures.
SUBJECTS AND METHODS: A two-step protocol was performed. In the first case-control experiment, aimed to assess the pharyngeal width, we compared nine adult IGHD and 36 normal statured controls. Both upper and lower pharyngeal widths were measured. The second step (assessment of pharyngeal width) was performed only in the IGHD group.
RESULTS: Upper and lower pharyngeal widths were similar in IGHD and controls. In IGHD subjects, the lower pharyngeal width exhibited a negative correlation with F1 [a] and a positive correlation with mandibular length. There were negative correlations between F1 and F2 with linear and positive correlations with the angular measures.
CO