# This program computes the phonotactic probability metrics employed by # Jusczyk et al in their study of infants' knowledge of phonotactics # It uses the wordlist CelexWordsInTranscription.txt to calculate # probabilities from the English lexicon, and then applies these probabilities # to test items, to give predicted well-formedness scores. $count_cvc_only = 0; $freq_threshhold = 1; $print_all_consonants = 0; $vowels = "aeiouAEIOU&V\@3╠п┐оОYЪWoХСЯBMLи"; print "\nCounting frequencies in wordlist\n"; open (WORDLIST, "RandomHouseMonosyllables.txt") or die "Warning! Can't open wordlist: $!\n"; # The first line is the headers; skip it ; while ($line = ) { chomp($line); ($badfreq, $freq, $orthography, $transcription, $template) = split("\t", $line); next if ($freq < $freq_threshhold); $original_transcription = $transcription; $transcription =~ s/^\[//; $transcription =~ s/\]$//; # It will help a lot to have every phoneme represented by a single character $transcription = remove_digraphs($transcription); # Kessler & Treiman count only CVC, but this is something we might want to explore if ($count_cvc_only and $template ne "CVC") { next; } # First, adjust positional probabilities if ($transcription =~ /([^$vowels]*)([$vowels])([^$vowels]*)/) { $onset = $1; $nucleus = $2; $coda = $3; } else { print "Error: can't parse syllable [$syl] into onset, nucleus, and coda in the word $original_transcription\n"; } # Kessler & Treiman ignore empty positions, but this too is something that could make a difference if ($onset eq "") { $onset = " "; } if ($coda eq "") { $coda = " "; } # First the generic, position-independent counts $TotalFreq{$onset}++; $TotalFreq{$nucleus}++; $TotalFreq{$coda}++; # Bookkeeping: handy to have a list of all consonants $Consonants{$onset} = 1; $Consonants{$coda} = 1; # Then, the contextual (position-dependent) counts $OnsetFreq{$onset}++; $NucleusFreq{$nucleus}++; $CodaFreq{$coda}++; $total_onsets++; $total_nuclei++; $total_codas++; # Now, count co-occurrences $OnsetNuc{$onset}{$nucleus}++; $NucCoda{$nucleus}{$coda}++; $OnsetCoda{$onset}{$coda}++; } close (WORDLIST); # Now that we're done counting from the corpus, we can calculate the probabilities print "\nCalculating positional probabilities\n"; # First, the positional probabilities of each phoneme foreach $onset (keys %OnsetFreq) { $OnsetProb{$onset} = $OnsetFreq{$onset} / $total_onsets; } foreach $nucleus (keys %NucleusFreq) { $NucleusProb{$nucleus} = $NucleusFreq{$nucleus} / $total_nuclei; } foreach $coda (keys %CodaFreq) { $CodaProb{$coda} = $CodaFreq{$coda} / $total_codas; } # Let's save the results in a file open (POSPROBFILE, ">RHPositionalProbabilities.txt") or die "Warning! Can't create positional probabilties file: $!\n"; # Now print the results, in decreasing probability: printf POSPROBFILE "*****************************\n Onset Probabilities\n*****************************\n"; foreach $onset (sort {$OnsetProb{$b} <=> $OnsetProb{$a}} keys %OnsetProb) { printf POSPROBFILE replace_digraphs($onset) . "\t$OnsetProb{$onset}\n"; } printf POSPROBFILE "\n*****************************\n Nucleus Probabilities\n*****************************\n"; foreach $nucleus (sort {$NucleusProb{$b} <=> $NucleusProb{$a}} keys %NucleusProb) { printf POSPROBFILE replace_digraphs($nucleus) . "\t$NucleusProb{$nucleus}\n"; } printf POSPROBFILE "\n*****************************\n Coda Probabilities\n*****************************\n"; foreach $coda (sort {$CodaProb{$b} <=> $CodaProb{$a}} keys %CodaProb) { printf POSPROBFILE replace_digraphs($coda) . "\t$CodaProb{$coda}\n"; } close (POSPROBFILE); # Now on to Study 1: print out the onset/coda distributions, for Study 1 chi square tests print "\nCalculating onset/coda asymmetries\n"; open (ONSETCODA, ">RH-OnsetCodaDistributions.txt") or die "Warning! Can't create file for onset/coda distributions: $!\n"; @vowels = ("i", "I", "О", "E", "a", "&", "u", "U", "o", "┐", "V", "A", "Y", "W", "Ъ", "и", "3", "╠", "п", "┐", "о", "Х", "С", "Я", "B", "M", "L"); @consonants = ("p", "t", "k", "b", "d", "g", "f", "T", "s", "S", "h", "C", "v", "D", "z", "Z", "J", "m", "n", "N", "l", "r", "w", "j"); printf ONSETCODA "Consonant\tTotal\tOnset (obs)\tCoda (obs)\tOnset (exp)\tCoda (exp)\tOnset Bias\tchi^2\n"; foreach $consonant (@consonants) { $seen_consonants{$consonant} = 1; $expected = $TotalFreq{$consonant} / 2; $onset_bias = $OnsetFreq{$consonant} / $TotalFreq{$consonant}; $chi_square = 2 * (($OnsetFreq{$consonant} - $expected)**2)/$expected; $line = "$consonant\t$TotalFreq{$consonant}\t$OnsetFreq{$consonant}\t$CodaFreq{$consonant}\t$expected\t$expected\t$onset_bias\t$chi_square\n"; $line =~ s/\t\t/\t0\t/g; printf ONSETCODA $line; } # There will also be other consonants in the list (esp clusters) # So, get those from the keys of the %consonants hash foreach $consonant (sort keys %Consonants) { unless ($seen_consonants{$consonant}) { $expected = $TotalFreq{$consonant} / 2; $onset_bias = $OnsetFreq{$consonant} / $TotalFreq{$consonant}; $chi_square = 2 * (($OnsetFreq{$consonant} - $expected)**2)/$expected; $line = "$consonant\t$TotalFreq{$consonant}\t$OnsetFreq{$consonant}\t$CodaFreq{$consonant}\t$expected\t$expected\t$onset_bias\t$chi_square\n"; $line =~ s/\t\t/\t0\t/g; printf ONSETCODA $line; } } %seen_consonants = undef; # Now, calculate co-occurrence counts print "\nTallying co-occurrences\n"; open (COOCCUR, ">RH-Cooccurrences.txt") or die "Warning! Can't create file of cooccurrence counts: $!\n"; open (EXPECTED, ">RH-ExpectedCooccurrences.txt") or die "Warning! Can't create file of expected cooccurrence counts: $!\n"; printf COOCCUR "Observed Onset-Nucleus Co-occurrences:\n"; printf EXPECTED "Expected Onset-Nucleus Co-occurrences:\n"; foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { printf COOCCUR "\t". replace_digraphs($vowel); printf EXPECTED "\t". replace_digraphs($vowel); } } printf COOCCUR "\n"; printf EXPECTED "\n"; foreach $consonant (@consonants) { $line = undef; $expected_line = undef; # We'll print values for each predefined consonant that actually occurs in the corpus if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) { printf COOCCUR replace_digraphs($consonant); printf EXPECTED replace_digraphs($consonant); foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { $line .= "\t$OnsetNuc{$consonant}{$vowel}"; # The expected count is proportional to the joint probability of this onset and nucleus $expected_line .= "\t" . ($OnsetProb{$consonant} * $NucleusProb{$vowel} * $total_nuclei); } } # For undefined (unseen) values, provide 0's $line =~ s/\t(?=\t)/\t0/g; $line =~ s/\t$/\t0/; $expected_line =~ s/\t(?=\t)/\t0/g; $expected_line =~ s/\t$/\t0/; printf COOCCUR "$line\n"; printf EXPECTED "$expected_line\n"; $seen_consonants{$consonant} = 1; } } # We might still have more onsets that were found in the corpus, but not in the handy ordered list # provided above. (For example, if we also count complex onsets) # So, now go back and look at ALL attested onsets, skipping the ones we've already handled. # (This looks overly complex, but the goal is to improve readability and make it easier to analyze # the results, but guaranteeing that ordinary stuff is up front in a logical order) if ($print_all_consonants) { foreach $consonant (sort keys %OnsetFreq) { $line = undef; $expected_line = undef; unless ($seen_consonants{$consonant}) { printf COOCCUR replace_digraphs($consonant); printf EXPECTED replace_digraphs($consonant); foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { $line .= "\t$OnsetNuc{$consonant}{$vowel}"; # The expected count is proportional to the joint probability of this onset and nucleus $expected_line .= "\t" . ($OnsetProb{$consonant} * $NucleusProb{$vowel} * $total_nuclei); } } $line =~ s/\t(?=\t)/\t0/g; $line =~ s/\t$/\t0/; $expected_line =~ s/\t(?=\t)/\t0/g; $expected_line =~ s/\t$/\t0/; printf COOCCUR "$line\n"; printf EXPECTED "$expected_line\n"; } } } %seen_consonants = undef; $line = undef; $expected_line = undef; printf COOCCUR "\nObserved Nucleus-Coda Co-occurrences:\n"; printf EXPECTED "\nExpected Nucleus-Coda Co-occurrences:\n"; foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { printf COOCCUR "\t". replace_digraphs($vowel); printf EXPECTED "\t". replace_digraphs($vowel); } } printf COOCCUR "\n"; printf EXPECTED "\n"; foreach $consonant (@consonants) { $line = undef; $expected_line = undef; # We'll print values for each predefined consonant that actually occurs in the corpus if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) { printf COOCCUR replace_digraphs($consonant); printf EXPECTED replace_digraphs($consonant); foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { $line .= "\t$NucCoda{$vowel}{$consonant}"; # The expected count is proportional to the joint probability of this nucleus and coda $expected_line .= "\t" . ($NucleusProb{$vowel} * $CodaProb{$consonant} * $total_nuclei); } } # For undefined (unseen) values, provide 0's $line =~ s/\t(?=\t)/\t0/g; $line =~ s/\t$/\t0/; $expected_line =~ s/\t(?=\t)/\t0/g; $expected_line =~ s/\t$/\t0/; printf COOCCUR "$line\n"; printf EXPECTED "$expected_line\n"; $seen_consonants{$consonant} = 1; } } # We might still have more codas that were found in the corpus, but not in the handy ordered list # provided above. (For example, if we also count complex codas) if ($print_all_consonants) { foreach $consonant (sort keys %OnsetFreq) { $line = undef; $expected_line = undef; unless ($seen_consonants{$consonant}) { printf COOCCUR replace_digraphs($consonant); printf EXPECTED replace_digraphs($consonant); foreach $vowel (@vowels) { if (exists $NucleusFreq{$vowel}) { $line .= "\t$NucCoda{$vowel}{$consonant}"; # The expected count is proportional to the joint probability of this nucleus and coda $expected_line .= "\t" . ($NucleusProb{$vowel} * $CodaProb{$consonant} * $total_nuclei); } } $line =~ s/\t(?=\t)/\t0/g; $line =~ s/\t$/\t0/; $expected_line =~ s/\t(?=\t)/\t0/g; $expected_line =~ s/\t$/\t0/; printf COOCCUR "$line\n"; printf EXPECTED "$expected_line\n"; } } } %seen_consonants = undef; $line = undef; $expected_line = undef; printf COOCCUR "\nObserved Onset-Coda Co-occurrences:\n"; printf EXPECTED "\nExpected Onset-Coda Co-occurrences:\n"; foreach $consonant (@consonants) { if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) { printf COOCCUR "\t". replace_digraphs($consonant); printf EXPECTED "\t". replace_digraphs($consonant); } } printf COOCCUR "\n"; printf EXPECTED "\n"; foreach $consonant (@consonants) { $line = undef; $expected_line = undef; # We'll print values for each predefined consonant that actually occurs in the corpus if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) { printf COOCCUR replace_digraphs($consonant); printf EXPECTED replace_digraphs($consonant); foreach $consonant2 (@consonants) { if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) { $line .= "\t$OnsetCoda{$consonant}{$consonant2}"; # The expected count is proportional to the joint probability of this onset and nucleus $expected_line .= "\t" . ($OnsetProb{$consonant} * $CodaProb{$consonant2} * $total_nuclei); } } # For undefined (unseen) values, provide 0's $line =~ s/\t(?=\t)/\t0/g; $line =~ s/\t$/\t0/; $expected_line =~ s/\t(?=\t)/\t0/g; $expected_line =~ s/\t$/\t0/; printf COOCCUR "$line\n"; printf EXPECTED "$expected_line\n"; } } $line = undef; $expected_line = undef; close (COOCCUR); sub remove_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/\@r/и/g; $string =~ s/i:/i/g; $string =~ s/A:/A/g; $string =~ s/u:/u/g; $string =~ s/3:/3/g; $string =~ s/A~:/╠/g; $string =~ s/O~:/п/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/O:/┐/g; # Nasalized short and long ╛ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/&~(:)/о/g; $string =~ s/eI/О/g; $string =~ s/aI/Y/g; $string =~ s/OI/Ъ/g; $string =~ s/aU/W/g; $string =~ s/\@U/o/g; # The following usually correspond to r in American English $string =~ s/I\@/Х/g; $string =~ s/E\@/С/g; $string =~ s/U\@/Я/g; # Also some consonant digraphs $string =~ s/dZ/J/g; $string =~ s/tS/C/g; $string =~ s/n,/B/g; # totally arbitrary; N is taken $string =~ s/m,/M/g; $string =~ s/l,/L/g; $string =~ s/r\*/R/g; return $string; } sub replace_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i/i:/g; $string =~ s/A/A:/g; $string =~ s/u/u:/g; $string =~ s/3/3:/g; $string =~ s/╠/A~:/g; $string =~ s/п/O~:/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/┐/O:/g; # Nasalized short and long ╛ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/о/&~/g; $string =~ s/О/eI/g; $string =~ s/Y/aI/g; $string =~ s/Ъ/OI/g; $string =~ s/W/aU/g; $string =~ s/o/\@U/g; $string =~ s/Х/I\@/g; $string =~ s/С/E\@/g; $string =~ s/Я/U\@/g; # Also some consonant digraphs $string =~ s/J/dZ/g; $string =~ s/C/tS/g; $string =~ s/B/n,/g; # totally arbitrary; N is taken $string =~ s/M/m,/g; $string =~ s/L/l,/g; $string =~ s/R/r\*/g; $string =~ s/ /[none]/g; return $string; }