# This program computes the phonotactic probability metrics employed by
# Jusczyk et al in their study of infants' knowledge of phonotactics
# It uses the wordlist CelexWordsInTranscription.txt to calculate 
# probabilities from the English lexicon, and then applies these probabilities
# to test items, to give predicted well-formedness scores.

$count_cvc_only = 0;
$freq_threshhold = 1;
$print_all_consonants = 0;

$vowels = "aeiouAEIOU&V\@3Ì¯¿®ŽYšWo•‘ŸBML¨";

print "\nCounting frequencies in wordlist\n";
open (WORDLIST, "RandomHouseMonosyllables.txt") or die "Warning! Can't open wordlist: $!\n";
# The first line is the headers; skip it
<WORDLIST>;

while ($line = <WORDLIST>) {
    chomp($line);    
    ($badfreq, $freq, $orthography, $transcription, $template) = split("\t", $line);
    next if ($freq < $freq_threshhold);    
    
    $original_transcription = $transcription;        
    $transcription =~ s/^\[//;    
    $transcription =~ s/\]$//;    
    # It will help a lot to have every phoneme represented by a single character
    $transcription = remove_digraphs($transcription);    
            
    # Kessler & Treiman count only CVC, but this is something we might want to explore
    if ($count_cvc_only and $template ne "CVC") {
	next;		
    }
    
    # First, adjust positional probabilities
    if ($transcription =~ /([^$vowels]*)([$vowels])([^$vowels]*)/) {
	$onset = $1;	    
	$nucleus = $2;	    
	$coda = $3;	    	    
    } else {
	print "Error: can't parse syllable [$syl] into onset, nucleus, and coda in the word $original_transcription\n";
    }
    # Kessler & Treiman ignore empty positions, but this too is something that could make a difference
    if ($onset eq "") {
	$onset = " ";
    }
    if ($coda eq "") {
	$coda = " ";
    }
    
    # First the generic, position-independent counts
    $TotalFreq{$onset}++;    
    $TotalFreq{$nucleus}++;    
    $TotalFreq{$coda}++;    
    # Bookkeeping: handy to have a list of all consonants
    $Consonants{$onset} = 1;    
    $Consonants{$coda} = 1;        

    # Then, the contextual (position-dependent) counts
    $OnsetFreq{$onset}++;    
    $NucleusFreq{$nucleus}++;    
    $CodaFreq{$coda}++;
    $total_onsets++;    
    $total_nuclei++;    
    $total_codas++;
    
    # Now, count co-occurrences
    $OnsetNuc{$onset}{$nucleus}++;    
    $NucCoda{$nucleus}{$coda}++;    
    $OnsetCoda{$onset}{$coda}++;    
}
close (WORDLIST);


# Now that we're done counting from the corpus, we can calculate the probabilities
print "\nCalculating positional probabilities\n";
# First, the positional probabilities of each phoneme
foreach $onset (keys %OnsetFreq) {
    $OnsetProb{$onset} = $OnsetFreq{$onset} / $total_onsets;    
}
foreach $nucleus (keys %NucleusFreq) {
    $NucleusProb{$nucleus} = $NucleusFreq{$nucleus} / $total_nuclei;    
}
foreach $coda (keys %CodaFreq) {
    $CodaProb{$coda} = $CodaFreq{$coda} / $total_codas;    
}

# Let's save the results in a file
open (POSPROBFILE, ">RHPositionalProbabilities.txt") or die "Warning! Can't create positional probabilties file: $!\n";

# Now print the results, in decreasing probability:
printf POSPROBFILE "*****************************\n   Onset Probabilities\n*****************************\n";
foreach $onset (sort {$OnsetProb{$b} <=> $OnsetProb{$a}} keys %OnsetProb) {
    printf POSPROBFILE replace_digraphs($onset) . "\t$OnsetProb{$onset}\n";
}
printf POSPROBFILE "\n*****************************\n   Nucleus Probabilities\n*****************************\n";
foreach $nucleus (sort {$NucleusProb{$b} <=> $NucleusProb{$a}} keys %NucleusProb) {
    printf POSPROBFILE replace_digraphs($nucleus) . "\t$NucleusProb{$nucleus}\n";
}
printf POSPROBFILE "\n*****************************\n   Coda Probabilities\n*****************************\n";
foreach $coda (sort {$CodaProb{$b} <=> $CodaProb{$a}} keys %CodaProb) {
    printf POSPROBFILE replace_digraphs($coda) . "\t$CodaProb{$coda}\n";
}
close (POSPROBFILE);


# Now on to Study 1: print out the onset/coda distributions, for Study 1 chi square tests
print "\nCalculating onset/coda asymmetries\n";
open (ONSETCODA, ">RH-OnsetCodaDistributions.txt") or die "Warning! Can't create file for onset/coda distributions: $!\n";

@vowels = ("i", "I", "Ž", "E", "a", "&", "u", "U", "o", "¿", "V", "A", "Y", "W", "š", "¨",
	   "3", "Ì", "¯", "¿", "®", "•", "‘", "Ÿ", "B", "M", "L");                
@consonants = ("p", "t", "k", "b", "d", "g", "f", "T", "s", "S", "h", "C", "v", "D", "z", "Z", "J", "m", "n", "N", "l", "r", "w", "j");

printf ONSETCODA "Consonant\tTotal\tOnset (obs)\tCoda (obs)\tOnset (exp)\tCoda (exp)\tOnset Bias\tchi^2\n";
foreach $consonant (@consonants) {
    $seen_consonants{$consonant} = 1;        
    $expected = $TotalFreq{$consonant} / 2;
    $onset_bias = $OnsetFreq{$consonant} / $TotalFreq{$consonant};        
    $chi_square = 2 * (($OnsetFreq{$consonant} - $expected)**2)/$expected;			
    $line = "$consonant\t$TotalFreq{$consonant}\t$OnsetFreq{$consonant}\t$CodaFreq{$consonant}\t$expected\t$expected\t$onset_bias\t$chi_square\n";    
    $line =~ s/\t\t/\t0\t/g;        
    printf ONSETCODA $line;        
}
# There will also be other consonants in the list (esp clusters)
# So, get those from the keys of the %consonants hash
foreach $consonant (sort keys %Consonants) {
    unless ($seen_consonants{$consonant}) {
	$expected = $TotalFreq{$consonant} / 2;        
	$onset_bias = $OnsetFreq{$consonant} / $TotalFreq{$consonant};        
	$chi_square = 2 * (($OnsetFreq{$consonant} - $expected)**2)/$expected;			
	$line = "$consonant\t$TotalFreq{$consonant}\t$OnsetFreq{$consonant}\t$CodaFreq{$consonant}\t$expected\t$expected\t$onset_bias\t$chi_square\n";    
	$line =~ s/\t\t/\t0\t/g;        
	printf ONSETCODA $line;        
    }
}

%seen_consonants = undef;

# Now, calculate co-occurrence counts
print "\nTallying co-occurrences\n";

open (COOCCUR, ">RH-Cooccurrences.txt") or die "Warning! Can't create file of cooccurrence counts: $!\n";
open (EXPECTED, ">RH-ExpectedCooccurrences.txt") or die "Warning! Can't create file of expected cooccurrence counts: $!\n";
printf COOCCUR "Observed Onset-Nucleus Co-occurrences:\n";
printf EXPECTED "Expected Onset-Nucleus Co-occurrences:\n";

foreach $vowel (@vowels) {
  if (exists $NucleusFreq{$vowel}) {
      printf COOCCUR "\t". replace_digraphs($vowel);            
      printf EXPECTED "\t". replace_digraphs($vowel);            
  }  
}
printf COOCCUR "\n";
printf EXPECTED "\n";
foreach $consonant (@consonants) {
    $line = undef;        
    $expected_line = undef;    
    
    # We'll print values for each predefined consonant that actually occurs in the corpus
    if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) {
	printf COOCCUR replace_digraphs($consonant);	
	printf EXPECTED replace_digraphs($consonant);	
	foreach $vowel (@vowels) {
	    if (exists $NucleusFreq{$vowel}) {
		$line .= "\t$OnsetNuc{$consonant}{$vowel}";            
		# The expected count is proportional to the joint probability of this onset and nucleus
		$expected_line .= "\t" . ($OnsetProb{$consonant} * $NucleusProb{$vowel} * $total_nuclei);				
	    }  	    
	}
	# For undefined (unseen) values, provide 0's
	$line =~ s/\t(?=\t)/\t0/g;		
	$line =~ s/\t$/\t0/;
	$expected_line =~ s/\t(?=\t)/\t0/g;		
	$expected_line =~ s/\t$/\t0/;
	printf COOCCUR "$line\n";	
	printf EXPECTED "$expected_line\n";		
	$seen_consonants{$consonant} = 1;		
    }
}
# We might still have more onsets that were found in the corpus, but not in the handy ordered list
# provided above.  (For example, if we also count complex onsets)
# So, now go back and look at ALL attested onsets, skipping the ones we've already handled.
# (This looks overly complex, but the goal is to improve readability and make it easier to analyze
#  the results, but guaranteeing that ordinary stuff is up front in a logical order)
if ($print_all_consonants) {
    foreach $consonant (sort keys %OnsetFreq) {
	$line = undef;        
	$expected_line = undef;		
	unless ($seen_consonants{$consonant}) {
	    printf COOCCUR replace_digraphs($consonant);	
	    printf EXPECTED replace_digraphs($consonant);	
	    foreach $vowel (@vowels) {
		if (exists $NucleusFreq{$vowel}) {
		    $line .= "\t$OnsetNuc{$consonant}{$vowel}";            
		    # The expected count is proportional to the joint probability of this onset and nucleus
		    $expected_line .= "\t" . ($OnsetProb{$consonant} * $NucleusProb{$vowel} * $total_nuclei);				
		}  	    
	    }
	    $line =~ s/\t(?=\t)/\t0/g;		
	    $line =~ s/\t$/\t0/;		
	    $expected_line =~ s/\t(?=\t)/\t0/g;		
	    $expected_line =~ s/\t$/\t0/;
	    printf COOCCUR "$line\n";	
	    printf EXPECTED "$expected_line\n";		
	}
    }
}


%seen_consonants = undef;
$line = undef;
$expected_line = undef;

printf COOCCUR "\nObserved Nucleus-Coda Co-occurrences:\n";
printf EXPECTED "\nExpected Nucleus-Coda Co-occurrences:\n";
foreach $vowel (@vowels) {
  if (exists $NucleusFreq{$vowel}) {
      printf COOCCUR "\t". replace_digraphs($vowel);            
      printf EXPECTED "\t". replace_digraphs($vowel);            
  }  
}
printf COOCCUR "\n";
printf EXPECTED "\n";
foreach $consonant (@consonants) {
    $line = undef;     
    $expected_line = undef;        
    # We'll print values for each predefined consonant that actually occurs in the corpus
    if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) {
	printf COOCCUR replace_digraphs($consonant);	
	printf EXPECTED replace_digraphs($consonant);	
	foreach $vowel (@vowels) {
	    if (exists $NucleusFreq{$vowel}) {
		$line .= "\t$NucCoda{$vowel}{$consonant}";            
		# The expected count is proportional to the joint probability of this nucleus and coda
		$expected_line .= "\t" . ($NucleusProb{$vowel} * $CodaProb{$consonant} * $total_nuclei);				
	    }  	    
	}
	# For undefined (unseen) values, provide 0's
	$line =~ s/\t(?=\t)/\t0/g;		
	$line =~ s/\t$/\t0/;		
	$expected_line =~ s/\t(?=\t)/\t0/g;		
	$expected_line =~ s/\t$/\t0/;
	printf COOCCUR "$line\n";	
	printf EXPECTED "$expected_line\n";		
	$seen_consonants{$consonant} = 1;		
    }
}
# We might still have more codas that were found in the corpus, but not in the handy ordered list
# provided above.  (For example, if we also count complex codas)
if ($print_all_consonants) {
    foreach $consonant (sort keys %OnsetFreq) {
	$line = undef; 
	$expected_line = undef;		
	unless ($seen_consonants{$consonant}) {
	    printf COOCCUR replace_digraphs($consonant);	
	    printf EXPECTED replace_digraphs($consonant);	
	    foreach $vowel (@vowels) {
		if (exists $NucleusFreq{$vowel}) {
		    $line .= "\t$NucCoda{$vowel}{$consonant}";            
		    # The expected count is proportional to the joint probability of this nucleus and coda
		    $expected_line .= "\t" . ($NucleusProb{$vowel} * $CodaProb{$consonant} * $total_nuclei);				
		}  	    
	    }
	    $line =~ s/\t(?=\t)/\t0/g;		
	    $line =~ s/\t$/\t0/;		
	    $expected_line =~ s/\t(?=\t)/\t0/g;		
	    $expected_line =~ s/\t$/\t0/;
	    printf COOCCUR "$line\n";	
	    printf EXPECTED "$expected_line\n";		
	}
    }
}

%seen_consonants = undef;
$line = undef;
$expected_line = undef;

printf COOCCUR "\nObserved Onset-Coda Co-occurrences:\n";
printf EXPECTED "\nExpected Onset-Coda Co-occurrences:\n";
foreach $consonant (@consonants) {
  if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) {
      printf COOCCUR "\t". replace_digraphs($consonant);            
      printf EXPECTED "\t". replace_digraphs($consonant);            
  }  
}
printf COOCCUR "\n";
printf EXPECTED "\n";
foreach $consonant (@consonants) {
    $line = undef;        
    $expected_line = undef;        
    # We'll print values for each predefined consonant that actually occurs in the corpus
    if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) {
	printf COOCCUR replace_digraphs($consonant);	
	printf EXPECTED replace_digraphs($consonant);	
	foreach $consonant2 (@consonants) {
	    if (exists $OnsetFreq{$consonant} or exists $CodaFreq{$consonant}) {
		$line .= "\t$OnsetCoda{$consonant}{$consonant2}";            
		# The expected count is proportional to the joint probability of this onset and nucleus
		$expected_line .= "\t" . ($OnsetProb{$consonant} * $CodaProb{$consonant2} * $total_nuclei);				
	    }  	    
	}
	# For undefined (unseen) values, provide 0's
	$line =~ s/\t(?=\t)/\t0/g;		
	$line =~ s/\t$/\t0/;		
	$expected_line =~ s/\t(?=\t)/\t0/g;		
	$expected_line =~ s/\t$/\t0/;
	printf COOCCUR "$line\n";	
	printf EXPECTED "$expected_line\n";		
    }
}

$line = undef;
$expected_line = undef;


close (COOCCUR);


sub remove_digraphs {
    $string = @_[0];
    # Some digraphs indicate length redundantly on tense vowels; removing
    # the colon won't result in any neutralizations
    $string =~ s/\@r/¨/g;        
    $string =~ s/i:/i/g;    
    $string =~ s/A:/A/g;    
    $string =~ s/u:/u/g;    
    $string =~ s/3:/3/g;        
    $string =~ s/A~:/Ì/g;        
    $string =~ s/O~:/¯/g;        

    # Some tense vowels have lax correspondents with the same symbol;
    # have to change
    $string =~ s/O:/¿/g;
  
    # Nasalized short and long ¾ doesn't even seem like a real distinction;
    # I'm going to neutralize them
    $string =~ s/&~(:)/®/g;
    $string =~ s/eI/Ž/g;
    $string =~ s/aI/Y/g;    
    $string =~ s/OI/š/g;        
    $string =~ s/aU/W/g;
    $string =~ s/\@U/o/g; 
    # The following usually correspond to r in American English
    $string =~ s/I\@/•/g;    
    $string =~ s/E\@/‘/g;        
    $string =~ s/U\@/Ÿ/g;    
    
    # Also some consonant digraphs
    $string =~ s/dZ/J/g;    
    $string =~ s/tS/C/g;    
    $string =~ s/n,/B/g; # totally arbitrary; N is taken
    $string =~ s/m,/M/g;      
    $string =~ s/l,/L/g;    
    $string =~ s/r\*/R/g; 
    
    return $string;    
}
sub replace_digraphs {
    $string = @_[0];
    # Some digraphs indicate length redundantly on tense vowels; removing
    # the colon won't result in any neutralizations
    $string =~ s/i/i:/g;    
    $string =~ s/A/A:/g;    
    $string =~ s/u/u:/g;    
    $string =~ s/3/3:/g;        
    $string =~ s/Ì/A~:/g;        
    $string =~ s/¯/O~:/g;        

    # Some tense vowels have lax correspondents with the same symbol;
    # have to change
    $string =~ s/¿/O:/g;
  
    # Nasalized short and long ¾ doesn't even seem like a real distinction;
    # I'm going to neutralize them
    $string =~ s/®/&~/g;
    $string =~ s/Ž/eI/g;
    $string =~ s/Y/aI/g;    
    $string =~ s/š/OI/g;        
    $string =~ s/W/aU/g;
    $string =~ s/o/\@U/g; 
    
    $string =~ s/•/I\@/g;    
    $string =~ s/‘/E\@/g;        
    $string =~ s/Ÿ/U\@/g;    

    # Also some consonant digraphs
    $string =~ s/J/dZ/g;    
    $string =~ s/C/tS/g;    
    $string =~ s/B/n,/g; # totally arbitrary; N is taken
    $string =~ s/M/m,/g;      
    $string =~ s/L/l,/g;    
    $string =~ s/R/r\*/g;
    $string =~ s/ /[none]/g;    
    

    return $string;    
}
