Skip to content

Commit

Permalink
Initial revision
Browse files Browse the repository at this point in the history
  • Loading branch information
denizyuret committed Nov 29, 2006
1 parent 28667d1 commit a825121
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 0 deletions.
24 changes: 24 additions & 0 deletions ambiguous.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/perl -w
# Determine ambiguous turkish words when converted to ascii
# Input assumed one word per line in Latin-5

use strict;
use locale;
use POSIX qw(locale_h);
setlocale(LC_ALL, "tr_TR.iso88599");

my %word;
while(<>) {
chomp;
next unless /\w/;
my $lcword = lc();
my $ascii = $lcword;
$ascii =~ tr/çðýöþü/cgiosu/;
$word{$ascii}{$lcword}++;
}

for my $w (keys %word) {
my @a = keys %{$word{$w}};
next if scalar(@a)==1;
print join(' ', $w, @a, "\n");
}
23 changes: 23 additions & 0 deletions dlist2alist.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/perl -w
# Convert a dlist model into an alist
use strict;

my @model = <>;
$model[0] =~ s/^\S+/$& X/;

print "(";
for (my $i = $#model; $i >= 0; $i--) {
$model[$i] =~ s/\s+\#.*//;
my ($class, @pat) = split(' ', $model[$i]);
die if scalar(@pat) != 1;
my $pat = $pat[0];
# $pat =~ s/[\W_]/ /g;
$pat =~ s/_/ /g;
$pat =~ s/\"/\\\"/g;
if ($class == 0) {
print "(\"$pat\") ";
} else {
print "(\"$pat\" . t) ";
}
}
print ")\n";
55 changes: 55 additions & 0 deletions features.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <glib.h>
FILE *popen(const char *command, const char *type);
int pclose(FILE *stream);
#include "foreach.h"

GStringChunk *strings = NULL;
GHashTable *strtable = NULL;

int main(int argc, char **argv) {
unsigned lines = 0;
if (argc > 1) { /* we have a feature list */
fputs("Reading features\n", stderr);
strings = g_string_chunk_new(1024);
strtable = g_hash_table_new(g_str_hash, g_str_equal);
char cmd[256];
snprintf(cmd, 256, "|zcat %s", argv[1]);
foreach_line(buf, cmd) {
if (++lines % 100000 == 0) fprintf(stderr, ".");
buf[strlen(buf)-1] = 0;
gchar *s = g_string_chunk_insert(strings, buf);
g_hash_table_insert(strtable, s, s);
}
}
fputs("\nReading instances\n", stderr);
lines = 0;
foreach_line(buf, NULL) {
if (++lines % 100000 == 0) fprintf(stderr, ".");
buf[12] = 'X';
foreach_int(i, 13, 22) { /* lowercase right side */
int c = buf[i];
if (isupper(c))
buf[i] = tolower(c);
}
putchar(*buf); /* class */
foreach_int(a, 2, 12) {
foreach_int(b, 13, 23) {
/* include a, exclude b */
int c = buf[b];
buf[b] = 0;
gchar *s = buf + a;
if ((NULL == strtable) ||
g_hash_table_lookup(strtable, s)) {
putchar(' ');
fputs(s, stdout);
}
buf[b] = c;
if (buf[b-1] == '_') break;
}
}
putchar('\n');
}
}
63 changes: 63 additions & 0 deletions repeated.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <glib.h>
#include "foreach.h"

static unsigned *ht;
GStringChunk *strings;
GHashTable *strtable;

int seen(unsigned h) {
unsigned bit = (1 << (h & 31));
unsigned word = (h >> 5);
if (ht[word] & bit) return 1;
else {
ht[word] |= bit;
return 0;
}
}

void output(const char *str) {
if (!g_hash_table_lookup(strtable, str)) {
puts(str);
gchar *s = g_string_chunk_insert(strings, str);
g_hash_table_insert(strtable, s, s);
}
}

int main() {
ht = calloc(1<<27, sizeof(unsigned));
strings = g_string_chunk_new(1024);
strtable = g_hash_table_new(g_str_hash, g_str_equal);
unsigned r[24];
foreach_int(i, 0, 23) {
r[i] = (unsigned) rand();
}
unsigned lines = 0;
foreach_line(buf, NULL) {
if (++lines % 100000 == 0) fprintf(stderr, ".");
buf[12] = 'X';
foreach_int(i, 13, 22) {
int c = buf[i];
if (isupper(c))
buf[i] = tolower(c);
}
foreach_int(a, 2, 12) {
unsigned h = 0;
foreach_int(i, a, 12)
h += (unsigned) buf[i] * r[i];
foreach_int(b, 13, 23) {
/* include a, exclude b */
if (seen(h)) {
int c = buf[b];
buf[b] = 0;
output(buf + a);
buf[b] = c;
}
if (buf[b-1] == '_') break;
h += (unsigned) buf[b] * r[b];
}
}
}
}
12 changes: 12 additions & 0 deletions test.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/perl -w
use strict;
my $file = shift;
$file =~ /^(\w)\.out$/ or die;
my $letter = $1;
my $cmd = qq{zcat test.100k | instances.pl $letter | features | dlist -t $file |};
warn "$cmd\n";
open(FP, $cmd) or die;
my $ans = <FP>;
close(FP);
$ans = 1/(1-$ans);
print "$file\t$ans\n";
14 changes: 14 additions & 0 deletions uniq.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include <glib.h>
#include "foreach.h"

int main() {
GStringChunk *strings = g_string_chunk_new(1024);
GHashTable *strtable = g_hash_table_new(g_str_hash, g_str_equal);
foreach_line(buf, NULL) {
if (!g_hash_table_lookup(strtable, buf)) {
gchar *s = g_string_chunk_insert(strings, buf);
g_hash_table_insert(strtable, s, s);
fputs(s, stdout);
}
}
}

0 comments on commit a825121

Please sign in to comment.