Maximal matching of patterns

Assume you have a list of patterns, say,

ab, abcd, acde, cdac, cdef, dec, def, defd, efg

and a piece of string, say,

abcdacdecdefdefg

How can you chop the string into pieces according to the patterns known? I wrote the following maximal matching code for this purpose. I call those patterns the tokens and the string is a concatenation of tokens. The problem is now figure out what are the tokens to constitute the string.

#!/usr/bin/perl
# Copyright 2007 (c) Adrian Sai-wah Tam
# Wed Mar 28 10:26:05 HKT 2007
#
# Maximal matching for segmentation
# It tries with forward max matching first, then go with backward if patterns
# not found.
#
# Version 2.0
# Updated the code to make it modularized and easier to read

use strict;
use warnings;

my $TRUE=1;
my $FALSE=0;

my %dict;		# Dictionary of matching tokens (fmt: token=>1)
my $maxtoklen=0;	# Max token length
my $sep='/';		# Separator for printing the tokenized

# Open and read-in the dictionary
open(DICT,"dict.txt") || die "Cannot open dictionary\n";
while(<DICT>) {
	chomp;
	$dict{$_} = 1;
	$maxtoklen = length($_) unless ($maxtoklen > length($_));
};
close(DICT);

# Read the string from console and tokenize it
while(<>){
	chomp;
	my @tokens = tokenize($_);
	print join($sep,@tokens) . "\n";
};

##########
# Function pool
# 

## Tokenize the supplied string
sub tokenize {
	my @tokens;	# Array of tokens
	my $rest=$_[0];	# The string not yet tokenized
	while (length($rest)) {
		my @result = matchOne($rest);	# Get the first token
		# Check if it is matched successfully
		if (length($result[0])) {
			push @tokens, $result[0];
			$rest = $result[1];
			next;
		};
		# Now we see something wrong in maximal matching, rollback!
		my $brokenTok;
		rollback(\@tokens, \$rest, \$brokenTok) || die "Rollback failed.\n";
		# Then, use reverse maximal matching to rectify the tokens
		revmatch(\@tokens, $brokenTok) || die "Reverse match failed: New tokens?\n";
	};
	return @tokens;
};

## Find the leading token in the string by maximal matching
sub matchOne {
	my @result = ("", $_[0]);	# 1=token, 2=remaining

	# Remove leading spaces
	(my $sanitized = $result[1]) =~ s/^\s+//;
	# Extract substrings in various length and match with dictionary
	for(my $i=$maxtoklen; $i>=1; $i--) {
		my $substr = substr($sanitized,0,$i);
		# Update @result if something is found
		if ( defined($dict{$substr}) ) {
			$result[0] = $substr;
			$result[1] = substr($sanitized,length($substr));
			last;
		};
	};
	return @result;
};

## Rollback function: to search for the (possibily) correct boundary from
#                     the given array of tokens and the untokenized string
sub rollback {
	(my $refTok, my $refRest, my $refBrokenTok) = @_;
	my $rest = $$refRest;		# Copy of untokenized string
	my @tokens = @$refTok;		# Copy of tokens array
	my $brokenTok;			# Variable to hold the broken token
	my @tok = ("","");		# Array to hold the search result
	# Loop to ensure we exhausted $maxtoklen possibilities
	for (my $lenTried=0; $lenTried<$maxtoklen;) {
		if (scalar(@tokens)==0) {	# If no more things to rollback
			$tok[0]="";		#   clear the misleading indication
			last;			#   and stop
		};
		$brokenTok = pop @tokens;
		my $lastTok = $brokenTok;
		# Boundary-searching loop
		while (length($brokenTok)) {
			$rest = substr($brokenTok,-1) . $rest;
			$brokenTok = substr($brokenTok,0,-1);
			$lenTried++;
			@tok = matchOne($rest);	# Try if this is a boundary
			last if (length($tok[0]));	# Stop if found something
		};
		# Check if we need to continue the search
		last if ($lastTok ne $tok[0]);	# Check if we found something new
	};
	# Check the indication to see if we found anything, and report
	if (length($tok[0])) {
		$$refRest = $tok[0].$tok[1];	# Update the info
		$$refBrokenTok = $brokenTok;
		@$refTok = @tokens;
		return $TRUE;
	} else {
		return $FALSE;
	};
};

## Reverse maximal matching to rectify incorrect tokenization
sub revmatch {
	my $rest = $_[1]; 		# Copy of the broken token
	my @tokens = @{$_[0]};		# Copy of tokens array
	my @newToks;			# Array to store the new tokens
	my $brokenTok;			# Variable to hold the broken token in the process

	while (length($rest)) {
		# if $rest is already a token, we can stop here
		if ( defined($dict{$rest}) ) {
			unshift @newToks, $rest;
			last;
		};
		# Return if reverse match failed. (new token?)
		return $FALSE unless (rollback(\@tokens,\$rest,\$brokenTok));
		# Otherwise, try to get the new token
		my @result = matchOne($rest);
		# If the remaining if not zero, it is not reverse max matching
		return $FALSE if (length($result[1]));
		# Put the new token into array and continue matching
		unshift @newToks, $result[0];
		$rest = $brokenTok;
	};
	push @tokens, @newToks;
	@{$_[0]} = @tokens;
	return $TRUE;
};

The program is assuming no other stuff other than the tokens may constitute the string and there must be a solution. What it expects is a file named dict.txt with the tokens listed line-by-line.

Assuming the input string as a tape and the reading head is at the first character, the algorithm is as follows.

Maximal matching from left to right, check against the dictionary
If something is not matched, there must be a mistake behind
- Move backward for the nearest boundary that makes a valid token at the reading head
- Declare this as a fixed boundary
- Perform reverse maximal matching to correct the tokening, by checking against the dictionary
Repeat until the input tape is exhausted

This program may be useful for the segmentation of Chinese/Japanese text where no explicit phase (a.k.a. “word”) boundary exists. However, to make it useful for such purpose, there must be a way to figure out new phases.

Extension: New token identification

The above program can be modified to include new token identification. That is, instead of halting the program, we can identify a phase that does not exist in the dictionary and continue the match.

#!/usr/bin/perl
# Copyright 2007 (c) Adrian Sai-wah Tam
# Wed Mar 28 10:26:05 HKT 2007
#
# Maximal matching for segmentation
# It tries with forward max matching first, then go with backward if patterns
# not found.
#
# Test case:
#   with dict.txt contains: ab abcd acde cdef def cdac dec defd efg
#   invoking with
#     echo abcdacdecdefdefgab | perl maxmatch3.pl
#   shall give: "ab/cdac/dec/defd/efg/ab";
#   invoking with
#     echo abcdacdec22defd33efgab | perl maxmatch3.pl
#   shall give: "ab/cdac/dec/22/defd/33/efg/ab" with new token "22/33";
#
# Version 3.0
# Added the functionality of new token detection

use strict;
use warnings;

my $TRUE=1;
my $FALSE=0;

my %dict;		# Dictionary of matching tokens (fmt: token=>1)
my $maxtoklen=0;	# Max token length
my $sep='/';		# Separator for printing the tokenized
my @newToks;		# Array to hold the newly found tokens

# Open and read-in the dictionary
open(DICT,"dict.txt") || die "Cannot open dictionary\n";
while(<DICT>) {
	chomp;
	$dict{$_} = 1;
	$maxtoklen = length($_) unless ($maxtoklen > length($_));
};
close(DICT);

# Read the string from console and tokenize it
while(<>){
	chomp;
	my @tokens = tokenize($_);
	print join($sep,@tokens) . "\n";
	print "New tokens found: " . join($sep,@newToks) . "\n" if (scalar(@newToks));
};

##########
# Function pool
# 

## Tokenize the supplied string
sub tokenize {
	my @tokens;	# Array of tokens
	my $rest=$_[0];	# The string not yet tokenized
	my $lastIsRollback = $FALSE;
	while (length($rest)) {
		my @result = matchOne($rest);	# Get the first token
		# Check if it is matched successfully
		if (length($result[0])) {
			push @tokens, $result[0];
			$rest = $result[1];
			$lastIsRollback = $FALSE;
			next;
		};
		# Is it a new token or mistake in maximal matching?
		if ($lastIsRollback) {
			getNewTok(\@tokens, \$rest);
			next;
		};
		# Now we see something wrong in maximal matching, rollback!
		my $brokenTok;
		$lastIsRollback = $TRUE;
		unless ( rollback(\@tokens, \$rest, \$brokenTok) ) {
			# Rollback failed, must be some new tokens there
			getNewTok(\@tokens, \$rest);
			next;
		};
		# Then, use reverse maximal matching to rectify the tokens
		unless ( revmatch(\@tokens, $brokenTok) ) {
			# if reverse match faild, $brokenTok is possibly a new token
			push @tokens, $brokenTok;	# Append to the tokens array
			push @newToks, $brokenTok;
			$dict{$brokenTok} = 1;		# and add to dictionary
			$maxtoklen = length($brokenTok) unless ($maxtoklen > length($brokenTok));
		};
		# Now we know that the next one must be a matching token, this is
		# to prevent flip-flop-like looped rectification
		last unless (length($rest));
		@result = matchOne($rest);
		$rest = $result[1];
		push @tokens, $result[0];
	};
	return @tokens;
};

## Find the leading token in the string by maximal matching
sub matchOne {
	my @result = ("", $_[0]);	# 1=token, 2=remaining

	# Remove leading spaces
	(my $sanitized = $result[1]) =~ s/^\s+//;
	# Extract substrings in various length and match with dictionary
	for(my $i=$maxtoklen; $i>=1; $i--) {
		my $substr = substr($sanitized,0,$i);
		# Update @result if something is found
		if ( defined($dict{$substr}) ) {
			$result[0] = $substr;
			$result[1] = substr($sanitized,length($substr));
			last;
		};
	};
	return @result;
};

## Rollback function: to search for the (possibily) correct boundary from
#                     the given array of tokens and the untokenized string
sub rollback {
	(my $refTok, my $refRest, my $refBrokenTok) = @_;
	my $rest = $$refRest;		# Copy of untokenized string
	my @tokens = @$refTok;		# Copy of tokens array
	my $brokenTok;			# Variable to hold the broken token
	my $lastTok;
	my @tok = ("","");		# Array to hold the search result
	# Loop to ensure we exhausted $maxtoklen possibilities
	for (my $lenTried=0; $lenTried<$maxtoklen;) {
		$tok[0]="";			#   clear the misleading indication
		last if (scalar(@tokens)==0);	# If no more things to rollback
		$lastTok = pop @tokens;
		$brokenTok = $lastTok;
		# Boundary-searching loop
		while (length($brokenTok)) {
			$rest = substr($brokenTok,-1) . $rest;
			$brokenTok = substr($brokenTok,0,-1);
			$lenTried++;
			@tok = matchOne($rest);		# Try if this is a boundary
			last if (length($tok[0]));	# Stop if found something
		};
		# Check if we need to continue the search
		last if ($lastTok ne $tok[0]);	# Check if we found something new
	};
	# Check the indication to see if we found anything, and report
	return $FALSE if ($lastTok eq $tok[0] || length($tok[0])==0);
	$$refRest = $tok[0].$tok[1];	# Update the info
	$$refBrokenTok = $brokenTok;
	@$refTok = @tokens;
	return $TRUE;
};

## Reverse maximal matching to rectify incorrect tokenization
sub revmatch {
	my $rest = $_[1]; 		# Copy of the broken token
	my @tokens = @{$_[0]};		# Copy of tokens array
	my @newToks;			# Array to store the new tokens
	my $brokenTok;			# Variable to hold the broken token in the process

	while (length($rest)) {
		# if $rest is already a token, we can stop here
		if ( defined($dict{$rest}) ) {
			unshift @newToks, $rest;
			last;
		};
		# Return if reverse match failed. (new token?)
		return $FALSE unless (rollback(\@tokens,\$rest,\$brokenTok));
		# Otherwise, try to get the new token
		my @result = matchOne($rest);
		# If the remaining if not zero, it is not reverse max matching
		return $FALSE if (length($result[1]));
		# Put the new token into array and continue matching
		unshift @newToks, $result[0];
		$rest = $brokenTok;
	};
	push @tokens, @newToks;
	@{$_[0]} = @tokens;
	return $TRUE;
};

## Get new token: Given the string is headed by a new token, extract it
#          and append to the token array
sub getNewTok {
	my $rest = ${$_[1]};	# Copy of the string
	my $token = "";		# Array to store the new token
	my @result = matchOne($rest);
	# The matching loop
	while (length($result[0])==0) {
		$token = substr($rest,0,1).$token;
		$rest = substr($rest,1);
		last unless (length($rest));
		@result = matchOne($rest);
	};
	# We are here either because we found the next dictionary word or
	# we exhaused the matching string, in either case, the new word
	# is in $token
	push @{$_[0]}, $token;	# Put the new word into array
	push @newToks, $token;	# and record it as a new word
	${$_[1]} = $rest;	# Update the string to be matched
	$dict{$token} = 1;	# and add to dictionary
	$maxtoklen = length($token) unless ($maxtoklen > length($token));
};

Download

The above source can be downloaded here:

Maximal matching with new token identification: maxmatch3.pl
Maximal matching: maxmatch2.pl
Old version of the maximal matching program, the code is a bit messy: maxmatch.pl