function [words,counts]=parseEmail(fname,dictionary) % [words,counts]=parseEmail(fname,dictionary) % Reads in file FNAME, collects unique words and counts their occurences. % A word is defined as any contiguous sequence of letters, _ and @ % (surrounded by anything that is not a letter, _ or @). % If non-empty cell-array DICTIONARY is given, only words in it are considered % (i.e., any word not in DICTIONARY is ignored). % Returns: % WORDS{I} is the I-th distinct word; % COUNTS(I) is the number of times WORDS{I} appeared in this document. % Note: words that do not appear in the file are not included in WORDS. if (nargin < 2) dictionary = []; end f=fopen(fname,'rb'); if (f < 0) error(['Can not open ' fname]); end % read in characters (fastest to read a stream of bytes), % convert to char type, % convert all letters to lower case and finally enclose with spaces on % both ends (to make regexp's life easier) email=[32 lower(char(fread(f,'char')')) 32]; % this will find all words (non unique i.e. some words will appear % multiple times) matches=regexp(email,'([a-z_@]*)','match'); % collect unique words [words,ignore,ind]=unique(matches); if (~isempty(dictionary)) % only keep words in the dictionary [words,keep]=intersect(words,dictionary); else % keep all of them keep=1:length(words); end % count them using the output of unique counts=zeros(1,length(words)); for i=1:length(keep) counts(i)=sum(ind==keep(i)); end fclose(f);