usr 发表于 2020-10-23 21:46:02

【C】语言写的基于Trie的字典统计单词出现次数

本帖最后由 new_starter 于 2020-11-2 23:11 编辑

受到:【C】语言编写的字典支持各种类型扩展 这篇帖子的启发,写了一个C语言基于Trie的字典。
这个工程使用了StoneValley库。
StoneValley库在使用的时候将库文件全部添加到工程即可。
以后还会写一些基于StoneValley库的代码供大家参考。
以下是主函数文件的代码:

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "svtree.h"
#include "svstring.h"

typedef struct st_VOCABULARY
{
        long freq; /* 单词频率 */
        char * pword; /* malloc申请的单词字符串 */
} VOCABULARY, * P_VOCABULARY;

P_VOCABULARY CreateWord()
{
        P_VOCABULARY pvol = malloc(sizeof(VOCABULARY));
        if (NULL != pvol)
        {
                pvol->freq = 1;
                pvol->pword = NULL;
        }
        return pvol;
}

void DeleteWord(P_VOCABULARY pvol)
{
        free(pvol->pword);
        free(pvol);
}

void DestoryArray(P_ARRAY_Z parrz)
{
        size_t i = 0;
        for (; i < parrz->num; ++i)
        {
                P_VOCABULARY pvol = *(((P_VOCABULARY *)parrz->pdata) + i);
                DeleteWord(pvol);
        }
        strFreeArrayZ(parrz);
}

int CbfPrintResult(void * pitem, size_t param)
{
        printf("%s %lu\n", (*((P_VOCABULARY *)pitem))->pword, (*((P_VOCABULARY *)pitem))->freq);
        DWC4100(param);
        return CBF_CONTINUE;
}

int cbfcmpchar(const void * px, const void * py)
{
        return *(char *)px - *(char *)py;
}

int cbfcmpfreq(const void * px, const void * py)
{
        return (*(P_VOCABULARY *)py)->freq - (*(P_VOCABULARY *)px)->freq;
}

int main(int argc, char * argv[])
{
        int c;
        long i, j = 0, k = 0, sizeBuffer = 1024, sizeArray = 1024;
        BOOL bCase = FALSE, bSort = FALSE;
        char * buffer = NULL;
        P_TRIE_A ptrie;
        ARRAY_Z arrz;

        if (NULL == (buffer = (char *) malloc(sizeBuffer)))
                return 1;
        *buffer = '\0';
        if (NULL == (ptrie = treCreateTrieA()))
                return 2;
        if (NULL == strInitArrayZ(&arrz, sizeArray, sizeof(P_VOCABULARY)))
                return 3;

        for (i = 1; i < argc; ++i)
        {
                if (0 == strcmp("-s", argv)) /* 排序开关 */
                        bSort = TRUE;
                else if (0 == strcmp("-c", argv)) /* 大小写开关 */
                        bCase = TRUE;
                else
                        return 4;
        }
        while (EOF != (c = fgetc(stdin)))
        {
                if (j >= sizeBuffer)
                {
                        char * tempBuffer;
                        sizeBuffer += 1024;
                        if (NULL == (tempBuffer = realloc(buffer, sizeBuffer)))
                                return 5;
                        buffer = tempBuffer;
                }
                if (c >= 'A' && c <= 'Z')
                {
                        *(buffer + j++) = bCase ? (char)c : (char)tolower(c);
                }
                else if (c >= 'a' && c <= 'z')
                {
                        *(buffer + j++) = (char)c;
                }
                else
                {
                        *(buffer + j) = '\0';
                        if (0 != j)
                        {
                                size_t * presult;
                                P_VOCABULARY pvol;
                                size_t bufferLen = strlen(buffer);
                                /* 在Trie里查询buffer所包含的单词 */
                                if (NULL == (presult = treSearchTrieA(ptrie, buffer, bufferLen, sizeof(char), cbfcmpchar)))
                                {
                                        if (k >= sizeArray)
                                        {
                                                sizeArray += 1024;
                                                strResizeArrayZ(&arrz, sizeArray, sizeof(P_VOCABULARY));
                                        }
                                        /* 如果没找到就创建一个新单词 */
                                        pvol = CreateWord();
                                        /* 将新单词插入Trie */
                                        treInsertTrieA(ptrie, buffer, bufferLen, sizeof(char), (size_t)pvol, cbfcmpchar);
                                        if (NULL == (pvol->pword = malloc(bufferLen + 1)))
                                                return 6;
                                        memcpy(pvol->pword, buffer, bufferLen + 1);
                                        /* 将新单词的指针存储在数组里 */
                                        *(((P_VOCABULARY *)arrz.pdata) + k++) = pvol;
                                }
                                else
                                {        /* 如果找到单词,就把Trie里存储的Appendix拿出来转换为P_VOCABULARY指针,并且把频率+1 */
                                        pvol = (P_VOCABULARY)*presult;
                                        ++pvol->freq;
                                }
                        }
                        j = 0;

                }
        }
        strResizeArrayZ(&arrz, k, sizeof(P_VOCABULARY));
        if (bSort)
                strSortArrayZ(&arrz, sizeof(P_VOCABULARY), cbfcmpfreq);
        strTraverseArrayZ(&arrz, sizeof(P_VOCABULARY), CbfPrintResult, 0, FALSE);
        DestoryArray(&arrz);
        treDeleteTrieA(ptrie, sizeof(char));
        free(buffer);
        return 0;
}

使用的时候(在Linux环境下)将工程里所有的文件编译为一个名为freq的可执行体:
$ cc *.c -o freq
将可执行体移动到Debug文件夹下就可以对其测试:
$ mv freq ../Debug
以下是一个供测试的小短文:
A wildfire burning in Grand County, Colorado, has exploded from 19,000 acres to more than 125,000 on Thursday, forcing thousands of families to evacuate.
The so-called East Troublesome Fire raced through the town of Grand Lake and into the western portion of Rocky Mountain National Park, which closed to visitors Thursday, CBS Denver reported.
Thick smoke and flames were closing in on homes and terrifying residents. Security video even captured the scene as flames overtook a home. It was not immediately clear how much damage the fire caused in the Grand Lake community.
"We know that historic buildings and businesses are on people's minds, and we just don't have confirmed information at this time. Many of the buildings and the establishments, they are the heart and soul of our community. And as soon we know something definitive, we'll share with those who are affected and the community," Mayor Steve Kudron said late Thursday morning, according to the TV station.
The flames are being driven by high winds and dead, dry timber.
"We plan for the worst. This is the worst of the worst of the worst," said Grand County Sheriff Brett Schroetlin while referencing the fire's 100,000-acre jump in mere hours.
Nearly 300 firefighters are battling the flames in dense wooded hills as the fire continues to spread into the national park. This wildfire could end up merging with the Cameron Peak Fire, which was reported to be burning just a few miles away, according to CBS Denver.
因为程序是用stdin作为输入,所以使用管道连接程序十分方便:
输入以下代码按单词出现顺序统计单词出现频率(区分大小写):
$ cat new2.txt | ./freq -c
A 1
wildfire 2
burning 2
in 5
Grand 4
County 2
Colorado 1
has 1
exploded 1
from 1
acres 1
to 7
more 1
than 1
on 3
Thursday 3
forcing 1
thousands 1
of 7
families 1
evacuate 1
The 2
so 1
called 1
East 1
Troublesome 1
Fire 2
raced 1
through 1
the 19
town 1
Lake 2
and 9
into 2
western 1
portion 1
Rocky 1
Mountain 1
National 1
Park 1
which 2
closed 1
visitors 1
CBS 2
Denver 2
reported 2
Thick 1
smoke 1
flames 4
were 1
closing 1
homes 1
terrifying 1
residents 1
Security 1
video 1
even 1
captured 1
scene 1
as 3
overtook 1
a 2
home 1
It 1
was 2
not 1
immediately 1
clear 1
how 1
much 1
damage 1
fire 3
caused 1
community 3
We 2
know 2
that 1
historic 1
buildings 2
businesses 1
are 5
people 1
s 2
minds 1
we 3
just 2
don 1
t 1
have 1
confirmed 1
information 1
at 1
this 1
time 1
Many 1
establishments 1
they 1
heart 1
soul 1
our 1
And 1
soon 1
something 1
definitive 1
ll 1
share 1
with 2
those 1
who 1
affected 1
Mayor 1
Steve 1
Kudron 1
said 2
late 1
morning 1
according 2
TV 1
station 1
being 1
driven 1
by 1
high 1
winds 1
dead 1
dry 1
timber 1
plan 1
for 1
worst 4
This 2
is 1
Sheriff 1
Brett 1
Schroetlin 1
while 1
referencing 1
acre 1
jump 1
mere 1
hours 1
Nearly 1
firefighters 1
battling 1
dense 1
wooded 1
hills 1
continues 1
spread 1
national 1
park 1
could 1
end 1
up 1
merging 1
Cameron 1
Peak 1
be 1
few 1
miles 1
away 1
输入以下代码可以按频率由高到低的顺序显示单词和出现次数:
$ cat new2.txt | ./freq -s -c
以下代码可以显示包含特定字符串“the”的单词及出现次数(不区分大小写):
$ cat new2.txt | ./freq | grep "the"

这里放出打包好的工程源码:


页: [1]
查看完整版本: 【C】语言写的基于Trie的字典统计单词出现次数