Rev 10 | Rev 8171 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 10 | Rev 7676 | ||
---|---|---|---|
Line 13... | Line 13... | ||
13 | * You should have received a copy of the GNU General Public License |
13 | * You should have received a copy of the GNU General Public License |
14 | * along with this program; if not, write to the Free Software |
14 | * along with this program; if not, write to the Free Software |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
16 | */ |
16 | */ |
17 | 17 | ||
18 |
|
18 | /* This program computes an optimal coding of variable lengths |
19 |
|
19 | * on a given distribution of probabilities, |
20 |
|
20 | * using Huffman algorithm */ |
21 | 21 | ||
22 | /* Input data: via two environment variables. |
22 | /* Input data: via two environment variables. |
23 | * wims_exec_parm is a comma-separated list of probability distributions. |
23 | * wims_exec_parm is a comma-separated list of probability distributions. |
24 | * |
24 | * Limited to MAX_ITEMS. |
25 | * |
25 | * The input data will be scaled to unit sum. |
26 | * w_huffman_radix is the encoding radix, between 2 and MAX_RADIX. |
26 | * w_huffman_radix is the encoding radix, between 2 and MAX_RADIX. |
27 | * |
27 | * |
28 | * Output: two lines. |
28 | * Output: two lines. |
29 | * Line 1: Entropy and Average code length, comma-separated. |
29 | * Line 1: Entropy and Average code length, comma-separated. |
30 | * Line 2: comma-separated list of codes. |
30 | * Line 2: comma-separated list of codes. |
31 | */ |
31 | */ |
32 | 32 | ||
33 | #define MAX_ITEMS |
33 | #define MAX_ITEMS 2048 |
34 | #define MAX_CODELEN |
34 | #define MAX_CODELEN 100 |
35 | 35 | ||
36 | const char *codechar="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; |
36 | const char *codechar="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; |
37 | #define MAX_RADIX |
37 | #define MAX_RADIX strlen(codechar) |
38 | 38 | ||
39 | #include "../Lib/libwims.h" |
39 | #include "../Lib/libwims.h" |
40 | 40 | ||
41 | struct { |
41 | struct { |
42 | double prob; |
42 | double prob; |
Line 60... | Line 60... | ||
60 | 60 | ||
61 | int indcmp(const void *p1, const void *p2) |
61 | int indcmp(const void *p1, const void *p2) |
62 | { |
62 | { |
63 | const int *i1, *i2; |
63 | const int *i1, *i2; |
64 | double d1, d2; |
64 | double d1, d2; |
65 | 65 | ||
66 | i1=p1; i2=p2; |
66 | i1=p1; i2=p2; |
67 | d1=maintab[*i1].prob; d2=maintab[*i2].prob; |
67 | d1=maintab[*i1].prob; d2=maintab[*i2].prob; |
68 | if(d1==d2) return 0; |
68 | if(d1==d2) return 0; |
69 | if(d1>d2) return -1; |
69 | if(d1>d2) return -1; |
70 | else return 1; |
70 | else return 1; |
71 | } |
71 | } |
72 | 72 | ||
73 | void huffman(void) |
73 | void huffman(void) |
74 | { |
74 | { |
75 | int t, i, j, l; |
75 | int t, i, j, l; |
76 | double d; |
76 | double d; |
77 | 77 | ||
78 | while(indcnt>radix) { |
78 | while(indcnt>radix) { |
79 |
|
79 | qsort(indtab,indcnt,sizeof(indtab[0]),indcmp); |
80 |
|
80 | if(radix>2) t=(indcnt+radix-3)%(radix-1)+2; |
81 |
|
81 | else t=2; |
82 |
|
82 | d=0; |
83 |
|
83 | for(i=indcnt-t; i<indcnt; i++) { |
84 |
|
84 | d+=maintab[indtab[i]].prob; |
85 |
|
85 | maintab[indtab[i]].ind=itemcnt; |
86 |
|
86 | } |
87 |
|
87 | maintab[itemcnt].prob=d; |
88 |
|
88 | maintab[itemcnt].ind=-1; |
89 |
|
89 | maintab[itemcnt].codelen=-1; |
90 |
|
90 | indtab[indcnt-t]=itemcnt; |
91 |
|
91 | indcnt-=t-1; itemcnt++; |
92 | } |
92 | } |
93 | for(i=0;i<indcnt;i++) { |
93 | for(i=0;i<indcnt;i++) { |
94 |
|
94 | maintab[indtab[i]].codelen=1; |
95 |
|
95 | maintab[indtab[i]].code[0]=i; |
96 |
|
96 | maintab[indtab[i]].ind=0; |
97 | } |
97 | } |
98 | for(i=itemcnt-1;i>=0;i--) { |
98 | for(i=itemcnt-1;i>=0;i--) { |
99 |
|
99 | if(maintab[i].codelen>0) continue; |
100 |
|
100 | j=maintab[i].ind; l=maintab[j].codelen; |
101 |
|
101 | if(l>=MAX_CODELEN) error("Code too long."); |
102 |
|
102 | memmove(maintab[i].code,maintab[j].code,l); |
103 |
|
103 | maintab[i].code[l]=maintab[j].ind++; |
104 |
|
104 | maintab[i].codelen=l+1; |
105 |
|
105 | maintab[i].ind=0; |
106 | } |
106 | } |
107 | } |
107 | } |
108 | 108 | ||
109 | void output(void) |
109 | void output(void) |
110 | { |
110 | { |
111 | int i, j; |
111 | int i, j; |
Line 113... | Line 113... | ||
113 | 113 | ||
114 | d=0; |
114 | d=0; |
115 | for(i=0;i<origcnt;i++) d+=maintab[i].prob*maintab[i].codelen; |
115 | for(i=0;i<origcnt;i++) d+=maintab[i].prob*maintab[i].codelen; |
116 | printf("%f,%f\n",entropy,d); |
116 | printf("%f,%f\n",entropy,d); |
117 | for(i=0;i<origcnt;i++) { |
117 | for(i=0;i<origcnt;i++) { |
118 |
|
118 | for(j=0;j<maintab[i].codelen;j++) printf("%c",codechar[(int) maintab[i].code[j]]); |
119 |
|
119 | if(i<origcnt-1) printf(","); |
120 |
|
120 | else printf("\n"); |
121 | } |
121 | } |
122 | } |
122 | } |
123 | 123 | ||
124 | void getparm(char *p) |
124 | void getparm(char *p) |
125 | { |
125 | { |
Line 127... | Line 127... | ||
127 | char *p1, *p2; |
127 | char *p1, *p2; |
128 | double d1, dt; |
128 | double d1, dt; |
129 | 129 | ||
130 | origcnt=0; dt=0; |
130 | origcnt=0; dt=0; |
131 | for(p1=find_word_start(p); |
131 | for(p1=find_word_start(p); |
132 |
|
132 | *p1; p1=find_word_start(p2)) { |
133 |
|
133 | for(p2=p1; *p2 && strchr(",;",*p2)==NULL; p2++); |
134 |
|
134 | if(*p2) *p2++=0; |
135 |
|
135 | d1=strevalue(p1); |
136 |
|
136 | if(!finite(d1) || d1<0) { |
137 |
|
137 | char buf[256]; |
138 |
|
138 | snprintf(buf,sizeof(buf),"Bad data: %s",p1); |
139 |
|
139 | error(buf); |
140 |
|
140 | } |
141 |
|
141 | maintab[origcnt++].prob=d1; |
142 |
|
142 | dt+=d1; |
143 | } |
143 | } |
144 | if(dt*1000000<1) error("Empty data sum."); |
144 | if(dt*1000000<1) error("Empty data sum."); |
145 | if(origcnt<2) error("Insufficient data for encoding."); |
145 | if(origcnt<2) error("Insufficient data for encoding."); |
146 | itemcnt=origcnt; |
146 | itemcnt=origcnt; |
147 | if(dt!=1) for(i=0; i<origcnt; i++) maintab[i].prob/=dt; |
147 | if(dt!=1) for(i=0; i<origcnt; i++) maintab[i].prob/=dt; |
148 | 148 | ||
149 | entropy=0; |
149 | entropy=0; |
150 | for(i=0;i<origcnt;i++) { |
150 | for(i=0;i<origcnt;i++) { |
151 |
|
151 | maintab[i].codelen=-1; maintab[i].ind=-1; |
152 |
|
152 | indtab[i]=i; |
153 |
|
153 | d1=maintab[i].prob; |
154 |
|
154 | if(d1>0) entropy-=d1*log(d1); |
155 | } |
155 | } |
156 | entropy=entropy/log(radix); |
156 | entropy=entropy/log(radix); |
157 | indcnt=origcnt; |
157 | indcnt=origcnt; |
158 | } |
158 | } |
159 | 159 | ||
160 | int main() |
160 | int main() |
161 | { |
161 | { |
162 | char *p; |
162 | char *p; |
163 | int r; |
163 | int r; |
164 | 164 | ||
165 | error1=error; error2=error; error3=error; |
165 | error1=error; error2=error; error3=error; |
166 | p=getenv("w_huffman_radix"); |
166 | p=getenv("w_huffman_radix"); |
167 | if(p==NULL || *p==0) p=getenv("huffman_radix"); |
167 | if(p==NULL || *p==0) p=getenv("huffman_radix"); |
168 | if(p==NULL || *p==0) radix=2; |
168 | if(p==NULL || *p==0) radix=2; |
169 | else { |
169 | else { |
170 |
|
170 | r=atoi(p); if(r!=0) radix=r; |
171 | } |
171 | } |
172 | if(radix<2 || radix>MAX_RADIX) error("Bad radix."); |
172 | if(radix<2 || radix>MAX_RADIX) error("Bad radix."); |
173 | 173 | ||
174 | p=getenv("wims_exec_parm"); |
174 | p=getenv("wims_exec_parm"); |
175 | if(p==NULL || *p==0) error("No input data."); |
175 | if(p==NULL || *p==0) error("No input data."); |
176 | getparm(p); |
176 | getparm(p); |
177 | 177 | ||
178 | huffman(); |
178 | huffman(); |
179 | output(); |
179 | output(); |
180 | 180 | ||
181 | return 0; |
181 | return 0; |
182 | } |
182 | } |
183 | 183 |