|
|
Re: 请教高手:多列带有重复的ID合并成一列
1,猪头,楼主说的很清楚,任意两个(即一边),你只考虑一个(即一点)。所以得到的结果不符合楼主要求。
2,猪头,边的问题,因为涉及到的边并不是一条边,而是很多叠合在一起的边(很多条观察值共用一条边),所以边数并不是4倍行数,而是所有有两个相同点的边。
3,猪头,你的DFS写的不错,不过针对单点的,正好我上面给的代码得到只需要针对单点的DFS,所以我移花接木了一下,两个程序组合一个就ok,我小试了下10000条,得到了结果,应该是对的。
需要说明的是,下面的代码是通过转换两点为一点的问题,所以DFS只需要针对单点的DFS,编程难度稍微下降,问题能够解决,但是代价就是超长的CPU时间和IO时间。要想速度快,还得像oloolo大侠说的那样,凭借重型武器hash和合适的DFS算法,这样保证很短的时间算完,代码也很简洁很好看,不像下面的裹脚布。
猪头狮身组合的代码(未优化):
[code:2miq4v7r]data a1;
array id[1:4];
do _n_=1 to 700000;
do _i_ = 1 to dim(id);
id[_i_] = ceil(20000*ranuni(12345));
end;
call sortn(of id[*]); num1+1;
drop _i_; output;
end;
run;
data ex1;
array ids [4] id1-id4;
set a1;
do i=1 to 4;
id=ids[i];
if i=5 then num1+1;
num2=num1;
keep id num2;
output;
end;
run;
data ex2;
set a1 (rename=(id1=cid1 id2=cid2 id3=cid3 id4=cid4)) nobs=max;
call symput('max', max);
array cid [4] cid1-cid4;
/* put _n_;*/
do i=1 to max*4;
set ex1 point=i;
if mod(i,4)=1 then sum=0;
do j=1 to 4;
if num1 ne num2 then sum+(id=cid[j]);
end;
if mod(i,4)=0 then
do;
if sum ge 2 then
do;
call sortn(num1,num2);
keep num1 num2;
output;
end;
end;
end;
run;
proc sort data=ex2 out=ex2 nodupkey;
by num1 num2;
run;
data a2(rename=(num1=id1 num2=id2));
set ex2;
run;
data ids(keep=x id);
set a2 nobs=nobs;
if _n_=1 then call symputx('n',nobs);
* &n=the # of observations;
x = _n_;
array ids[*] id1-id2;
do _i_ = 1 to dim(ids);
id = ids[_i_];
output;
end;
drop _i_;
proc means data=ids(keep=id) nway noprint;
class id;
output out=distinctids(drop=_type_ _freq_);
data ctrl/view=ctrl;
retain fmtname 'id2seq' type 'i';
set distinctids(keep=id rename=(id=start)) nobs=nobs;
if _n_=1 then call symputx('m', nobs);
* &m=the # of distinct id's;
label = _n_+&n;
proc format cntlin=ctrl;
*construct edge set;
data edges;
set ids;
y = input(id, id2seq.);
output;
temp = x;
x=y;
y=temp;
output;
keep x y;
proc sort data=edges;
by x y;
*construct pointers to the incident edges @ each vertex;
data vertices;
do count=1 by 1 until(last.x);
set edges;
by x;
end;
lastobs+count;
firstobs=lastobs-count+1;
keep firstobs lastobs x;
*depth-first search;
data components(keep=x component);
array pointers[1:%eval(&n+&m),1:2] _temporary_;
array color[1:%eval(&n+&m)] _temporary_;
array stack[1:%eval(&n+&m)] _temporary_;
do _n_=1 by 1 until(eof);
set vertices end=eof;
pointers[_n_,1] = firstobs;
pointers[_n_,2] = lastobs;
end;
component = 0;
do _n_=lbound(color) to hbound(color);
if color[_n_] = . then do;
component = component+1;
stack_top = lbound(stack);
stack[stack_top] = _n_;
stack_size=1;
color[_n_]=component;
do until(stack_size=0);
top_vertex = stack[stack_top];
stack_top=stack_top-1;
stack_size=stack_size-1;
do _i_=pointers[top_vertex,1] to pointers[top_vertex,2];
set edges point=_i_;
if color[y] = . then do;
stack_top = stack_top+1;
stack[stack_top] = y;
stack_size=stack_size+1;
color[y]=component; *the line missed last time;
end; *end if color[y]=.;
end; *end of do _i_=... to ....;
end; *end of do until (stack_top=0);
end; *end of if color[_n_]=.;
end; *end of do _n_=1 to &nobs;
do x = 1 to &n;
component = color[x];
output;
end;
stop;
data a1_new;
merge a2 components(keep=component rename=(component=id));
run;
proc print;run;
data ex5;
set a1_new(keep=id1 id rename=(id1=num id=g)) a1_new(keep=id2 id rename=(id2=num id=g));
run;
proc sort nodupkey out=ex6;
by num;
run;
proc sql noprint;
select max(g) into :g from ex6;
quit;
data ex7;
do num= 1 to &max.;
output;
end;
run;
data ex8;
merge ex6 ex7;
by num;
if g =. then
do;
i+1;
id= &g.+i;
end;
else id=g;
drop i;
run;
proc print;
/*where g NE .; '.' 表示孤独的点 */
run;[/code:2miq4v7r] |
|