Ahuige 能不能在提供点线索,我可以去找一找。 多谢了 :)作者: shiyiming 时间: 2010-7-21 21:52 标题: Re: 请教高手:多列带有重复的ID合并成一列 没有想到什么好的办法。如果你的数据不是很多,那你只能用每行每列的记录与下行每列的所有记录匹配,依次做下去。作者: shiyiming 时间: 2010-7-21 23:00 标题: Re: 请教高手:多列带有重复的ID合并成一列 to gzgoon
this can be solved using hash list while your obs are index by the four IDs SEPARATELY.
by OP's description, as searching goes on, any subsequent observation that has 2 IDs in the current combination of id pool will be identified as belonging to the same group, then any new ID will be added to the pool. The main hash table contains the record number which is the unqiue ID for each obs, and the other elements in the hash is an 5 auxilary hash lists, 4 for id pool of each ID# while the last auxilary hash list contains all related records
for example, for first record, I search by index on ID1 and find record #2, by examination, it satisfies the criteria, so all the IDs are added to the first auxilary hash list while _n_=2 is also added to the second auxilary hash list. Now our ID pool has a set of ID1={1,2}, ID2={11, 12}, ID3={21, 22}, ID4={31}. Iterate over these four auxilary hash tables to find next record that satisfies the critera.
Now on ID2=11, we find _n_=6, by examination, it contains id2 \in ID3={21,22}, so that this record is added to the the hash table in a way as decribed above.
Keep this iteration until all four ID# lists are exhausted and no new record can be found.作者: shiyiming 时间: 2010-7-21 23:18 标题: Re: 请教高手:多列带有重复的ID合并成一列 [code:313d781q]data have;
set have; _obs++1;
run;
%macro aMcr;
%global ns; %local i _i;
%do k = 1 %to 4;
data have12;
set have1;
array id_{500000} _temporary_; *max 500, 000records;
%if &k = 1 %then%do;
if _n_ = 1 then id_[1] = id1;
if id1 = id_[1] then output;
%end;
%else%do; %let i = 1;
%do%while(%scan(&ns, &i) ne); %let _i = %scan(&ns, &i);
if _obs = &_i then id_[&_i] = id&k;
if id&k = id_[&i] then output;
%let i = %eval(&i+1);
%end;
%end;
run;
proc append base = have0 data = have12 force nowarn; run;
proc sql noprint;
select distinct _obs into :ns separated by ' ' from have0;
quit;
%end;
%mend aMcr;
data have_final; length id _obs id1-id4 8.; stop; run;
%let s = 0;
%macro bMcr;
data _null_;
call symputx('obs', obs);
set have nobs = obs; stop;
run;
%let nobs = 0;
%do%while(&nobs LT &obs); %let s = %eval(&s+1);
data have0; length id _obs id1-id4 8.; stop; run;
proc sql;
create table have1 as select * from have where _obs notin (select distinct _obs from have_final);
quit;
%aMcr
proc sort data = have0 nodupkey; by _obs; run;
data have00; set have0;
id = &s;
run;
proc append base = have_final data = have00 force nowarn; run;
proc sql noprint;
select count(*) into :nobs from have_final;
quit;
%end;
%mend bMcr;
proc sql;
create table id_pair as
select id1 as old_id,id1 as new_id from raw
union
select id2 as old_id,id2 as new_id from raw
union
select id3 as old_id,id3 as new_id from raw
union
select id4 as old_id,id4 as new_id from raw;
quit;
data _null_;
if _n_=1 then do;
declare hash h(dataset:"id_pair",ordered:'yes',hashexp:16);
h.defineKey('old_id');
h.defineData('old_id','new_id');
h.defineDone();
call missing(old_id,new_id);
declare hiter iter('h');
end;
set raw end=eof;
array arr{*} id1-id4;
do i=1 to 3;
do j=i+1 to 4;
rc=h.find(key:arr(i));
temp=new_id;
do while(iter.next()=0);
if new_id=temp then rc=h.replace(key:old_id,data:old_id,data:arr(j));
end;
end;
end;
if eof then rc=h.output(dataset:'_temp');
run;
proc sql;
create table id_pair as
select a.old_id,b.group
from _temp a,
(select min(old_id) as group,old_id from _temp
group by new_id) b
where a.old_id=b.old_id;
drop table _temp;
quit;
data final(keep=group id1-id4);
if _n_ = 1 then do;
declare hash h(dataset:"id_pair",hashexp:16);
h.defineKey('old_id');
h.defineData('group');
h.defineDone();
call missing(old_id,group);
end;
set raw;
rc=h.find(key:id1);
run;[/code:1bj53880]作者: shiyiming 时间: 2010-7-22 10:11 标题: Re: 请教高手:多列带有重复的ID合并成一列 oloolo和hopewell的想法很好,还有所有的楼上楼下你们也都是我崇拜的偶像。作者: shiyiming 时间: 2010-7-22 14:53 标题: Re: 请教高手:多列带有重复的ID合并成一列 to 死猪头:能不能试着写一写code呢? <!-- s:) --><img src="{SMILIES_PATH}/icon_smile.gif" alt=":)" title="Smile" /><!-- s:) -->
实现方式各异, 效率很不同。作者: shiyiming 时间: 2010-7-23 11:37 标题: Re: 请教高手:多列带有重复的ID合并成一列 Here is my two cents. Haven't got the chance to test it on huge datasets though. <!-- s:D --><img src="{SMILIES_PATH}/icon_biggrin.gif" alt=":D" title="Very Happy" /><!-- s:D -->
proc sql;
create table matches as
select a.rowid as rowid_a,
a.id1 as id1_a,
a.id2 as id2_a,
a.id3 as id3_a,
a.id4 as id4_a,
b.rowid as rowid_b,
b.id1 as id1_b,
b.id2 as id2_b,
b.id3 as id3_b,
b.id4 as id4_b
from a1 as a,
a1 as b
where a.rowid ^= b.rowid and
(a.id1 = b.id1 or
a.id2 = b.id2 or
a.id3 = b.id3 or
a.id4 = b.id4
);
quit;
proc sql;
create table matches as
select distinct
1 as id,
min(rowid_a, rowid_b) as rowid1,
max(rowid_a, rowid_b) as rowid2
from matches;
create table matches as
select id,
rowid1,
rowid2
from matches
union all
select 1 as id,
a.rowid as rowid1,
a.rowid as rowid2
from a1 as a
left join
matches as b
on a.rowid = b.rowid1 or
a.rowid = b.rowid2
where b.rowid1 is null;
quit;
data results(keep = rowsetid rowset);
set matches;
by id;
retain rowsetid rowset newset;
length rowset $256; /* Adjust this length if necessary. */
if first.id then do;
rowsetid = 1;
newset = 1;
end;
data tem.a1;
array id[1:4];
do _n_=1 to 700000;
do _i_ = 1 to dim(id);
id[_i_] = ceil(20000*ranuni(12345));
end;
call sortn(of id[*]); num1+1;
drop _i_; output;
end;
run;
data tem.ex1;
array ids [4] id1-id4;
set tem.a1;
do i=1 to 4;
id=ids[i];
if i=5 then num1+1;
num2=num1;
keep id num2;
output;
end;
run;
data tem.ex2;
set tem.a1 (rename=(id1=cid1 id2=cid2 id3=cid3 id4=cid4)) nobs=max;
array cid [4] cid1-cid4;
put _n_;
do i=1 to max*4;
set tem.ex1 point=i;
if mod(i,4)=1 then sum=0;
do j=1 to 4;
if num1 ne num2 then sum+(id=cid[j]);
end;
if mod(i,4)=0 then
do;
if sum ge 2 then
do;
call sortn(num1,num2);
keep num1 num2;
output;
end;
end;
end;
还请继续讨论,让我学习学习。作者: shiyiming 时间: 2010-7-24 23:55 标题: Re: 请教高手:多列带有重复的ID合并成一列 工作中的实际问题。课后练习题的话,也不可能七十多万个record :)
看来得找那些专业研究算法的高手了 :)
(非常感谢楼上大侠们的idea,虽然还没解决问题,但眼界开阔不少 :))作者: shiyiming 时间: 2010-7-25 07:28 标题: Re: 请教高手:多列带有重复的ID合并成一列 to 死猪头
right, at first look, it looks like a typical transverse problem in a graph, however, it is more sophisticated, but the same principle applies. You still can do a DFS-type search which is not hard to code in SAS.
I just found this is an open issue in OP's work, and unless he pays me....作者: shiyiming 时间: 2010-7-25 13:32 标题: Re: 请教高手:多列带有重复的ID合并成一列 猪头又上当了!
另外浏览了oloolo大师的伯克,大开眼界!尽然有Tensor,是张量代数里那个东西吗?竟然有应用?望oloolo大师不吝赐教解惑众生!作者: shiyiming 时间: 2010-7-25 14:13 标题: Re: 请教高手:多列带有重复的ID合并成一列 to 死猪头
I made my points crystal clear in my first response, but no gurantee that it is the correct one. I think you know Tensor as well, pls ZKSS. It could also be a good topic for SuperK as Tensor is used for image recognition. SuperK, it is your turn now.
so you are in US? W Va? What do u do right now? Student or faculty or working professional? I am working for an US agency.....作者: shiyiming 时间: 2010-7-25 14:27 标题: Re: 请教高手:多列带有重复的ID合并成一列 你们大师对话吧,我把代码搞明白先 <!-- s:lol: --><img src="{SMILIES_PATH}/icon_lol.gif" alt=":lol:" title="Laughing" /><!-- s:lol: -->作者: shiyiming 时间: 2010-7-25 21:53 标题: Re: 请教高手:多列带有重复的ID合并成一列 1,猪头,楼主说的很清楚,任意两个(即一边),你只考虑一个(即一点)。所以得到的结果不符合楼主要求。
2,猪头,边的问题,因为涉及到的边并不是一条边,而是很多叠合在一起的边(很多条观察值共用一条边),所以边数并不是4倍行数,而是所有有两个相同点的边。
3,猪头,你的DFS写的不错,不过针对单点的,正好我上面给的代码得到只需要针对单点的DFS,所以我移花接木了一下,两个程序组合一个就ok,我小试了下10000条,得到了结果,应该是对的。
猪头狮身组合的代码(未优化):
[code:2miq4v7r]data a1;
array id[1:4];
do _n_=1 to 700000;
do _i_ = 1 to dim(id);
id[_i_] = ceil(20000*ranuni(12345));
end;
call sortn(of id[*]); num1+1;
drop _i_; output;
end;
run;
data ex1;
array ids [4] id1-id4;
set a1;
do i=1 to 4;
id=ids[i];
if i=5 then num1+1;
num2=num1;
keep id num2;
output;
end;
run;
data ex2;
set a1 (rename=(id1=cid1 id2=cid2 id3=cid3 id4=cid4)) nobs=max;
call symput('max', max);
array cid [4] cid1-cid4;
/* put _n_;*/
do i=1 to max*4;
set ex1 point=i;
if mod(i,4)=1 then sum=0;
do j=1 to 4;
if num1 ne num2 then sum+(id=cid[j]);
end;
if mod(i,4)=0 then
do;
if sum ge 2 then
do;
call sortn(num1,num2);
keep num1 num2;
output;
end;
end;
end;
run;
proc sort data=ex2 out=ex2 nodupkey;
by num1 num2;
run;
data a2(rename=(num1=id1 num2=id2));
set ex2;
run;
data ids(keep=x id);
set a2 nobs=nobs;
if _n_=1 then call symputx('n',nobs);
* &n=the # of observations;
x = _n_;
array ids[*] id1-id2;
do _i_ = 1 to dim(ids);
id = ids[_i_];
output;
end;
drop _i_;
proc means data=ids(keep=id) nway noprint;
class id;
output out=distinctids(drop=_type_ _freq_);
data ctrl/view=ctrl;
retain fmtname 'id2seq' type 'i';
set distinctids(keep=id rename=(id=start)) nobs=nobs;
if _n_=1 then call symputx('m', nobs);
* &m=the # of distinct id's;
label = _n_+&n;
proc format cntlin=ctrl;
*construct edge set;
data edges;
set ids;
y = input(id, id2seq.);
output;
temp = x;
x=y;
y=temp;
output;
keep x y;
proc sort data=edges;
by x y;
*construct pointers to the incident edges @ each vertex;
data vertices;
do count=1 by 1 until(last.x);
set edges;
by x;
end;
lastobs+count;
firstobs=lastobs-count+1;
keep firstobs lastobs x;
*depth-first search;
data components(keep=x component);
array pointers[1:%eval(&n+&m),1:2] _temporary_;
array color[1:%eval(&n+&m)] _temporary_;
array stack[1:%eval(&n+&m)] _temporary_;
do _n_=1 by 1 until(eof);
set vertices end=eof;
pointers[_n_,1] = firstobs;
pointers[_n_,2] = lastobs;
end;
component = 0;
do _n_=lbound(color) to hbound(color);
if color[_n_] = . then do;
component = component+1;
stack_top = lbound(stack);
stack[stack_top] = _n_;
stack_size=1;
color[_n_]=component;
do until(stack_size=0);
top_vertex = stack[stack_top];
stack_top=stack_top-1;
stack_size=stack_size-1;
do _i_=pointers[top_vertex,1] to pointers[top_vertex,2];
set edges point=_i_;
if color[y] = . then do;
stack_top = stack_top+1;
stack[stack_top] = y;
stack_size=stack_size+1;
color[y]=component; *the line missed last time;
end; *end if color[y]=.;
end; *end of do _i_=... to ....;
end; *end of do until (stack_top=0);
end; *end of if color[_n_]=.;
end; *end of do _n_=1 to &nobs;
do x = 1 to &n;
component = color[x];
output;
end;
stop;
data a1_new;
merge a2 components(keep=component rename=(component=id));
run;
proc print;run;
data ex5;
set a1_new(keep=id1 id rename=(id1=num id=g)) a1_new(keep=id2 id rename=(id2=num id=g));
run;
proc sort nodupkey out=ex6;
by num;
run;
proc sql noprint;
select max(g) into :g from ex6;
quit;
data ex7;
do num= 1 to &max.;
output;
end;
run;
data ex8;
merge ex6 ex7;
by num;
if g =. then
do;
i+1;
id= &g.+i;
end;
else id=g;
drop i;
run;
proc print;
/*where g NE .; '.' 表示孤独的点 */
run;[/code:2miq4v7r]作者: shiyiming 时间: 2010-7-25 22:30 标题: Re: 请教高手:多列带有重复的ID合并成一列 有CPU快点的,可以试试,我用自己构造的需要回溯的数据,结果是没有问题,简单的代码待我学会DFS再来。